diff options
321 files changed, 11189 insertions, 3499 deletions
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index 2df7b067ab93..0e15f9b05c9d 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -208,6 +208,12 @@ data structures and compile with kernel internal headers. Both of these kernel internals are subject to change and can break with newer kernels such that the program needs to be adapted accordingly. +Q: Are tracepoints part of the stable ABI? +------------------------------------------ +A: NO. Tracepoints are tied to internal implementation details hence they are +subject to change and can break with newer kernels. BPF programs need to change +accordingly when this happens. + Q: How much stack space a BPF program uses? ------------------------------------------- A: Currently all program types are limited to 512 bytes of stack diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 5b613d2a5f1a..2ed89abbf9a4 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -501,16 +501,19 @@ All LLVM releases can be found at: http://releases.llvm.org/ Q: Got it, so how do I build LLVM manually anyway? -------------------------------------------------- -A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have -that set up, proceed with building the latest LLVM and clang version +A: We recommend that developers who want the fastest incremental builds +use the Ninja build system, you can find it in your system's package +manager, usually the package is ninja or ninja-build. + +You need ninja, cmake and gcc-c++ as build requisites for LLVM. Once you +have that set up, proceed with building the latest LLVM and clang version from the git repositories:: $ git clone https://github.com/llvm/llvm-project.git - $ mkdir -p llvm-project/llvm/build/install + $ mkdir -p llvm-project/llvm/build $ cd llvm-project/llvm/build $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ -DLLVM_ENABLE_PROJECTS="clang" \ - -DBUILD_SHARED_LIBS=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DLLVM_BUILD_RUNTIME=OFF $ ninja diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index dac4aadb6e2e..f599c1d9c961 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -89,6 +89,7 @@ properties: - trgmii - 1000base-x - 2500base-x + - 5gbase-r - rxaui - xaui diff --git a/Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml b/Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml new file mode 100644 index 000000000000..59724d18e6f3 --- /dev/null +++ b/Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/net/toshiba,visconti-dwmac.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: Toshiba Visconti DWMAC Ethernet controller + +maintainers: + - Nobuhiro Iwamatsu <[email protected]> + +select: + properties: + compatible: + contains: + enum: + - toshiba,visconti-dwmac + required: + - compatible + +allOf: + - $ref: "snps,dwmac.yaml#" + +properties: + compatible: + oneOf: + - items: + - enum: + - toshiba,visconti-dwmac + - const: snps,dwmac-4.20a + + reg: + maxItems: 1 + + clocks: + items: + - description: main clock + - description: PHY reference clock + + clock-names: + items: + - const: stmmaceth + - const: phy_ref_clk + +required: + - compatible + - reg + - clocks + - clock-names + +unevaluatedProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/arm-gic.h> + + soc { + #address-cells = <2>; + #size-cells = <2>; + + piether: ethernet@28000000 { + compatible = "toshiba,visconti-dwmac", "snps,dwmac-4.20a"; + reg = <0 0x28000000 0 0x10000>; + interrupts = <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + clocks = <&clk300mhz>, <&clk125mhz>; + clock-names = "stmmaceth", "phy_ref_clk"; + snps,txpbl = <4>; + snps,rxpbl = <4>; + snps,tso; + phy-mode = "rgmii-id"; + phy-handle = <&phy0>; + + mdio0 { + #address-cells = <0x1>; + #size-cells = <0x0>; + compatible = "snps,dwmac-mdio"; + + phy0: ethernet-phy@1 { + device_type = "ethernet-phy"; + reg = <0x1>; + }; + }; + }; + }; diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index f6d8f90e9a56..251c6bd73d15 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1048,12 +1048,12 @@ Unlike classic BPF instruction set, eBPF has generic load/store operations:: Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. It also includes atomic operations, which use the immediate field for extra -encoding. +encoding:: .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg -The basic atomic operations supported are: +The basic atomic operations supported are:: BPF_ADD BPF_AND @@ -1066,33 +1066,35 @@ memory location addresed by ``dst_reg + off`` is atomically modified, with immediate, then these operations also overwrite ``src_reg`` with the value that was in memory before it was modified. -The more special operations are: +The more special operations are:: BPF_XCHG This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg + -off``. +off``. :: BPF_CMPXCHG This atomically compares the value addressed by ``dst_reg + off`` with -``R0``. If they match it is replaced with ``src_reg``, The value that was there -before is loaded back to ``R0``. +``R0``. If they match it is replaced with ``src_reg``. In either case, the +value that was there before is zero-extended and loaded back to ``R0``. Note that 1 and 2 byte atomic operations are not supported. -Except ``BPF_ADD`` _without_ ``BPF_FETCH`` (for legacy reasons), all 4 byte -atomic operations require alu32 mode. Clang enables this mode by default in -architecture v3 (``-mcpu=v3``). For older versions it can be enabled with +Clang can generate atomic instructions by default when ``-mcpu=v3`` is +enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction +Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable +the atomics features, while keeping a lower ``-mcpu`` version, you can use ``-Xclang -target-feature -Xclang +alu32``. -You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to -the exclusive-add operation encoded when the immediate field is zero. +You may encounter ``BPF_XADD`` - this is a legacy name for ``BPF_ATOMIC``, +referring to the exclusive-add operation encoded when the immediate field is +zero. -eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +eBPF has one 16-byte instruction: ``BPF_LD | BPF_DW | BPF_IMM`` which consists of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single instruction that loads 64-bit immediate value into a dst_reg. -Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +Classic BPF has similar instruction: ``BPF_LD | BPF_W | BPF_IMM`` which loads 32-bit immediate value into a register. eBPF verifier diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 581bfce86dca..c7952ac5bd2f 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -651,16 +651,15 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max default: initial size of receive buffer used by TCP sockets. This value overrides net.core.rmem_default used by other protocols. - Default: 87380 bytes. This value results in window of 65535 with - default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit - less for default tcp_app_win. See below about these variables. + Default: 131072 bytes. + This value results in initial window of 65535. max: maximal size of receive buffer allowed for automatically selected receiver buffers for TCP socket. This value does not override net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables automatic tuning of that socket's receive buffer size, in which case this value is ignored. - Default: between 87380B and 6MB, depending on RAM size. + Default: between 131072 and 6MB, depending on RAM size. tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index ae2ae37cd921..a64c01b52b4c 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -272,6 +272,22 @@ to the mailing list, e.g.:: Posting as one thread is discouraged because it confuses patchwork (as of patchwork 2.2.2). +Can I reproduce the checks from patchwork on my local machine? +-------------------------------------------------------------- + +Checks in patchwork are mostly simple wrappers around existing kernel +scripts, the sources are available at: + +https://github.com/kuba-moo/nipa/tree/master/tests + +Running all the builds and checks locally is a pain, can I post my patches and have the patchwork bot validate them? +-------------------------------------------------------------------------------------------------------------------- + +No, you must ensure that your patches are ready by testing them locally +before posting to the mailing list. The patchwork build bot instance +gets overloaded very easily and netdev@vger really doesn't need more +traffic if we can help it. + Any other tips to help ensure my net/net-next patch gets OK'd? -------------------------------------------------------------- Attention to detail. Re-read your own work as if you were the diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 399f17976a6c..06adfc2afcf0 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -216,7 +216,7 @@ put into an unsupported state. Lastly, once the controller is ready to handle network traffic, you call phy_start(phydev). This tells the PAL that you are ready, and configures the PHY to connect to the network. If the MAC interrupt of your network driver -also handles PHY status changes, just set phydev->irq to PHY_IGNORE_INTERRUPT +also handles PHY status changes, just set phydev->irq to PHY_MAC_INTERRUPT before you call phy_start and use phy_mac_interrupt() from the network driver. If you don't want to use interrupts, set phydev->irq to PHY_POLL. phy_start() enables the PHY interrupts (if applicable) and starts the @@ -267,6 +267,12 @@ Some of the interface modes are described below: duplex, pause or other settings. This is dependent on the MAC and/or PHY behaviour. +``PHY_INTERFACE_MODE_5GBASER`` + This is the IEEE 802.3 Clause 129 defined 5GBASE-R protocol. It is + identical to the 10GBASE-R protocol defined in Clause 49, with the + exception that it operates at half the frequency. Please refer to the + IEEE standard for the definition. + ``PHY_INTERFACE_MODE_10GBASER`` This is the IEEE 802.3 Clause 49 defined 10GBASE-R protocol used with various different mediums. Please refer to the IEEE standard for a diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst index 5aec7c8857d0..328f8d39eeb3 100644 --- a/Documentation/networking/sfp-phylink.rst +++ b/Documentation/networking/sfp-phylink.rst @@ -163,7 +163,7 @@ this documentation. err = phylink_of_phy_connect(priv->phylink, node, flags); For the most part, ``flags`` can be zero; these flags are passed to - the of_phy_attach() inside this function call if a PHY is specified + the phy_attach_direct() inside this function call if a PHY is specified in the DT node ``node``. ``node`` should be the DT node which contains the network phy property, diff --git a/MAINTAINERS b/MAINTAINERS index 99335fd22c0a..986a8eef8633 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2641,8 +2641,10 @@ L: [email protected] (moderated for non-subscribers) S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/iwamatsu/linux-visconti.git F: Documentation/devicetree/bindings/arm/toshiba.yaml +F: Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml F: Documentation/devicetree/bindings/pinctrl/toshiba,tmpv7700-pinctrl.yaml F: arch/arm64/boot/dts/toshiba/ +F: drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c F: drivers/pinctrl/visconti/ N: visconti @@ -1082,6 +1082,17 @@ ifdef CONFIG_STACK_VALIDATION endif endif +PHONY += resolve_btfids_clean + +resolve_btfids_O = $(abspath $(objtree))/tools/bpf/resolve_btfids + +# tools/bpf/resolve_btfids directory might not exist +# in output directory, skip its clean in that case +resolve_btfids_clean: +ifneq ($(wildcard $(resolve_btfids_O)),) + $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean +endif + ifdef CONFIG_BPF ifdef CONFIG_DEBUG_INFO_BTF ifeq ($(has_libelf),1) @@ -1491,7 +1502,7 @@ vmlinuxclean: $(Q)$(CONFIG_SHELL) $(srctree)/scripts/link-vmlinux.sh clean $(Q)$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) clean) -clean: archclean vmlinuxclean +clean: archclean vmlinuxclean resolve_btfids_clean # mrproper - Delete all generated files, including .config # diff --git a/arch/arm64/boot/dts/toshiba/tmpv7708-rm-mbrc.dts b/arch/arm64/boot/dts/toshiba/tmpv7708-rm-mbrc.dts index ed0bf7f13f54..48fa8776e36f 100644 --- a/arch/arm64/boot/dts/toshiba/tmpv7708-rm-mbrc.dts +++ b/arch/arm64/boot/dts/toshiba/tmpv7708-rm-mbrc.dts @@ -41,3 +41,21 @@ clocks = <&uart_clk>; clock-names = "apb_pclk"; }; + +&piether { + status = "okay"; + phy-handle = <&phy0>; + phy-mode = "rgmii-id"; + clocks = <&clk300mhz>, <&clk125mhz>; + clock-names = "stmmaceth", "phy_ref_clk"; + + mdio0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwmac-mdio"; + phy0: ethernet-phy@1 { + device_type = "ethernet-phy"; + reg = <0x1>; + }; + }; +}; diff --git a/arch/arm64/boot/dts/toshiba/tmpv7708.dtsi b/arch/arm64/boot/dts/toshiba/tmpv7708.dtsi index 242f25f4e12a..3366786699fc 100644 --- a/arch/arm64/boot/dts/toshiba/tmpv7708.dtsi +++ b/arch/arm64/boot/dts/toshiba/tmpv7708.dtsi @@ -134,6 +134,20 @@ #clock-cells = <0>; }; + clk125mhz: clk125mhz { + compatible = "fixed-clock"; + clock-frequency = <125000000>; + #clock-cells = <0>; + clock-output-names = "clk125mhz"; + }; + + clk300mhz: clk300mhz { + compatible = "fixed-clock"; + clock-frequency = <300000000>; + #clock-cells = <0>; + clock-output-names = "clk300mhz"; + }; + soc { #address-cells = <2>; #size-cells = <2>; @@ -384,6 +398,17 @@ #size-cells = <0>; status = "disabled"; }; + + piether: ethernet@28000000 { + compatible = "toshiba,visconti-dwmac", "snps,dwmac-4.20a"; + reg = <0 0x28000000 0 0x10000>; + interrupts = <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + snps,txpbl = <4>; + snps,rxpbl = <4>; + snps,tso; + status = "disabled"; + }; }; }; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1d4d50199293..79e7a0ec1da5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -869,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } +static int emit_nops(u8 **pprog, int len) +{ + u8 *prog = *pprog; + int i, noplen, cnt = 0; + + while (len > 0) { + noplen = len; + + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + + for (i = 0; i < noplen; i++) + EMIT1(ideal_nops[noplen][i]); + len -= noplen; + } + + *pprog = prog; + + return cnt; +} + +#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) + int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; @@ -880,7 +903,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; - int proglen = 0; + int ilen, proglen = 0; u8 *prog = temp; int err; @@ -894,17 +917,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_prog->aux->func_idx != 0); push_callee_regs(&prog, callee_regs_used); - addrs[0] = prog - temp; + + ilen = prog - temp; + if (image) + memcpy(image + proglen, temp, ilen); + proglen += ilen; + addrs[0] = proglen; + prog = temp; for (i = 1; i <= insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; u8 b2 = 0, b3 = 0; + u8 *start_of_ldx; s64 jmp_offset; u8 jmp_cond; - int ilen; u8 *func; + int nops; switch (insn->code) { /* ALU */ @@ -1249,12 +1279,30 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + /* test src_reg, src_reg */ + maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */ + EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg)); + /* jne start_of_ldx */ + EMIT2(X86_JNE, 0); + /* xor dst_reg, dst_reg */ + emit_mov_imm32(&prog, false, dst_reg, 0); + /* jmp byte_after_ldx */ + EMIT2(0xEB, 0); + + /* populate jmp_offset for JNE above */ + temp[4] = prog - temp - 5 /* sizeof(test + jne) */; + start_of_ldx = prog; + } emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; u8 *_insn = image + proglen; s64 delta; + /* populate jmp_offset for JMP above */ + start_of_ldx[-1] = prog - start_of_ldx; + if (!bpf_prog->aux->extable) break; @@ -1502,6 +1550,30 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ } jmp_offset = addrs[i + insn->off] - addrs[i]; if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To keep the jmp_offset valid, the extra bytes are + * padded before the jump insn, so we substract the + * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp_cond, then this BPF insn won't shrink, so + * "nops" is 0. + * + * On the other hand, if the previous pass emits an + * imm32 jmp_cond, the extra 4 bytes(*) is padded to + * keep the image from shrinking further. + * + * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond + * is 2 bytes, so the size difference is 4 bytes. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 4) { + pr_err("unexpected jmp_cond padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset); @@ -1524,11 +1596,55 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ else jmp_offset = addrs[i + insn->off] - addrs[i]; - if (!jmp_offset) - /* Optimize out nop jumps */ + if (!jmp_offset) { + /* + * If jmp_padding is enabled, the extra nops will + * be inserted. Otherwise, optimize out nop jumps. + */ + if (jmp_padding) { + /* There are 3 possible conditions. + * (1) This BPF_JA is already optimized out in + * the previous run, so there is no need + * to pad any extra byte (0 byte). + * (2) The previous pass emits an imm8 jmp, + * so we pad 2 bytes to match the previous + * insn size. + * (3) Similarly, the previous pass emits an + * imm32 jmp, and 5 bytes is padded. + */ + nops = INSN_SZ_DIFF; + if (nops != 0 && nops != 2 && nops != 5) { + pr_err("unexpected nop jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } break; + } emit_jmp: if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To avoid breaking jmp_offset, the extra bytes + * are padded before the actual jmp insn, so + * 2 bytes is substracted from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp, there is nothing to pad (0 byte). + * + * If it emits an imm32 jmp (5 bytes) previously + * and now an imm8 jmp (2 bytes), then we pad + * (5 - 2 = 3) bytes to stop the image from + * shrinking further. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 3) { + pr_err("unexpected jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT1_off32(0xE9, jmp_offset); @@ -1624,17 +1740,25 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; + u8 *jmp_insn; int cnt = 0; - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) - return -EINVAL; - } else { - if (emit_call(&prog, __bpf_prog_enter, prog)) + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_enter_sleepable : + __bpf_prog_enter, prog)) return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); - } + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* if (__bpf_prog_enter*(prog) == 0) + * goto skip_exec_of_prog; + */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ + /* emit 2 nops that will be replaced with JE insn */ + jmp_insn = prog; + emit_nops(&prog, 2); /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1654,43 +1778,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) - return -EINVAL; - } else { - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) + /* replace 2 nops with JE insn, since jmp target is known */ + jmp_insn[0] = X86_JE; + jmp_insn[1] = prog - jmp_insn - 2; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_exit_sleepable : + __bpf_prog_exit, prog)) return -EINVAL; - } *pprog = prog; return 0; } -static void emit_nops(u8 **pprog, unsigned int len) -{ - unsigned int i, noplen; - u8 *prog = *pprog; - int cnt = 0; - - while (len > 0) { - noplen = len; - - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - - for (i = 0; i < noplen; i++) - EMIT1(ideal_nops[noplen][i]); - len -= noplen; - } - - *pprog = prog; -} - static void emit_align(u8 **pprog, u32 align) { u8 *target, *prog = *pprog; @@ -2065,6 +2169,9 @@ struct x64_jit_data { struct jit_context ctx; }; +#define MAX_PASSES 20 +#define PADDING_PASSES (MAX_PASSES - 5) + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; @@ -2074,6 +2181,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct jit_context ctx = {}; bool tmp_blinded = false; bool extra_pass = false; + bool padding = false; u8 *image = NULL; int *addrs; int pass; @@ -2110,6 +2218,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image = jit_data->image; header = jit_data->header; extra_pass = true; + padding = true; goto skip_init_addrs; } addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); @@ -2135,8 +2244,10 @@ skip_init_addrs: * may converge on the last pass. In such case do one more * pass to emit the final image. */ - for (pass = 0; pass < 20 || image; pass++) { - proglen = do_jit(prog, addrs, image, oldproglen, &ctx); + for (pass = 0; pass < MAX_PASSES || image; pass++) { + if (!padding && pass >= PADDING_PASSES) + padding = true; + proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 5f0472c18bcb..0c13cac903de 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -3743,16 +3743,7 @@ static int __init idt77252_init(void) struct sk_buff *skb; printk("%s: at %p\n", __func__, idt77252_init); - - if (sizeof(skb->cb) < sizeof(struct atm_skb_data) + - sizeof(struct idt77252_skb_prv)) { - printk(KERN_ERR "%s: skb->cb is too small (%lu < %lu)\n", - __func__, (unsigned long) sizeof(skb->cb), - (unsigned long) sizeof(struct atm_skb_data) + - sizeof(struct idt77252_skb_prv)); - return -EIO; - } - + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct idt77252_skb_prv) + sizeof(struct atm_skb_data)); return pci_register_driver(&idt77252_driver); } diff --git a/drivers/atm/idt77252.h b/drivers/atm/idt77252.h index 9339197d701c..b059d31364dd 100644 --- a/drivers/atm/idt77252.h +++ b/drivers/atm/idt77252.h @@ -789,7 +789,7 @@ struct idt77252_skb_prv { struct scqe tbd; /* Transmit Buffer Descriptor */ dma_addr_t paddr; /* DMA handle */ u32 pool; /* sb_pool handle */ -}; +} __packed; #define IDT77252_PRV_TBD(skb) \ (((struct idt77252_skb_prv *)(ATM_SKB(skb)+1))->tbd) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 72c75c7bdb65..ae86ded1e2a1 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1409,7 +1409,8 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, } EXPORT_SYMBOL(b53_phylink_mac_link_up); -int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) +int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct b53_device *dev = ds->priv; @@ -1444,7 +1445,8 @@ static int b53_vlan_prepare(struct dsa_switch *ds, int port, } int b53_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct b53_device *dev = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index ae72ef46b0b6..faf983fbca82 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -346,9 +346,11 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, struct phy_device *phydev, int speed, int duplex, bool tx_pause, bool rx_pause); -int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering); +int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack); int b53_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int b53_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int b53_fdb_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index 178218cf73a3..a7e2fcf2df2c 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -891,7 +891,7 @@ static int bcm_sf2_cfp_rule_insert(struct dsa_switch *ds, int port, else vlan.flags = 0; - ret = ds->ops->port_vlan_add(ds, port_num, &vlan); + ret = ds->ops->port_vlan_add(ds, port_num, &vlan, NULL); if (ret) return ret; } diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index 8c283f59158b..bfdf3324aac3 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -190,7 +190,8 @@ static void dsa_loop_port_stp_state_set(struct dsa_switch *ds, int port, } static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { dev_dbg(ds->dev, "%s: port: %d, vlan_filtering: %d\n", __func__, port, vlan_filtering); @@ -199,7 +200,8 @@ static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port, } static int dsa_loop_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index f984ca75a71f..463137c39db2 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -341,7 +341,8 @@ static u16 hellcreek_private_vid(int port) } static int hellcreek_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct hellcreek *hellcreek = ds->priv; int i; @@ -358,8 +359,10 @@ static int hellcreek_vlan_prepare(struct dsa_switch *ds, int port, if (!dsa_is_user_port(ds, i)) continue; - if (vlan->vid == restricted_vid) + if (vlan->vid == restricted_vid) { + NL_SET_ERR_MSG_MOD(extack, "VID restricted by driver"); return -EBUSY; + } } return 0; @@ -445,14 +448,15 @@ static void hellcreek_unapply_vlan(struct hellcreek *hellcreek, int port, } static int hellcreek_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct hellcreek *hellcreek = ds->priv; int err; - err = hellcreek_vlan_prepare(ds, port, vlan); + err = hellcreek_vlan_prepare(ds, port, vlan, extack); if (err) return err; @@ -871,7 +875,8 @@ static int hellcreek_fdb_dump(struct dsa_switch *ds, int port, } static int hellcreek_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct hellcreek *hellcreek = ds->priv; diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 9fec97773a15..52e865a3912c 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -727,14 +727,18 @@ static int gswip_pce_load_microcode(struct gswip_priv *priv) } static int gswip_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; struct gswip_priv *priv = ds->priv; /* Do not allow changing the VLAN filtering options while in bridge */ - if (bridge && !!(priv->port_vlan_filter & BIT(port)) != vlan_filtering) + if (bridge && !!(priv->port_vlan_filter & BIT(port)) != vlan_filtering) { + NL_SET_ERR_MSG_MOD(extack, + "Dynamic toggling of vlan_filtering not supported"); return -EIO; + } if (vlan_filtering) { /* Use port based VLAN tag */ @@ -773,7 +777,7 @@ static int gswip_setup(struct dsa_switch *ds) /* disable port fetch/store dma on all ports */ for (i = 0; i < priv->hw_info->max_ports; i++) { gswip_port_disable(ds, i); - gswip_port_vlan_filtering(ds, i, false); + gswip_port_vlan_filtering(ds, i, false, NULL); } /* enable Switch */ @@ -1128,7 +1132,8 @@ static void gswip_port_bridge_leave(struct dsa_switch *ds, int port, } static int gswip_port_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct gswip_priv *priv = ds->priv; struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; @@ -1163,15 +1168,18 @@ static int gswip_port_vlan_prepare(struct dsa_switch *ds, int port, } } - if (idx == -1) + if (idx == -1) { + NL_SET_ERR_MSG_MOD(extack, "No slot in VLAN table"); return -ENOSPC; + } } return 0; } static int gswip_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct gswip_priv *priv = ds->priv; struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; @@ -1179,7 +1187,7 @@ static int gswip_port_vlan_add(struct dsa_switch *ds, int port, bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; int err; - err = gswip_port_vlan_prepare(ds, port, vlan); + err = gswip_port_vlan_prepare(ds, port, vlan, extack); if (err) return err; diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index c87d445b30fd..b4b7de63ca79 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -783,7 +783,8 @@ static void ksz8795_flush_dyn_mac_table(struct ksz_device *dev, int port) } static int ksz8795_port_vlan_filtering(struct dsa_switch *ds, int port, - bool flag) + bool flag, + struct netlink_ext_ack *extack) { struct ksz_device *dev = ds->priv; @@ -793,7 +794,8 @@ static int ksz8795_port_vlan_filtering(struct dsa_switch *ds, int port, } static int ksz8795_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 00e38c8e0d01..55e5d479acce 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -493,7 +493,8 @@ static void ksz9477_flush_dyn_mac_table(struct ksz_device *dev, int port) } static int ksz9477_port_vlan_filtering(struct dsa_switch *ds, int port, - bool flag) + bool flag, + struct netlink_ext_ack *extack) { struct ksz_device *dev = ds->priv; @@ -511,7 +512,8 @@ static int ksz9477_port_vlan_filtering(struct dsa_switch *ds, int port, } static int ksz9477_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct ksz_device *dev = ds->priv; u32 vlan_table[3]; @@ -520,7 +522,7 @@ static int ksz9477_port_vlan_add(struct dsa_switch *ds, int port, err = ksz9477_get_vlan_table(dev, vlan->vid, vlan_table); if (err) { - dev_dbg(dev->dev, "Failed to get vlan table\n"); + NL_SET_ERR_MSG_MOD(extack, "Failed to get vlan table"); return err; } @@ -535,7 +537,7 @@ static int ksz9477_port_vlan_add(struct dsa_switch *ds, int port, err = ksz9477_set_vlan_table(dev, vlan->vid, vlan_table); if (err) { - dev_dbg(dev->dev, "Failed to set vlan table\n"); + NL_SET_ERR_MSG_MOD(extack, "Failed to set vlan table"); return err; } diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index eb13ba79dd01..c17de2bcf2fe 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1376,8 +1376,8 @@ mt7530_vlan_cmd(struct mt7530_priv *priv, enum mt7530_vlan_cmd cmd, u16 vid) } static int -mt7530_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) +mt7530_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { if (vlan_filtering) { /* The port is being kept as VLAN-unaware port when bridge is @@ -1483,7 +1483,8 @@ mt7530_hw_vlan_update(struct mt7530_priv *priv, u16 vid, static int mt7530_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 0ef1fadfec68..903d619e08ed 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1600,7 +1600,8 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, } static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct mv88e6xxx_chip *chip = ds->priv; u16 mode = vlan_filtering ? MV88E6XXX_PORT_CTL2_8021Q_MODE_SECURE : @@ -1982,7 +1983,8 @@ static int mv88e6xxx_port_vlan_join(struct mv88e6xxx_chip *chip, int port, } static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct mv88e6xxx_chip *chip = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index d3180b0f2307..628afb47b579 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -14,8 +14,9 @@ #include <soc/mscc/ocelot_ptp.h> #include <soc/mscc/ocelot.h> #include <linux/dsa/8021q.h> +#include <linux/dsa/ocelot.h> #include <linux/platform_device.h> -#include <linux/packing.h> +#include <linux/ptp_classify.h> #include <linux/module.h> #include <linux/of_net.h> #include <linux/pci.h> @@ -264,6 +265,129 @@ static void felix_8021q_cpu_port_deinit(struct ocelot *ocelot, int port) ocelot_apply_bridge_fwd_mask(ocelot); } +/* Set up a VCAP IS2 rule for delivering PTP frames to the CPU port module. + * If the quirk_no_xtr_irq is in place, then also copy those PTP frames to the + * tag_8021q CPU port. + */ +static int felix_setup_mmio_filtering(struct felix *felix) +{ + unsigned long user_ports = 0, cpu_ports = 0; + struct ocelot_vcap_filter *redirect_rule; + struct ocelot_vcap_filter *tagging_rule; + struct ocelot *ocelot = &felix->ocelot; + struct dsa_switch *ds = felix->ds; + int port, ret; + + tagging_rule = kzalloc(sizeof(struct ocelot_vcap_filter), GFP_KERNEL); + if (!tagging_rule) + return -ENOMEM; + + redirect_rule = kzalloc(sizeof(struct ocelot_vcap_filter), GFP_KERNEL); + if (!redirect_rule) { + kfree(tagging_rule); + return -ENOMEM; + } + + for (port = 0; port < ocelot->num_phys_ports; port++) { + if (dsa_is_user_port(ds, port)) + user_ports |= BIT(port); + if (dsa_is_cpu_port(ds, port)) + cpu_ports |= BIT(port); + } + + tagging_rule->key_type = OCELOT_VCAP_KEY_ETYPE; + *(__be16 *)tagging_rule->key.etype.etype.value = htons(ETH_P_1588); + *(__be16 *)tagging_rule->key.etype.etype.mask = htons(0xffff); + tagging_rule->ingress_port_mask = user_ports; + tagging_rule->prio = 1; + tagging_rule->id.cookie = ocelot->num_phys_ports; + tagging_rule->id.tc_offload = false; + tagging_rule->block_id = VCAP_IS1; + tagging_rule->type = OCELOT_VCAP_FILTER_OFFLOAD; + tagging_rule->lookup = 0; + tagging_rule->action.pag_override_mask = 0xff; + tagging_rule->action.pag_val = ocelot->num_phys_ports; + + ret = ocelot_vcap_filter_add(ocelot, tagging_rule, NULL); + if (ret) { + kfree(tagging_rule); + kfree(redirect_rule); + return ret; + } + + redirect_rule->key_type = OCELOT_VCAP_KEY_ANY; + redirect_rule->ingress_port_mask = user_ports; + redirect_rule->pag = ocelot->num_phys_ports; + redirect_rule->prio = 1; + redirect_rule->id.cookie = ocelot->num_phys_ports; + redirect_rule->id.tc_offload = false; + redirect_rule->block_id = VCAP_IS2; + redirect_rule->type = OCELOT_VCAP_FILTER_OFFLOAD; + redirect_rule->lookup = 0; + redirect_rule->action.cpu_copy_ena = true; + if (felix->info->quirk_no_xtr_irq) { + /* Redirect to the tag_8021q CPU but also copy PTP packets to + * the CPU port module + */ + redirect_rule->action.mask_mode = OCELOT_MASK_MODE_REDIRECT; + redirect_rule->action.port_mask = cpu_ports; + } else { + /* Trap PTP packets only to the CPU port module (which is + * redirected to the NPI port) + */ + redirect_rule->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY; + redirect_rule->action.port_mask = 0; + } + + ret = ocelot_vcap_filter_add(ocelot, redirect_rule, NULL); + if (ret) { + ocelot_vcap_filter_del(ocelot, tagging_rule); + kfree(redirect_rule); + return ret; + } + + /* The ownership of the CPU port module's queues might have just been + * transferred to the tag_8021q tagger from the NPI-based tagger. + * So there might still be all sorts of crap in the queues. On the + * other hand, the MMIO-based matching of PTP frames is very brittle, + * so we need to be careful that there are no extra frames to be + * dequeued over MMIO, since we would never know to discard them. + */ + ocelot_drain_cpu_queue(ocelot, 0); + + return 0; +} + +static int felix_teardown_mmio_filtering(struct felix *felix) +{ + struct ocelot_vcap_filter *tagging_rule, *redirect_rule; + struct ocelot_vcap_block *block_vcap_is1; + struct ocelot_vcap_block *block_vcap_is2; + struct ocelot *ocelot = &felix->ocelot; + int err; + + block_vcap_is1 = &ocelot->block[VCAP_IS1]; + block_vcap_is2 = &ocelot->block[VCAP_IS2]; + + tagging_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_is1, + ocelot->num_phys_ports, + false); + if (!tagging_rule) + return -ENOENT; + + err = ocelot_vcap_filter_del(ocelot, tagging_rule); + if (err) + return err; + + redirect_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, + ocelot->num_phys_ports, + false); + if (!redirect_rule) + return -ENOENT; + + return ocelot_vcap_filter_del(ocelot, redirect_rule); +} + static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu) { struct ocelot *ocelot = ds->priv; @@ -292,9 +416,9 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu) ANA_PORT_CPU_FWD_BPDU_CFG, port); } - /* In tag_8021q mode, the CPU port module is unused. So we - * want to disable flooding of any kind to the CPU port module, - * since packets going there will end in a black hole. + /* In tag_8021q mode, the CPU port module is unused, except for PTP + * frames. So we want to disable flooding of any kind to the CPU port + * module, since packets going there will end in a black hole. */ cpu_flood = ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)); ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_UC); @@ -314,8 +438,14 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu) if (err) goto out_free_dsa_8021_ctx; + err = felix_setup_mmio_filtering(felix); + if (err) + goto out_teardown_dsa_8021q; + return 0; +out_teardown_dsa_8021q: + dsa_8021q_setup(felix->dsa_8021q_ctx, false); out_free_dsa_8021_ctx: kfree(felix->dsa_8021q_ctx); return err; @@ -327,6 +457,11 @@ static void felix_teardown_tag_8021q(struct dsa_switch *ds, int cpu) struct felix *felix = ocelot_to_felix(ocelot); int err, port; + err = felix_teardown_mmio_filtering(felix); + if (err) + dev_err(ds->dev, "felix_teardown_mmio_filtering returned %d", + err); + err = dsa_8021q_setup(felix->dsa_8021q_ctx, false); if (err) dev_err(ds->dev, "dsa_8021q_setup returned %d", err); @@ -431,6 +566,7 @@ static int felix_set_tag_protocol(struct dsa_switch *ds, int cpu, int err; switch (proto) { + case DSA_TAG_PROTO_SEVILLE: case DSA_TAG_PROTO_OCELOT: err = felix_setup_tag_npi(ds, cpu); break; @@ -448,6 +584,7 @@ static void felix_del_tag_protocol(struct dsa_switch *ds, int cpu, enum dsa_tag_protocol proto) { switch (proto) { + case DSA_TAG_PROTO_SEVILLE: case DSA_TAG_PROTO_OCELOT: felix_teardown_tag_npi(ds, cpu); break; @@ -471,7 +608,8 @@ static int felix_change_tag_protocol(struct dsa_switch *ds, int cpu, enum dsa_tag_protocol old_proto = felix->tag_proto; int err; - if (proto != DSA_TAG_PROTO_OCELOT && + if (proto != DSA_TAG_PROTO_SEVILLE && + proto != DSA_TAG_PROTO_OCELOT && proto != DSA_TAG_PROTO_OCELOT_8021Q) return -EPROTONOSUPPORT; @@ -643,7 +781,8 @@ static int felix_vlan_prepare(struct dsa_switch *ds, int port, flags & BRIDGE_VLAN_INFO_UNTAGGED); } -static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) +static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, + struct netlink_ext_ack *extack) { struct ocelot *ocelot = ds->priv; @@ -651,7 +790,8 @@ static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) } static int felix_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct ocelot *ocelot = ds->priv; u16 flags = vlan->flags; @@ -1003,7 +1143,6 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) for (port = 0; port < num_phys_ports; port++) { struct ocelot_port *ocelot_port; struct regmap *target; - u8 *template; ocelot_port = devm_kzalloc(ocelot->dev, sizeof(struct ocelot_port), @@ -1029,22 +1168,10 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) return PTR_ERR(target); } - template = devm_kzalloc(ocelot->dev, OCELOT_TOTAL_TAG_LEN, - GFP_KERNEL); - if (!template) { - dev_err(ocelot->dev, - "Failed to allocate memory for DSA tag\n"); - kfree(port_phy_modes); - return -ENOMEM; - } - ocelot_port->phy_mode = port_phy_modes[port]; ocelot_port->ocelot = ocelot; ocelot_port->target = target; - ocelot_port->xmit_template = template; ocelot->ports[port] = ocelot_port; - - felix->info->xmit_template_populate(ocelot, port); } kfree(port_phy_modes); @@ -1075,7 +1202,7 @@ static int felix_setup(struct dsa_switch *ds) err = ocelot_init(ocelot); if (err) - return err; + goto out_mdiobus_free; if (ocelot->ptp) { err = ocelot_init_timestamp(ocelot, felix->info->ptp_caps); @@ -1100,7 +1227,7 @@ static int felix_setup(struct dsa_switch *ds) err = ocelot_devlink_sb_register(ocelot); if (err) - return err; + goto out_deinit_ports; for (port = 0; port < ds->num_ports; port++) { if (!dsa_is_cpu_port(ds, port)) @@ -1116,6 +1243,23 @@ static int felix_setup(struct dsa_switch *ds) ds->assisted_learning_on_cpu_port = true; return 0; + +out_deinit_ports: + for (port = 0; port < ocelot->num_phys_ports; port++) { + if (dsa_is_unused_port(ds, port)) + continue; + + ocelot_deinit_port(ocelot, port); + } + + ocelot_deinit_timestamp(ocelot); + ocelot_deinit(ocelot); + +out_mdiobus_free: + if (felix->info->mdio_bus_free) + felix->info->mdio_bus_free(ocelot); + + return err; } static void felix_teardown(struct dsa_switch *ds) @@ -1135,8 +1279,12 @@ static void felix_teardown(struct dsa_switch *ds) ocelot_deinit_timestamp(ocelot); ocelot_deinit(ocelot); - for (port = 0; port < ocelot->num_phys_ports; port++) + for (port = 0; port < ocelot->num_phys_ports; port++) { + if (dsa_is_unused_port(ds, port)) + continue; + ocelot_deinit_port(ocelot, port); + } if (felix->info->mdio_bus_free) felix->info->mdio_bus_free(ocelot); @@ -1158,20 +1306,79 @@ static int felix_hwtstamp_set(struct dsa_switch *ds, int port, return ocelot_hwstamp_set(ocelot, port, ifr); } +static bool felix_check_xtr_pkt(struct ocelot *ocelot, unsigned int ptp_type) +{ + struct felix *felix = ocelot_to_felix(ocelot); + int err, grp = 0; + + if (felix->tag_proto != DSA_TAG_PROTO_OCELOT_8021Q) + return false; + + if (!felix->info->quirk_no_xtr_irq) + return false; + + if (ptp_type == PTP_CLASS_NONE) + return false; + + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) { + struct sk_buff *skb; + unsigned int type; + + err = ocelot_xtr_poll_frame(ocelot, grp, &skb); + if (err) + goto out; + + /* We trap to the CPU port module all PTP frames, but + * felix_rxtstamp() only gets called for event frames. + * So we need to avoid sending duplicate general + * message frames by running a second BPF classifier + * here and dropping those. + */ + __skb_push(skb, ETH_HLEN); + + type = ptp_classify_raw(skb); + + __skb_pull(skb, ETH_HLEN); + + if (type == PTP_CLASS_NONE) { + kfree_skb(skb); + continue; + } + + netif_rx(skb); + } + +out: + if (err < 0) + ocelot_drain_cpu_queue(ocelot, 0); + + return true; +} + static bool felix_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type) { + u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN; struct skb_shared_hwtstamps *shhwtstamps; struct ocelot *ocelot = ds->priv; - u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN; u32 tstamp_lo, tstamp_hi; struct timespec64 ts; u64 tstamp, val; + /* If the "no XTR IRQ" workaround is in use, tell DSA to defer this skb + * for RX timestamping. Then free it, and poll for its copy through + * MMIO in the CPU port module, and inject that into the stack from + * ocelot_xtr_poll(). + */ + if (felix_check_xtr_pkt(ocelot, type)) { + kfree_skb(skb); + return true; + } + ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); tstamp = ktime_set(ts.tv_sec, ts.tv_nsec); - packing(extraction, &val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); + ocelot_xfh_get_rew_val(extraction, &val); tstamp_lo = (u32)val; tstamp_hi = tstamp >> 32; @@ -1375,6 +1582,40 @@ static int felix_sb_occ_tc_port_bind_get(struct dsa_switch *ds, int port, pool_type, p_cur, p_max); } +static int felix_mrp_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_mrp_add(ocelot, port, mrp); +} + +static int felix_mrp_del(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_mrp_add(ocelot, port, mrp); +} + +static int +felix_mrp_add_ring_role(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_mrp_add_ring_role(ocelot, port, mrp); +} + +static int +felix_mrp_del_ring_role(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_mrp_del_ring_role(ocelot, port, mrp); +} + const struct dsa_switch_ops felix_switch_ops = { .get_tag_protocol = felix_get_tag_protocol, .change_tag_protocol = felix_change_tag_protocol, @@ -1429,6 +1670,10 @@ const struct dsa_switch_ops felix_switch_ops = { .devlink_sb_occ_max_clear = felix_sb_occ_max_clear, .devlink_sb_occ_port_pool_get = felix_sb_occ_port_pool_get, .devlink_sb_occ_tc_port_bind_get= felix_sb_occ_tc_port_bind_get, + .port_mrp_add = felix_mrp_add, + .port_mrp_del = felix_mrp_del, + .port_mrp_add_ring_role = felix_mrp_add_ring_role, + .port_mrp_del_ring_role = felix_mrp_del_ring_role, }; struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port) diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 9d4459f2fffb..4d96cad815d5 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -23,6 +23,19 @@ struct felix_info { int switch_pci_bar; int imdio_pci_bar; const struct ptp_clock_info *ptp_caps; + + /* Some Ocelot switches are integrated into the SoC without the + * extraction IRQ line connected to the ARM GIC. By enabling this + * workaround, the few packets that are delivered to the CPU port + * module (currently only PTP) are copied not only to the hardware CPU + * port module, but also to the 802.1Q Ethernet CPU port, and polling + * the extraction registers is triggered once the DSA tagger sees a PTP + * frame. The Ethernet frame is only used as a notification: it is + * dropped, and the original frame is extracted over MMIO and annotated + * with the RX timestamp. + */ + bool quirk_no_xtr_irq; + int (*mdio_bus_alloc)(struct ocelot *ocelot); void (*mdio_bus_free)(struct ocelot *ocelot); void (*phylink_validate)(struct ocelot *ocelot, int port, @@ -34,7 +47,6 @@ struct felix_info { enum tc_setup_type type, void *type_data); void (*port_sched_speed_set)(struct ocelot *ocelot, int port, u32 speed); - void (*xmit_template_populate)(struct ocelot *ocelot, int port); }; extern const struct dsa_switch_ops felix_switch_ops; diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index e944868cc120..5ff623ee76a6 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -8,7 +8,7 @@ #include <soc/mscc/ocelot_ptp.h> #include <soc/mscc/ocelot_sys.h> #include <soc/mscc/ocelot.h> -#include <linux/packing.h> +#include <linux/dsa/ocelot.h> #include <linux/pcs-lynx.h> #include <net/pkt_sched.h> #include <linux/iopoll.h> @@ -1339,31 +1339,6 @@ static int vsc9959_port_setup_tc(struct dsa_switch *ds, int port, } } -static void vsc9959_xmit_template_populate(struct ocelot *ocelot, int port) -{ - struct ocelot_port *ocelot_port = ocelot->ports[port]; - u8 *template = ocelot_port->xmit_template; - u64 bypass, dest, src; - __be32 *prefix; - u8 *injection; - - /* Set the source port as the CPU port module and not the - * NPI port - */ - src = ocelot->num_phys_ports; - dest = BIT(port); - bypass = true; - - injection = template + OCELOT_SHORT_PREFIX_LEN; - prefix = (__be32 *)template; - - packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); - - *prefix = cpu_to_be32(0x8880000a); -} - static const struct felix_info felix_info_vsc9959 = { .target_io_res = vsc9959_target_io_res, .port_io_res = vsc9959_port_io_res, @@ -1379,6 +1354,7 @@ static const struct felix_info felix_info_vsc9959 = { .num_tx_queues = OCELOT_NUM_TC, .switch_pci_bar = 4, .imdio_pci_bar = 0, + .quirk_no_xtr_irq = true, .ptp_caps = &vsc9959_ptp_caps, .mdio_bus_alloc = vsc9959_mdio_bus_alloc, .mdio_bus_free = vsc9959_mdio_bus_free, @@ -1386,7 +1362,6 @@ static const struct felix_info felix_info_vsc9959 = { .prevalidate_phy_mode = vsc9959_prevalidate_phy_mode, .port_setup_tc = vsc9959_port_setup_tc, .port_sched_speed_set = vsc9959_sched_speed_set, - .xmit_template_populate = vsc9959_xmit_template_populate, }; static irqreturn_t felix_irq_handler(int irq, void *data) diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index 512f677a6c1c..84f93a874d50 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -8,7 +8,7 @@ #include <soc/mscc/ocelot.h> #include <linux/of_platform.h> #include <linux/pcs-lynx.h> -#include <linux/packing.h> +#include <linux/dsa/ocelot.h> #include <linux/iopoll.h> #include "felix.h" @@ -1165,31 +1165,6 @@ static void vsc9953_mdio_bus_free(struct ocelot *ocelot) mdiobus_unregister(felix->imdio); } -static void vsc9953_xmit_template_populate(struct ocelot *ocelot, int port) -{ - struct ocelot_port *ocelot_port = ocelot->ports[port]; - u8 *template = ocelot_port->xmit_template; - u64 bypass, dest, src; - __be32 *prefix; - u8 *injection; - - /* Set the source port as the CPU port module and not the - * NPI port - */ - src = ocelot->num_phys_ports; - dest = BIT(port); - bypass = true; - - injection = template + OCELOT_SHORT_PREFIX_LEN; - prefix = (__be32 *)template; - - packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &dest, 67, 57, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); - - *prefix = cpu_to_be32(0x88800005); -} - static const struct felix_info seville_info_vsc9953 = { .target_io_res = vsc9953_target_io_res, .port_io_res = vsc9953_port_io_res, @@ -1206,7 +1181,6 @@ static const struct felix_info seville_info_vsc9953 = { .mdio_bus_free = vsc9953_mdio_bus_free, .phylink_validate = vsc9953_phylink_validate, .prevalidate_phy_mode = vsc9953_prevalidate_phy_mode, - .xmit_template_populate = vsc9953_xmit_template_populate, }; static int seville_probe(struct platform_device *pdev) @@ -1246,7 +1220,7 @@ static int seville_probe(struct platform_device *pdev) ds->ops = &felix_switch_ops; ds->priv = ocelot; felix->ds = ds; - felix->tag_proto = DSA_TAG_PROTO_OCELOT; + felix->tag_proto = DSA_TAG_PROTO_SEVILLE; err = dsa_register_switch(ds); if (err) { diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 6127823d6c2e..cdaf9f85a2cb 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1294,7 +1294,8 @@ qca8k_port_fdb_dump(struct dsa_switch *ds, int port, } static int -qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) +qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct qca8k_priv *priv = ds->priv; @@ -1313,7 +1314,8 @@ qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) static int qca8k_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; diff --git a/drivers/net/dsa/realtek-smi-core.h b/drivers/net/dsa/realtek-smi-core.h index 26376b052594..fcf465f7f922 100644 --- a/drivers/net/dsa/realtek-smi-core.h +++ b/drivers/net/dsa/realtek-smi-core.h @@ -130,10 +130,11 @@ int rtl8366_enable_vlan4k(struct realtek_smi *smi, bool enable); int rtl8366_enable_vlan(struct realtek_smi *smi, bool enable); int rtl8366_reset_vlan(struct realtek_smi *smi); int rtl8366_init_vlan(struct realtek_smi *smi); -int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering); +int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack); int rtl8366_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int rtl8366_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); void rtl8366_get_strings(struct dsa_switch *ds, int port, u32 stringset, diff --git a/drivers/net/dsa/rtl8366.c b/drivers/net/dsa/rtl8366.c index 3b24f2e13200..75897a369096 100644 --- a/drivers/net/dsa/rtl8366.c +++ b/drivers/net/dsa/rtl8366.c @@ -340,7 +340,8 @@ int rtl8366_init_vlan(struct realtek_smi *smi) } EXPORT_SYMBOL_GPL(rtl8366_init_vlan); -int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) +int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct realtek_smi *smi = ds->priv; struct rtl8366_vlan_4k vlan4k; @@ -375,7 +376,8 @@ int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) EXPORT_SYMBOL_GPL(rtl8366_vlan_filtering); int rtl8366_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = !!(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED); bool pvid = !!(vlan->flags & BRIDGE_VLAN_INFO_PVID); @@ -384,16 +386,20 @@ int rtl8366_vlan_add(struct dsa_switch *ds, int port, u32 untag = 0; int ret; - if (!smi->ops->is_vlan_valid(smi, vlan->vid)) + if (!smi->ops->is_vlan_valid(smi, vlan->vid)) { + NL_SET_ERR_MSG_MOD(extack, "VLAN ID not valid"); return -EINVAL; + } /* Enable VLAN in the hardware * FIXME: what's with this 4k business? * Just rtl8366_enable_vlan() seems inconclusive. */ ret = rtl8366_enable_vlan4k(smi, true); - if (ret) + if (ret) { + NL_SET_ERR_MSG_MOD(extack, "Failed to enable VLAN 4K"); return ret; + } dev_info(smi->dev, "add VLAN %d on port %d, %s, %s\n", vlan->vid, port, untagged ? "untagged" : "tagged", diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 15a0893d0ff1..f9e87fb33da0 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -206,6 +206,8 @@ struct sja1105_private { bool rgmii_tx_delay[SJA1105_NUM_PORTS]; bool best_effort_vlan_filtering; unsigned long learn_ena; + unsigned long ucast_egress_floods; + unsigned long bcast_egress_floods; const struct sja1105_info *info; struct gpio_desc *reset_gpio; struct spi_device *spidev; @@ -247,7 +249,8 @@ enum sja1105_reset_reason { int sja1105_static_config_reload(struct sja1105_private *priv, enum sja1105_reset_reason reason); -int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled); +int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, + struct netlink_ext_ack *extack); void sja1105_frame_memory_partitioning(struct sja1105_private *priv); /* From sja1105_devlink.c */ diff --git a/drivers/net/dsa/sja1105/sja1105_devlink.c b/drivers/net/dsa/sja1105/sja1105_devlink.c index b4bf1b10e66c..b6a4a16b8c7e 100644 --- a/drivers/net/dsa/sja1105/sja1105_devlink.c +++ b/drivers/net/dsa/sja1105/sja1105_devlink.c @@ -143,7 +143,7 @@ static int sja1105_best_effort_vlan_filtering_set(struct sja1105_private *priv, dp = dsa_to_port(ds, port); vlan_filtering = dsa_port_is_vlan_filtering(dp); - rc = sja1105_vlan_filtering(ds, port, vlan_filtering); + rc = sja1105_vlan_filtering(ds, port, vlan_filtering, NULL); if (rc) break; } diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 1dad94540cc9..7692338730df 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -50,6 +50,12 @@ sja1105_port_allow_traffic(struct sja1105_l2_forwarding_entry *l2_fwd, l2_fwd[from].reach_port &= ~BIT(to); } +static bool sja1105_can_forward(struct sja1105_l2_forwarding_entry *l2_fwd, + int from, int to) +{ + return !!(l2_fwd[from].reach_port & BIT(to)); +} + /* Structure used to temporarily transport device tree * settings into sja1105_setup */ @@ -408,6 +414,12 @@ static int sja1105_init_l2_forwarding(struct sja1105_private *priv) for (j = 0; j < SJA1105_NUM_TC; j++) l2fwd[i].vlan_pmap[j] = j; + /* All ports start up with egress flooding enabled, + * including the CPU port. + */ + priv->ucast_egress_floods |= BIT(i); + priv->bcast_egress_floods |= BIT(i); + if (i == upstream) continue; @@ -1571,6 +1583,50 @@ static int sja1105_mdb_del(struct dsa_switch *ds, int port, return sja1105_fdb_del(ds, port, mdb->addr, mdb->vid); } +/* Common function for unicast and broadcast flood configuration. + * Flooding is configured between each {ingress, egress} port pair, and since + * the bridge's semantics are those of "egress flooding", it means we must + * enable flooding towards this port from all ingress ports that are in the + * same forwarding domain. + */ +static int sja1105_manage_flood_domains(struct sja1105_private *priv) +{ + struct sja1105_l2_forwarding_entry *l2_fwd; + struct dsa_switch *ds = priv->ds; + int from, to, rc; + + l2_fwd = priv->static_config.tables[BLK_IDX_L2_FORWARDING].entries; + + for (from = 0; from < ds->num_ports; from++) { + u64 fl_domain = 0, bc_domain = 0; + + for (to = 0; to < priv->ds->num_ports; to++) { + if (!sja1105_can_forward(l2_fwd, from, to)) + continue; + + if (priv->ucast_egress_floods & BIT(to)) + fl_domain |= BIT(to); + if (priv->bcast_egress_floods & BIT(to)) + bc_domain |= BIT(to); + } + + /* Nothing changed, nothing to do */ + if (l2_fwd[from].fl_domain == fl_domain && + l2_fwd[from].bc_domain == bc_domain) + continue; + + l2_fwd[from].fl_domain = fl_domain; + l2_fwd[from].bc_domain = bc_domain; + + rc = sja1105_dynamic_config_write(priv, BLK_IDX_L2_FORWARDING, + from, &l2_fwd[from], true); + if (rc < 0) + return rc; + } + + return 0; +} + static int sja1105_bridge_member(struct dsa_switch *ds, int port, struct net_device *br, bool member) { @@ -1608,8 +1664,12 @@ static int sja1105_bridge_member(struct dsa_switch *ds, int port, return rc; } - return sja1105_dynamic_config_write(priv, BLK_IDX_L2_FORWARDING, - port, &l2_fwd[port], true); + rc = sja1105_dynamic_config_write(priv, BLK_IDX_L2_FORWARDING, + port, &l2_fwd[port], true); + if (rc) + return rc; + + return sja1105_manage_flood_domains(priv); } static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port, @@ -2639,7 +2699,8 @@ out: * which can only be partially reconfigured at runtime (and not the TPID). * So a switch reset is required. */ -int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) +int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, + struct netlink_ext_ack *extack) { struct sja1105_l2_lookup_params_entry *l2_lookup_params; struct sja1105_general_params_entry *general_params; @@ -2653,8 +2714,8 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) list_for_each_entry(rule, &priv->flow_block.rules, list) { if (rule->type == SJA1105_RULE_VL) { - dev_err(ds->dev, - "Cannot change VLAN filtering with active VL rules\n"); + NL_SET_ERR_MSG_MOD(extack, + "Cannot change VLAN filtering with active VL rules"); return -EBUSY; } } @@ -2736,7 +2797,7 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) rc = sja1105_static_config_reload(priv, SJA1105_VLAN_FILTERING); if (rc) - dev_err(ds->dev, "Failed to change VLAN Ethertype\n"); + NL_SET_ERR_MSG_MOD(extack, "Failed to change VLAN Ethertype"); /* Switch port identification based on 802.1Q is only passable * if we are not under a vlan_filtering bridge. So make sure @@ -2795,7 +2856,8 @@ static int sja1105_vlan_del_one(struct dsa_switch *ds, int port, u16 vid, } static int sja1105_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct sja1105_private *priv = ds->priv; bool vlan_table_changed = false; @@ -2807,7 +2869,8 @@ static int sja1105_vlan_add(struct dsa_switch *ds, int port, */ if (priv->vlan_state != SJA1105_VLAN_FILTERING_FULL && vid_is_dsa_8021q(vlan->vid)) { - dev_err(ds->dev, "Range 1024-3071 reserved for dsa_8021q operation\n"); + NL_SET_ERR_MSG_MOD(extack, + "Range 1024-3071 reserved for dsa_8021q operation"); return -EBUSY; } @@ -2936,6 +2999,8 @@ static int sja1105_setup(struct dsa_switch *ds) ds->mtu_enforcement_ingress = true; + priv->best_effort_vlan_filtering = true; + rc = sja1105_devlink_setup(ds); if (rc < 0) return rc; @@ -3282,7 +3347,7 @@ static int sja1105_port_set_learning(struct sja1105_private *priv, int port, mac = priv->static_config.tables[BLK_IDX_MAC_CONFIG].entries; - mac[port].dyn_learn = !!(priv->learn_ena & BIT(port)); + mac[port].dyn_learn = enabled; rc = sja1105_dynamic_config_write(priv, BLK_IDX_MAC_CONFIG, port, &mac[port], true); @@ -3297,51 +3362,24 @@ static int sja1105_port_set_learning(struct sja1105_private *priv, int port, return 0; } -/* Common function for unicast and broadcast flood configuration. - * Flooding is configured between each {ingress, egress} port pair, and since - * the bridge's semantics are those of "egress flooding", it means we must - * enable flooding towards this port from all ingress ports that are in the - * same bridge. In practice, we just enable flooding from all possible ingress - * ports regardless of whether they're in the same bridge or not, since the - * reach_port configuration will not allow flooded frames to leak across - * bridging domains anyway. - */ static int sja1105_port_ucast_bcast_flood(struct sja1105_private *priv, int to, struct switchdev_brport_flags flags) { - struct sja1105_l2_forwarding_entry *l2_fwd; - int from, rc; - - l2_fwd = priv->static_config.tables[BLK_IDX_L2_FORWARDING].entries; - - for (from = 0; from < priv->ds->num_ports; from++) { - if (dsa_is_unused_port(priv->ds, from)) - continue; - if (from == to) - continue; - - /* Unicast */ - if (flags.mask & BR_FLOOD) { - if (flags.val & BR_FLOOD) - l2_fwd[from].fl_domain |= BIT(to); - else - l2_fwd[from].fl_domain &= ~BIT(to); - } - /* Broadcast */ - if (flags.mask & BR_BCAST_FLOOD) { - if (flags.val & BR_BCAST_FLOOD) - l2_fwd[from].bc_domain |= BIT(to); - else - l2_fwd[from].bc_domain &= ~BIT(to); - } + if (flags.mask & BR_FLOOD) { + if (flags.val & BR_FLOOD) + priv->ucast_egress_floods |= BIT(to); + else + priv->ucast_egress_floods |= BIT(to); + } - rc = sja1105_dynamic_config_write(priv, BLK_IDX_L2_FORWARDING, - from, &l2_fwd[from], true); - if (rc < 0) - return rc; + if (flags.mask & BR_BCAST_FLOOD) { + if (flags.val & BR_BCAST_FLOOD) + priv->bcast_egress_floods |= BIT(to); + else + priv->bcast_egress_floods |= BIT(to); } - return 0; + return sja1105_manage_flood_domains(priv); } static int sja1105_port_mcast_flood(struct sja1105_private *priv, int to, diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index b40d4377cc71..b2cd3bdba9f8 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -1279,10 +1279,18 @@ #define MDIO_PMA_10GBR_FECCTRL 0x00ab #endif +#ifndef MDIO_PMA_RX_CTRL1 +#define MDIO_PMA_RX_CTRL1 0x8051 +#endif + #ifndef MDIO_PCS_DIG_CTRL #define MDIO_PCS_DIG_CTRL 0x8000 #endif +#ifndef MDIO_PCS_DIGITAL_STAT +#define MDIO_PCS_DIGITAL_STAT 0x8010 +#endif + #ifndef MDIO_AN_XNP #define MDIO_AN_XNP 0x0016 #endif @@ -1358,6 +1366,8 @@ #define XGBE_KR_TRAINING_ENABLE BIT(1) #define XGBE_PCS_CL37_BP BIT(12) +#define XGBE_PCS_PSEQ_STATE_MASK 0x1c +#define XGBE_PCS_PSEQ_STATE_POWER_GOOD 0x10 #define XGBE_AN_CL37_INT_CMPLT BIT(0) #define XGBE_AN_CL37_INT_MASK 0x01 @@ -1375,6 +1385,10 @@ #define XGBE_PMA_CDR_TRACK_EN_OFF 0x00 #define XGBE_PMA_CDR_TRACK_EN_ON 0x01 +#define XGBE_PMA_RX_RST_0_MASK BIT(4) +#define XGBE_PMA_RX_RST_0_RESET_ON 0x10 +#define XGBE_PMA_RX_RST_0_RESET_OFF 0x00 + /* Bit setting and getting macros * The get macro will extract the current bit field value from within * the variable diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 99b6d5a9f1d9..4f714f874c4f 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1368,6 +1368,7 @@ static void xgbe_stop(struct xgbe_prv_data *pdata) return; netif_tx_stop_all_queues(netdev); + netif_carrier_off(pdata->netdev); xgbe_stop_timers(pdata); flush_workqueue(pdata->dev_workqueue); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 93ef5a30cb8d..4e97b4869522 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1345,7 +1345,7 @@ static void xgbe_phy_status(struct xgbe_prv_data *pdata) &an_restart); if (an_restart) { xgbe_phy_config_aneg(pdata); - return; + goto adjust_link; } if (pdata->phy.link) { @@ -1396,7 +1396,6 @@ static void xgbe_phy_stop(struct xgbe_prv_data *pdata) pdata->phy_if.phy_impl.stop(pdata); pdata->phy.link = 0; - netif_carrier_off(pdata->netdev); xgbe_phy_adjust_link(pdata); } diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 859ded0c06b0..18e48b3bc402 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -922,6 +922,9 @@ static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata) if ((phy_id & 0xfffffff0) != 0x03625d10) return false; + /* Reset PHY - wait for self-clearing reset bit to clear */ + genphy_soft_reset(phy_data->phydev); + /* Disable RGMII mode */ phy_write(phy_data->phydev, 0x18, 0x7007); reg = phy_read(phy_data->phydev, 0x18); @@ -1953,6 +1956,27 @@ static void xgbe_phy_set_redrv_mode(struct xgbe_prv_data *pdata) xgbe_phy_put_comm_ownership(pdata); } +static void xgbe_phy_rx_reset(struct xgbe_prv_data *pdata) +{ + int reg; + + reg = XMDIO_READ_BITS(pdata, MDIO_MMD_PCS, MDIO_PCS_DIGITAL_STAT, + XGBE_PCS_PSEQ_STATE_MASK); + if (reg == XGBE_PCS_PSEQ_STATE_POWER_GOOD) { + /* Mailbox command timed out, reset of RX block is required. + * This can be done by asseting the reset bit and wait for + * its compeletion. + */ + XMDIO_WRITE_BITS(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_RX_CTRL1, + XGBE_PMA_RX_RST_0_MASK, XGBE_PMA_RX_RST_0_RESET_ON); + ndelay(20); + XMDIO_WRITE_BITS(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_RX_CTRL1, + XGBE_PMA_RX_RST_0_MASK, XGBE_PMA_RX_RST_0_RESET_OFF); + usleep_range(40, 50); + netif_err(pdata, link, pdata->netdev, "firmware mailbox reset performed\n"); + } +} + static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, unsigned int cmd, unsigned int sub_cmd) { @@ -1960,9 +1984,11 @@ static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, unsigned int wait; /* Log if a previous command did not complete */ - if (XP_IOREAD_BITS(pdata, XP_DRIVER_INT_RO, STATUS)) + if (XP_IOREAD_BITS(pdata, XP_DRIVER_INT_RO, STATUS)) { netif_dbg(pdata, link, pdata->netdev, "firmware mailbox not ready for command\n"); + xgbe_phy_rx_reset(pdata); + } /* Construct the command */ XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, COMMAND, cmd); @@ -1984,6 +2010,9 @@ static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, netif_dbg(pdata, link, pdata->netdev, "firmware mailbox command did not complete\n"); + + /* Reset on error */ + xgbe_phy_rx_reset(pdata); } static void xgbe_phy_rrc(struct xgbe_prv_data *pdata) @@ -2584,6 +2613,14 @@ static int xgbe_phy_link_status(struct xgbe_prv_data *pdata, int *an_restart) if (reg & MDIO_STAT1_LSTATUS) return 1; + if (pdata->phy.autoneg == AUTONEG_ENABLE && + phy_data->port_mode == XGBE_PORT_MODE_BACKPLANE) { + if (!test_bit(XGBE_LINK_INIT, &pdata->dev_state)) { + netif_carrier_off(pdata->netdev); + *an_restart = 1; + } + } + /* No link, attempt a receiver reset cycle */ if (phy_data->rrc_count++ > XGBE_RRC_FREQUENCY) { phy_data->rrc_count = 0; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c index 8f70a3909929..4af0cd9530de 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c @@ -71,8 +71,10 @@ static int aq_ndev_open(struct net_device *ndev) goto err_exit; err = aq_nic_start(aq_nic); - if (err < 0) + if (err < 0) { + aq_nic_stop(aq_nic); goto err_exit; + } err_exit: if (err < 0) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 0da8c8c73ba7..9be33dc98072 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -5,6 +5,7 @@ #include <linux/delay.h> #include <linux/etherdevice.h> +#include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/of.h> @@ -29,9 +30,10 @@ ENET_DMA_CH_CFG_INT_BUFF_DONE) #define ENET_DMA_MAX_BURST_LEN 8 /* in 64 bit words */ -#define ENET_MTU_MIN 60 -#define ENET_MTU_MAX 1500 /* Is it possible to support 2044? */ -#define ENET_MTU_MAX_EXTRA_SIZE 32 /* L2 */ +#define ENET_MTU_MAX ETH_DATA_LEN /* Is it possible to support 2044? */ +#define BRCM_MAX_TAG_LEN 6 +#define ENET_MAX_ETH_OVERHEAD (ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \ + ETH_FCS_LEN + 4) /* 32 */ struct bcm4908_enet_dma_ring_bd { __le32 ctl; @@ -135,6 +137,11 @@ static void bcm4908_enet_intrs_ack(struct bcm4908_enet *enet) enet_write(enet, ENET_DMA_CH_RX_CFG + ENET_DMA_CH_CFG_INT_STAT, ENET_DMA_INT_DEFAULTS); } +static void bcm4908_enet_set_mtu(struct bcm4908_enet *enet, int mtu) +{ + enet_umac_write(enet, UMAC_MAX_FRAME_LEN, mtu + ENET_MAX_ETH_OVERHEAD); +} + /*** * DMA */ @@ -246,7 +253,7 @@ static int bcm4908_enet_dma_alloc_rx_buf(struct bcm4908_enet *enet, unsigned int u32 tmp; int err; - slot->len = ENET_MTU_MAX + ENET_MTU_MAX_EXTRA_SIZE; + slot->len = ENET_MTU_MAX + ENET_MAX_ETH_OVERHEAD; slot->skb = netdev_alloc_skb(enet->netdev, slot->len); if (!slot->skb) @@ -374,6 +381,8 @@ static void bcm4908_enet_gmac_init(struct bcm4908_enet *enet) { u32 cmd; + bcm4908_enet_set_mtu(enet, enet->netdev->mtu); + cmd = enet_umac_read(enet, UMAC_CMD); enet_umac_write(enet, UMAC_CMD, cmd | CMD_SW_RESET); enet_umac_write(enet, UMAC_CMD, cmd & ~CMD_SW_RESET); @@ -559,7 +568,7 @@ static int bcm4908_enet_poll(struct napi_struct *napi, int weight) len = (ctl & DMA_CTL_LEN_DESC_BUFLENGTH) >> DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT; - if (len < ENET_MTU_MIN || + if (len < ETH_ZLEN || (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) { enet->netdev->stats.rx_dropped++; break; @@ -583,11 +592,21 @@ static int bcm4908_enet_poll(struct napi_struct *napi, int weight) return handled; } +static int bcm4908_enet_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct bcm4908_enet *enet = netdev_priv(netdev); + + bcm4908_enet_set_mtu(enet, new_mtu); + + return 0; +} + static const struct net_device_ops bcm4908_enet_netdev_ops = { .ndo_open = bcm4908_enet_open, .ndo_stop = bcm4908_enet_stop, .ndo_start_xmit = bcm4908_enet_start_xmit, .ndo_set_mac_address = eth_mac_addr, + .ndo_change_mtu = bcm4908_enet_change_mtu, }; static int bcm4908_enet_probe(struct platform_device *pdev) @@ -625,7 +644,7 @@ static int bcm4908_enet_probe(struct platform_device *pdev) eth_hw_addr_random(netdev); netdev->netdev_ops = &bcm4908_enet_netdev_ops; netdev->min_mtu = ETH_ZLEN; - netdev->mtu = ENET_MTU_MAX; + netdev->mtu = ETH_DATA_LEN; netdev->max_mtu = ENET_MTU_MAX; netif_napi_add(netdev, &enet->napi, bcm4908_enet_poll, 64); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index f508c5c61a30..a680fd9c68ea 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -257,6 +257,7 @@ static const u16 bnxt_async_events_arr[] = { ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY, ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION, ASYNC_EVENT_CMPL_EVENT_ID_RING_MONITOR_MSG, + ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST, }; static struct workqueue_struct *bnxt_pf_wq; @@ -2054,14 +2055,11 @@ static int bnxt_async_event_process(struct bnxt *bp, fw_health->enabled = EVENT_DATA1_RECOVERY_ENABLED(data1); fw_health->master = EVENT_DATA1_RECOVERY_MASTER_FUNC(data1); - if (!fw_health->enabled) + if (!fw_health->enabled) { + netif_info(bp, drv, bp->dev, + "Error recovery info: error recovery[0]\n"); break; - - netif_info(bp, drv, bp->dev, - "Error recovery info: error recovery[%d], master[%d], reset count[0x%x], health status: 0x%x\n", - fw_health->enabled, fw_health->master, - bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG), - bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG)); + } fw_health->tmr_multiplier = DIV_ROUND_UP(fw_health->polling_dsecs * HZ, bp->current_interval * 10); @@ -2070,6 +2068,10 @@ static int bnxt_async_event_process(struct bnxt *bp, bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG); fw_health->last_fw_reset_cnt = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); + netif_info(bp, drv, bp->dev, + "Error recovery info: error recovery[1], master[%d], reset count[%u], health status: 0x%x\n", + fw_health->master, fw_health->last_fw_reset_cnt, + bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG)); goto async_event_process_exit; } case ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION: @@ -2099,6 +2101,20 @@ static int bnxt_async_event_process(struct bnxt *bp, bnxt_sched_reset(bp, rxr); goto async_event_process_exit; } + case ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST: { + struct bnxt_fw_health *fw_health = bp->fw_health; + + netif_notice(bp, hw, bp->dev, + "Received firmware echo request, data1: 0x%x, data2: 0x%x\n", + data1, data2); + if (fw_health) { + fw_health->echo_req_data1 = data1; + fw_health->echo_req_data2 = data2; + set_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event); + break; + } + goto async_event_process_exit; + } default: goto async_event_process_exit; } @@ -2688,6 +2704,23 @@ static void bnxt_free_skbs(struct bnxt *bp) bnxt_free_rx_skbs(bp); } +static void bnxt_init_ctx_mem(struct bnxt_mem_init *mem_init, void *p, int len) +{ + u8 init_val = mem_init->init_val; + u16 offset = mem_init->offset; + u8 *p2 = p; + int i; + + if (!init_val) + return; + if (offset == BNXT_MEM_INVALID_OFFSET) { + memset(p, init_val, len); + return; + } + for (i = 0; i < len; i += mem_init->size) + *(p2 + i + offset) = init_val; +} + static void bnxt_free_ring(struct bnxt *bp, struct bnxt_ring_mem_info *rmem) { struct pci_dev *pdev = bp->pdev; @@ -2747,9 +2780,9 @@ static int bnxt_alloc_ring(struct bnxt *bp, struct bnxt_ring_mem_info *rmem) if (!rmem->pg_arr[i]) return -ENOMEM; - if (rmem->init_val) - memset(rmem->pg_arr[i], rmem->init_val, - rmem->page_size); + if (rmem->mem_init) + bnxt_init_ctx_mem(rmem->mem_init, rmem->pg_arr[i], + rmem->page_size); if (rmem->nr_pages > 1 || rmem->depth > 0) { if (i == rmem->nr_pages - 2 && (rmem->flags & BNXT_RMEM_RING_PTE_FLAG)) @@ -6750,6 +6783,39 @@ func_qcfg_exit: return rc; } +static void bnxt_init_ctx_initializer(struct bnxt_ctx_mem_info *ctx, + struct hwrm_func_backing_store_qcaps_output *resp) +{ + struct bnxt_mem_init *mem_init; + u16 init_mask; + u8 init_val; + u8 *offset; + int i; + + init_val = resp->ctx_kind_initializer; + init_mask = le16_to_cpu(resp->ctx_init_mask); + offset = &resp->qp_init_offset; + mem_init = &ctx->mem_init[BNXT_CTX_MEM_INIT_QP]; + for (i = 0; i < BNXT_CTX_MEM_INIT_MAX; i++, mem_init++, offset++) { + mem_init->init_val = init_val; + mem_init->offset = BNXT_MEM_INVALID_OFFSET; + if (!init_mask) + continue; + if (i == BNXT_CTX_MEM_INIT_STAT) + offset = &resp->stat_init_offset; + if (init_mask & (1 << i)) + mem_init->offset = *offset * 4; + else + mem_init->init_val = 0; + } + ctx->mem_init[BNXT_CTX_MEM_INIT_QP].size = ctx->qp_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_SRQ].size = ctx->srq_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_CQ].size = ctx->cq_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_VNIC].size = ctx->vnic_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_STAT].size = ctx->stat_entry_size; + ctx->mem_init[BNXT_CTX_MEM_INIT_MRAV].size = ctx->mrav_entry_size; +} + static int bnxt_hwrm_func_backing_store_qcaps(struct bnxt *bp) { struct hwrm_func_backing_store_qcaps_input req = {0}; @@ -6804,7 +6870,9 @@ static int bnxt_hwrm_func_backing_store_qcaps(struct bnxt *bp) le16_to_cpu(resp->mrav_num_entries_units); ctx->tim_entry_size = le16_to_cpu(resp->tim_entry_size); ctx->tim_max_entries = le32_to_cpu(resp->tim_max_entries); - ctx->ctx_kind_initializer = resp->ctx_kind_initializer; + + bnxt_init_ctx_initializer(ctx, resp); + ctx->tqm_fp_rings_count = resp->tqm_fp_rings_count; if (!ctx->tqm_fp_rings_count) ctx->tqm_fp_rings_count = bp->max_q; @@ -6834,6 +6902,9 @@ static void bnxt_hwrm_set_pg_attr(struct bnxt_ring_mem_info *rmem, u8 *pg_attr, { u8 pg_size = 0; + if (!rmem->nr_pages) + return; + if (BNXT_PAGE_SHIFT == 13) pg_size = 1 << 4; else if (BNXT_PAGE_SIZE == 16) @@ -6978,7 +7049,7 @@ static int bnxt_alloc_ctx_mem_blk(struct bnxt *bp, static int bnxt_alloc_ctx_pg_tbls(struct bnxt *bp, struct bnxt_ctx_pg_info *ctx_pg, u32 mem_size, - u8 depth, bool use_init_val) + u8 depth, struct bnxt_mem_init *mem_init) { struct bnxt_ring_mem_info *rmem = &ctx_pg->ring_mem; int rc; @@ -7016,8 +7087,7 @@ static int bnxt_alloc_ctx_pg_tbls(struct bnxt *bp, rmem->pg_tbl_map = ctx_pg->ctx_dma_arr[i]; rmem->depth = 1; rmem->nr_pages = MAX_CTX_PAGES; - if (use_init_val) - rmem->init_val = bp->ctx->ctx_kind_initializer; + rmem->mem_init = mem_init; if (i == (nr_tbls - 1)) { int rem = ctx_pg->nr_pages % MAX_CTX_PAGES; @@ -7032,8 +7102,7 @@ static int bnxt_alloc_ctx_pg_tbls(struct bnxt *bp, rmem->nr_pages = DIV_ROUND_UP(mem_size, BNXT_PAGE_SIZE); if (rmem->nr_pages > 1 || depth) rmem->depth = 1; - if (use_init_val) - rmem->init_val = bp->ctx->ctx_kind_initializer; + rmem->mem_init = mem_init; rc = bnxt_alloc_ctx_mem_blk(bp, ctx_pg); } return rc; @@ -7097,6 +7166,7 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp) { struct bnxt_ctx_pg_info *ctx_pg; struct bnxt_ctx_mem_info *ctx; + struct bnxt_mem_init *init; u32 mem_size, ena, entries; u32 entries_sp, min; u32 num_mr, num_ah; @@ -7124,39 +7194,54 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp) ctx_pg = &ctx->qp_mem; ctx_pg->entries = ctx->qp_min_qp1_entries + ctx->qp_max_l2_entries + extra_qps; - mem_size = ctx->qp_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true); - if (rc) - return rc; + if (ctx->qp_entry_size) { + mem_size = ctx->qp_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_QP]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init); + if (rc) + return rc; + } ctx_pg = &ctx->srq_mem; ctx_pg->entries = ctx->srq_max_l2_entries + extra_srqs; - mem_size = ctx->srq_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true); - if (rc) - return rc; + if (ctx->srq_entry_size) { + mem_size = ctx->srq_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_SRQ]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init); + if (rc) + return rc; + } ctx_pg = &ctx->cq_mem; ctx_pg->entries = ctx->cq_max_l2_entries + extra_qps * 2; - mem_size = ctx->cq_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, true); - if (rc) - return rc; + if (ctx->cq_entry_size) { + mem_size = ctx->cq_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_CQ]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, pg_lvl, init); + if (rc) + return rc; + } ctx_pg = &ctx->vnic_mem; ctx_pg->entries = ctx->vnic_max_vnic_entries + ctx->vnic_max_ring_table_entries; - mem_size = ctx->vnic_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, true); - if (rc) - return rc; + if (ctx->vnic_entry_size) { + mem_size = ctx->vnic_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_VNIC]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, init); + if (rc) + return rc; + } ctx_pg = &ctx->stat_mem; ctx_pg->entries = ctx->stat_max_entries; - mem_size = ctx->stat_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, true); - if (rc) - return rc; + if (ctx->stat_entry_size) { + mem_size = ctx->stat_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_STAT]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, init); + if (rc) + return rc; + } ena = 0; if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) @@ -7169,10 +7254,13 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp) num_mr = 1024 * 256; num_ah = 1024 * 128; ctx_pg->entries = num_mr + num_ah; - mem_size = ctx->mrav_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 2, true); - if (rc) - return rc; + if (ctx->mrav_entry_size) { + mem_size = ctx->mrav_entry_size * ctx_pg->entries; + init = &ctx->mem_init[BNXT_CTX_MEM_INIT_MRAV]; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 2, init); + if (rc) + return rc; + } ena = FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV; if (ctx->mrav_num_entries_units) ctx_pg->entries = @@ -7181,10 +7269,12 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp) ctx_pg = &ctx->tim_mem; ctx_pg->entries = ctx->qp_mem.entries; - mem_size = ctx->tim_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, false); - if (rc) - return rc; + if (ctx->tim_entry_size) { + mem_size = ctx->tim_entry_size * ctx_pg->entries; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, NULL); + if (rc) + return rc; + } ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_TIM; skip_rdma: @@ -7198,10 +7288,13 @@ skip_rdma: for (i = 0; i < ctx->tqm_fp_rings_count + 1; i++) { ctx_pg = ctx->tqm_mem[i]; ctx_pg->entries = i ? entries : entries_sp; - mem_size = ctx->tqm_entry_size * ctx_pg->entries; - rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, false); - if (rc) - return rc; + if (ctx->tqm_entry_size) { + mem_size = ctx->tqm_entry_size * ctx_pg->entries; + rc = bnxt_alloc_ctx_pg_tbls(bp, ctx_pg, mem_size, 1, + NULL); + if (rc) + return rc; + } ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_SP << i; } ena |= FUNC_BACKING_STORE_CFG_REQ_DFLT_ENABLES; @@ -8891,9 +8984,10 @@ void bnxt_tx_disable(struct bnxt *bp) txr->dev_state = BNXT_DEV_STATE_CLOSING; } } + /* Drop carrier first to prevent TX timeout */ + netif_carrier_off(bp->dev); /* Stop all TX queues */ netif_tx_disable(bp->dev); - netif_carrier_off(bp->dev); } void bnxt_tx_enable(struct bnxt *bp) @@ -10915,6 +11009,11 @@ static void bnxt_fw_reset_close(struct bnxt *bp) * kernel memory. */ if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) { + u16 val = 0; + + pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, &val); + if (val == 0xffff) + bp->fw_reset_min_dsecs = 0; bnxt_tx_disable(bp); bnxt_disable_napi(bp); bnxt_disable_int_sync(bp); @@ -11126,6 +11225,17 @@ static void bnxt_init_ethtool_link_settings(struct bnxt *bp) link_info->req_flow_ctrl = link_info->force_pause_setting; } +static void bnxt_fw_echo_reply(struct bnxt *bp) +{ + struct bnxt_fw_health *fw_health = bp->fw_health; + struct hwrm_func_echo_response_input req = {0}; + + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_ECHO_RESPONSE, -1, -1); + req.event_data1 = cpu_to_le32(fw_health->echo_req_data1); + req.event_data2 = cpu_to_le32(fw_health->echo_req_data2); + hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); +} + static void bnxt_sp_task(struct work_struct *work) { struct bnxt *bp = container_of(work, struct bnxt, sp_task); @@ -11193,6 +11303,9 @@ static void bnxt_sp_task(struct work_struct *work) if (test_and_clear_bit(BNXT_RING_COAL_NOW_SP_EVENT, &bp->sp_event)) bnxt_chk_missed_irq(bp); + if (test_and_clear_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event)) + bnxt_fw_echo_reply(bp); + /* These functions below will clear BNXT_STATE_IN_SP_TASK. They * must be the last functions to be called before exiting. */ @@ -11600,6 +11713,20 @@ static void bnxt_fw_reset_task(struct work_struct *work) if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) { u32 val; + if (!bp->fw_reset_min_dsecs) { + u16 val; + + pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, + &val); + if (val == 0xffff) { + if (bnxt_fw_reset_timeout(bp)) { + netdev_err(bp->dev, "Firmware reset aborted, PCI config space invalid\n"); + goto fw_reset_abort; + } + bnxt_queue_fw_reset_work(bp, HZ / 1000); + return; + } + } val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_INPROG_REG); if (val) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 4ef6888acdc6..1259e68cba2a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -18,7 +18,7 @@ */ #define DRV_VER_MAJ 1 #define DRV_VER_MIN 10 -#define DRV_VER_UPD 1 +#define DRV_VER_UPD 2 #include <linux/ethtool.h> #include <linux/interrupt.h> @@ -714,6 +714,13 @@ struct bnxt_sw_rx_agg_bd { dma_addr_t mapping; }; +struct bnxt_mem_init { + u8 init_val; + u16 offset; +#define BNXT_MEM_INVALID_OFFSET 0xffff + u16 size; +}; + struct bnxt_ring_mem_info { int nr_pages; int page_size; @@ -723,7 +730,7 @@ struct bnxt_ring_mem_info { #define BNXT_RMEM_USE_FULL_PAGE_FLAG 4 u16 depth; - u8 init_val; + struct bnxt_mem_init *mem_init; void **pg_arr; dma_addr_t *dma_arr; @@ -1474,7 +1481,6 @@ struct bnxt_ctx_mem_info { u32 tim_max_entries; u16 mrav_num_entries_units; u8 tqm_entries_multiple; - u8 ctx_kind_initializer; u8 tqm_fp_rings_count; u32 flags; @@ -1488,6 +1494,15 @@ struct bnxt_ctx_mem_info { struct bnxt_ctx_pg_info mrav_mem; struct bnxt_ctx_pg_info tim_mem; struct bnxt_ctx_pg_info *tqm_mem[BNXT_MAX_TQM_RINGS]; + +#define BNXT_CTX_MEM_INIT_QP 0 +#define BNXT_CTX_MEM_INIT_SRQ 1 +#define BNXT_CTX_MEM_INIT_CQ 2 +#define BNXT_CTX_MEM_INIT_VNIC 3 +#define BNXT_CTX_MEM_INIT_STAT 4 +#define BNXT_CTX_MEM_INIT_MRAV 5 +#define BNXT_CTX_MEM_INIT_MAX 6 + struct bnxt_mem_init mem_init[BNXT_CTX_MEM_INIT_MAX]; }; struct bnxt_fw_health { @@ -1516,6 +1531,8 @@ struct bnxt_fw_health { u32 fw_reset_seq_regs[16]; u32 fw_reset_seq_vals[16]; u32 fw_reset_seq_delay_msec[16]; + u32 echo_req_data1; + u32 echo_req_data2; struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_reset_reporter; struct devlink_health_reporter *fw_fatal_reporter; @@ -1925,6 +1942,7 @@ struct bnxt { #define BNXT_FW_RESET_NOTIFY_SP_EVENT 18 #define BNXT_FW_EXCEPTION_SP_EVENT 19 #define BNXT_LINK_CFG_CHANGE_SP_EVENT 21 +#define BNXT_FW_ECHO_REQUEST_SP_EVENT 23 struct delayed_work fw_reset_task; int fw_reset_state; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 90a31b4a3020..64381be935a8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -471,8 +471,8 @@ static int bnxt_dl_info_get(struct devlink *dl, struct devlink_info_req *req, if (BNXT_PF(bp) && !bnxt_hwrm_get_nvm_cfg_ver(bp, &nvm_cfg_ver)) { u32 ver = nvm_cfg_ver.vu32; - sprintf(buf, "%X.%X.%X", (ver >> 16) & 0xF, (ver >> 8) & 0xF, - ver & 0xF); + sprintf(buf, "%d.%d.%d", (ver >> 16) & 0xf, (ver >> 8) & 0xf, + ver & 0xf); rc = bnxt_dl_info_put(bp, req, BNXT_VERSION_STORED, DEVLINK_INFO_VERSION_GENERIC_FW_PSID, buf); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h index d5c6e6a3d22d..6199f125bc13 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h @@ -103,6 +103,7 @@ struct hwrm_short_input { struct cmd_nums { __le16 req_type; #define HWRM_VER_GET 0x0UL + #define HWRM_FUNC_ECHO_RESPONSE 0xbUL #define HWRM_ERROR_RECOVERY_QCFG 0xcUL #define HWRM_FUNC_DRV_IF_CHANGE 0xdUL #define HWRM_FUNC_BUF_UNRGTR 0xeUL @@ -501,8 +502,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 2 -#define HWRM_VERSION_RSVD 11 -#define HWRM_VERSION_STR "1.10.2.11" +#define HWRM_VERSION_RSVD 16 +#define HWRM_VERSION_STR "1.10.2.16" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -723,7 +724,8 @@ struct hwrm_async_event_cmpl { #define ASYNC_EVENT_CMPL_EVENT_ID_QUIESCE_DONE 0x3fUL #define ASYNC_EVENT_CMPL_EVENT_ID_DEFERRED_RESPONSE 0x40UL #define ASYNC_EVENT_CMPL_EVENT_ID_PFC_WATCHDOG_CFG_CHANGE 0x41UL - #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x42UL + #define ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST 0x42UL + #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x43UL #define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG 0xfeUL #define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR 0xffUL #define ASYNC_EVENT_CMPL_EVENT_ID_LAST ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR @@ -1052,6 +1054,26 @@ struct hwrm_async_event_cmpl_deferred_response { __le32 event_data1; }; +/* hwrm_async_event_cmpl_echo_request (size:128b/16B) */ +struct hwrm_async_event_cmpl_echo_request { + __le16 type; + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_MASK 0x3fUL + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_SFT 0 + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_HWRM_ASYNC_EVENT 0x2eUL + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_LAST ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_HWRM_ASYNC_EVENT + __le16 event_id; + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_ECHO_REQUEST 0x42UL + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_LAST ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_ECHO_REQUEST + __le32 event_data2; + u8 opaque_v; + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_V 0x1UL + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_OPAQUE_MASK 0xfeUL + #define ASYNC_EVENT_CMPL_ECHO_REQUEST_OPAQUE_SFT 1 + u8 timestamp_lo; + __le16 timestamp_hi; + __le32 event_data1; +}; + /* hwrm_func_reset_input (size:192b/24B) */ struct hwrm_func_reset_input { __le16 req_type; @@ -1294,6 +1316,10 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_FLAGS_EXT_SOC_SPD_SUPPORTED 0x200UL #define FUNC_QCAPS_RESP_FLAGS_EXT_FW_LIVEPATCH_SUPPORTED 0x400UL #define FUNC_QCAPS_RESP_FLAGS_EXT_FAST_RESET_CAPABLE 0x800UL + #define FUNC_QCAPS_RESP_FLAGS_EXT_TX_METADATA_CFG_CAPABLE 0x1000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT_NVM_OPTION_ACTION_SUPPORTED 0x2000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT_BD_METADATA_SUPPORTED 0x4000UL + #define FUNC_QCAPS_RESP_FLAGS_EXT_ECHO_REQUEST_SUPPORTED 0x8000UL u8 max_schqs; u8 mpc_chnls_cap; #define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TCE 0x1UL @@ -1339,6 +1365,7 @@ struct hwrm_func_qcfg_output { #define FUNC_QCFG_RESP_FLAGS_PPP_PUSH_MODE_ENABLED 0x400UL #define FUNC_QCFG_RESP_FLAGS_RING_MONITOR_ENABLED 0x800UL #define FUNC_QCFG_RESP_FLAGS_FAST_RESET_ALLOWED 0x1000UL + #define FUNC_QCFG_RESP_FLAGS_MULTI_ROOT 0x2000UL u8 mac_address[6]; __le16 pci_id; __le16 alloc_rsscos_ctx; @@ -1474,6 +1501,8 @@ struct hwrm_func_cfg_input { #define FUNC_CFG_REQ_FLAGS_HOT_RESET_IF_EN_DIS 0x4000000UL #define FUNC_CFG_REQ_FLAGS_PPP_PUSH_MODE_ENABLE 0x8000000UL #define FUNC_CFG_REQ_FLAGS_PPP_PUSH_MODE_DISABLE 0x10000000UL + #define FUNC_CFG_REQ_FLAGS_BD_METADATA_ENABLE 0x20000000UL + #define FUNC_CFG_REQ_FLAGS_BD_METADATA_DISABLE 0x40000000UL __le32 enables; #define FUNC_CFG_REQ_ENABLES_MTU 0x1UL #define FUNC_CFG_REQ_ENABLES_MRU 0x2UL @@ -2063,8 +2092,32 @@ struct hwrm_func_backing_store_qcaps_output { u8 tqm_fp_rings_count; u8 stat_init_offset; u8 mrav_init_offset; - u8 rsvd[6]; - u8 valid; + u8 tqm_fp_rings_count_ext; + u8 rsvd[5]; + u8 valid; +}; + +/* tqm_fp_ring_cfg (size:128b/16B) */ +struct tqm_fp_ring_cfg { + u8 tqm_ring_pg_size_tqm_ring_lvl; + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_MASK 0xfUL + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_SFT 0 + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_0 0x0UL + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_1 0x1UL + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_2 0x2UL + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LAST TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_2 + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_MASK 0xf0UL + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_SFT 4 + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_4K (0x0UL << 4) + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_8K (0x1UL << 4) + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_64K (0x2UL << 4) + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_2M (0x3UL << 4) + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_8M (0x4UL << 4) + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_1G (0x5UL << 4) + #define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_LAST TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_1G + u8 unused[3]; + __le32 tqm_ring_num_entries; + __le64 tqm_ring_page_dir; }; /* hwrm_func_backing_store_cfg_input (size:2432b/304B) */ @@ -2560,6 +2613,27 @@ struct hwrm_error_recovery_qcfg_output { u8 valid; }; +/* hwrm_func_echo_response_input (size:192b/24B) */ +struct hwrm_func_echo_response_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; + __le32 event_data1; + __le32 event_data2; +}; + +/* hwrm_func_echo_response_output (size:128b/16B) */ +struct hwrm_func_echo_response_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + u8 unused_0[7]; + u8 valid; +}; + /* hwrm_func_drv_if_change_input (size:192b/24B) */ struct hwrm_func_drv_if_change_input { __le16 req_type; @@ -3627,7 +3701,7 @@ struct hwrm_port_phy_qcaps_output { #define PORT_PHY_QCAPS_RESP_FLAGS_CUMULATIVE_COUNTERS_ON_RESET 0x10UL #define PORT_PHY_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED 0x20UL #define PORT_PHY_QCAPS_RESP_FLAGS_FW_MANAGED_LINK_DOWN 0x40UL - #define PORT_PHY_QCAPS_RESP_FLAGS_RSVD1 0x80UL + #define PORT_PHY_QCAPS_RESP_FLAGS_NO_FCS 0x80UL u8 port_cnt; #define PORT_PHY_QCAPS_RESP_PORT_CNT_UNKNOWN 0x0UL #define PORT_PHY_QCAPS_RESP_PORT_CNT_1 0x1UL @@ -5392,6 +5466,7 @@ struct hwrm_vnic_qcaps_output { #define VNIC_QCAPS_RESP_FLAGS_RX_CMPL_V2_CAP 0x200UL #define VNIC_QCAPS_RESP_FLAGS_VNIC_STATE_CAP 0x400UL #define VNIC_QCAPS_RESP_FLAGS_VIRTIO_NET_VNIC_ALLOC_CAP 0x800UL + #define VNIC_QCAPS_RESP_FLAGS_METADATA_FORMAT_CAP 0x1000UL __le16 max_aggs_supported; u8 unused_1[5]; u8 valid; @@ -5725,7 +5800,7 @@ struct hwrm_ring_alloc_output { u8 valid; }; -/* hwrm_ring_free_input (size:192b/24B) */ +/* hwrm_ring_free_input (size:256b/32B) */ struct hwrm_ring_free_input { __le16 req_type; __le16 cmpl_ring; @@ -5740,9 +5815,13 @@ struct hwrm_ring_free_input { #define RING_FREE_REQ_RING_TYPE_RX_AGG 0x4UL #define RING_FREE_REQ_RING_TYPE_NQ 0x5UL #define RING_FREE_REQ_RING_TYPE_LAST RING_FREE_REQ_RING_TYPE_NQ - u8 unused_0; + u8 flags; + #define RING_FREE_REQ_FLAGS_VIRTIO_RING_VALID 0x1UL + #define RING_FREE_REQ_FLAGS_LAST RING_FREE_REQ_FLAGS_VIRTIO_RING_VALID __le16 ring_id; - u8 unused_1[4]; + __le32 prod_idx; + __le32 opaque; + __le32 unused_1; }; /* hwrm_ring_free_output (size:128b/16B) */ @@ -7562,7 +7641,13 @@ struct hwrm_fw_qstatus_output { #define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPCIERST 0x2UL #define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPOWER 0x3UL #define FW_QSTATUS_RESP_SELFRST_STATUS_LAST FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPOWER - u8 unused_0[6]; + u8 nvm_option_action_status; + #define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_NONE 0x0UL + #define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_HOTRESET 0x1UL + #define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_WARMBOOT 0x2UL + #define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_COLDBOOT 0x3UL + #define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_LAST FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_COLDBOOT + u8 unused_0[5]; u8 valid; }; diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 17f997ef950f..5335244e4577 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -359,7 +359,7 @@ int bcmgenet_mii_probe(struct net_device *dev) * those versions of GENET. */ if (priv->internal_phy && !GENET_IS_V5(priv)) - dev->phydev->irq = PHY_IGNORE_INTERRUPT; + dev->phydev->irq = PHY_MAC_INTERRUPT; return 0; } diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 8936c2bc6286..d2381929931b 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -1580,12 +1580,6 @@ static int tg3_mdio_init(struct tg3 *tp) PHY_BRCM_RX_REFCLK_UNUSED | PHY_BRCM_DIS_TXCRXC_NOENRGY | PHY_BRCM_AUTO_PWRDWN_ENABLE; - if (tg3_flag(tp, RGMII_INBAND_DISABLE)) - phydev->dev_flags |= PHY_BRCM_STD_IBND_DISABLE; - if (tg3_flag(tp, RGMII_EXT_IBND_RX_EN)) - phydev->dev_flags |= PHY_BRCM_EXT_IBND_RX_ENABLE; - if (tg3_flag(tp, RGMII_EXT_IBND_TX_EN)) - phydev->dev_flags |= PHY_BRCM_EXT_IBND_TX_ENABLE; fallthrough; case PHY_ID_RTL8211C: phydev->interface = PHY_INTERFACE_MODE_RGMII; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 1b49f2fa9b18..34546f5312ee 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -46,6 +46,9 @@ #define MAX_ULD_QSETS 16 #define MAX_ULD_NPORTS 4 +/* ulp_mem_io + ulptx_idata + payload + padding */ +#define MAX_IMM_ULPTX_WR_LEN (32 + 8 + 256 + 8) + /* CPL message priority levels */ enum { CPL_PRIORITY_DATA = 0, /* data messages */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 550cc065649f..256fae15e032 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2846,17 +2846,22 @@ int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb) * @skb: the packet * * Returns true if a packet can be sent as an offload WR with immediate - * data. We currently use the same limit as for Ethernet packets. + * data. + * FW_OFLD_TX_DATA_WR limits the payload to 255 bytes due to 8-bit field. + * However, FW_ULPTX_WR commands have a 256 byte immediate only + * payload limit. */ static inline int is_ofld_imm(const struct sk_buff *skb) { struct work_request_hdr *req = (struct work_request_hdr *)skb->data; unsigned long opcode = FW_WR_OP_G(ntohl(req->wr_hi)); - if (opcode == FW_CRYPTO_LOOKASIDE_WR) + if (unlikely(opcode == FW_ULPTX_WR)) + return skb->len <= MAX_IMM_ULPTX_WR_LEN; + else if (opcode == FW_CRYPTO_LOOKASIDE_WR) return skb->len <= SGE_MAX_WR_LEN; else - return skb->len <= MAX_IMM_TX_PKT_LEN; + return skb->len <= MAX_IMM_OFLD_TX_DATA_WR_LEN; } /** diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h index 47ba81e42f5d..b1161bdeda4d 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h @@ -50,9 +50,6 @@ #define MIN_RCV_WND (24 * 1024U) #define LOOPBACK(x) (((x) & htonl(0xff000000)) == htonl(0x7f000000)) -/* ulp_mem_io + ulptx_idata + payload + padding */ -#define MAX_IMM_ULPTX_WR_LEN (32 + 8 + 256 + 8) - /* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */ #define TX_HEADER_LEN \ (sizeof(struct fw_ofld_tx_data_wr) + sizeof(struct sge_opaque_hdr)) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 19f74d4cbb4e..492943bb9c48 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -395,10 +395,20 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE; err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog); - if (unlikely(err)) + if (unlikely(err)) { + addr = dma_map_page(priv->net_dev->dev.parent, + virt_to_page(vaddr), 0, + priv->rx_buf_size, DMA_BIDIRECTIONAL); + if (unlikely(dma_mapping_error(priv->net_dev->dev.parent, addr))) { + free_pages((unsigned long)vaddr, 0); + } else { + ch->buf_count++; + dpaa2_eth_xdp_release_buf(priv, ch, addr); + } ch->stats.xdp_drop++; - else + } else { ch->stats.xdp_redirect++; + } break; } diff --git a/drivers/net/ethernet/freescale/enetc/Kconfig b/drivers/net/ethernet/freescale/enetc/Kconfig index d99ea0f4e4a6..ab92382c399a 100644 --- a/drivers/net/ethernet/freescale/enetc/Kconfig +++ b/drivers/net/ethernet/freescale/enetc/Kconfig @@ -27,7 +27,7 @@ config FSL_ENETC_VF config FSL_ENETC_MDIO tristate "ENETC MDIO driver" - depends on PCI + depends on PCI && MDIO_DEVRES && MDIO_BUS help This driver supports NXP ENETC Central MDIO controller as a PCIe physical function (PF) device. diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 3eb5f1375bd4..515c5b29d7aa 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -1157,14 +1157,15 @@ static void enetc_pf_remove(struct pci_dev *pdev) struct enetc_ndev_priv *priv; priv = netdev_priv(si->ndev); - enetc_phylink_destroy(priv); - enetc_mdiobus_destroy(pf); if (pf->num_vfs) enetc_sriov_configure(pdev, 0); unregister_netdev(si->ndev); + enetc_phylink_destroy(priv); + enetc_mdiobus_destroy(pf); + enetc_free_msix(priv); enetc_free_si_resources(priv); diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 1774fbaab146..1c0e4beb56e7 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -247,46 +247,23 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, if (!ltb->buff) return; + /* VIOS automatically unmaps the long term buffer at remote + * end for the following resets: + * FAILOVER, MOBILITY, TIMEOUT. + */ if (adapter->reset_reason != VNIC_RESET_FAILOVER && - adapter->reset_reason != VNIC_RESET_MOBILITY) + adapter->reset_reason != VNIC_RESET_MOBILITY && + adapter->reset_reason != VNIC_RESET_TIMEOUT) send_request_unmap(adapter, ltb->map_id); dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); } -static int reset_long_term_buff(struct ibmvnic_adapter *adapter, - struct ibmvnic_long_term_buff *ltb) +static int reset_long_term_buff(struct ibmvnic_long_term_buff *ltb) { - struct device *dev = &adapter->vdev->dev; - int rc; + if (!ltb->buff) + return -EINVAL; memset(ltb->buff, 0, ltb->size); - - mutex_lock(&adapter->fw_lock); - adapter->fw_done_rc = 0; - - reinit_completion(&adapter->fw_done); - rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); - if (rc) { - mutex_unlock(&adapter->fw_lock); - return rc; - } - - rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); - if (rc) { - dev_info(dev, - "Reset failed, long term map request timed out or aborted\n"); - mutex_unlock(&adapter->fw_lock); - return rc; - } - - if (adapter->fw_done_rc) { - dev_info(dev, - "Reset failed, attempting to free and reallocate buffer\n"); - free_long_term_buff(adapter, ltb); - mutex_unlock(&adapter->fw_lock); - return alloc_long_term_buff(adapter, ltb, ltb->size); - } - mutex_unlock(&adapter->fw_lock); return 0; } @@ -508,8 +485,7 @@ static int reset_rx_pools(struct ibmvnic_adapter *adapter) rx_pool->size * rx_pool->buff_size); } else { - rc = reset_long_term_buff(adapter, - &rx_pool->long_term_buff); + rc = reset_long_term_buff(&rx_pool->long_term_buff); } if (rc) @@ -632,12 +608,11 @@ static int init_rx_pools(struct net_device *netdev) return 0; } -static int reset_one_tx_pool(struct ibmvnic_adapter *adapter, - struct ibmvnic_tx_pool *tx_pool) +static int reset_one_tx_pool(struct ibmvnic_tx_pool *tx_pool) { int rc, i; - rc = reset_long_term_buff(adapter, &tx_pool->long_term_buff); + rc = reset_long_term_buff(&tx_pool->long_term_buff); if (rc) return rc; @@ -664,10 +639,10 @@ static int reset_tx_pools(struct ibmvnic_adapter *adapter) tx_scrqs = adapter->num_active_tx_pools; for (i = 0; i < tx_scrqs; i++) { - rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]); + rc = reset_one_tx_pool(&adapter->tso_pool[i]); if (rc) return rc; - rc = reset_one_tx_pool(adapter, &adapter->tx_pool[i]); + rc = reset_one_tx_pool(&adapter->tx_pool[i]); if (rc) return rc; } @@ -1352,10 +1327,8 @@ static int __ibmvnic_close(struct net_device *netdev) adapter->state = VNIC_CLOSING; rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); - if (rc) - return rc; adapter->state = VNIC_CLOSED; - return 0; + return rc; } static int ibmvnic_close(struct net_device *netdev) @@ -1700,6 +1673,9 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) skb_copy_from_linear_data(skb, dst, skb->len); } + /* post changes to long_term_buff *dst before VIOS accessing it */ + dma_wmb(); + tx_pool->consumer_index = (tx_pool->consumer_index + 1) % tx_pool->num_buffers; @@ -2318,6 +2294,8 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, unsigned long flags; int ret; + spin_lock_irqsave(&adapter->rwi_lock, flags); + /* If failover is pending don't schedule any other reset. * Instead let the failover complete. If there is already a * a failover reset scheduled, we will detect and drop the @@ -2338,14 +2316,11 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, goto err; } - spin_lock_irqsave(&adapter->rwi_lock, flags); - list_for_each(entry, &adapter->rwi_list) { tmp = list_entry(entry, struct ibmvnic_rwi, list); if (tmp->reset_reason == reason) { netdev_dbg(netdev, "Skipping matching reset, reason=%d\n", reason); - spin_unlock_irqrestore(&adapter->rwi_lock, flags); ret = EBUSY; goto err; } @@ -2353,8 +2328,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, rwi = kzalloc(sizeof(*rwi), GFP_ATOMIC); if (!rwi) { - spin_unlock_irqrestore(&adapter->rwi_lock, flags); - ibmvnic_close(netdev); ret = ENOMEM; goto err; } @@ -2367,12 +2340,17 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, } rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); - spin_unlock_irqrestore(&adapter->rwi_lock, flags); netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason); schedule_work(&adapter->ibmvnic_reset); - return 0; + ret = 0; err: + /* ibmvnic_close() below can block, so drop the lock first */ + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + + if (ret == ENOMEM) + ibmvnic_close(netdev); + return -ret; } @@ -2463,6 +2441,8 @@ restart_poll: offset = be16_to_cpu(next->rx_comp.off_frame_data); flags = next->rx_comp.flags; skb = rx_buff->skb; + /* load long_term_buff before copying to skb */ + dma_rmb(); skb_copy_to_linear_data(skb, rx_buff->data + offset, length); @@ -3629,7 +3609,7 @@ static int send_subcrq_indirect(struct ibmvnic_adapter *adapter, int rc; /* Make sure the hypervisor sees the complete request */ - mb(); + dma_wmb(); rc = plpar_hcall_norets(H_SEND_SUB_CRQ_INDIRECT, ua, cpu_to_be64(remote_handle), ioba, num_entries); @@ -3659,7 +3639,7 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter, } /* Make sure the hypervisor sees the complete request */ - mb(); + dma_wmb(); rc = plpar_hcall_norets(H_SEND_CRQ, ua, cpu_to_be64(u64_crq[0]), @@ -5376,7 +5356,18 @@ static int ibmvnic_remove(struct vio_dev *dev) unsigned long flags; spin_lock_irqsave(&adapter->state_lock, flags); + + /* If ibmvnic_reset() is scheduling a reset, wait for it to + * finish. Then, set the state to REMOVING to prevent it from + * scheduling any more work and to have reset functions ignore + * any resets that have already been scheduled. Drop the lock + * after setting state, so __ibmvnic_reset() which is called + * from the flush_work() below, can make progress. + */ + spin_lock_irqsave(&adapter->rwi_lock, flags); adapter->state = VNIC_REMOVING; + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + spin_unlock_irqrestore(&adapter->state_lock, flags); flush_work(&adapter->ibmvnic_reset); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 270d1cac86a4..806aa75a4e86 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -31,7 +31,7 @@ #define IBMVNIC_BUFFS_PER_POOL 100 #define IBMVNIC_MAX_QUEUES 16 #define IBMVNIC_MAX_QUEUE_SZ 4096 -#define IBMVNIC_MAX_IND_DESCS 128 +#define IBMVNIC_MAX_IND_DESCS 16 #define IBMVNIC_IND_ARR_SZ (IBMVNIC_MAX_IND_DESCS * 32) #define IBMVNIC_TSO_BUF_SZ 65536 @@ -1081,11 +1081,15 @@ struct ibmvnic_adapter { struct tasklet_struct tasklet; enum vnic_state state; - /* Used for serializatin of state field */ + /* Used for serialization of state field. When taking both state + * and rwi locks, take state lock first. + */ spinlock_t state_lock; enum ibmvnic_reset_reason reset_reason; struct list_head rwi_list; - /* Used for serialization of rwi_list */ + /* Used for serialization of rwi_list. When taking both state + * and rwi locks, take state lock first + */ spinlock_t rwi_lock; struct work_struct ibmvnic_reset; struct delayed_work ibmvnic_delayed_reset; diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c index 7b73a279d46e..243b0d2b7b72 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c @@ -1636,7 +1636,7 @@ void i40e_dcb_hw_calculate_pool_sizes(struct i40e_hw *hw, u32 total_pool_size = 0; int shared_pool_size; /* Need signed variable */ u32 port_pb_size; - u32 mfs_max; + u32 mfs_max = 0; u32 pcirtt; u8 i; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 8a4dd77a12da..a8a2b5f683a2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -4250,7 +4250,7 @@ static int i40e_check_fdir_input_set(struct i40e_vsi *vsi, (struct in6_addr *)&ipv6_full_mask)) new_mask |= I40E_L3_V6_DST_MASK; else if (ipv6_addr_any((struct in6_addr *) - &tcp_ip6_spec->ip6src)) + &tcp_ip6_spec->ip6dst)) new_mask &= ~I40E_L3_V6_DST_MASK; else return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 6290bfb6494e..a635cf84608a 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -102,6 +102,8 @@ #define MVNETA_TX_NO_DATA_SWAP BIT(5) #define MVNETA_DESC_SWAP BIT(6) #define MVNETA_TX_BRST_SZ_MASK(burst) ((burst) << 22) +#define MVNETA_VLAN_PRIO_TO_RXQ 0x2440 +#define MVNETA_VLAN_PRIO_RXQ_MAP(prio, rxq) ((rxq) << ((prio) * 3)) #define MVNETA_PORT_STATUS 0x2444 #define MVNETA_TX_IN_PRGRS BIT(1) #define MVNETA_TX_FIFO_EMPTY BIT(8) @@ -490,6 +492,7 @@ struct mvneta_port { u8 mcast_count[256]; u16 tx_ring_size; u16 rx_ring_size; + u8 prio_tc_map[8]; phy_interface_t phy_interface; struct device_node *dn; @@ -3428,7 +3431,9 @@ static int mvneta_txq_sw_init(struct mvneta_port *pp, return -ENOMEM; /* Setup XPS mapping */ - if (txq_number > 1) + if (pp->neta_armada3700) + cpu = 0; + else if (txq_number > 1) cpu = txq->id % num_present_cpus(); else cpu = pp->rxq_def % num_present_cpus(); @@ -4206,6 +4211,11 @@ static int mvneta_cpu_online(unsigned int cpu, struct hlist_node *node) node_online); struct mvneta_pcpu_port *port = per_cpu_ptr(pp->ports, cpu); + /* Armada 3700's per-cpu interrupt for mvneta is broken, all interrupts + * are routed to CPU 0, so we don't need all the cpu-hotplug support + */ + if (pp->neta_armada3700) + return 0; spin_lock(&pp->lock); /* @@ -4915,6 +4925,63 @@ static int mvneta_ethtool_set_eee(struct net_device *dev, return phylink_ethtool_set_eee(pp->phylink, eee); } +static void mvneta_clear_rx_prio_map(struct mvneta_port *pp) +{ + mvreg_write(pp, MVNETA_VLAN_PRIO_TO_RXQ, 0); +} + +static void mvneta_setup_rx_prio_map(struct mvneta_port *pp) +{ + u32 val = 0; + int i; + + for (i = 0; i < rxq_number; i++) + val |= MVNETA_VLAN_PRIO_RXQ_MAP(i, pp->prio_tc_map[i]); + + mvreg_write(pp, MVNETA_VLAN_PRIO_TO_RXQ, val); +} + +static int mvneta_setup_mqprio(struct net_device *dev, + struct tc_mqprio_qopt *qopt) +{ + struct mvneta_port *pp = netdev_priv(dev); + u8 num_tc; + int i; + + qopt->hw = TC_MQPRIO_HW_OFFLOAD_TCS; + num_tc = qopt->num_tc; + + if (num_tc > rxq_number) + return -EINVAL; + + if (!num_tc) { + mvneta_clear_rx_prio_map(pp); + netdev_reset_tc(dev); + return 0; + } + + memcpy(pp->prio_tc_map, qopt->prio_tc_map, sizeof(pp->prio_tc_map)); + + mvneta_setup_rx_prio_map(pp); + + netdev_set_num_tc(dev, qopt->num_tc); + for (i = 0; i < qopt->num_tc; i++) + netdev_set_tc_queue(dev, i, qopt->count[i], qopt->offset[i]); + + return 0; +} + +static int mvneta_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + switch (type) { + case TC_SETUP_QDISC_MQPRIO: + return mvneta_setup_mqprio(dev, type_data); + default: + return -EOPNOTSUPP; + } +} + static const struct net_device_ops mvneta_netdev_ops = { .ndo_open = mvneta_open, .ndo_stop = mvneta_stop, @@ -4927,6 +4994,7 @@ static const struct net_device_ops mvneta_netdev_ops = { .ndo_do_ioctl = mvneta_ioctl, .ndo_bpf = mvneta_xdp, .ndo_xdp_xmit = mvneta_xdp_xmit, + .ndo_setup_tc = mvneta_setup_tc, }; static const struct ethtool_ops mvneta_eth_tool_ops = { diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h index da8715297a9a..8edba5ea90f0 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h @@ -673,9 +673,9 @@ #define GENCONF_PORT_CTRL1_EN(p) BIT(p) #define GENCONF_PORT_CTRL1_RESET(p) (BIT(p) << 28) #define GENCONF_CTRL0 0x1120 -#define GENCONF_CTRL0_PORT0_RGMII BIT(0) -#define GENCONF_CTRL0_PORT1_RGMII_MII BIT(1) -#define GENCONF_CTRL0_PORT1_RGMII BIT(2) +#define GENCONF_CTRL0_PORT2_RGMII BIT(0) +#define GENCONF_CTRL0_PORT3_RGMII_MII BIT(1) +#define GENCONF_CTRL0_PORT3_RGMII BIT(2) /* Various constants */ @@ -763,9 +763,9 @@ #define MVPP2_RX_FIFO_PORT_MIN_PKT 0x80 /* TX FIFO constants */ -#define MVPP22_TX_FIFO_DATA_SIZE_16KB 16 +#define MVPP22_TX_FIFO_DATA_SIZE_18KB 18 #define MVPP22_TX_FIFO_DATA_SIZE_10KB 10 -#define MVPP22_TX_FIFO_DATA_SIZE_3KB 3 +#define MVPP22_TX_FIFO_DATA_SIZE_1KB 1 #define MVPP2_TX_FIFO_THRESHOLD_MIN 256 /* Bytes */ #define MVPP2_TX_FIFO_THRESHOLD(kb) \ ((kb) * 1024 - MVPP2_TX_FIFO_THRESHOLD_MIN) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index e88272f8b600..0507369bb54d 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -329,7 +329,7 @@ static int mvpp2_get_nrxqs(struct mvpp2 *priv) { unsigned int nrxqs; - if (priv->hw_version != MVPP21 && queue_mode == MVPP2_QDIST_SINGLE_MODE) + if (priv->hw_version >= MVPP22 && queue_mode == MVPP2_QDIST_SINGLE_MODE) return 1; /* According to the PPv2.2 datasheet and our experiments on @@ -469,7 +469,7 @@ static void mvpp2_bm_bufs_get_addrs(struct device *dev, struct mvpp2 *priv, MVPP2_BM_PHY_ALLOC_REG(bm_pool->id)); *phys_addr = mvpp2_thread_read(priv, thread, MVPP2_BM_VIRT_ALLOC_REG); - if (priv->hw_version != MVPP21) { + if (priv->hw_version >= MVPP22) { u32 val; u32 dma_addr_highbits, phys_addr_highbits; @@ -924,6 +924,25 @@ static void mvpp2_bm_pool_update_fc(struct mvpp2_port *port, spin_unlock_irqrestore(&port->priv->mss_spinlock, flags); } +/* disable/enable flow control for BM pool on all ports */ +static void mvpp2_bm_pool_update_priv_fc(struct mvpp2 *priv, bool en) +{ + struct mvpp2_port *port; + int i; + + for (i = 0; i < priv->port_count; i++) { + port = priv->port_list[i]; + if (port->priv->percpu_pools) { + for (i = 0; i < port->nrxqs; i++) + mvpp2_bm_pool_update_fc(port, &port->priv->bm_pools[i], + port->tx_fc & en); + } else { + mvpp2_bm_pool_update_fc(port, port->pool_long, port->tx_fc & en); + mvpp2_bm_pool_update_fc(port, port->pool_short, port->tx_fc & en); + } + } +} + static int mvpp2_enable_global_fc(struct mvpp2 *priv) { int val, timeout = 0; @@ -963,7 +982,7 @@ static inline void mvpp2_bm_pool_put(struct mvpp2_port *port, int pool, if (test_bit(thread, &port->priv->lock_map)) spin_lock_irqsave(&port->bm_lock[thread], flags); - if (port->priv->hw_version != MVPP21) { + if (port->priv->hw_version >= MVPP22) { u32 val = 0; if (sizeof(dma_addr_t) == 8) @@ -1462,7 +1481,7 @@ static bool mvpp2_port_supports_xlg(struct mvpp2_port *port) static bool mvpp2_port_supports_rgmii(struct mvpp2_port *port) { - return !(port->priv->hw_version != MVPP21 && port->gop_id == 0); + return !(port->priv->hw_version >= MVPP22 && port->gop_id == 0); } /* Port configuration routines */ @@ -1494,9 +1513,9 @@ static void mvpp22_gop_init_rgmii(struct mvpp2_port *port) regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val); if (port->gop_id == 2) - val |= GENCONF_CTRL0_PORT0_RGMII; + val |= GENCONF_CTRL0_PORT2_RGMII; else if (port->gop_id == 3) - val |= GENCONF_CTRL0_PORT1_RGMII_MII; + val |= GENCONF_CTRL0_PORT3_RGMII_MII; regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val); } @@ -1513,9 +1532,9 @@ static void mvpp22_gop_init_sgmii(struct mvpp2_port *port) if (port->gop_id > 1) { regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val); if (port->gop_id == 2) - val &= ~GENCONF_CTRL0_PORT0_RGMII; + val &= ~GENCONF_CTRL0_PORT2_RGMII; else if (port->gop_id == 3) - val &= ~GENCONF_CTRL0_PORT1_RGMII_MII; + val &= ~GENCONF_CTRL0_PORT3_RGMII_MII; regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val); } } @@ -2125,7 +2144,7 @@ static void mvpp2_mac_reset_assert(struct mvpp2_port *port) MVPP2_GMAC_PORT_RESET_MASK; writel(val, port->base + MVPP2_GMAC_CTRL_2_REG); - if (port->priv->hw_version != MVPP21 && port->gop_id == 0) { + if (port->priv->hw_version >= MVPP22 && port->gop_id == 0) { val = readl(port->base + MVPP22_XLG_CTRL0_REG) & ~MVPP22_XLG_CTRL0_MAC_RESET_DIS; writel(val, port->base + MVPP22_XLG_CTRL0_REG); @@ -4016,7 +4035,7 @@ static void mvpp2_txdesc_clear_ptp(struct mvpp2_port *port, struct mvpp2_tx_desc *desc) { /* We only need to clear the low bits */ - if (port->priv->hw_version != MVPP21) + if (port->priv->hw_version >= MVPP22) desc->pp22.ptp_descriptor &= cpu_to_le32(~MVPP22_PTP_DESC_MASK_LOW); } @@ -4528,7 +4547,7 @@ static void mvpp2_start_dev(struct mvpp2_port *port) /* Enable interrupts on all threads */ mvpp2_interrupts_enable(port); - if (port->priv->hw_version != MVPP21) + if (port->priv->hw_version >= MVPP22) mvpp22_mode_reconfigure(port); if (port->phylink) { @@ -4746,7 +4765,7 @@ static int mvpp2_open(struct net_device *dev) valid = true; } - if (priv->hw_version != MVPP21 && port->port_irq) { + if (priv->hw_version >= MVPP22 && port->port_irq) { err = request_irq(port->port_irq, mvpp2_port_isr, 0, dev->name, port); if (err) { @@ -4913,6 +4932,7 @@ static int mvpp2_set_mac_address(struct net_device *dev, void *p) */ static int mvpp2_bm_switch_buffers(struct mvpp2 *priv, bool percpu) { + bool change_percpu = (percpu != priv->percpu_pools); int numbufs = MVPP2_BM_POOLS_NUM, i; struct mvpp2_port *port = NULL; bool status[MVPP2_MAX_PORTS]; @@ -4928,6 +4948,9 @@ static int mvpp2_bm_switch_buffers(struct mvpp2 *priv, bool percpu) if (priv->percpu_pools) numbufs = port->nrxqs * 2; + if (change_percpu) + mvpp2_bm_pool_update_priv_fc(priv, false); + for (i = 0; i < numbufs; i++) mvpp2_bm_pool_destroy(port->dev->dev.parent, priv, &priv->bm_pools[i]); @@ -4942,6 +4965,9 @@ static int mvpp2_bm_switch_buffers(struct mvpp2 *priv, bool percpu) mvpp2_open(port->dev); } + if (change_percpu) + mvpp2_bm_pool_update_priv_fc(priv, true); + return 0; } @@ -6399,7 +6425,7 @@ static int mvpp2__mac_prepare(struct phylink_config *config, unsigned int mode, MVPP2_GMAC_PORT_RESET_MASK, MVPP2_GMAC_PORT_RESET_MASK); - if (port->priv->hw_version != MVPP21) { + if (port->priv->hw_version >= MVPP22) { mvpp22_gop_mask_irq(port); phy_power_off(port->comphy); @@ -6453,7 +6479,7 @@ static int mvpp2_mac_finish(struct phylink_config *config, unsigned int mode, { struct mvpp2_port *port = mvpp2_phylink_to_port(config); - if (port->priv->hw_version != MVPP21 && + if (port->priv->hw_version >= MVPP22 && port->phy_interface != interface) { port->phy_interface = interface; @@ -7093,8 +7119,8 @@ static void mvpp22_tx_fifo_set_hw(struct mvpp2 *priv, int port, int size) } /* Initialize TX FIFO's: the total FIFO size is 19kB on PPv2.2 and PPv2.3. - * 3kB fixed space must be assigned for the loopback port. - * Redistribute remaining avialable 16kB space among all active ports. + * 1kB fixed space must be assigned for the loopback port. + * Redistribute remaining avialable 18kB space among all active ports. * The 10G interface should use 10kB (which is maximum possible size * per single port). */ @@ -7105,9 +7131,9 @@ static void mvpp22_tx_fifo_init(struct mvpp2 *priv) int size_remainder; int port, size; - /* The loopback requires fixed 3kB of the FIFO space assignment. */ + /* The loopback requires fixed 1kB of the FIFO space assignment. */ mvpp22_tx_fifo_set_hw(priv, MVPP2_LOOPBACK_PORT_INDEX, - MVPP22_TX_FIFO_DATA_SIZE_3KB); + MVPP22_TX_FIFO_DATA_SIZE_1KB); port_map = priv->port_map & ~BIT(MVPP2_LOOPBACK_PORT_INDEX); /* Set TX FIFO size to 0 for inactive ports. */ @@ -7115,7 +7141,7 @@ static void mvpp22_tx_fifo_init(struct mvpp2 *priv) mvpp22_tx_fifo_set_hw(priv, port, 0); /* Assign remaining TX FIFO space among all active ports. */ - size_remainder = MVPP22_TX_FIFO_DATA_SIZE_16KB; + size_remainder = MVPP22_TX_FIFO_DATA_SIZE_18KB; remaining_ports_count = hweight_long(port_map); for_each_set_bit(port, &port_map, MVPP2_LOOPBACK_PORT_INDEX) { @@ -7200,7 +7226,7 @@ static int mvpp2_init(struct platform_device *pdev, struct mvpp2 *priv) if (dram_target_info) mvpp2_conf_mbus_windows(dram_target_info, priv); - if (priv->hw_version != MVPP21) + if (priv->hw_version >= MVPP22) mvpp2_axi_init(priv); /* Disable HW PHY polling */ @@ -7277,10 +7303,8 @@ static int mvpp2_get_sram(struct platform_device *pdev, } priv->cm3_base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(priv->cm3_base)) - return PTR_ERR(priv->cm3_base); - return 0; + return PTR_ERR_OR_ZERO(priv->cm3_base); } static int mvpp2_probe(struct platform_device *pdev) @@ -7350,7 +7374,7 @@ static int mvpp2_probe(struct platform_device *pdev) priv->global_tx_fc = true; } - if (priv->hw_version != MVPP21 && dev_of_node(&pdev->dev)) { + if (priv->hw_version >= MVPP22 && dev_of_node(&pdev->dev)) { priv->sysctrl_base = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "marvell,system-controller"); @@ -7363,7 +7387,7 @@ static int mvpp2_probe(struct platform_device *pdev) priv->sysctrl_base = NULL; } - if (priv->hw_version != MVPP21 && + if (priv->hw_version >= MVPP22 && mvpp2_get_nrxqs(priv) * 2 <= MVPP2_BM_MAX_POOLS) priv->percpu_pools = 1; @@ -7408,7 +7432,7 @@ static int mvpp2_probe(struct platform_device *pdev) if (err < 0) goto err_pp_clk; - if (priv->hw_version != MVPP21) { + if (priv->hw_version >= MVPP22) { priv->mg_clk = devm_clk_get(&pdev->dev, "mg_clk"); if (IS_ERR(priv->mg_clk)) { err = PTR_ERR(priv->mg_clk); @@ -7449,7 +7473,7 @@ static int mvpp2_probe(struct platform_device *pdev) return -EINVAL; } - if (priv->hw_version != MVPP21) { + if (priv->hw_version >= MVPP22) { err = dma_set_mask(&pdev->dev, MVPP2_DESC_DMA_MASK); if (err) goto err_axi_clk; @@ -7469,10 +7493,8 @@ static int mvpp2_probe(struct platform_device *pdev) priv->port_map |= BIT(i); } - if (priv->hw_version != MVPP21) { - if (mvpp2_read(priv, MVPP2_VER_ID_REG) == MVPP2_VER_PP23) - priv->hw_version = MVPP23; - } + if (mvpp2_read(priv, MVPP2_VER_ID_REG) == MVPP2_VER_PP23) + priv->hw_version = MVPP23; /* Init mss lock */ spin_lock_init(&priv->mss_spinlock); @@ -7516,7 +7538,7 @@ static int mvpp2_probe(struct platform_device *pdev) goto err_port_probe; } - if (priv->global_tx_fc && priv->hw_version != MVPP21) { + if (priv->global_tx_fc && priv->hw_version >= MVPP22) { err = mvpp2_enable_global_fc(priv); if (err) dev_warn(&pdev->dev, "Minimum of CM3 firmware 18.09 and chip revision B0 required for flow control\n"); @@ -7538,10 +7560,10 @@ err_axi_clk: clk_disable_unprepare(priv->axi_clk); err_mg_core_clk: - if (priv->hw_version != MVPP21) + if (priv->hw_version >= MVPP22) clk_disable_unprepare(priv->mg_core_clk); err_mg_clk: - if (priv->hw_version != MVPP21) + if (priv->hw_version >= MVPP22) clk_disable_unprepare(priv->mg_clk); err_gop_clk: clk_disable_unprepare(priv->gop_clk); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 3a1809c28e83..e668e482383a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -722,12 +722,14 @@ u32 rvu_cgx_get_fifolen(struct rvu *rvu) static int rvu_cgx_config_intlbk(struct rvu *rvu, u16 pcifunc, bool en) { + int pf = rvu_get_pf(pcifunc); struct mac_ops *mac_ops; u8 cgx_id, lmac_id; if (!is_cgx_config_permitted(rvu, pcifunc)) return -EPERM; + rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); mac_ops = get_mac_ops(rvu_cgx_pdata(cgx_id, rvu)); return mac_ops->mac_lmac_intl_lbk(rvu_cgx_pdata(cgx_id, rvu), diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index 48a84c65804c..094124b695dc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -2432,7 +2432,7 @@ void rvu_dbg_init(struct rvu *rvu) debugfs_create_file("rvu_pf_cgx_map", 0444, rvu->rvu_dbg.root, rvu, &rvu_dbg_rvu_pf_cgx_map_fops); else - debugfs_create_file("rvu_pf_cgx_map", 0444, rvu->rvu_dbg.root, + debugfs_create_file("rvu_pf_rpm_map", 0444, rvu->rvu_dbg.root, rvu, &rvu_dbg_rvu_pf_cgx_map_fops); create: diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 4c472646a0ac..f14d388efb51 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -407,6 +407,9 @@ static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf) pfvf->hw.rq_skid = 600; pfvf->qset.rqe_cnt = Q_COUNT(Q_SIZE_1K); } + if (is_96xx_B0(pfvf->pdev)) + __clear_bit(HW_TSO, &hw->cap_flag); + if (!is_dev_otx2(pfvf->pdev)) { __set_bit(CN10K_MBOX, &hw->cap_flag); __set_bit(CN10K_LMTST, &hw->cap_flag); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 3f778fc054b5..22ec03a618b1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -816,22 +816,23 @@ static bool is_hw_tso_supported(struct otx2_nic *pfvf, { int payload_len, last_seg_size; + if (test_bit(HW_TSO, &pfvf->hw.cap_flag)) + return true; + + /* On 96xx A0, HW TSO not supported */ + if (!is_96xx_B0(pfvf->pdev)) + return false; /* HW has an issue due to which when the payload of the last LSO * segment is shorter than 16 bytes, some header fields may not * be correctly modified, hence don't offload such TSO segments. */ - if (!is_96xx_B0(pfvf->pdev)) - return true; payload_len = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb)); last_seg_size = payload_len % skb_shinfo(skb)->gso_size; if (last_seg_size && last_seg_size < 16) return false; - if (!test_bit(HW_TSO, &pfvf->hw.cap_flag)) - return false; - return true; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index aa76a6e0dae8..d7d8a68ef23d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -141,6 +141,11 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, return -EOPNOTSUPP; } + if (mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "reload is unsupported in Lag mode\n"); + return -EOPNOTSUPP; + } + switch (action) { case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: mlx5_unload_one(dev, false); @@ -428,6 +433,10 @@ static int mlx5_devlink_enable_roce_validate(struct devlink *devlink, u32 id, NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE"); return -EOPNOTSUPP; } + if (mlx5_core_is_mp_slave(dev) || mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Multi port slave/Lag device can't configure RoCE"); + return -EOPNOTSUPP; + } return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index a8e31cdd4a4e..7435fe6829b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -57,6 +57,7 @@ #include "en/fs.h" #include "en/qos.h" #include "lib/hv_vhca.h" +#include "lib/clock.h" extern const struct net_device_ops mlx5e_netdev_ops; struct page_pool; @@ -393,6 +394,7 @@ struct mlx5e_txqsq { u32 rate_limit; struct work_struct recover_work; struct mlx5e_ptpsq *ptpsq; + cqe_ts_to_ns ptp_cyc2time; } ____cacheline_aligned_in_smp; struct mlx5e_dma_info { @@ -655,6 +657,7 @@ struct mlx5e_rq { /* XDP read-mostly */ struct xdp_rxq_info xdp_rxq; + cqe_ts_to_ns ptp_cyc2time; } ____cacheline_aligned_in_smp; enum mlx5e_channel_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index a76cfefec708..d57b6f06382f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -3,7 +3,6 @@ #include "en/ptp.h" #include "en/txrx.h" -#include "lib/clock.h" struct mlx5e_skb_cb_hwtstamp { ktime_t cqe_hwtstamp; @@ -70,6 +69,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, int budget) { struct sk_buff *skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo); + struct mlx5e_txqsq *sq = &ptpsq->txqsq; ktime_t hwtstamp; if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { @@ -77,7 +77,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, goto out; } - hwtstamp = mlx5_timecounter_cyc2time(ptpsq->txqsq.clock, get_cqe_ts(cqe)); + hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe)); mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_PORT_HWTSTAMP, hwtstamp, ptpsq->cq_stats); ptpsq->cq_stats->cqe++; @@ -183,6 +183,9 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_port_ptp *c, int txq_ix, if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert)) set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state); sq->stop_room = param->stop_room; + sq->ptp_cyc2time = mlx5_is_real_time_sq(mdev) ? + mlx5_real_time_cyc2time : + mlx5_timecounter_cyc2time; node = dev_to_node(mlx5_core_dma_dev(mdev)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 0b503ebe59ec..f3f6eb081948 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -12,6 +12,7 @@ #include <net/flow_offload.h> #include <net/netfilter/nf_flow_table.h> #include <linux/workqueue.h> +#include <linux/refcount.h> #include <linux/xarray.h> #include "lib/fs_chains.h" @@ -52,11 +53,11 @@ struct mlx5_tc_ct_priv { struct mlx5_flow_table *ct_nat; struct mlx5_flow_table *post_ct; struct mutex control_lock; /* guards parallel adds/dels */ - struct mutex shared_counter_lock; struct mapping_ctx *zone_mapping; struct mapping_ctx *labels_mapping; enum mlx5_flow_namespace_type ns_type; struct mlx5_fs_chains *chains; + spinlock_t ht_lock; /* protects ft entries */ }; struct mlx5_ct_flow { @@ -125,6 +126,10 @@ struct mlx5_ct_counter { bool is_shared; }; +enum { + MLX5_CT_ENTRY_FLAG_VALID, +}; + struct mlx5_ct_entry { struct rhash_head node; struct rhash_head tuple_node; @@ -135,6 +140,12 @@ struct mlx5_ct_entry { struct mlx5_ct_tuple tuple; struct mlx5_ct_tuple tuple_nat; struct mlx5_ct_zone_rule zone_rules[2]; + + struct mlx5_tc_ct_priv *ct_priv; + struct work_struct work; + + refcount_t refcnt; + unsigned long flags; }; static const struct rhashtable_params cts_ht_params = { @@ -742,6 +753,87 @@ err_attr: return err; } +static bool +mlx5_tc_ct_entry_valid(struct mlx5_ct_entry *entry) +{ + return test_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); +} + +static struct mlx5_ct_entry * +mlx5_tc_ct_entry_get(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_tuple *tuple) +{ + struct mlx5_ct_entry *entry; + + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, tuple, + tuples_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) { + return entry; + } else if (!entry) { + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht, + tuple, tuples_nat_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) + return entry; + } + + return entry ? ERR_PTR(-EINVAL) : NULL; +} + +static void mlx5_tc_ct_entry_remove_from_tuples(struct mlx5_ct_entry *entry) +{ + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); + rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, + tuples_ht_params); +} + +static void mlx5_tc_ct_entry_del(struct mlx5_ct_entry *entry) +{ + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + mlx5_tc_ct_entry_del_rules(ct_priv, entry); + + spin_lock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_remove_from_tuples(entry); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_counter_put(ct_priv, entry); + kfree(entry); +} + +static void +mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) +{ + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + mlx5_tc_ct_entry_del(entry); +} + +static void mlx5_tc_ct_entry_del_work(struct work_struct *work) +{ + struct mlx5_ct_entry *entry = container_of(work, struct mlx5_ct_entry, work); + + mlx5_tc_ct_entry_del(entry); +} + +static void +__mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) +{ + struct mlx5e_priv *priv; + + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + priv = netdev_priv(entry->ct_priv->netdev); + INIT_WORK(&entry->work, mlx5_tc_ct_entry_del_work); + queue_work(priv->wq, &entry->work); +} + static struct mlx5_ct_counter * mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv) { @@ -793,16 +885,26 @@ mlx5_tc_ct_shared_counter_get(struct mlx5_tc_ct_priv *ct_priv, } /* Use the same counter as the reverse direction */ - mutex_lock(&ct_priv->shared_counter_lock); - rev_entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &rev_tuple, - tuples_ht_params); - if (rev_entry) { - if (refcount_inc_not_zero(&rev_entry->counter->refcount)) { - mutex_unlock(&ct_priv->shared_counter_lock); - return rev_entry->counter; - } + spin_lock_bh(&ct_priv->ht_lock); + rev_entry = mlx5_tc_ct_entry_get(ct_priv, &rev_tuple); + + if (IS_ERR(rev_entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + goto create_counter; } - mutex_unlock(&ct_priv->shared_counter_lock); + + if (rev_entry && refcount_inc_not_zero(&rev_entry->counter->refcount)) { + ct_dbg("Using shared counter entry=0x%p rev=0x%p\n", entry, rev_entry); + shared_counter = rev_entry->counter; + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(rev_entry); + return shared_counter; + } + + spin_unlock_bh(&ct_priv->ht_lock); + +create_counter: shared_counter = mlx5_tc_ct_counter_create(ct_priv); if (IS_ERR(shared_counter)) @@ -865,10 +967,14 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, if (!meta_action) return -EOPNOTSUPP; - entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (entry) - return 0; + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (entry && refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_put(entry); + return -EEXIST; + } + spin_unlock_bh(&ct_priv->ht_lock); entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) @@ -877,6 +983,8 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, entry->tuple.zone = ft->zone; entry->cookie = flow->cookie; entry->restore_cookie = meta_action->ct_metadata.cookie; + refcount_set(&entry->refcnt, 2); + entry->ct_priv = ct_priv; err = mlx5_tc_ct_rule_to_tuple(&entry->tuple, flow_rule); if (err) @@ -887,35 +995,40 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, if (err) goto err_set; - err = rhashtable_insert_fast(&ct_priv->ct_tuples_ht, - &entry->tuple_node, - tuples_ht_params); + spin_lock_bh(&ct_priv->ht_lock); + + err = rhashtable_lookup_insert_fast(&ft->ct_entries_ht, &entry->node, + cts_ht_params); + if (err) + goto err_entries; + + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_ht, + &entry->tuple_node, + tuples_ht_params); if (err) goto err_tuple; if (memcmp(&entry->tuple, &entry->tuple_nat, sizeof(entry->tuple))) { - err = rhashtable_insert_fast(&ct_priv->ct_tuples_nat_ht, - &entry->tuple_nat_node, - tuples_nat_ht_params); + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); if (err) goto err_tuple_nat; } + spin_unlock_bh(&ct_priv->ht_lock); err = mlx5_tc_ct_entry_add_rules(ct_priv, flow_rule, entry, ft->zone_restore_id); if (err) goto err_rules; - err = rhashtable_insert_fast(&ft->ct_entries_ht, &entry->node, - cts_ht_params); - if (err) - goto err_insert; + set_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); + mlx5_tc_ct_entry_put(entry); /* this function reference */ return 0; -err_insert: - mlx5_tc_ct_entry_del_rules(ct_priv, entry); err_rules: + spin_lock_bh(&ct_priv->ht_lock); if (mlx5_tc_ct_entry_has_nat(entry)) rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, &entry->tuple_nat_node, tuples_nat_ht_params); @@ -924,47 +1037,43 @@ err_tuple_nat: &entry->tuple_node, tuples_ht_params); err_tuple: + rhashtable_remove_fast(&ft->ct_entries_ht, + &entry->node, + cts_ht_params); +err_entries: + spin_unlock_bh(&ct_priv->ht_lock); err_set: kfree(entry); - netdev_warn(ct_priv->netdev, - "Failed to offload ct entry, err: %d\n", err); + if (err != -EEXIST) + netdev_warn(ct_priv->netdev, "Failed to offload ct entry, err: %d\n", err); return err; } -static void -mlx5_tc_ct_del_ft_entry(struct mlx5_tc_ct_priv *ct_priv, - struct mlx5_ct_entry *entry) -{ - mlx5_tc_ct_entry_del_rules(ct_priv, entry); - mutex_lock(&ct_priv->shared_counter_lock); - if (mlx5_tc_ct_entry_has_nat(entry)) - rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, - &entry->tuple_nat_node, - tuples_nat_ht_params); - rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, - tuples_ht_params); - mutex_unlock(&ct_priv->shared_counter_lock); - mlx5_tc_ct_counter_put(ct_priv, entry); - -} - static int mlx5_tc_ct_block_flow_offload_del(struct mlx5_ct_ft *ft, struct flow_cls_offload *flow) { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; unsigned long cookie = flow->cookie; struct mlx5_ct_entry *entry; - entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (!entry) + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); return -ENOENT; + } - mlx5_tc_ct_del_ft_entry(ft->ct_priv, entry); - WARN_ON(rhashtable_remove_fast(&ft->ct_entries_ht, - &entry->node, - cts_ht_params)); - kfree(entry); + if (!mlx5_tc_ct_entry_valid(entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + rhashtable_remove_fast(&ft->ct_entries_ht, &entry->node, cts_ht_params); + mlx5_tc_ct_entry_remove_from_tuples(entry); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(entry); return 0; } @@ -973,19 +1082,30 @@ static int mlx5_tc_ct_block_flow_offload_stats(struct mlx5_ct_ft *ft, struct flow_cls_offload *f) { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; unsigned long cookie = f->cookie; struct mlx5_ct_entry *entry; u64 lastuse, packets, bytes; - entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (!entry) + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); return -ENOENT; + } + + if (!mlx5_tc_ct_entry_valid(entry) || !refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + spin_unlock_bh(&ct_priv->ht_lock); mlx5_fc_query_cached(entry->counter->counter, &bytes, &packets, &lastuse); flow_stats_update(&f->stats, bytes, packets, 0, lastuse, FLOW_ACTION_HW_STATS_DELAYED); + mlx5_tc_ct_entry_put(entry); return 0; } @@ -1481,11 +1601,9 @@ err_mapping: static void mlx5_tc_ct_flush_ft_entry(void *ptr, void *arg) { - struct mlx5_tc_ct_priv *ct_priv = arg; struct mlx5_ct_entry *entry = ptr; - mlx5_tc_ct_del_ft_entry(ct_priv, entry); - kfree(entry); + mlx5_tc_ct_entry_put(entry); } static void @@ -1962,6 +2080,7 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, goto err_mapping_labels; } + spin_lock_init(&ct_priv->ht_lock); ct_priv->ns_type = ns_type; ct_priv->chains = chains; ct_priv->netdev = priv->netdev; @@ -1996,7 +2115,6 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, idr_init(&ct_priv->fte_ids); mutex_init(&ct_priv->control_lock); - mutex_init(&ct_priv->shared_counter_lock); rhashtable_init(&ct_priv->zone_ht, &zone_params); rhashtable_init(&ct_priv->ct_tuples_ht, &tuples_ht_params); rhashtable_init(&ct_priv->ct_tuples_nat_ht, &tuples_nat_ht_params); @@ -2039,7 +2157,6 @@ mlx5_tc_ct_clean(struct mlx5_tc_ct_priv *ct_priv) rhashtable_destroy(&ct_priv->ct_tuples_nat_ht); rhashtable_destroy(&ct_priv->zone_ht); mutex_destroy(&ct_priv->control_lock); - mutex_destroy(&ct_priv->shared_counter_lock); idr_destroy(&ct_priv->fte_ids); kfree(ct_priv); } @@ -2061,14 +2178,22 @@ mlx5e_tc_ct_restore_flow(struct mlx5_tc_ct_priv *ct_priv, if (!mlx5_tc_ct_skb_to_tuple(skb, &tuple, zone)) return false; - entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &tuple, - tuples_ht_params); - if (!entry) - entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht, - &tuple, tuples_nat_ht_params); - if (!entry) + spin_lock(&ct_priv->ht_lock); + + entry = mlx5_tc_ct_entry_get(ct_priv, &tuple); + if (!entry) { + spin_unlock(&ct_priv->ht_lock); + return false; + } + + if (IS_ERR(entry)) { + spin_unlock(&ct_priv->ht_lock); return false; + } + spin_unlock(&ct_priv->ht_lock); tcf_ct_flow_table_restore_skb(skb, entry->restore_cookie); + __mlx5_tc_ct_entry_put(entry); + return true; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 4880f2179273..2371b83dad9c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -26,6 +26,13 @@ #define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND) +static inline +ktime_t mlx5e_cqe_ts_to_ns(cqe_ts_to_ns func, struct mlx5_clock *clock, u64 cqe_ts) +{ + return INDIRECT_CALL_2(func, mlx5_real_time_cyc2time, mlx5_timecounter_cyc2time, + clock, cqe_ts); +} + enum mlx5e_icosq_wqe_type { MLX5E_ICOSQ_WQE_NOP, MLX5E_ICOSQ_WQE_UMR_RX, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index d487e5e37162..8d991c3b7a50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -83,7 +83,7 @@ static inline void mlx5e_xdp_tx_disable(struct mlx5e_priv *priv) clear_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state); /* Let other device's napi(s) and XSK wakeups see our new state. */ - synchronize_rcu(); + synchronize_net(); } static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index d87c345878d3..f4bce1365639 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -111,7 +111,7 @@ err_free_cparam: void mlx5e_close_xsk(struct mlx5e_channel *c) { clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state); - synchronize_rcu(); /* Sync with the XSK wakeup and with NAPI. */ + synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */ mlx5e_close_rq(&c->xskrq); mlx5e_close_cq(&c->xskrq.cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 959bb6cd7203..cc0efac7b812 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -173,7 +173,7 @@ static inline bool mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, #endif #if IS_ENABLED(CONFIG_GENEVE) - if (skb->encapsulation) + if (skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL) mlx5e_tx_tunnel_accel(skb, eseg, ihs); #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 6a1d82503ef8..d06532d0baa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -57,6 +57,20 @@ struct mlx5e_ktls_offload_context_rx { struct mlx5e_ktls_rx_resync_ctx resync; }; +static bool mlx5e_ktls_priv_rx_put(struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + if (!refcount_dec_and_test(&priv_rx->resync.refcnt)) + return false; + + kfree(priv_rx); + return true; +} + +static void mlx5e_ktls_priv_rx_get(struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + refcount_inc(&priv_rx->resync.refcnt); +} + static int mlx5e_ktls_create_tir(struct mlx5_core_dev *mdev, u32 *tirn, u32 rqtn) { int err, inlen; @@ -326,7 +340,7 @@ static void resync_handle_work(struct work_struct *work) priv_rx = container_of(resync, struct mlx5e_ktls_offload_context_rx, resync); if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) { - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); return; } @@ -334,7 +348,7 @@ static void resync_handle_work(struct work_struct *work) sq = &c->async_icosq; if (resync_post_get_progress_params(sq, priv_rx)) - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); } static void resync_init(struct mlx5e_ktls_rx_resync_ctx *resync, @@ -377,7 +391,11 @@ unlock: return err; } -/* Function is called with elevated refcount, it decreases it. */ +/* Function can be called with the refcount being either elevated or not. + * It decreases the refcount and may free the kTLS priv context. + * Refcount is not elevated only if tls_dev_del has been called, but GET_PSV was + * already in flight. + */ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, struct mlx5e_icosq *sq) { @@ -410,7 +428,7 @@ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, tls_offload_rx_resync_async_request_end(priv_rx->sk, cpu_to_be32(hw_seq)); priv_rx->stats->tls_resync_req_end++; out: - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); dma_unmap_single(dev, buf->dma_addr, PROGRESS_PARAMS_PADDED_SIZE, DMA_FROM_DEVICE); kfree(buf); } @@ -431,9 +449,9 @@ static bool resync_queue_get_psv(struct sock *sk) return false; resync = &priv_rx->resync; - refcount_inc(&resync->refcnt); + mlx5e_ktls_priv_rx_get(priv_rx); if (unlikely(!queue_work(resync->priv->tls->rx_wq, &resync->work))) - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); return true; } @@ -625,31 +643,6 @@ err_create_key: return err; } -/* Elevated refcount on the resync object means there are - * outstanding operations (uncompleted GET_PSV WQEs) that - * will read the resync / priv_rx objects once completed. - * Wait for them to avoid use-after-free. - */ -static void wait_for_resync(struct net_device *netdev, - struct mlx5e_ktls_rx_resync_ctx *resync) -{ -#define MLX5E_KTLS_RX_RESYNC_TIMEOUT 20000 /* msecs */ - unsigned long exp_time = jiffies + msecs_to_jiffies(MLX5E_KTLS_RX_RESYNC_TIMEOUT); - unsigned int refcnt; - - do { - refcnt = refcount_read(&resync->refcnt); - if (refcnt == 1) - return; - - msleep(20); - } while (time_before(jiffies, exp_time)); - - netdev_warn(netdev, - "Failed waiting for kTLS RX resync refcnt to be released (%u).\n", - refcnt); -} - void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) { struct mlx5e_ktls_offload_context_rx *priv_rx; @@ -663,7 +656,7 @@ void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) priv_rx = mlx5e_get_ktls_rx_priv_ctx(tls_ctx); set_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags); mlx5e_set_ktls_rx_priv_ctx(tls_ctx, NULL); - synchronize_rcu(); /* Sync with NAPI */ + synchronize_net(); /* Sync with NAPI */ if (!cancel_work_sync(&priv_rx->rule.work)) /* completion is needed, as the priv_rx in the add flow * is maintained on the wqe info (wi), not on the socket. @@ -671,8 +664,7 @@ void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) wait_for_completion(&priv_rx->add_ctx); resync = &priv_rx->resync; if (cancel_work_sync(&resync->work)) - refcount_dec(&resync->refcnt); - wait_for_resync(netdev, resync); + mlx5e_ktls_priv_rx_put(priv_rx); priv_rx->stats->tls_del++; if (priv_rx->rule.rule) @@ -680,5 +672,9 @@ void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) mlx5_core_destroy_tir(mdev, priv_rx->tirn); mlx5_ktls_destroy_key(mdev, priv_rx->key_id); - kfree(priv_rx); + /* priv_rx should normally be freed here, but if there is an outstanding + * GET_PSV, deallocation will be delayed until the CQE for GET_PSV is + * processed. + */ + mlx5e_ktls_priv_rx_put(priv_rx); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 5e9474dba4e5..abdf721bb264 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -536,7 +536,7 @@ static int mlx5e_get_coalesce(struct net_device *netdev, #define MLX5E_MAX_COAL_FRAMES MLX5_MAX_CQ_COUNT static void -mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) +mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { struct mlx5_core_dev *mdev = priv->mdev; int tc; @@ -551,6 +551,17 @@ mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesc coal->tx_coalesce_usecs, coal->tx_max_coalesced_frames); } + } +} + +static void +mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int i; + + for (i = 0; i < priv->channels.num; ++i) { + struct mlx5e_channel *c = priv->channels.c[i]; mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, coal->rx_coalesce_usecs, @@ -597,21 +608,9 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, tx_moder->pkts = coal->tx_max_coalesced_frames; new_channels.params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce; - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - priv->channels.params = new_channels.params; - goto out; - } - /* we are opened */ - reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled; reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled; - if (!reset_rx && !reset_tx) { - mlx5e_set_priv_channels_coalesce(priv, coal); - priv->channels.params = new_channels.params; - goto out; - } - if (reset_rx) { u8 mode = MLX5E_GET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_BASED_MODER); @@ -625,6 +624,20 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, mlx5e_reset_tx_moderation(&new_channels.params, mode); } + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + priv->channels.params = new_channels.params; + goto out; + } + + if (!reset_rx && !reset_tx) { + if (!coal->use_adaptive_rx_coalesce) + mlx5e_set_priv_channels_rx_coalesce(priv, coal); + if (!coal->use_adaptive_tx_coalesce) + mlx5e_set_priv_channels_tx_coalesce(priv, coal); + priv->channels.params = new_channels.params; + goto out; + } + err = mlx5e_safe_switch_channels(priv, &new_channels, NULL, NULL); out: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d3534b657b98..ec2fcb2a2977 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -67,6 +67,7 @@ #include "en/ptp.h" #include "qos.h" #include "en/trap.h" +#include "fpga/ipsec.h" bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { @@ -108,7 +109,7 @@ bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev, if (!mlx5e_check_fragmented_striding_rq_cap(mdev)) return false; - if (MLX5_IPSEC_DEV(mdev)) + if (mlx5_fpga_is_ipsec_device(mdev)) return false; if (params->xdp_prog) { @@ -449,6 +450,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); rq->xdpsq = &c->rq_xdpsq; rq->xsk_pool = xsk_pool; + rq->ptp_cyc2time = mlx5_is_real_time_rq(mdev) ? + mlx5_real_time_cyc2time : + mlx5_timecounter_cyc2time; if (rq->xsk_pool) rq->stats = &c->priv->channel_stats[c->ix].xskrq; @@ -680,7 +684,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) { struct mlx5_core_dev *mdev = rq->mdev; - + u8 ts_format; void *in; void *rqc; void *wq; @@ -693,6 +697,9 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) if (!in) return -ENOMEM; + ts_format = mlx5_is_real_time_rq(mdev) ? + MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME : + MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING; rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); wq = MLX5_ADDR_OF(rqc, rqc, wq); @@ -700,6 +707,7 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) MLX5_SET(rqc, rqc, cqn, rq->cq.mcq.cqn); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, ts_format, ts_format); MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); @@ -940,7 +948,7 @@ void mlx5e_activate_rq(struct mlx5e_rq *rq) void mlx5e_deactivate_rq(struct mlx5e_rq *rq) { clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state); - synchronize_rcu(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ + synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ } void mlx5e_close_rq(struct mlx5e_rq *rq) @@ -1179,6 +1187,9 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, if (param->is_mpw) set_bit(MLX5E_SQ_STATE_MPWQE, &sq->state); sq->stop_room = param->stop_room; + sq->ptp_cyc2time = mlx5_is_real_time_sq(mdev) ? + mlx5_real_time_cyc2time : + mlx5_timecounter_cyc2time; param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); @@ -1212,6 +1223,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, struct mlx5e_create_sq_param *csp, u32 *sqn) { + u8 ts_format; void *in; void *sqc; void *wq; @@ -1224,6 +1236,9 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, if (!in) return -ENOMEM; + ts_format = mlx5_is_real_time_sq(mdev) ? + MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME : + MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING; sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); wq = MLX5_ADDR_OF(sqc, sqc, wq); @@ -1232,6 +1247,8 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, MLX5_SET(sqc, sqc, tis_num_0, csp->tisn); MLX5_SET(sqc, sqc, cqn, csp->cqn); MLX5_SET(sqc, sqc, ts_cqe_to_dest_cqn, csp->ts_cqe_to_dest_cqn); + MLX5_SET(sqc, sqc, ts_format, ts_format); + if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_VPORT_CONTEXT) MLX5_SET(sqc, sqc, min_wqe_inline_mode, csp->min_inline_mode); @@ -1385,7 +1402,7 @@ void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq) struct mlx5_wq_cyc *wq = &sq->wq; clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - synchronize_rcu(); /* Sync with NAPI to prevent netif_tx_wake_queue. */ + synchronize_net(); /* Sync with NAPI to prevent netif_tx_wake_queue. */ mlx5e_tx_disable_queue(sq->txq); @@ -1460,7 +1477,7 @@ void mlx5e_activate_icosq(struct mlx5e_icosq *icosq) void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq) { clear_bit(MLX5E_SQ_STATE_ENABLED, &icosq->state); - synchronize_rcu(); /* Sync with NAPI. */ + synchronize_net(); /* Sync with NAPI. */ } void mlx5e_close_icosq(struct mlx5e_icosq *sq) @@ -1539,7 +1556,7 @@ void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq) struct mlx5e_channel *c = sq->channel; clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - synchronize_rcu(); /* Sync with NAPI. */ + synchronize_net(); /* Sync with NAPI. */ mlx5e_destroy_sq(c->mdev, sq->sqn); mlx5e_free_xdpsq_descs(sq); @@ -1863,12 +1880,12 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, mlx5e_build_create_cq_param(&ccp, c); - err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, &c->async_icosq.cq); if (err) return err; - err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, &c->icosq.cq); if (err) goto err_close_async_icosq_cq; @@ -2106,7 +2123,7 @@ static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev, u32 buf_size = 0; int i; - if (MLX5_IPSEC_DEV(mdev)) + if (mlx5_fpga_is_ipsec_device(mdev)) byte_count += MLX5E_METADATA_ETHER_LEN; if (mlx5e_rx_is_linear_skb(params, xsk)) { @@ -4574,8 +4591,9 @@ static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog) return -EINVAL; } - if (MLX5_IPSEC_DEV(priv->mdev)) { - netdev_warn(netdev, "can't set XDP with IPSec offload\n"); + if (mlx5_fpga_is_ipsec_device(priv->mdev)) { + netdev_warn(netdev, + "XDP is not available on Innova cards with IPsec support\n"); return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 4de5a97ceac6..1b6ad94ebb10 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -47,7 +47,6 @@ #include "fpga/ipsec.h" #include "en_accel/ipsec_rxtx.h" #include "en_accel/tls_rxtx.h" -#include "lib/clock.h" #include "en/xdp.h" #include "en/xsk/rx.h" #include "en/health.h" @@ -1062,9 +1061,8 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, } if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp))) - skb_hwtstamps(skb)->hwtstamp = - mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe)); - + skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, + rq->clock, get_cqe_ts(cqe)); skb_record_rx_queue(skb, rq->ix); if (likely(netdev->features & NETIF_F_RXHASH)) @@ -1667,9 +1665,8 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, } if (unlikely(mlx5e_rx_hw_stamp(tstamp))) - skb_hwtstamps(skb)->hwtstamp = - mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe)); - + skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, + rq->clock, get_cqe_ts(cqe)); skb_record_rx_queue(skb, rq->ix); if (likely(netdev->features & NETIF_F_RXHASH)) @@ -1786,8 +1783,8 @@ int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe; rq->handle_rx_cqe = priv->profile->rx_handlers->handle_rx_cqe_mpwqe; - if (MLX5_IPSEC_DEV(mdev)) { - netdev_err(netdev, "MPWQE RQ with IPSec offload not supported\n"); + if (mlx5_fpga_is_ipsec_device(mdev)) { + netdev_err(netdev, "MPWQE RQ with Innova IPSec offload not supported\n"); return -EINVAL; } if (!rq->handle_rx_cqe) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9f126054d371..0da69b98f38f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4445,7 +4445,7 @@ static int apply_police_params(struct mlx5e_priv *priv, u64 rate, */ if (rate) { rate = (rate * BITS_PER_BYTE) + 500000; - rate_mbps = max_t(u32, do_div(rate, 1000000), 1); + rate_mbps = max_t(u64, do_div(rate, 1000000), 1); } err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index da6a358a8a10..bdbffe484fce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -39,7 +39,6 @@ #include "en/txrx.h" #include "ipoib/ipoib.h" #include "en_accel/en_accel.h" -#include "lib/clock.h" #include "en/ptp.h" static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) @@ -802,7 +801,7 @@ static void mlx5e_consume_skb(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct skb_shared_hwtstamps hwts = {}; u64 ts = get_cqe_ts(cqe); - hwts.hwtstamp = mlx5_timecounter_cyc2time(sq->clock, ts); + hwts.hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, ts); if (sq->ptpsq) mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_CQE_HWTSTAMP, hwts.hwtstamp, sq->ptpsq->cq_stats); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index cc67366495b0..22bee4990232 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -124,7 +124,7 @@ struct mlx5_fpga_ipsec { struct ida halloc; }; -static bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) +bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) { if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga)) return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h index db88eb4c49e3..8931b5584477 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h @@ -43,6 +43,7 @@ u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev); const struct mlx5_flow_cmds * mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type); void mlx5_fpga_ipsec_build_fs_cmds(void); +bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev); #else static inline const struct mlx5_accel_ipsec_ops *mlx5_fpga_ipsec_ops(struct mlx5_core_dev *mdev) @@ -55,6 +56,7 @@ mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type) } static inline void mlx5_fpga_ipsec_build_fs_cmds(void) {}; +static inline bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) { return false; } #endif /* CONFIG_MLX5_FPGA_IPSEC */ #endif /* __MLX5_FPGA_IPSEC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 54523bed16cd..0c32c485eb58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -190,6 +190,16 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev) return true; } +static void enter_error_state(struct mlx5_core_dev *dev, bool force) +{ + if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */ + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + mlx5_cmd_flush(dev); + } + + mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); +} + void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) { bool err_detected = false; @@ -208,12 +218,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) goto unlock; } - if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */ - dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; - mlx5_cmd_flush(dev); - } - - mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); + enter_error_state(dev, force); unlock: mutex_unlock(&dev->intf_state_mutex); } @@ -613,7 +618,7 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) priv = container_of(health, struct mlx5_priv, health); dev = container_of(priv, struct mlx5_core_dev, priv); - mlx5_enter_error_state(dev, false); + enter_error_state(dev, false); if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { if (mlx5_health_try_recover(dev)) mlx5_core_err(dev, "health recovery failed\n"); @@ -707,8 +712,9 @@ static void poll_health(struct timer_list *t) mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); dev->priv.health.fatal_error = fatal_error; print_health_info(dev); + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_trigger_health_work(dev); - goto out; + return; } count = ioread32be(health->health_counter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index c70c1f0ca0c1..b0e129d0f6d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -67,39 +67,68 @@ enum { MLX5_MTPPS_FS_ENH_OUT_PER_ADJ = BIT(0x7), }; -static u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev, - struct ptp_system_timestamp *sts) +static bool mlx5_real_time_mode(struct mlx5_core_dev *mdev) +{ + return (mlx5_is_real_time_rq(mdev) || mlx5_is_real_time_sq(mdev)); +} + +static bool mlx5_modify_mtutc_allowed(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_MCAM_FEATURE(mdev, ptpcyc2realtime_modify); +} + +static int mlx5_set_mtutc(struct mlx5_core_dev *dev, u32 *mtutc, u32 size) +{ + u32 out[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!MLX5_CAP_MCAM_REG(dev, mtutc)) + return -EOPNOTSUPP; + + return mlx5_core_access_reg(dev, mtutc, size, out, sizeof(out), + MLX5_REG_MTUTC, 0, 1); +} + +static u64 mlx5_read_time(struct mlx5_core_dev *dev, + struct ptp_system_timestamp *sts, + bool real_time) { u32 timer_h, timer_h1, timer_l; - timer_h = ioread32be(&dev->iseg->internal_timer_h); + timer_h = ioread32be(real_time ? &dev->iseg->real_time_h : + &dev->iseg->internal_timer_h); ptp_read_system_prets(sts); - timer_l = ioread32be(&dev->iseg->internal_timer_l); + timer_l = ioread32be(real_time ? &dev->iseg->real_time_l : + &dev->iseg->internal_timer_l); ptp_read_system_postts(sts); - timer_h1 = ioread32be(&dev->iseg->internal_timer_h); + timer_h1 = ioread32be(real_time ? &dev->iseg->real_time_h : + &dev->iseg->internal_timer_h); if (timer_h != timer_h1) { /* wrap around */ ptp_read_system_prets(sts); - timer_l = ioread32be(&dev->iseg->internal_timer_l); + timer_l = ioread32be(real_time ? &dev->iseg->real_time_l : + &dev->iseg->internal_timer_l); ptp_read_system_postts(sts); } - return (u64)timer_l | (u64)timer_h1 << 32; + return real_time ? REAL_TIME_TO_NS(timer_h1, timer_l) : + (u64)timer_l | (u64)timer_h1 << 32; } static u64 read_internal_timer(const struct cyclecounter *cc) { - struct mlx5_clock *clock = container_of(cc, struct mlx5_clock, cycles); + struct mlx5_timer *timer = container_of(cc, struct mlx5_timer, cycles); + struct mlx5_clock *clock = container_of(timer, struct mlx5_clock, timer); struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); - return mlx5_read_internal_timer(mdev, NULL) & cc->mask; + return mlx5_read_time(mdev, NULL, false) & cc->mask; } static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) { struct mlx5_ib_clock_info *clock_info = mdev->clock_info; struct mlx5_clock *clock = &mdev->clock; + struct mlx5_timer *timer; u32 sign; if (!clock_info) @@ -109,10 +138,11 @@ static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) smp_store_mb(clock_info->sign, sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING); - clock_info->cycles = clock->tc.cycle_last; - clock_info->mult = clock->cycles.mult; - clock_info->nsec = clock->tc.nsec; - clock_info->frac = clock->tc.frac; + timer = &clock->timer; + clock_info->cycles = timer->tc.cycle_last; + clock_info->mult = timer->cycles.mult; + clock_info->nsec = timer->tc.nsec; + clock_info->frac = timer->tc.frac; smp_store_release(&clock_info->sign, sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2); @@ -151,92 +181,184 @@ static void mlx5_timestamp_overflow(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct mlx5_core_dev *mdev; + struct mlx5_timer *timer; struct mlx5_clock *clock; unsigned long flags; - clock = container_of(dwork, struct mlx5_clock, overflow_work); + timer = container_of(dwork, struct mlx5_timer, overflow_work); + clock = container_of(timer, struct mlx5_clock, timer); mdev = container_of(clock, struct mlx5_core_dev, clock); + write_seqlock_irqsave(&clock->lock, flags); - timecounter_read(&clock->tc); + timecounter_read(&timer->tc); mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); - schedule_delayed_work(&clock->overflow_work, clock->overflow_period); + schedule_delayed_work(&timer->overflow_work, timer->overflow_period); +} + +static int mlx5_ptp_settime_real_time(struct mlx5_core_dev *mdev, + const struct timespec64 *ts) +{ + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!mlx5_modify_mtutc_allowed(mdev)) + return 0; + + if (ts->tv_sec < 0 || ts->tv_sec > U32_MAX || + ts->tv_nsec < 0 || ts->tv_nsec > NSEC_PER_SEC) + return -EINVAL; + + MLX5_SET(mtutc_reg, in, operation, MLX5_MTUTC_OPERATION_SET_TIME_IMMEDIATE); + MLX5_SET(mtutc_reg, in, utc_sec, ts->tv_sec); + MLX5_SET(mtutc_reg, in, utc_nsec, ts->tv_nsec); + + return mlx5_set_mtutc(mdev, in, sizeof(in)); } static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); - u64 ns = timespec64_to_ns(ts); + struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; + int err; mdev = container_of(clock, struct mlx5_core_dev, clock); + err = mlx5_ptp_settime_real_time(mdev, ts); + if (err) + return err; + write_seqlock_irqsave(&clock->lock, flags); - timecounter_init(&clock->tc, &clock->cycles, ns); + timecounter_init(&timer->tc, &timer->cycles, timespec64_to_ns(ts)); mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); return 0; } +static +struct timespec64 mlx5_ptp_gettimex_real_time(struct mlx5_core_dev *mdev, + struct ptp_system_timestamp *sts) +{ + struct timespec64 ts; + u64 time; + + time = mlx5_read_time(mdev, sts, true); + ts = ns_to_timespec64(time); + return ts; +} + static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; u64 cycles, ns; mdev = container_of(clock, struct mlx5_core_dev, clock); + if (mlx5_real_time_mode(mdev)) { + *ts = mlx5_ptp_gettimex_real_time(mdev, sts); + goto out; + } + write_seqlock_irqsave(&clock->lock, flags); - cycles = mlx5_read_internal_timer(mdev, sts); - ns = timecounter_cyc2time(&clock->tc, cycles); + cycles = mlx5_read_time(mdev, sts, false); + ns = timecounter_cyc2time(&timer->tc, cycles); write_sequnlock_irqrestore(&clock->lock, flags); - *ts = ns_to_timespec64(ns); - +out: return 0; } +static int mlx5_ptp_adjtime_real_time(struct mlx5_core_dev *mdev, s64 delta) +{ + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!mlx5_modify_mtutc_allowed(mdev)) + return 0; + + /* HW time adjustment range is s16. If out of range, settime instead */ + if (delta < S16_MIN || delta > S16_MAX) { + struct timespec64 ts; + s64 ns; + + ts = mlx5_ptp_gettimex_real_time(mdev, NULL); + ns = timespec64_to_ns(&ts) + delta; + ts = ns_to_timespec64(ns); + return mlx5_ptp_settime_real_time(mdev, &ts); + } + + MLX5_SET(mtutc_reg, in, operation, MLX5_MTUTC_OPERATION_ADJUST_TIME); + MLX5_SET(mtutc_reg, in, time_adjustment, delta); + + return mlx5_set_mtutc(mdev, in, sizeof(in)); +} + static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; + int err; mdev = container_of(clock, struct mlx5_core_dev, clock); + + err = mlx5_ptp_adjtime_real_time(mdev, delta); + if (err) + return err; write_seqlock_irqsave(&clock->lock, flags); - timecounter_adjtime(&clock->tc, delta); + timecounter_adjtime(&timer->tc, delta); mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); return 0; } +static int mlx5_ptp_adjfreq_real_time(struct mlx5_core_dev *mdev, s32 freq) +{ + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!mlx5_modify_mtutc_allowed(mdev)) + return 0; + + MLX5_SET(mtutc_reg, in, operation, MLX5_MTUTC_OPERATION_ADJUST_FREQ_UTC); + MLX5_SET(mtutc_reg, in, freq_adjustment, freq); + + return mlx5_set_mtutc(mdev, in, sizeof(in)); +} + static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; int neg_adj = 0; u32 diff; u64 adj; + int err; + mdev = container_of(clock, struct mlx5_core_dev, clock); + err = mlx5_ptp_adjfreq_real_time(mdev, delta); + if (err) + return err; if (delta < 0) { neg_adj = 1; delta = -delta; } - adj = clock->nominal_c_mult; + adj = timer->nominal_c_mult; adj *= delta; diff = div_u64(adj, 1000000000ULL); - mdev = container_of(clock, struct mlx5_core_dev, clock); write_seqlock_irqsave(&clock->lock, flags); - timecounter_read(&clock->tc); - clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff : - clock->nominal_c_mult + diff; + timecounter_read(&timer->tc); + timer->cycles.mult = neg_adj ? timer->nominal_c_mult - diff : + timer->nominal_c_mult + diff; mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); @@ -305,6 +427,45 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, MLX5_EVENT_MODE_REPETETIVE & on); } +static u64 find_target_cycles(struct mlx5_core_dev *mdev, s64 target_ns) +{ + struct mlx5_clock *clock = &mdev->clock; + u64 cycles_now, cycles_delta; + u64 nsec_now, nsec_delta; + struct mlx5_timer *timer; + unsigned long flags; + + timer = &clock->timer; + + cycles_now = mlx5_read_time(mdev, NULL, false); + write_seqlock_irqsave(&clock->lock, flags); + nsec_now = timecounter_cyc2time(&timer->tc, cycles_now); + nsec_delta = target_ns - nsec_now; + cycles_delta = div64_u64(nsec_delta << timer->cycles.shift, + timer->cycles.mult); + write_sequnlock_irqrestore(&clock->lock, flags); + + return cycles_now + cycles_delta; +} + +static u64 perout_conf_internal_timer(struct mlx5_core_dev *mdev, + s64 sec, u32 nsec) +{ + struct timespec64 ts; + s64 target_ns; + + ts.tv_sec = sec; + ts.tv_nsec = nsec; + target_ns = timespec64_to_ns(&ts); + + return find_target_cycles(mdev, target_ns); +} + +static u64 perout_conf_real_time(s64 sec, u32 nsec) +{ + return (u64)nsec | (u64)sec << 32; +} + static int mlx5_perout_configure(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) @@ -314,11 +475,9 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; - u64 nsec_now, nsec_delta, time_stamp = 0; - u64 cycles_now, cycles_delta; struct timespec64 ts; - unsigned long flags; u32 field_select = 0; + u64 time_stamp = 0; u8 pin_mode = 0; u8 pattern = 0; int pin = -1; @@ -335,12 +494,16 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (rq->perout.index >= clock->ptp_info.n_pins) return -EINVAL; - pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, - rq->perout.index); - if (pin < 0) - return -EBUSY; - + field_select = MLX5_MTPPS_FS_ENABLE; if (on) { + bool rt_mode = mlx5_real_time_mode(mdev); + u32 nsec; + s64 sec; + + pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, rq->perout.index); + if (pin < 0) + return -EBUSY; + pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; ts.tv_sec = rq->perout.period.sec; @@ -350,23 +513,18 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if ((ns >> 1) != 500000000LL) return -EINVAL; - ts.tv_sec = rq->perout.start.sec; - ts.tv_nsec = rq->perout.start.nsec; - ns = timespec64_to_ns(&ts); - cycles_now = mlx5_read_internal_timer(mdev, NULL); - write_seqlock_irqsave(&clock->lock, flags); - nsec_now = timecounter_cyc2time(&clock->tc, cycles_now); - nsec_delta = ns - nsec_now; - cycles_delta = div64_u64(nsec_delta << clock->cycles.shift, - clock->cycles.mult); - write_sequnlock_irqrestore(&clock->lock, flags); - time_stamp = cycles_now + cycles_delta; - field_select = MLX5_MTPPS_FS_PIN_MODE | - MLX5_MTPPS_FS_PATTERN | - MLX5_MTPPS_FS_ENABLE | - MLX5_MTPPS_FS_TIME_STAMP; - } else { - field_select = MLX5_MTPPS_FS_ENABLE; + nsec = rq->perout.start.nsec; + sec = rq->perout.start.sec; + + if (rt_mode && sec > U32_MAX) + return -EINVAL; + + time_stamp = rt_mode ? perout_conf_real_time(sec, nsec) : + perout_conf_internal_timer(mdev, sec, nsec); + + field_select |= MLX5_MTPPS_FS_PIN_MODE | + MLX5_MTPPS_FS_PATTERN | + MLX5_MTPPS_FS_TIME_STAMP; } MLX5_SET(mtpps_reg, in, pin, pin); @@ -537,25 +695,50 @@ static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev) clock->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode); } +static void ts_next_sec(struct timespec64 *ts) +{ + ts->tv_sec += 1; + ts->tv_nsec = 0; +} + +static u64 perout_conf_next_event_timer(struct mlx5_core_dev *mdev, + struct mlx5_clock *clock) +{ + bool rt_mode = mlx5_real_time_mode(mdev); + struct timespec64 ts; + s64 target_ns; + + if (rt_mode) + ts = mlx5_ptp_gettimex_real_time(mdev, NULL); + else + mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); + + ts_next_sec(&ts); + target_ns = timespec64_to_ns(&ts); + + return rt_mode ? perout_conf_real_time(ts.tv_sec, ts.tv_nsec) : + find_target_cycles(mdev, target_ns); +} + static int mlx5_pps_event(struct notifier_block *nb, unsigned long type, void *data) { struct mlx5_clock *clock = mlx5_nb_cof(nb, struct mlx5_clock, pps_nb); struct ptp_clock_event ptp_event; - u64 cycles_now, cycles_delta; - u64 nsec_now, nsec_delta, ns; struct mlx5_eqe *eqe = data; int pin = eqe->data.pps.pin; struct mlx5_core_dev *mdev; - struct timespec64 ts; unsigned long flags; + u64 ns; mdev = container_of(clock, struct mlx5_core_dev, clock); switch (clock->ptp_info.pin_config[pin].func) { case PTP_PF_EXTTS: ptp_event.index = pin; - ptp_event.timestamp = + ptp_event.timestamp = mlx5_real_time_mode(mdev) ? + mlx5_real_time_cyc2time(clock, + be64_to_cpu(eqe->data.pps.time_stamp)) : mlx5_timecounter_cyc2time(clock, be64_to_cpu(eqe->data.pps.time_stamp)); if (clock->pps_info.enabled) { @@ -569,17 +752,9 @@ static int mlx5_pps_event(struct notifier_block *nb, ptp_clock_event(clock->ptp, &ptp_event); break; case PTP_PF_PEROUT: - mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); - cycles_now = mlx5_read_internal_timer(mdev, NULL); - ts.tv_sec += 1; - ts.tv_nsec = 0; - ns = timespec64_to_ns(&ts); + ns = perout_conf_next_event_timer(mdev, clock); write_seqlock_irqsave(&clock->lock, flags); - nsec_now = timecounter_cyc2time(&clock->tc, cycles_now); - nsec_delta = ns - nsec_now; - cycles_delta = div64_u64(nsec_delta << clock->cycles.shift, - clock->cycles.mult); - clock->pps_info.start[pin] = cycles_now + cycles_delta; + clock->pps_info.start[pin] = ns; write_sequnlock_irqrestore(&clock->lock, flags); schedule_work(&clock->pps_info.out_work); break; @@ -591,29 +766,32 @@ static int mlx5_pps_event(struct notifier_block *nb, return NOTIFY_OK; } -void mlx5_init_clock(struct mlx5_core_dev *mdev) +static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) { struct mlx5_clock *clock = &mdev->clock; - u64 overflow_cycles; - u64 ns; - u64 frac = 0; + struct mlx5_timer *timer = &clock->timer; u32 dev_freq; dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz); - if (!dev_freq) { - mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); - return; - } - seqlock_init(&clock->lock); - clock->cycles.read = read_internal_timer; - clock->cycles.shift = MLX5_CYCLES_SHIFT; - clock->cycles.mult = clocksource_khz2mult(dev_freq, - clock->cycles.shift); - clock->nominal_c_mult = clock->cycles.mult; - clock->cycles.mask = CLOCKSOURCE_MASK(41); - - timecounter_init(&clock->tc, &clock->cycles, + timer->cycles.read = read_internal_timer; + timer->cycles.shift = MLX5_CYCLES_SHIFT; + timer->cycles.mult = clocksource_khz2mult(dev_freq, + timer->cycles.shift); + timer->nominal_c_mult = timer->cycles.mult; + timer->cycles.mask = CLOCKSOURCE_MASK(41); + + timecounter_init(&timer->tc, &timer->cycles, ktime_to_ns(ktime_get_real())); +} + +static void mlx5_init_overflow_period(struct mlx5_clock *clock) +{ + struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); + struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_timer *timer = &clock->timer; + u64 overflow_cycles; + u64 frac = 0; + u64 ns; /* Calculate period in seconds to call the overflow watchdog - to make * sure counter is checked at least twice every wrap around. @@ -622,32 +800,77 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) * multiplied by clock multiplier where the result doesn't exceed * 64bits. */ - overflow_cycles = div64_u64(~0ULL >> 1, clock->cycles.mult); - overflow_cycles = min(overflow_cycles, div_u64(clock->cycles.mask, 3)); + overflow_cycles = div64_u64(~0ULL >> 1, timer->cycles.mult); + overflow_cycles = min(overflow_cycles, div_u64(timer->cycles.mask, 3)); - ns = cyclecounter_cyc2ns(&clock->cycles, overflow_cycles, + ns = cyclecounter_cyc2ns(&timer->cycles, overflow_cycles, frac, &frac); do_div(ns, NSEC_PER_SEC / HZ); - clock->overflow_period = ns; + timer->overflow_period = ns; - mdev->clock_info = - (struct mlx5_ib_clock_info *)get_zeroed_page(GFP_KERNEL); - if (mdev->clock_info) { - mdev->clock_info->nsec = clock->tc.nsec; - mdev->clock_info->cycles = clock->tc.cycle_last; - mdev->clock_info->mask = clock->cycles.mask; - mdev->clock_info->mult = clock->nominal_c_mult; - mdev->clock_info->shift = clock->cycles.shift; - mdev->clock_info->frac = clock->tc.frac; - mdev->clock_info->overflow_period = clock->overflow_period; + INIT_DELAYED_WORK(&timer->overflow_work, mlx5_timestamp_overflow); + if (timer->overflow_period) + schedule_delayed_work(&timer->overflow_work, 0); + else + mlx5_core_warn(mdev, + "invalid overflow period, overflow_work is not scheduled\n"); + + if (clock_info) + clock_info->overflow_period = timer->overflow_period; +} + +static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + struct mlx5_ib_clock_info *info; + struct mlx5_timer *timer; + + mdev->clock_info = (struct mlx5_ib_clock_info *)get_zeroed_page(GFP_KERNEL); + if (!mdev->clock_info) { + mlx5_core_warn(mdev, "Failed to allocate IB clock info page\n"); + return; + } + + info = mdev->clock_info; + timer = &clock->timer; + + info->nsec = timer->tc.nsec; + info->cycles = timer->tc.cycle_last; + info->mask = timer->cycles.mask; + info->mult = timer->nominal_c_mult; + info->shift = timer->cycles.shift; + info->frac = timer->tc.frac; +} + +static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + mlx5_timecounter_init(mdev); + mlx5_init_clock_info(mdev); + mlx5_init_overflow_period(clock); + clock->ptp_info = mlx5_ptp_clock_info; + + if (mlx5_real_time_mode(mdev)) { + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + mlx5_ptp_settime(&clock->ptp_info, &ts); + } +} + +void mlx5_init_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { + mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); + return; } + seqlock_init(&clock->lock); + mlx5_init_timer_clock(mdev); INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); - INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow); - if (clock->overflow_period) - schedule_delayed_work(&clock->overflow_work, 0); - else - mlx5_core_warn(mdev, "invalid overflow period, overflow_work is not scheduled\n"); /* Configure the PHC */ clock->ptp_info = mlx5_ptp_clock_info; @@ -684,7 +907,7 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) } cancel_work_sync(&clock->pps_info.out_work); - cancel_delayed_work_sync(&clock->overflow_work); + cancel_delayed_work_sync(&clock->timer.overflow_work); if (mdev->clock_info) { free_page((unsigned long)mdev->clock_info); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index 31600924bdc3..a12c7da618a7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -33,6 +33,24 @@ #ifndef __LIB_CLOCK_H__ #define __LIB_CLOCK_H__ +static inline bool mlx5_is_real_time_rq(struct mlx5_core_dev *mdev) +{ + u8 rq_ts_format_cap = MLX5_CAP_GEN(mdev, rq_ts_format); + + return (rq_ts_format_cap == MLX5_RQ_TIMESTAMP_FORMAT_CAP_REAL_TIME || + rq_ts_format_cap == MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME); +} + +static inline bool mlx5_is_real_time_sq(struct mlx5_core_dev *mdev) +{ + u8 sq_ts_format_cap = MLX5_CAP_GEN(mdev, sq_ts_format); + + return (sq_ts_format_cap == MLX5_SQ_TIMESTAMP_FORMAT_CAP_REAL_TIME || + sq_ts_format_cap == MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME); +} + +typedef ktime_t (*cqe_ts_to_ns)(struct mlx5_clock *, u64); + #if IS_ENABLED(CONFIG_PTP_1588_CLOCK) void mlx5_init_clock(struct mlx5_core_dev *mdev); void mlx5_cleanup_clock(struct mlx5_core_dev *mdev); @@ -45,17 +63,27 @@ static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock, u64 timestamp) { + struct mlx5_timer *timer = &clock->timer; unsigned int seq; u64 nsec; do { seq = read_seqbegin(&clock->lock); - nsec = timecounter_cyc2time(&clock->tc, timestamp); + nsec = timecounter_cyc2time(&timer->tc, timestamp); } while (read_seqretry(&clock->lock, seq)); return ns_to_ktime(nsec); } +#define REAL_TIME_TO_NS(hi, low) (((u64)hi) * NSEC_PER_SEC + ((u64)low)) + +static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, + u64 timestamp) +{ + u64 time = REAL_TIME_TO_NS(timestamp >> 32, timestamp & 0xFFFFFFFF); + + return ns_to_ktime(time); +} #else static inline void mlx5_init_clock(struct mlx5_core_dev *mdev) {} static inline void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) {} @@ -69,6 +97,12 @@ static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock, { return 0; } + +static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, + u64 timestamp) +{ + return 0; +} #endif #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e4c9627485aa..2f2c352f301e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1445,7 +1445,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); pci_save_state(pdev); - devlink_reload_enable(devlink); + if (!mlx5_core_is_mp_slave(dev)) + devlink_reload_enable(devlink); return 0; err_load_one: diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 51359ce650bd..dbdfabff3b00 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1926,15 +1926,6 @@ static int lan743x_rx_next_index(struct lan743x_rx *rx, int index) return ((++index) % rx->ring_size); } -static struct sk_buff *lan743x_rx_allocate_skb(struct lan743x_rx *rx) -{ - int length = 0; - - length = (LAN743X_MAX_FRAME_SIZE + ETH_HLEN + 4 + RX_HEAD_PADDING); - return __netdev_alloc_skb(rx->adapter->netdev, - length, GFP_ATOMIC | GFP_DMA); -} - static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index) { /* update the tail once per 8 descriptors */ @@ -1943,36 +1934,57 @@ static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index) index); } -static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index, - struct sk_buff *skb) +static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index) { + struct net_device *netdev = rx->adapter->netdev; + struct device *dev = &rx->adapter->pdev->dev; struct lan743x_rx_buffer_info *buffer_info; + unsigned int buffer_length, used_length; struct lan743x_rx_descriptor *descriptor; - int length = 0; + struct sk_buff *skb; + dma_addr_t dma_ptr; + + buffer_length = netdev->mtu + ETH_HLEN + 4 + RX_HEAD_PADDING; - length = (LAN743X_MAX_FRAME_SIZE + ETH_HLEN + 4 + RX_HEAD_PADDING); descriptor = &rx->ring_cpu_ptr[index]; buffer_info = &rx->buffer_info[index]; - buffer_info->skb = skb; - if (!(buffer_info->skb)) + skb = __netdev_alloc_skb(netdev, buffer_length, GFP_ATOMIC | GFP_DMA); + if (!skb) return -ENOMEM; - buffer_info->dma_ptr = dma_map_single(&rx->adapter->pdev->dev, - buffer_info->skb->data, - length, - DMA_FROM_DEVICE); - if (dma_mapping_error(&rx->adapter->pdev->dev, - buffer_info->dma_ptr)) { - buffer_info->dma_ptr = 0; + dma_ptr = dma_map_single(dev, skb->data, buffer_length, DMA_FROM_DEVICE); + if (dma_mapping_error(dev, dma_ptr)) { + dev_kfree_skb_any(skb); return -ENOMEM; } + if (buffer_info->dma_ptr) { + /* sync used area of buffer only */ + if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_LS_) + /* frame length is valid only if LS bit is set. + * it's a safe upper bound for the used area in this + * buffer. + */ + used_length = min(RX_DESC_DATA0_FRAME_LENGTH_GET_ + (le32_to_cpu(descriptor->data0)), + buffer_info->buffer_length); + else + used_length = buffer_info->buffer_length; + dma_sync_single_for_cpu(dev, buffer_info->dma_ptr, + used_length, + DMA_FROM_DEVICE); + dma_unmap_single_attrs(dev, buffer_info->dma_ptr, + buffer_info->buffer_length, + DMA_FROM_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + } - buffer_info->buffer_length = length; + buffer_info->skb = skb; + buffer_info->dma_ptr = dma_ptr; + buffer_info->buffer_length = buffer_length; descriptor->data1 = cpu_to_le32(DMA_ADDR_LOW32(buffer_info->dma_ptr)); descriptor->data2 = cpu_to_le32(DMA_ADDR_HIGH32(buffer_info->dma_ptr)); descriptor->data3 = 0; descriptor->data0 = cpu_to_le32((RX_DESC_DATA0_OWN_ | - (length & RX_DESC_DATA0_BUF_LENGTH_MASK_))); - skb_reserve(buffer_info->skb, RX_HEAD_PADDING); + (buffer_length & RX_DESC_DATA0_BUF_LENGTH_MASK_))); lan743x_rx_update_tail(rx, index); return 0; @@ -2021,16 +2033,32 @@ static void lan743x_rx_release_ring_element(struct lan743x_rx *rx, int index) memset(buffer_info, 0, sizeof(*buffer_info)); } -static int lan743x_rx_process_packet(struct lan743x_rx *rx) +static struct sk_buff * +lan743x_rx_trim_skb(struct sk_buff *skb, int frame_length) +{ + if (skb_linearize(skb)) { + dev_kfree_skb_irq(skb); + return NULL; + } + frame_length = max_t(int, 0, frame_length - RX_HEAD_PADDING - 2); + if (skb->len > frame_length) { + skb->tail -= skb->len - frame_length; + skb->len = frame_length; + } + return skb; +} + +static int lan743x_rx_process_buffer(struct lan743x_rx *rx) { - struct skb_shared_hwtstamps *hwtstamps = NULL; - int result = RX_PROCESS_RESULT_NOTHING_TO_DO; int current_head_index = le32_to_cpu(*rx->head_cpu_ptr); + struct lan743x_rx_descriptor *descriptor, *desc_ext; + struct net_device *netdev = rx->adapter->netdev; + int result = RX_PROCESS_RESULT_NOTHING_TO_DO; struct lan743x_rx_buffer_info *buffer_info; - struct lan743x_rx_descriptor *descriptor; + int frame_length, buffer_length; int extension_index = -1; - int first_index = -1; - int last_index = -1; + bool is_last, is_first; + struct sk_buff *skb; if (current_head_index < 0 || current_head_index >= rx->ring_size) goto done; @@ -2038,163 +2066,120 @@ static int lan743x_rx_process_packet(struct lan743x_rx *rx) if (rx->last_head < 0 || rx->last_head >= rx->ring_size) goto done; - if (rx->last_head != current_head_index) { - descriptor = &rx->ring_cpu_ptr[rx->last_head]; - if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_OWN_) - goto done; + if (rx->last_head == current_head_index) + goto done; - if (!(le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_FS_)) - goto done; + descriptor = &rx->ring_cpu_ptr[rx->last_head]; + if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_OWN_) + goto done; + buffer_info = &rx->buffer_info[rx->last_head]; - first_index = rx->last_head; - if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_LS_) { - last_index = rx->last_head; - } else { - int index; - - index = lan743x_rx_next_index(rx, first_index); - while (index != current_head_index) { - descriptor = &rx->ring_cpu_ptr[index]; - if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_OWN_) - goto done; - - if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_LS_) { - last_index = index; - break; - } - index = lan743x_rx_next_index(rx, index); - } - } - if (last_index >= 0) { - descriptor = &rx->ring_cpu_ptr[last_index]; - if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_EXT_) { - /* extension is expected to follow */ - int index = lan743x_rx_next_index(rx, - last_index); - if (index != current_head_index) { - descriptor = &rx->ring_cpu_ptr[index]; - if (le32_to_cpu(descriptor->data0) & - RX_DESC_DATA0_OWN_) { - goto done; - } - if (le32_to_cpu(descriptor->data0) & - RX_DESC_DATA0_EXT_) { - extension_index = index; - } else { - goto done; - } - } else { - /* extension is not yet available */ - /* prevent processing of this packet */ - first_index = -1; - last_index = -1; - } - } - } - } - if (first_index >= 0 && last_index >= 0) { - int real_last_index = last_index; - struct sk_buff *skb = NULL; - u32 ts_sec = 0; - u32 ts_nsec = 0; - - /* packet is available */ - if (first_index == last_index) { - /* single buffer packet */ - struct sk_buff *new_skb = NULL; - int packet_length; - - new_skb = lan743x_rx_allocate_skb(rx); - if (!new_skb) { - /* failed to allocate next skb. - * Memory is very low. - * Drop this packet and reuse buffer. - */ - lan743x_rx_reuse_ring_element(rx, first_index); - goto process_extension; - } + is_last = le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_LS_; + is_first = le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_FS_; - buffer_info = &rx->buffer_info[first_index]; - skb = buffer_info->skb; - descriptor = &rx->ring_cpu_ptr[first_index]; - - /* unmap from dma */ - if (buffer_info->dma_ptr) { - dma_unmap_single(&rx->adapter->pdev->dev, - buffer_info->dma_ptr, - buffer_info->buffer_length, - DMA_FROM_DEVICE); - buffer_info->dma_ptr = 0; - buffer_info->buffer_length = 0; - } - buffer_info->skb = NULL; - packet_length = RX_DESC_DATA0_FRAME_LENGTH_GET_ - (le32_to_cpu(descriptor->data0)); - skb_put(skb, packet_length - 4); - skb->protocol = eth_type_trans(skb, - rx->adapter->netdev); - lan743x_rx_init_ring_element(rx, first_index, new_skb); - } else { - int index = first_index; + if (is_last && le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_EXT_) { + /* extension is expected to follow */ + int index = lan743x_rx_next_index(rx, rx->last_head); - /* multi buffer packet not supported */ - /* this should not happen since - * buffers are allocated to be at least jumbo size - */ + if (index == current_head_index) + /* extension not yet available */ + goto done; + desc_ext = &rx->ring_cpu_ptr[index]; + if (le32_to_cpu(desc_ext->data0) & RX_DESC_DATA0_OWN_) + /* extension not yet available */ + goto done; + if (!(le32_to_cpu(desc_ext->data0) & RX_DESC_DATA0_EXT_)) + goto move_forward; + extension_index = index; + } - /* clean up buffers */ - if (first_index <= last_index) { - while ((index >= first_index) && - (index <= last_index)) { - lan743x_rx_reuse_ring_element(rx, - index); - index = lan743x_rx_next_index(rx, - index); - } - } else { - while ((index >= first_index) || - (index <= last_index)) { - lan743x_rx_reuse_ring_element(rx, - index); - index = lan743x_rx_next_index(rx, - index); - } - } - } + /* Only the last buffer in a multi-buffer frame contains the total frame + * length. The chip occasionally sends more buffers than strictly + * required to reach the total frame length. + * Handle this by adding all buffers to the skb in their entirety. + * Once the real frame length is known, trim the skb. + */ + frame_length = + RX_DESC_DATA0_FRAME_LENGTH_GET_(le32_to_cpu(descriptor->data0)); + buffer_length = buffer_info->buffer_length; + + netdev_dbg(netdev, "%s%schunk: %d/%d", + is_first ? "first " : " ", + is_last ? "last " : " ", + frame_length, buffer_length); + + /* save existing skb, allocate new skb and map to dma */ + skb = buffer_info->skb; + if (lan743x_rx_init_ring_element(rx, rx->last_head)) { + /* failed to allocate next skb. + * Memory is very low. + * Drop this packet and reuse buffer. + */ + lan743x_rx_reuse_ring_element(rx, rx->last_head); + /* drop packet that was being assembled */ + dev_kfree_skb_irq(rx->skb_head); + rx->skb_head = NULL; + goto process_extension; + } + + /* add buffers to skb via skb->frag_list */ + if (is_first) { + skb_reserve(skb, RX_HEAD_PADDING); + skb_put(skb, buffer_length - RX_HEAD_PADDING); + if (rx->skb_head) + dev_kfree_skb_irq(rx->skb_head); + rx->skb_head = skb; + } else if (rx->skb_head) { + skb_put(skb, buffer_length); + if (skb_shinfo(rx->skb_head)->frag_list) + rx->skb_tail->next = skb; + else + skb_shinfo(rx->skb_head)->frag_list = skb; + rx->skb_tail = skb; + rx->skb_head->len += skb->len; + rx->skb_head->data_len += skb->len; + rx->skb_head->truesize += skb->truesize; + } else { + /* packet to assemble has already been dropped because one or + * more of its buffers could not be allocated + */ + netdev_dbg(netdev, "drop buffer intended for dropped packet"); + dev_kfree_skb_irq(skb); + } process_extension: - if (extension_index >= 0) { - descriptor = &rx->ring_cpu_ptr[extension_index]; - buffer_info = &rx->buffer_info[extension_index]; - - ts_sec = le32_to_cpu(descriptor->data1); - ts_nsec = (le32_to_cpu(descriptor->data2) & - RX_DESC_DATA2_TS_NS_MASK_); - lan743x_rx_reuse_ring_element(rx, extension_index); - real_last_index = extension_index; - } + if (extension_index >= 0) { + u32 ts_sec; + u32 ts_nsec; - if (!skb) { - result = RX_PROCESS_RESULT_PACKET_DROPPED; - goto move_forward; - } + ts_sec = le32_to_cpu(desc_ext->data1); + ts_nsec = (le32_to_cpu(desc_ext->data2) & + RX_DESC_DATA2_TS_NS_MASK_); + if (rx->skb_head) + skb_hwtstamps(rx->skb_head)->hwtstamp = + ktime_set(ts_sec, ts_nsec); + lan743x_rx_reuse_ring_element(rx, extension_index); + rx->last_head = extension_index; + netdev_dbg(netdev, "process extension"); + } - if (extension_index < 0) - goto pass_packet_to_os; - hwtstamps = skb_hwtstamps(skb); - if (hwtstamps) - hwtstamps->hwtstamp = ktime_set(ts_sec, ts_nsec); + if (is_last && rx->skb_head) + rx->skb_head = lan743x_rx_trim_skb(rx->skb_head, frame_length); -pass_packet_to_os: - /* pass packet to OS */ - napi_gro_receive(&rx->napi, skb); - result = RX_PROCESS_RESULT_PACKET_RECEIVED; + if (is_last && rx->skb_head) { + rx->skb_head->protocol = eth_type_trans(rx->skb_head, + rx->adapter->netdev); + netdev_dbg(netdev, "sending %d byte frame to OS", + rx->skb_head->len); + napi_gro_receive(&rx->napi, rx->skb_head); + rx->skb_head = NULL; + } move_forward: - /* push tail and head forward */ - rx->last_tail = real_last_index; - rx->last_head = lan743x_rx_next_index(rx, real_last_index); - } + /* push tail and head forward */ + rx->last_tail = rx->last_head; + rx->last_head = lan743x_rx_next_index(rx, rx->last_head); + result = RX_PROCESS_RESULT_BUFFER_RECEIVED; done: return result; } @@ -2213,12 +2198,12 @@ static int lan743x_rx_napi_poll(struct napi_struct *napi, int weight) DMAC_INT_BIT_RXFRM_(rx->channel_number)); } for (count = 0; count < weight; count++) { - result = lan743x_rx_process_packet(rx); + result = lan743x_rx_process_buffer(rx); if (result == RX_PROCESS_RESULT_NOTHING_TO_DO) break; } rx->frame_count += count; - if (count == weight || result == RX_PROCESS_RESULT_PACKET_RECEIVED) + if (count == weight || result == RX_PROCESS_RESULT_BUFFER_RECEIVED) return weight; if (!napi_complete_done(napi, count)) @@ -2330,9 +2315,7 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx) rx->last_head = 0; for (index = 0; index < rx->ring_size; index++) { - struct sk_buff *new_skb = lan743x_rx_allocate_skb(rx); - - ret = lan743x_rx_init_ring_element(rx, index, new_skb); + ret = lan743x_rx_init_ring_element(rx, index); if (ret) goto cleanup; } diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index f3f778910fcc..6080028c1df2 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -699,6 +699,8 @@ struct lan743x_rx { struct napi_struct napi; u32 frame_count; + + struct sk_buff *skb_head, *skb_tail; }; struct lan743x_adapter { @@ -831,8 +833,7 @@ struct lan743x_rx_buffer_info { #define LAN743X_RX_RING_SIZE (65) #define RX_PROCESS_RESULT_NOTHING_TO_DO (0) -#define RX_PROCESS_RESULT_PACKET_RECEIVED (1) -#define RX_PROCESS_RESULT_PACKET_DROPPED (2) +#define RX_PROCESS_RESULT_BUFFER_RECEIVED (1) u32 lan743x_csr_read(struct lan743x_adapter *adapter, int offset); void lan743x_csr_write(struct lan743x_adapter *adapter, int offset, u32 data); diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig index ee7bb7e24e8e..c0ede0ca7115 100644 --- a/drivers/net/ethernet/mscc/Kconfig +++ b/drivers/net/ethernet/mscc/Kconfig @@ -14,6 +14,7 @@ if NET_VENDOR_MICROSEMI # Users should depend on NET_SWITCHDEV, HAS_IOMEM config MSCC_OCELOT_SWITCH_LIB select REGMAP_MMIO + select PACKING select PHYLIB tristate help diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile index 346bba2730ad..722c27694b21 100644 --- a/drivers/net/ethernet/mscc/Makefile +++ b/drivers/net/ethernet/mscc/Makefile @@ -8,6 +8,7 @@ mscc_ocelot_switch_lib-y := \ ocelot_flower.o \ ocelot_ptp.o \ ocelot_devlink.o +mscc_ocelot_switch_lib-$(CONFIG_BRIDGE_MRP) += ocelot_mrp.o obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot.o mscc_ocelot-y := \ ocelot_vsc7514.o \ diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index d1a9cdbf7a3e..46e5c9136bac 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -4,6 +4,7 @@ * * Copyright (c) 2017 Microsemi Corporation */ +#include <linux/dsa/ocelot.h> #include <linux/if_bridge.h> #include <soc/mscc/ocelot_vcap.h> #include "ocelot.h" @@ -628,6 +629,229 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) } EXPORT_SYMBOL(ocelot_get_txtstamp); +static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh, + u32 *rval) +{ + u32 bytes_valid, val; + + val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + if (val == XTR_NOT_READY) { + if (ifh) + return -EIO; + + do { + val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + } while (val == XTR_NOT_READY); + } + + switch (val) { + case XTR_ABORT: + return -EIO; + case XTR_EOF_0: + case XTR_EOF_1: + case XTR_EOF_2: + case XTR_EOF_3: + case XTR_PRUNED: + bytes_valid = XTR_VALID_BYTES(val); + val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + if (val == XTR_ESCAPE) + *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + else + *rval = val; + + return bytes_valid; + case XTR_ESCAPE: + *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + + return 4; + default: + *rval = val; + + return 4; + } +} + +static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh) +{ + int i, err = 0; + + for (i = 0; i < OCELOT_TAG_LEN / 4; i++) { + err = ocelot_rx_frame_word(ocelot, grp, true, &xfh[i]); + if (err != 4) + return (err < 0) ? err : -EIO; + } + + return 0; +} + +int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) +{ + struct skb_shared_hwtstamps *shhwtstamps; + u64 tod_in_ns, full_ts_in_ns, cpuq; + u64 timestamp, src_port, len; + u32 xfh[OCELOT_TAG_LEN / 4]; + struct net_device *dev; + struct timespec64 ts; + struct sk_buff *skb; + int sz, buf_len; + u32 val, *buf; + int err; + + err = ocelot_xtr_poll_xfh(ocelot, grp, xfh); + if (err) + return err; + + ocelot_xfh_get_src_port(xfh, &src_port); + ocelot_xfh_get_len(xfh, &len); + ocelot_xfh_get_rew_val(xfh, ×tamp); + ocelot_xfh_get_cpuq(xfh, &cpuq); + + if (WARN_ON(src_port >= ocelot->num_phys_ports)) + return -EINVAL; + + dev = ocelot->ops->port_to_netdev(ocelot, src_port); + if (!dev) + return -EINVAL; + + skb = netdev_alloc_skb(dev, len); + if (unlikely(!skb)) { + netdev_err(dev, "Unable to allocate sk_buff\n"); + return -ENOMEM; + } + + buf_len = len - ETH_FCS_LEN; + buf = (u32 *)skb_put(skb, buf_len); + + len = 0; + do { + sz = ocelot_rx_frame_word(ocelot, grp, false, &val); + if (sz < 0) { + err = sz; + goto out_free_skb; + } + *buf++ = val; + len += sz; + } while (len < buf_len); + + /* Read the FCS */ + sz = ocelot_rx_frame_word(ocelot, grp, false, &val); + if (sz < 0) { + err = sz; + goto out_free_skb; + } + + /* Update the statistics if part of the FCS was read before */ + len -= ETH_FCS_LEN - sz; + + if (unlikely(dev->features & NETIF_F_RXFCS)) { + buf = (u32 *)skb_put(skb, ETH_FCS_LEN); + *buf = val; + } + + if (ocelot->ptp) { + ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); + + tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec); + if ((tod_in_ns & 0xffffffff) < timestamp) + full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) | + timestamp; + else + full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) | + timestamp; + + shhwtstamps = skb_hwtstamps(skb); + memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); + shhwtstamps->hwtstamp = full_ts_in_ns; + } + + /* Everything we see on an interface that is in the HW bridge + * has already been forwarded. + */ + if (ocelot->bridge_mask & BIT(src_port)) + skb->offload_fwd_mark = 1; + + skb->protocol = eth_type_trans(skb, dev); + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + if (skb->protocol == cpu_to_be16(ETH_P_MRP) && + cpuq & BIT(OCELOT_MRP_CPUQ)) + skb->offload_fwd_mark = 0; +#endif + + *nskb = skb; + + return 0; + +out_free_skb: + kfree_skb(skb); + return err; +} +EXPORT_SYMBOL(ocelot_xtr_poll_frame); + +bool ocelot_can_inject(struct ocelot *ocelot, int grp) +{ + u32 val = ocelot_read(ocelot, QS_INJ_STATUS); + + if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp)))) + return false; + if (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp))) + return false; + + return true; +} +EXPORT_SYMBOL(ocelot_can_inject); + +void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, + u32 rew_op, struct sk_buff *skb) +{ + u32 ifh[OCELOT_TAG_LEN / 4] = {0}; + unsigned int i, count, last; + + ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | + QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp); + + ocelot_ifh_set_bypass(ifh, 1); + ocelot_ifh_set_dest(ifh, BIT_ULL(port)); + ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C); + ocelot_ifh_set_vid(ifh, skb_vlan_tag_get(skb)); + ocelot_ifh_set_rew_op(ifh, rew_op); + + for (i = 0; i < OCELOT_TAG_LEN / 4; i++) + ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp); + + count = DIV_ROUND_UP(skb->len, 4); + last = skb->len % 4; + for (i = 0; i < count; i++) + ocelot_write_rix(ocelot, ((u32 *)skb->data)[i], QS_INJ_WR, grp); + + /* Add padding */ + while (i < (OCELOT_BUFFER_CELL_SZ / 4)) { + ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); + i++; + } + + /* Indicate EOF and valid bytes in last word */ + ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | + QS_INJ_CTRL_VLD_BYTES(skb->len < OCELOT_BUFFER_CELL_SZ ? 0 : last) | + QS_INJ_CTRL_EOF, + QS_INJ_CTRL, grp); + + /* Add dummy CRC */ + ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); + skb_tx_timestamp(skb); + + skb->dev->stats.tx_packets++; + skb->dev->stats.tx_bytes += skb->len; +} +EXPORT_SYMBOL(ocelot_port_inject_frame); + +void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) +{ + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) + ocelot_read_rix(ocelot, QS_XTR_RD, grp); +} +EXPORT_SYMBOL(ocelot_drain_cpu_queue); + int ocelot_fdb_add(struct ocelot *ocelot, int port, const unsigned char *addr, u16 vid) { diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index c485795c606b..db6b1a4c3926 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -32,15 +32,6 @@ #define OCELOT_PTP_QUEUE_SZ 128 -struct frame_info { - u32 len; - u16 port; - u16 vid; - u8 tag_type; - u16 rew_op; - u32 timestamp; /* rew_val */ -}; - struct ocelot_port_tc { bool block_shared; unsigned long offload_cnt; diff --git a/drivers/net/ethernet/mscc/ocelot_mrp.c b/drivers/net/ethernet/mscc/ocelot_mrp.c new file mode 100644 index 000000000000..683da320bfd8 --- /dev/null +++ b/drivers/net/ethernet/mscc/ocelot_mrp.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Microsemi Ocelot Switch driver + * + * This contains glue logic between the switchdev driver operations and the + * mscc_ocelot_switch_lib. + * + * Copyright (c) 2017, 2019 Microsemi Corporation + * Copyright 2020-2021 NXP Semiconductors + */ + +#include <linux/if_bridge.h> +#include <linux/mrp_bridge.h> +#include <soc/mscc/ocelot_vcap.h> +#include <uapi/linux/mrp_bridge.h> +#include "ocelot.h" +#include "ocelot_vcap.h" + +static int ocelot_mrp_del_vcap(struct ocelot *ocelot, int port) +{ + struct ocelot_vcap_block *block_vcap_is2; + struct ocelot_vcap_filter *filter; + + block_vcap_is2 = &ocelot->block[VCAP_IS2]; + filter = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, port, + false); + if (!filter) + return 0; + + return ocelot_vcap_filter_del(ocelot, filter); +} + +int ocelot_mrp_add(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port_private *priv; + struct net_device *dev; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (mrp->p_port != dev && mrp->s_port != dev) + return 0; + + if (ocelot->mrp_ring_id != 0 && + ocelot->mrp_s_port && + ocelot->mrp_p_port) + return -EINVAL; + + if (mrp->p_port == dev) + ocelot->mrp_p_port = dev; + + if (mrp->s_port == dev) + ocelot->mrp_s_port = dev; + + ocelot->mrp_ring_id = mrp->ring_id; + + return 0; +} +EXPORT_SYMBOL(ocelot_mrp_add); + +int ocelot_mrp_del(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port_private *priv; + struct net_device *dev; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + return 0; + + if (ocelot->mrp_ring_id == 0 && + !ocelot->mrp_s_port && + !ocelot->mrp_p_port) + return -EINVAL; + + if (ocelot_mrp_del_vcap(ocelot, priv->chip_port)) + return -EINVAL; + + if (ocelot->mrp_p_port == dev) + ocelot->mrp_p_port = NULL; + + if (ocelot->mrp_s_port == dev) + ocelot->mrp_s_port = NULL; + + ocelot->mrp_ring_id = 0; + + return 0; +} +EXPORT_SYMBOL(ocelot_mrp_del); + +int ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_vcap_filter *filter; + struct ocelot_port_private *priv; + struct net_device *dev; + int err; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (ocelot->mrp_ring_id != mrp->ring_id) + return -EINVAL; + + if (!mrp->sw_backup) + return -EOPNOTSUPP; + + if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + return 0; + + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); + if (!filter) + return -ENOMEM; + + filter->key_type = OCELOT_VCAP_KEY_ETYPE; + filter->prio = 1; + filter->id.cookie = priv->chip_port; + filter->id.tc_offload = false; + filter->block_id = VCAP_IS2; + filter->type = OCELOT_VCAP_FILTER_OFFLOAD; + filter->ingress_port_mask = BIT(priv->chip_port); + *(__be16 *)filter->key.etype.etype.value = htons(ETH_P_MRP); + *(__be16 *)filter->key.etype.etype.mask = htons(0xffff); + filter->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY; + filter->action.port_mask = 0x0; + filter->action.cpu_copy_ena = true; + filter->action.cpu_qu_num = OCELOT_MRP_CPUQ; + + err = ocelot_vcap_filter_add(ocelot, filter, NULL); + if (err) + kfree(filter); + + return err; +} +EXPORT_SYMBOL(ocelot_mrp_add_ring_role); + +int ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port_private *priv; + struct net_device *dev; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (ocelot->mrp_ring_id != mrp->ring_id) + return -EINVAL; + + if (!mrp->sw_backup) + return -EOPNOTSUPP; + + if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + return 0; + + return ocelot_mrp_del_vcap(ocelot, priv->chip_port); +} +EXPORT_SYMBOL(ocelot_mrp_del_ring_role); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index b5ffe6724eb7..12cb6867a2d0 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -488,53 +488,20 @@ static int ocelot_port_stop(struct net_device *dev) return 0; } -/* Generate the IFH for frame injection - * - * The IFH is a 128bit-value - * bit 127: bypass the analyzer processing - * bit 56-67: destination mask - * bit 28-29: pop_cnt: 3 disables all rewriting of the frame - * bit 20-27: cpu extraction queue mask - * bit 16: tag type 0: C-tag, 1: S-tag - * bit 0-11: VID - */ -static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info) -{ - ifh[0] = IFH_INJ_BYPASS | ((0x1ff & info->rew_op) << 21); - ifh[1] = (0xf00 & info->port) >> 8; - ifh[2] = (0xff & info->port) << 24; - ifh[3] = (info->tag_type << 16) | info->vid; - - return 0; -} - -static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) { struct ocelot_port_private *priv = netdev_priv(dev); - struct skb_shared_info *shinfo = skb_shinfo(skb); struct ocelot_port *ocelot_port = &priv->port; struct ocelot *ocelot = ocelot_port->ocelot; - u32 val, ifh[OCELOT_TAG_LEN / 4]; - struct frame_info info = {}; - u8 grp = 0; /* Send everything on CPU group 0 */ - unsigned int i, count, last; int port = priv->chip_port; + u32 rew_op = 0; - val = ocelot_read(ocelot, QS_INJ_STATUS); - if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp))) || - (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp)))) + if (!ocelot_can_inject(ocelot, 0)) return NETDEV_TX_BUSY; - ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | - QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp); - - info.port = BIT(port); - info.tag_type = IFH_TAG_TYPE_C; - info.vid = skb_vlan_tag_get(skb); - /* Check if timestamping is needed */ - if (ocelot->ptp && (shinfo->tx_flags & SKBTX_HW_TSTAMP)) { - info.rew_op = ocelot_port->ptp_cmd; + if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + rew_op = ocelot_port->ptp_cmd; if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { struct sk_buff *clone; @@ -547,45 +514,11 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) ocelot_port_add_txtstamp_skb(ocelot, port, clone); - info.rew_op |= clone->cb[0] << 3; + rew_op |= clone->cb[0] << 3; } } - if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP) { - info.rew_op = ocelot_port->ptp_cmd; - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - info.rew_op |= skb->cb[0] << 3; - } - - ocelot_gen_ifh(ifh, &info); - - for (i = 0; i < OCELOT_TAG_LEN / 4; i++) - ocelot_write_rix(ocelot, (__force u32)cpu_to_be32(ifh[i]), - QS_INJ_WR, grp); - - count = (skb->len + 3) / 4; - last = skb->len % 4; - for (i = 0; i < count; i++) - ocelot_write_rix(ocelot, ((u32 *)skb->data)[i], QS_INJ_WR, grp); - - /* Add padding */ - while (i < (OCELOT_BUFFER_CELL_SZ / 4)) { - ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); - i++; - } - - /* Indicate EOF and valid bytes in last word */ - ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | - QS_INJ_CTRL_VLD_BYTES(skb->len < OCELOT_BUFFER_CELL_SZ ? 0 : last) | - QS_INJ_CTRL_EOF, - QS_INJ_CTRL, grp); - - /* Add dummy CRC */ - ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); - skb_tx_timestamp(skb); - - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); kfree_skb(skb); @@ -1077,6 +1010,52 @@ static int ocelot_port_obj_del_mdb(struct net_device *dev, return ocelot_port_mdb_del(ocelot, port, mdb); } +static int ocelot_port_obj_mrp_add(struct net_device *dev, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_add(ocelot, port, mrp); +} + +static int ocelot_port_obj_mrp_del(struct net_device *dev, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_del(ocelot, port, mrp); +} + +static int +ocelot_port_obj_mrp_add_ring_role(struct net_device *dev, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_add_ring_role(ocelot, port, mrp); +} + +static int +ocelot_port_obj_mrp_del_ring_role(struct net_device *dev, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_del_ring_role(ocelot, port, mrp); +} + static int ocelot_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) @@ -1091,6 +1070,13 @@ static int ocelot_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: ret = ocelot_port_obj_add_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); break; + case SWITCHDEV_OBJ_ID_MRP: + ret = ocelot_port_obj_mrp_add(dev, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + ret = ocelot_port_obj_mrp_add_ring_role(dev, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: return -EOPNOTSUPP; } @@ -1111,6 +1097,13 @@ static int ocelot_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: ret = ocelot_port_obj_del_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); break; + case SWITCHDEV_OBJ_ID_MRP: + ret = ocelot_port_obj_mrp_del(dev, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + ret = ocelot_port_obj_mrp_del_ring_role(dev, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 6b6eb92149ba..4bd7e9d9ec61 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -4,6 +4,7 @@ * * Copyright (c) 2017 Microsemi Corporation */ +#include <linux/dsa/ocelot.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/of_net.h> @@ -18,8 +19,6 @@ #include <soc/mscc/ocelot_hsio.h> #include "ocelot.h" -#define IFH_EXTRACT_BITFIELD64(x, o, w) (((x) >> (o)) & GENMASK_ULL((w) - 1, 0)) - static const u32 ocelot_ana_regmap[] = { REG(ANA_ADVLEARN, 0x009000), REG(ANA_VLANMASK, 0x009004), @@ -532,181 +531,28 @@ static int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops) return 0; } -static int ocelot_parse_ifh(u32 *_ifh, struct frame_info *info) -{ - u8 llen, wlen; - u64 ifh[2]; - - ifh[0] = be64_to_cpu(((__force __be64 *)_ifh)[0]); - ifh[1] = be64_to_cpu(((__force __be64 *)_ifh)[1]); - - wlen = IFH_EXTRACT_BITFIELD64(ifh[0], 7, 8); - llen = IFH_EXTRACT_BITFIELD64(ifh[0], 15, 6); - - info->len = OCELOT_BUFFER_CELL_SZ * wlen + llen - 80; - - info->timestamp = IFH_EXTRACT_BITFIELD64(ifh[0], 21, 32); - - info->port = IFH_EXTRACT_BITFIELD64(ifh[1], 43, 4); - - info->tag_type = IFH_EXTRACT_BITFIELD64(ifh[1], 16, 1); - info->vid = IFH_EXTRACT_BITFIELD64(ifh[1], 0, 12); - - return 0; -} - -static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh, - u32 *rval) -{ - u32 val; - u32 bytes_valid; - - val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - if (val == XTR_NOT_READY) { - if (ifh) - return -EIO; - - do { - val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - } while (val == XTR_NOT_READY); - } - - switch (val) { - case XTR_ABORT: - return -EIO; - case XTR_EOF_0: - case XTR_EOF_1: - case XTR_EOF_2: - case XTR_EOF_3: - case XTR_PRUNED: - bytes_valid = XTR_VALID_BYTES(val); - val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - if (val == XTR_ESCAPE) - *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - else - *rval = val; - - return bytes_valid; - case XTR_ESCAPE: - *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - - return 4; - default: - *rval = val; - - return 4; - } -} - static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) { struct ocelot *ocelot = arg; - int i = 0, grp = 0; - int err = 0; - - if (!(ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp))) - return IRQ_NONE; + int grp = 0, err; - do { - struct skb_shared_hwtstamps *shhwtstamps; - struct ocelot_port_private *priv; - struct ocelot_port *ocelot_port; - u64 tod_in_ns, full_ts_in_ns; - struct frame_info info = {}; - struct net_device *dev; - u32 ifh[4], val, *buf; - struct timespec64 ts; - int sz, len, buf_len; + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) { struct sk_buff *skb; - for (i = 0; i < OCELOT_TAG_LEN / 4; i++) { - err = ocelot_rx_frame_word(ocelot, grp, true, &ifh[i]); - if (err != 4) - break; - } - - if (err != 4) - break; - - /* At this point the IFH was read correctly, so it is safe to - * presume that there is no error. The err needs to be reset - * otherwise a frame could come in CPU queue between the while - * condition and the check for error later on. And in that case - * the new frame is just removed and not processed. - */ - err = 0; - - ocelot_parse_ifh(ifh, &info); - - ocelot_port = ocelot->ports[info.port]; - priv = container_of(ocelot_port, struct ocelot_port_private, - port); - dev = priv->dev; - - skb = netdev_alloc_skb(dev, info.len); - - if (unlikely(!skb)) { - netdev_err(dev, "Unable to allocate sk_buff\n"); - err = -ENOMEM; - break; - } - buf_len = info.len - ETH_FCS_LEN; - buf = (u32 *)skb_put(skb, buf_len); - - len = 0; - do { - sz = ocelot_rx_frame_word(ocelot, grp, false, &val); - *buf++ = val; - len += sz; - } while (len < buf_len); - - /* Read the FCS */ - sz = ocelot_rx_frame_word(ocelot, grp, false, &val); - /* Update the statistics if part of the FCS was read before */ - len -= ETH_FCS_LEN - sz; - - if (unlikely(dev->features & NETIF_F_RXFCS)) { - buf = (u32 *)skb_put(skb, ETH_FCS_LEN); - *buf = val; - } - - if (sz < 0) { - err = sz; - break; - } - - if (ocelot->ptp) { - ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); - - tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec); - if ((tod_in_ns & 0xffffffff) < info.timestamp) - full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) | - info.timestamp; - else - full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) | - info.timestamp; - - shhwtstamps = skb_hwtstamps(skb); - memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); - shhwtstamps->hwtstamp = full_ts_in_ns; - } + err = ocelot_xtr_poll_frame(ocelot, grp, &skb); + if (err) + goto out; - /* Everything we see on an interface that is in the HW bridge - * has already been forwarded. - */ - if (ocelot->bridge_mask & BIT(info.port)) - skb->offload_fwd_mark = 1; + skb->dev->stats.rx_bytes += skb->len; + skb->dev->stats.rx_packets++; - skb->protocol = eth_type_trans(skb, dev); if (!skb_defer_rx_timestamp(skb)) netif_rx(skb); - dev->stats.rx_bytes += len; - dev->stats.rx_packets++; - } while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)); + } - if (err) - while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) - ocelot_read_rix(ocelot, QS_XTR_RD, grp); +out: + if (err < 0) + ocelot_drain_cpu_queue(ocelot, 0); return IRQ_HANDLED; } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h index 9bed42719ae5..563dba384a53 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h @@ -249,8 +249,6 @@ static inline u32 ionic_coal_usec_to_hw(struct ionic *ionic, u32 usecs) return (usecs * mult) / div; } -typedef void (*ionic_reset_cb)(struct ionic_lif *lif, void *arg); - void ionic_link_status_check_request(struct ionic_lif *lif, bool can_sleep); void ionic_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *ns); diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 9197da8e626a..cbc30df4e08a 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1252,6 +1252,7 @@ static void rtl_set_d3_pll_down(struct rtl8169_private *tp, bool enable) { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: + case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30: case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_37: case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_63: if (enable) @@ -5055,7 +5056,7 @@ static int r8169_mdio_register(struct rtl8169_private *tp) new_bus->name = "r8169"; new_bus->priv = tp; new_bus->parent = &pdev->dev; - new_bus->irq[0] = PHY_IGNORE_INTERRUPT; + new_bus->irq[0] = PHY_MAC_INTERRUPT; snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x", pci_dev_id(pdev)); new_bus->read = r8169_mdio_read_reg; diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c index b1e7f7ab281c..fceb6d637235 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c @@ -203,8 +203,8 @@ int sxgbe_mdio_register(struct net_device *ndev) case PHY_POLL: irq_str = "POLL"; break; - case PHY_IGNORE_INTERRUPT: - irq_str = "IGNORE"; + case PHY_MAC_INTERRUPT: + irq_str = "MAC"; break; default: sprintf(irq_num, "%d", phy->irq); diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index 53f14c5a9e02..e675ba12fde2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -219,6 +219,14 @@ config DWMAC_INTEL_PLAT This selects the Intel platform specific glue layer support for the stmmac device driver. This driver is used for the Intel Keem Bay SoC. + +config DWMAC_VISCONTI + tristate "Toshiba Visconti DWMAC support" + default ARCH_VISCONTI + depends on OF && COMMON_CLK && (ARCH_VISCONTI || COMPILE_TEST) + help + Support for ethernet controller on Visconti SoCs. + endif config DWMAC_INTEL diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile index 24e6145d4eae..366740ab9c5a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Makefile +++ b/drivers/net/ethernet/stmicro/stmmac/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_DWMAC_DWC_QOS_ETH) += dwmac-dwc-qos-eth.o obj-$(CONFIG_DWMAC_INTEL_PLAT) += dwmac-intel-plat.o obj-$(CONFIG_DWMAC_GENERIC) += dwmac-generic.o obj-$(CONFIG_DWMAC_IMX8) += dwmac-imx.o +obj-$(CONFIG_DWMAC_VISCONTI) += dwmac-visconti.o stmmac-platform-objs:= stmmac_platform.o dwmac-altr-socfpga-objs := altr_tse_pcs.o dwmac-socfpga.o diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c new file mode 100644 index 000000000000..b7a0c57dfbfb --- /dev/null +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Toshiba Visconti Ethernet Support + * + * (C) Copyright 2020 TOSHIBA CORPORATION + * (C) Copyright 2020 Toshiba Electronic Devices & Storage Corporation + */ + +#include <linux/module.h> +#include <linux/of_device.h> +#include <linux/of_net.h> +#include <linux/stmmac.h> + +#include "stmmac_platform.h" +#include "dwmac4.h" + +#define REG_ETHER_CONTROL 0x52D4 +#define ETHER_ETH_CONTROL_RESET BIT(17) + +#define REG_ETHER_CLOCK_SEL 0x52D0 +#define ETHER_CLK_SEL_TX_CLK_EN BIT(0) +#define ETHER_CLK_SEL_RX_CLK_EN BIT(1) +#define ETHER_CLK_SEL_RMII_CLK_EN BIT(2) +#define ETHER_CLK_SEL_RMII_CLK_RST BIT(3) +#define ETHER_CLK_SEL_DIV_SEL_2 BIT(4) +#define ETHER_CLK_SEL_DIV_SEL_20 BIT(0) +#define ETHER_CLK_SEL_FREQ_SEL_125M (BIT(9) | BIT(8)) +#define ETHER_CLK_SEL_FREQ_SEL_50M BIT(9) +#define ETHER_CLK_SEL_FREQ_SEL_25M BIT(8) +#define ETHER_CLK_SEL_FREQ_SEL_2P5M BIT(0) +#define ETHER_CLK_SEL_TX_CLK_EXT_SEL_IN BIT(0) +#define ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC BIT(10) +#define ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV BIT(11) +#define ETHER_CLK_SEL_RX_CLK_EXT_SEL_IN BIT(0) +#define ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC BIT(12) +#define ETHER_CLK_SEL_RX_CLK_EXT_SEL_DIV BIT(13) +#define ETHER_CLK_SEL_TX_CLK_O_TX_I BIT(0) +#define ETHER_CLK_SEL_TX_CLK_O_RMII_I BIT(14) +#define ETHER_CLK_SEL_TX_O_E_N_IN BIT(15) +#define ETHER_CLK_SEL_RMII_CLK_SEL_IN BIT(0) +#define ETHER_CLK_SEL_RMII_CLK_SEL_RX_C BIT(16) + +#define ETHER_CLK_SEL_RX_TX_CLK_EN (ETHER_CLK_SEL_RX_CLK_EN | ETHER_CLK_SEL_TX_CLK_EN) + +#define ETHER_CONFIG_INTF_MII 0 +#define ETHER_CONFIG_INTF_RGMII BIT(0) +#define ETHER_CONFIG_INTF_RMII BIT(2) + +struct visconti_eth { + void __iomem *reg; + u32 phy_intf_sel; + struct clk *phy_ref_clk; + spinlock_t lock; /* lock to protect register update */ +}; + +static void visconti_eth_fix_mac_speed(void *priv, unsigned int speed) +{ + struct visconti_eth *dwmac = priv; + unsigned int val, clk_sel_val; + unsigned long flags; + + spin_lock_irqsave(&dwmac->lock, flags); + + /* adjust link */ + val = readl(dwmac->reg + MAC_CTRL_REG); + val &= ~(GMAC_CONFIG_PS | GMAC_CONFIG_FES); + + switch (speed) { + case SPEED_1000: + if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RGMII) + clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_125M; + break; + case SPEED_100: + if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RGMII) + clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_25M; + if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RMII) + clk_sel_val = ETHER_CLK_SEL_DIV_SEL_2; + val |= GMAC_CONFIG_PS | GMAC_CONFIG_FES; + break; + case SPEED_10: + if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RGMII) + clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_2P5M; + if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RMII) + clk_sel_val = ETHER_CLK_SEL_DIV_SEL_20; + val |= GMAC_CONFIG_PS; + break; + default: + /* No bit control */ + break; + } + + writel(val, dwmac->reg + MAC_CTRL_REG); + + /* Stop internal clock */ + val = readl(dwmac->reg + REG_ETHER_CLOCK_SEL); + val &= ~(ETHER_CLK_SEL_RMII_CLK_EN | ETHER_CLK_SEL_RX_TX_CLK_EN); + val |= ETHER_CLK_SEL_TX_O_E_N_IN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + switch (dwmac->phy_intf_sel) { + case ETHER_CONFIG_INTF_RGMII: + val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC; + break; + case ETHER_CONFIG_INTF_RMII: + val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_DIV | + ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC | ETHER_CLK_SEL_TX_O_E_N_IN | + ETHER_CLK_SEL_RMII_CLK_SEL_RX_C; + break; + case ETHER_CONFIG_INTF_MII: + default: + val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC | + ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV | ETHER_CLK_SEL_TX_O_E_N_IN | + ETHER_CLK_SEL_RMII_CLK_EN; + break; + } + + /* Start clock */ + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + val |= ETHER_CLK_SEL_RX_TX_CLK_EN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + val &= ~ETHER_CLK_SEL_TX_O_E_N_IN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + spin_unlock_irqrestore(&dwmac->lock, flags); +} + +static int visconti_eth_init_hw(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat) +{ + struct visconti_eth *dwmac = plat_dat->bsp_priv; + unsigned int reg_val, clk_sel_val; + + switch (plat_dat->phy_interface) { + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: + dwmac->phy_intf_sel = ETHER_CONFIG_INTF_RGMII; + break; + case PHY_INTERFACE_MODE_MII: + dwmac->phy_intf_sel = ETHER_CONFIG_INTF_MII; + break; + case PHY_INTERFACE_MODE_RMII: + dwmac->phy_intf_sel = ETHER_CONFIG_INTF_RMII; + break; + default: + dev_err(&pdev->dev, "Unsupported phy-mode (%d)\n", plat_dat->phy_interface); + return -EOPNOTSUPP; + } + + reg_val = dwmac->phy_intf_sel; + writel(reg_val, dwmac->reg + REG_ETHER_CONTROL); + + /* Enable TX/RX clock */ + clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_125M; + writel(clk_sel_val, dwmac->reg + REG_ETHER_CLOCK_SEL); + + writel((clk_sel_val | ETHER_CLK_SEL_RMII_CLK_EN | ETHER_CLK_SEL_RX_TX_CLK_EN), + dwmac->reg + REG_ETHER_CLOCK_SEL); + + /* release internal-reset */ + reg_val |= ETHER_ETH_CONTROL_RESET; + writel(reg_val, dwmac->reg + REG_ETHER_CONTROL); + + return 0; +} + +static int visconti_eth_clock_probe(struct platform_device *pdev, + struct plat_stmmacenet_data *plat_dat) +{ + struct visconti_eth *dwmac = plat_dat->bsp_priv; + int err; + + dwmac->phy_ref_clk = devm_clk_get(&pdev->dev, "phy_ref_clk"); + if (IS_ERR(dwmac->phy_ref_clk)) { + dev_err(&pdev->dev, "phy_ref_clk clock not found.\n"); + return PTR_ERR(dwmac->phy_ref_clk); + } + + err = clk_prepare_enable(dwmac->phy_ref_clk); + if (err < 0) { + dev_err(&pdev->dev, "failed to enable phy_ref clock: %d\n", err); + return err; + } + + return 0; +} + +static int visconti_eth_clock_remove(struct platform_device *pdev) +{ + struct visconti_eth *dwmac = get_stmmac_bsp_priv(&pdev->dev); + struct net_device *ndev = platform_get_drvdata(pdev); + struct stmmac_priv *priv = netdev_priv(ndev); + + clk_disable_unprepare(dwmac->phy_ref_clk); + clk_disable_unprepare(priv->plat->stmmac_clk); + + return 0; +} + +static int visconti_eth_dwmac_probe(struct platform_device *pdev) +{ + struct plat_stmmacenet_data *plat_dat; + struct stmmac_resources stmmac_res; + struct visconti_eth *dwmac; + int ret; + + ret = stmmac_get_platform_resources(pdev, &stmmac_res); + if (ret) + return ret; + + plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + if (IS_ERR(plat_dat)) + return PTR_ERR(plat_dat); + + dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL); + if (!dwmac) { + ret = -ENOMEM; + goto remove_config; + } + + dwmac->reg = stmmac_res.addr; + plat_dat->bsp_priv = dwmac; + plat_dat->fix_mac_speed = visconti_eth_fix_mac_speed; + + ret = visconti_eth_clock_probe(pdev, plat_dat); + if (ret) + goto remove_config; + + visconti_eth_init_hw(pdev, plat_dat); + + plat_dat->dma_cfg->aal = 1; + + ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + if (ret) + goto remove; + + return ret; + +remove: + visconti_eth_clock_remove(pdev); +remove_config: + stmmac_remove_config_dt(pdev, plat_dat); + + return ret; +} + +static int visconti_eth_dwmac_remove(struct platform_device *pdev) +{ + struct net_device *ndev = platform_get_drvdata(pdev); + struct stmmac_priv *priv = netdev_priv(ndev); + int err; + + err = stmmac_pltfr_remove(pdev); + if (err < 0) + dev_err(&pdev->dev, "failed to remove platform: %d\n", err); + + err = visconti_eth_clock_remove(pdev); + if (err < 0) + dev_err(&pdev->dev, "failed to remove clock: %d\n", err); + + stmmac_remove_config_dt(pdev, priv->plat); + + return err; +} + +static const struct of_device_id visconti_eth_dwmac_match[] = { + { .compatible = "toshiba,visconti-dwmac" }, + { } +}; +MODULE_DEVICE_TABLE(of, visconti_eth_dwmac_match); + +static struct platform_driver visconti_eth_dwmac_driver = { + .probe = visconti_eth_dwmac_probe, + .remove = visconti_eth_dwmac_remove, + .driver = { + .name = "visconti-eth-dwmac", + .of_match_table = visconti_eth_dwmac_match, + }, +}; +module_platform_driver(visconti_eth_dwmac_driver); + +MODULE_AUTHOR("Toshiba"); +MODULE_DESCRIPTION("Toshiba Visconti Ethernet DWMAC glue driver"); +MODULE_AUTHOR("Nobuhiro Iwamatsu <[email protected]"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/ethernet/xilinx/Kconfig b/drivers/net/ethernet/xilinx/Kconfig index 3b2137d1f4c6..c6eb7f2368aa 100644 --- a/drivers/net/ethernet/xilinx/Kconfig +++ b/drivers/net/ethernet/xilinx/Kconfig @@ -18,7 +18,6 @@ if NET_VENDOR_XILINX config XILINX_EMACLITE tristate "Xilinx 10/100 Ethernet Lite support" - depends on PPC32 || MICROBLAZE || ARCH_ZYNQ || MIPS || COMPILE_TEST select PHYLIB help This driver supports the 10/100 Ethernet Lite from Xilinx. diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index e8febfb1924d..3a8775e0ca55 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1862,6 +1862,18 @@ static int axienet_probe(struct platform_device *pdev) lp->options = XAE_OPTION_DEFAULTS; lp->rx_bd_num = RX_BD_NUM_DEFAULT; lp->tx_bd_num = TX_BD_NUM_DEFAULT; + + lp->clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(lp->clk)) { + ret = PTR_ERR(lp->clk); + goto free_netdev; + } + ret = clk_prepare_enable(lp->clk); + if (ret) { + dev_err(&pdev->dev, "Unable to enable clock: %d\n", ret); + goto free_netdev; + } + /* Map device registers */ ethres = platform_get_resource(pdev, IORESOURCE_MEM, 0); lp->regs = devm_ioremap_resource(&pdev->dev, ethres); @@ -2046,20 +2058,6 @@ static int axienet_probe(struct platform_device *pdev) lp->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (lp->phy_node) { - lp->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(lp->clk)) { - dev_warn(&pdev->dev, "Failed to get clock: %ld\n", - PTR_ERR(lp->clk)); - lp->clk = NULL; - } else { - ret = clk_prepare_enable(lp->clk); - if (ret) { - dev_err(&pdev->dev, "Unable to enable clock: %d\n", - ret); - goto free_netdev; - } - } - ret = axienet_mdio_setup(lp); if (ret) dev_warn(&pdev->dev, diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 008b9a40faad..007840d4a807 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1193,8 +1193,8 @@ static int xemaclite_of_probe(struct platform_device *ofdev) } dev_info(dev, - "Xilinx EmacLite at 0x%08X mapped to 0x%08lX, irq=%d\n", - (unsigned int __force)ndev->mem_start, + "Xilinx EmacLite at 0x%08lX mapped to 0x%08lX, irq=%d\n", + (unsigned long __force)ndev->mem_start, (unsigned long __force)lp->base_addr, ndev->irq); return 0; diff --git a/drivers/net/ipa/ipa_main.c b/drivers/net/ipa/ipa_main.c index c10e7340b031..97c1b55405cb 100644 --- a/drivers/net/ipa/ipa_main.c +++ b/drivers/net/ipa/ipa_main.c @@ -580,10 +580,10 @@ ipa_resource_config(struct ipa *ipa, const struct ipa_resource_data *data) return -EINVAL; for (i = 0; i < data->resource_src_count; i++) - ipa_resource_config_src(ipa, data->resource_src); + ipa_resource_config_src(ipa, &data->resource_src[i]); for (i = 0; i < data->resource_dst_count; i++) - ipa_resource_config_dst(ipa, data->resource_dst); + ipa_resource_config_dst(ipa, &data->resource_dst[i]); return 0; } diff --git a/drivers/net/mdio/mdio-moxart.c b/drivers/net/mdio/mdio-moxart.c index b72c6d185175..f0cff584e176 100644 --- a/drivers/net/mdio/mdio-moxart.c +++ b/drivers/net/mdio/mdio-moxart.c @@ -125,7 +125,7 @@ static int moxart_mdio_probe(struct platform_device *pdev) snprintf(bus->id, MII_BUS_ID_SIZE, "%s-%d-mii", pdev->name, pdev->id); bus->parent = &pdev->dev; - /* Setting PHY_IGNORE_INTERRUPT here even if it has no effect, + /* Setting PHY_MAC_INTERRUPT here even if it has no effect, * of_mdiobus_register() sets these PHY_POLL. * Ideally, the interrupt from MAC controller could be used to * detect link state changes, not polling, i.e. if there was @@ -133,7 +133,7 @@ static int moxart_mdio_probe(struct platform_device *pdev) * interrupt handled in ethernet drivercode. */ for (i = 0; i < PHY_MAX_ADDR; i++) - bus->irq[i] = PHY_IGNORE_INTERRUPT; + bus->irq[i] = PHY_MAC_INTERRUPT; data = bus->priv; data->base = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c index 4daf94bb56a5..ea9d5855fb52 100644 --- a/drivers/net/mdio/of_mdio.c +++ b/drivers/net/mdio/of_mdio.c @@ -462,36 +462,6 @@ struct phy_device *of_phy_get_and_connect(struct net_device *dev, } EXPORT_SYMBOL(of_phy_get_and_connect); -/** - * of_phy_attach - Attach to a PHY without starting the state machine - * @dev: pointer to net_device claiming the phy - * @phy_np: Node pointer for the PHY - * @flags: flags to pass to the PHY - * @iface: PHY data interface type - * - * If successful, returns a pointer to the phy_device with the embedded - * struct device refcount incremented by one, or NULL on failure. The - * refcount must be dropped by calling phy_disconnect() or phy_detach(). - */ -struct phy_device *of_phy_attach(struct net_device *dev, - struct device_node *phy_np, u32 flags, - phy_interface_t iface) -{ - struct phy_device *phy = of_phy_find_device(phy_np); - int ret; - - if (!phy) - return NULL; - - ret = phy_attach_direct(dev, phy, flags, iface); - - /* refcount is held by phy_attach_direct() on success */ - put_device(&phy->mdio.dev); - - return ret ? NULL : phy; -} -EXPORT_SYMBOL(of_phy_attach); - /* * of_phy_is_fixed_link() and of_phy_register_fixed_link() must * support two DT bindings: diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index d67bddc111e3..c2aa4c92edde 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -1190,6 +1190,7 @@ static struct phy_driver at803x_driver[] = { .probe = at803x_probe, .remove = at803x_remove, .config_init = at803x_config_init, + .config_aneg = at803x_config_aneg, .soft_reset = genphy_soft_reset, .set_wol = at803x_set_wol, .get_wol = at803x_get_wol, diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 0472b3470c59..fa0be591ae79 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -26,7 +26,46 @@ MODULE_DESCRIPTION("Broadcom PHY driver"); MODULE_AUTHOR("Maciej W. Rozycki"); MODULE_LICENSE("GPL"); -static int bcm54xx_config_clock_delay(struct phy_device *phydev); +static int bcm54xx_config_clock_delay(struct phy_device *phydev) +{ + int rc, val; + + /* handling PHY's internal RX clock delay */ + val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC); + val |= MII_BCM54XX_AUXCTL_MISC_WREN; + if (phydev->interface == PHY_INTERFACE_MODE_RGMII || + phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) { + /* Disable RGMII RXC-RXD skew */ + val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN; + } + if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID || + phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) { + /* Enable RGMII RXC-RXD skew */ + val |= MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN; + } + rc = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC, + val); + if (rc < 0) + return rc; + + /* handling PHY's internal TX clock delay */ + val = bcm_phy_read_shadow(phydev, BCM54810_SHD_CLK_CTL); + if (phydev->interface == PHY_INTERFACE_MODE_RGMII || + phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) { + /* Disable internal TX clock delay */ + val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN; + } + if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID || + phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) { + /* Enable internal TX clock delay */ + val |= BCM54810_SHD_CLK_CTL_GTXCLK_EN; + } + rc = bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val); + if (rc < 0) + return rc; + + return 0; +} static int bcm54210e_config_init(struct phy_device *phydev) { @@ -64,45 +103,62 @@ static int bcm54612e_config_init(struct phy_device *phydev) return 0; } -static int bcm54xx_config_clock_delay(struct phy_device *phydev) +static int bcm54616s_config_init(struct phy_device *phydev) { int rc, val; - /* handling PHY's internal RX clock delay */ + if (phydev->interface != PHY_INTERFACE_MODE_SGMII && + phydev->interface != PHY_INTERFACE_MODE_1000BASEX) + return 0; + + /* Ensure proper interface mode is selected. */ + /* Disable RGMII mode */ val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC); + if (val < 0) + return val; + val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN; val |= MII_BCM54XX_AUXCTL_MISC_WREN; - if (phydev->interface == PHY_INTERFACE_MODE_RGMII || - phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) { - /* Disable RGMII RXC-RXD skew */ - val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN; - } - if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID || - phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) { - /* Enable RGMII RXC-RXD skew */ - val |= MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN; - } rc = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC, val); if (rc < 0) return rc; - /* handling PHY's internal TX clock delay */ - val = bcm_phy_read_shadow(phydev, BCM54810_SHD_CLK_CTL); - if (phydev->interface == PHY_INTERFACE_MODE_RGMII || - phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) { - /* Disable internal TX clock delay */ - val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN; - } - if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID || - phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) { - /* Enable internal TX clock delay */ - val |= BCM54810_SHD_CLK_CTL_GTXCLK_EN; - } - rc = bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val); + /* Select 1000BASE-X register set (primary SerDes) */ + val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE); + if (val < 0) + return val; + val |= BCM54XX_SHD_MODE_1000BX; + rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val); if (rc < 0) return rc; - return 0; + /* Power down SerDes interface */ + rc = phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN); + if (rc < 0) + return rc; + + /* Select proper interface mode */ + val &= ~BCM54XX_SHD_INTF_SEL_MASK; + val |= phydev->interface == PHY_INTERFACE_MODE_SGMII ? + BCM54XX_SHD_INTF_SEL_SGMII : + BCM54XX_SHD_INTF_SEL_GBIC; + rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val); + if (rc < 0) + return rc; + + /* Power up SerDes interface */ + rc = phy_clear_bits(phydev, MII_BMCR, BMCR_PDOWN); + if (rc < 0) + return rc; + + /* Select copper register set */ + val &= ~BCM54XX_SHD_MODE_1000BX; + rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val); + if (rc < 0) + return rc; + + /* Power up copper interface */ + return phy_clear_bits(phydev, MII_BMCR, BMCR_PDOWN); } /* Needs SMDSP clock enabled via bcm54xx_phydsp_config() */ @@ -195,6 +251,7 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) if (BRCM_PHY_MODEL(phydev) != PHY_ID_BCM57780 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610M && + BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54210E && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54810 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54811) return; @@ -229,9 +286,10 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) val |= BCM54XX_SHD_SCR3_DLLAPD_DIS; if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) { - if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810 || - BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54811) - val |= BCM54810_SHD_SCR3_TRDDAPD; + if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E || + BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810 || + BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E) + val |= BCM54XX_SHD_SCR3_RXCTXC_DIS; else val |= BCM54XX_SHD_SCR3_TRDDAPD; } @@ -283,15 +341,17 @@ static int bcm54xx_config_init(struct phy_device *phydev) bcm54xx_adjust_rxrefclk(phydev); - if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E) { + switch (BRCM_PHY_MODEL(phydev)) { + case PHY_ID_BCM54210E: err = bcm54210e_config_init(phydev); - if (err) - return err; - } else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54612E) { + break; + case PHY_ID_BCM54612E: err = bcm54612e_config_init(phydev); - if (err) - return err; - } else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) { + break; + case PHY_ID_BCM54616S: + err = bcm54616s_config_init(phydev); + break; + case PHY_ID_BCM54810: /* For BCM54810, we need to disable BroadR-Reach function */ val = bcm_phy_read_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL); @@ -299,24 +359,31 @@ static int bcm54xx_config_init(struct phy_device *phydev) err = bcm_phy_write_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL, val); - if (err < 0) - return err; + break; } + if (err) + return err; bcm54xx_phydsp_config(phydev); - /* Encode link speed into LED1 and LED3 pair (green/amber). + /* For non-SFP setups, encode link speed into LED1 and LED3 pair + * (green/amber). * Also flash these two LEDs on activity. This means configuring * them for MULTICOLOR and encoding link/activity into them. + * Don't do this for devices on an SFP module, since some of these + * use the LED outputs to control the SFP LOS signal, and changing + * these settings will cause LOS to malfunction. */ - val = BCM5482_SHD_LEDS1_LED1(BCM_LED_SRC_MULTICOLOR1) | - BCM5482_SHD_LEDS1_LED3(BCM_LED_SRC_MULTICOLOR1); - bcm_phy_write_shadow(phydev, BCM5482_SHD_LEDS1, val); - - val = BCM_LED_MULTICOLOR_IN_PHASE | - BCM5482_SHD_LEDS1_LED1(BCM_LED_MULTICOLOR_LINK_ACT) | - BCM5482_SHD_LEDS1_LED3(BCM_LED_MULTICOLOR_LINK_ACT); - bcm_phy_write_exp(phydev, BCM_EXP_MULTICOLOR, val); + if (!phy_on_sfp(phydev)) { + val = BCM5482_SHD_LEDS1_LED1(BCM_LED_SRC_MULTICOLOR1) | + BCM5482_SHD_LEDS1_LED3(BCM_LED_SRC_MULTICOLOR1); + bcm_phy_write_shadow(phydev, BCM5482_SHD_LEDS1, val); + + val = BCM_LED_MULTICOLOR_IN_PHASE | + BCM5482_SHD_LEDS1_LED1(BCM_LED_MULTICOLOR_LINK_ACT) | + BCM5482_SHD_LEDS1_LED3(BCM_LED_MULTICOLOR_LINK_ACT); + bcm_phy_write_exp(phydev, BCM_EXP_MULTICOLOR, val); + } return 0; } @@ -383,9 +450,20 @@ static int bcm5481_config_aneg(struct phy_device *phydev) return ret; } +struct bcm54616s_phy_priv { + bool mode_1000bx_en; +}; + static int bcm54616s_probe(struct phy_device *phydev) { - int val, intf_sel; + struct bcm54616s_phy_priv *priv; + int val; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE); if (val < 0) @@ -397,8 +475,7 @@ static int bcm54616s_probe(struct phy_device *phydev) * RGMII-1000Base-X is properly supported, but RGMII-100Base-FX * support is still missing as of now. */ - intf_sel = (val & BCM54XX_SHD_INTF_SEL_MASK) >> 1; - if (intf_sel == 1) { + if ((val & BCM54XX_SHD_INTF_SEL_MASK) == BCM54XX_SHD_INTF_SEL_RGMII) { val = bcm_phy_read_shadow(phydev, BCM54616S_SHD_100FX_CTRL); if (val < 0) return val; @@ -409,7 +486,7 @@ static int bcm54616s_probe(struct phy_device *phydev) * 1000BASE-X configuration. */ if (!(val & BCM54616S_100FX_MODE)) - phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX; + priv->mode_1000bx_en = true; phydev->port = PORT_FIBRE; } @@ -419,10 +496,11 @@ static int bcm54616s_probe(struct phy_device *phydev) static int bcm54616s_config_aneg(struct phy_device *phydev) { + struct bcm54616s_phy_priv *priv = phydev->priv; int ret; /* Aneg firstly. */ - if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX) + if (priv->mode_1000bx_en) ret = genphy_c37_config_aneg(phydev); else ret = genphy_config_aneg(phydev); @@ -435,9 +513,10 @@ static int bcm54616s_config_aneg(struct phy_device *phydev) static int bcm54616s_read_status(struct phy_device *phydev) { + struct bcm54616s_phy_priv *priv = phydev->priv; int err; - if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX) + if (priv->mode_1000bx_en) err = genphy_c37_read_status(phydev); else err = genphy_read_status(phydev); diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c index 4e15d4d02488..3e431737c1ba 100644 --- a/drivers/net/phy/icplus.c +++ b/drivers/net/phy/icplus.c @@ -188,7 +188,7 @@ static int ip175c_read_status(struct phy_device *phydev) genphy_read_status(phydev); else /* Don't need to read status for switch ports */ - phydev->irq = PHY_IGNORE_INTERRUPT; + phydev->irq = PHY_MAC_INTERRUPT; return 0; } diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 3238d0fbf437..e26a5d663f8a 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -684,16 +684,19 @@ static int m88e1111_config_aneg(struct phy_device *phydev) if (err < 0) goto error; - /* Do not touch the fiber page if we're in copper->sgmii mode */ - if (phydev->interface == PHY_INTERFACE_MODE_SGMII) - return 0; - /* Then the fiber link */ err = marvell_set_page(phydev, MII_MARVELL_FIBER_PAGE); if (err < 0) goto error; - err = marvell_config_aneg_fiber(phydev); + if (phydev->interface == PHY_INTERFACE_MODE_SGMII) + /* Do not touch the fiber advertisement if we're in copper->sgmii mode. + * Just ensure that SGMII-side autonegotiation is enabled. + * If we switched from some other mode to SGMII it may not be. + */ + err = genphy_check_and_restart_aneg(phydev, false); + else + err = marvell_config_aneg_fiber(phydev); if (err < 0) goto error; diff --git a/drivers/net/phy/mscc/Makefile b/drivers/net/phy/mscc/Makefile index d8e22a4eeeff..78d84194f79a 100644 --- a/drivers/net/phy/mscc/Makefile +++ b/drivers/net/phy/mscc/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_MICROSEMI_PHY) := mscc.o mscc-objs := mscc_main.o +mscc-objs += mscc_serdes.o ifdef CONFIG_MACSEC mscc-objs += mscc_macsec.o diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h index 9481bce94c2e..a50235fdf7d9 100644 --- a/drivers/net/phy/mscc/mscc.h +++ b/drivers/net/phy/mscc/mscc.h @@ -102,6 +102,7 @@ enum rgmii_clock_delay { #define PHY_MCB_S6G_READ BIT(30) #define PHY_S6G_PLL5G_CFG0 0x06 +#define PHY_S6G_PLL5G_CFG2 0x08 #define PHY_S6G_LCPLL_CFG 0x11 #define PHY_S6G_PLL_CFG 0x2b #define PHY_S6G_COMMON_CFG 0x2c @@ -121,6 +122,9 @@ enum rgmii_clock_delay { #define PHY_S6G_PLL_FSM_CTRL_DATA_POS 8 #define PHY_S6G_PLL_FSM_ENA_POS 7 +#define PHY_S6G_CFG2_FSM_DIS 1 +#define PHY_S6G_CFG2_FSM_CLK_BP 23 + #define MSCC_EXT_PAGE_ACCESS 31 #define MSCC_PHY_PAGE_STANDARD 0x0000 /* Standard registers */ #define MSCC_PHY_PAGE_EXTENDED 0x0001 /* Extended registers */ @@ -136,6 +140,10 @@ enum rgmii_clock_delay { #define MSCC_PHY_PAGE_1588 0x1588 /* PTP (1588) */ #define MSCC_PHY_PAGE_TEST 0x2a30 /* Test reg */ #define MSCC_PHY_PAGE_TR 0x52b5 /* Token ring registers */ +#define MSCC_PHY_GPIO_CONTROL_2 14 + +#define MSCC_PHY_COMA_MODE 0x2000 /* input(1) / output(0) */ +#define MSCC_PHY_COMA_OUTPUT 0x1000 /* value to output */ /* Extended Page 1 Registers */ #define MSCC_PHY_CU_MEDIA_CRC_VALID_CNT 18 @@ -335,6 +343,10 @@ enum rgmii_clock_delay { #define VSC8584_REVB 0x0001 #define MSCC_DEV_REV_MASK GENMASK(3, 0) +#define MSCC_ROM_TRAP_SERDES_6G_CFG 0x1E48 +#define MSCC_RAM_TRAP_SERDES_6G_CFG 0x1E4F +#define PATCH_VEC_ZERO_EN 0x0100 + struct reg_val { u16 reg; u32 val; @@ -412,6 +424,22 @@ struct vsc8531_edge_rate_table { }; #endif /* CONFIG_OF_MDIO */ +enum csr_target { + MACRO_CTRL = 0x07, +}; + +u32 vsc85xx_csr_read(struct phy_device *phydev, + enum csr_target target, u32 reg); + +int vsc85xx_csr_write(struct phy_device *phydev, + enum csr_target target, u32 reg, u32 val); + +int phy_base_write(struct phy_device *phydev, u32 regnum, u16 val); +int phy_base_read(struct phy_device *phydev, u32 regnum); +int phy_update_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb); +int phy_commit_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb); +int vsc8584_cmd(struct phy_device *phydev, u16 val); + #if IS_ENABLED(CONFIG_MACSEC) int vsc8584_macsec_init(struct phy_device *phydev); void vsc8584_handle_macsec_interrupt(struct phy_device *phydev); diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index 2f2157e3deab..3a7705228ed5 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -17,7 +17,7 @@ #include <linux/of.h> #include <linux/netdevice.h> #include <dt-bindings/net/mscc-phy-vsc8531.h> - +#include "mscc_serdes.h" #include "mscc.h" static const struct vsc85xx_hw_stat vsc85xx_hw_stats[] = { @@ -689,7 +689,7 @@ out_unlock: } /* phydev->bus->mdio_lock should be locked when using this function */ -static int phy_base_write(struct phy_device *phydev, u32 regnum, u16 val) +int phy_base_write(struct phy_device *phydev, u32 regnum, u16 val) { if (unlikely(!mutex_is_locked(&phydev->mdio.bus->mdio_lock))) { dev_err(&phydev->mdio.dev, "MDIO bus lock not held!\n"); @@ -700,7 +700,7 @@ static int phy_base_write(struct phy_device *phydev, u32 regnum, u16 val) } /* phydev->bus->mdio_lock should be locked when using this function */ -static int phy_base_read(struct phy_device *phydev, u32 regnum) +int phy_base_read(struct phy_device *phydev, u32 regnum) { if (unlikely(!mutex_is_locked(&phydev->mdio.bus->mdio_lock))) { dev_err(&phydev->mdio.dev, "MDIO bus lock not held!\n"); @@ -710,6 +710,113 @@ static int phy_base_read(struct phy_device *phydev, u32 regnum) return __phy_package_read(phydev, regnum); } +u32 vsc85xx_csr_read(struct phy_device *phydev, + enum csr_target target, u32 reg) +{ + unsigned long deadline; + u32 val, val_l, val_h; + + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_CSR_CNTL); + + /* CSR registers are grouped under different Target IDs. + * 6-bit Target_ID is split between MSCC_EXT_PAGE_CSR_CNTL_20 and + * MSCC_EXT_PAGE_CSR_CNTL_19 registers. + * Target_ID[5:2] maps to bits[3:0] of MSCC_EXT_PAGE_CSR_CNTL_20 + * and Target_ID[1:0] maps to bits[13:12] of MSCC_EXT_PAGE_CSR_CNTL_19. + */ + + /* Setup the Target ID */ + phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_20, + MSCC_PHY_CSR_CNTL_20_TARGET(target >> 2)); + + if ((target >> 2 == 0x1) || (target >> 2 == 0x3)) + /* non-MACsec access */ + target &= 0x3; + else + target = 0; + + /* Trigger CSR Action - Read into the CSR's */ + phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_19, + MSCC_PHY_CSR_CNTL_19_CMD | MSCC_PHY_CSR_CNTL_19_READ | + MSCC_PHY_CSR_CNTL_19_REG_ADDR(reg) | + MSCC_PHY_CSR_CNTL_19_TARGET(target)); + + /* Wait for register access*/ + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + usleep_range(500, 1000); + val = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_19); + } while (time_before(jiffies, deadline) && + !(val & MSCC_PHY_CSR_CNTL_19_CMD)); + + if (!(val & MSCC_PHY_CSR_CNTL_19_CMD)) + return 0xffffffff; + + /* Read the Least Significant Word (LSW) (17) */ + val_l = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_17); + + /* Read the Most Significant Word (MSW) (18) */ + val_h = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_18); + + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, + MSCC_PHY_PAGE_STANDARD); + + return (val_h << 16) | val_l; +} + +int vsc85xx_csr_write(struct phy_device *phydev, + enum csr_target target, u32 reg, u32 val) +{ + unsigned long deadline; + + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_CSR_CNTL); + + /* CSR registers are grouped under different Target IDs. + * 6-bit Target_ID is split between MSCC_EXT_PAGE_CSR_CNTL_20 and + * MSCC_EXT_PAGE_CSR_CNTL_19 registers. + * Target_ID[5:2] maps to bits[3:0] of MSCC_EXT_PAGE_CSR_CNTL_20 + * and Target_ID[1:0] maps to bits[13:12] of MSCC_EXT_PAGE_CSR_CNTL_19. + */ + + /* Setup the Target ID */ + phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_20, + MSCC_PHY_CSR_CNTL_20_TARGET(target >> 2)); + + /* Write the Least Significant Word (LSW) (17) */ + phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_17, (u16)val); + + /* Write the Most Significant Word (MSW) (18) */ + phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_18, (u16)(val >> 16)); + + if ((target >> 2 == 0x1) || (target >> 2 == 0x3)) + /* non-MACsec access */ + target &= 0x3; + else + target = 0; + + /* Trigger CSR Action - Write into the CSR's */ + phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_19, + MSCC_PHY_CSR_CNTL_19_CMD | + MSCC_PHY_CSR_CNTL_19_REG_ADDR(reg) | + MSCC_PHY_CSR_CNTL_19_TARGET(target)); + + /* Wait for register access */ + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + usleep_range(500, 1000); + val = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_19); + } while (time_before(jiffies, deadline) && + !(val & MSCC_PHY_CSR_CNTL_19_CMD)); + + if (!(val & MSCC_PHY_CSR_CNTL_19_CMD)) + return -ETIMEDOUT; + + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, + MSCC_PHY_PAGE_STANDARD); + + return 0; +} + /* bus->mdio_lock should be locked when using this function */ static void vsc8584_csr_write(struct phy_device *phydev, u16 addr, u32 val) { @@ -719,7 +826,7 @@ static void vsc8584_csr_write(struct phy_device *phydev, u16 addr, u32 val) } /* bus->mdio_lock should be locked when using this function */ -static int vsc8584_cmd(struct phy_device *phydev, u16 val) +int vsc8584_cmd(struct phy_device *phydev, u16 val) { unsigned long deadline; u16 reg_val; @@ -1131,6 +1238,92 @@ out: return ret; } +/* Access LCPLL Cfg_2 */ +static void vsc8584_pll5g_cfg2_wr(struct phy_device *phydev, + bool disable_fsm) +{ + u32 rd_dat; + + rd_dat = vsc85xx_csr_read(phydev, MACRO_CTRL, PHY_S6G_PLL5G_CFG2); + rd_dat &= ~BIT(PHY_S6G_CFG2_FSM_DIS); + rd_dat |= (disable_fsm << PHY_S6G_CFG2_FSM_DIS); + vsc85xx_csr_write(phydev, MACRO_CTRL, PHY_S6G_PLL5G_CFG2, rd_dat); +} + +/* trigger a read to the spcified MCB */ +static int vsc8584_mcb_rd_trig(struct phy_device *phydev, + u32 mcb_reg_addr, u8 mcb_slave_num) +{ + u32 rd_dat = 0; + + /* read MCB */ + vsc85xx_csr_write(phydev, MACRO_CTRL, mcb_reg_addr, + (0x40000000 | (1L << mcb_slave_num))); + + return read_poll_timeout(vsc85xx_csr_read, rd_dat, + !(rd_dat & 0x40000000), + 4000, 200000, 0, + phydev, MACRO_CTRL, mcb_reg_addr); +} + +/* trigger a write to the spcified MCB */ +static int vsc8584_mcb_wr_trig(struct phy_device *phydev, + u32 mcb_reg_addr, + u8 mcb_slave_num) +{ + u32 rd_dat = 0; + + /* write back MCB */ + vsc85xx_csr_write(phydev, MACRO_CTRL, mcb_reg_addr, + (0x80000000 | (1L << mcb_slave_num))); + + return read_poll_timeout(vsc85xx_csr_read, rd_dat, + !(rd_dat & 0x80000000), + 4000, 200000, 0, + phydev, MACRO_CTRL, mcb_reg_addr); +} + +/* Sequence to Reset LCPLL for the VIPER and ELISE PHY */ +static int vsc8584_pll5g_reset(struct phy_device *phydev) +{ + bool dis_fsm; + int ret = 0; + + ret = vsc8584_mcb_rd_trig(phydev, 0x11, 0); + if (ret < 0) + goto done; + dis_fsm = 1; + + /* Reset LCPLL */ + vsc8584_pll5g_cfg2_wr(phydev, dis_fsm); + + /* write back LCPLL MCB */ + ret = vsc8584_mcb_wr_trig(phydev, 0x11, 0); + if (ret < 0) + goto done; + + /* 10 mSec sleep while LCPLL is hold in reset */ + usleep_range(10000, 20000); + + /* read LCPLL MCB into CSRs */ + ret = vsc8584_mcb_rd_trig(phydev, 0x11, 0); + if (ret < 0) + goto done; + dis_fsm = 0; + + /* Release the Reset of LCPLL */ + vsc8584_pll5g_cfg2_wr(phydev, dis_fsm); + + /* write back LCPLL MCB */ + ret = vsc8584_mcb_wr_trig(phydev, 0x11, 0); + if (ret < 0) + goto done; + + usleep_range(110000, 200000); +done: + return ret; +} + /* bus->mdio_lock should be locked when using this function */ static int vsc8584_config_pre_init(struct phy_device *phydev) { @@ -1323,6 +1516,21 @@ static void vsc8584_get_base_addr(struct phy_device *phydev) vsc8531->addr = addr; } +static void vsc85xx_coma_mode_release(struct phy_device *phydev) +{ + /* The coma mode (pin or reg) provides an optional feature that + * may be used to control when the PHYs become active. + * Alternatively the COMA_MODE pin may be connected low + * so that the PHYs are fully active once out of reset. + */ + + /* Enable output (mode=0) and write zero to it */ + vsc85xx_phy_write_page(phydev, MSCC_PHY_PAGE_EXTENDED_GPIO); + __phy_modify(phydev, MSCC_PHY_GPIO_CONTROL_2, + MSCC_PHY_COMA_MODE | MSCC_PHY_COMA_OUTPUT, 0); + vsc85xx_phy_write_page(phydev, MSCC_PHY_PAGE_STANDARD); +} + static int vsc8584_config_init(struct phy_device *phydev) { struct vsc8531_private *vsc8531 = phydev->priv; @@ -1541,6 +1749,100 @@ static int vsc85xx_config_init(struct phy_device *phydev) return 0; } +static int __phy_write_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb, + u32 op) +{ + unsigned long deadline; + u32 val; + int ret; + + ret = vsc85xx_csr_write(phydev, PHY_MCB_TARGET, reg, + op | (1 << mcb)); + if (ret) + return -EINVAL; + + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + usleep_range(500, 1000); + val = vsc85xx_csr_read(phydev, PHY_MCB_TARGET, reg); + + if (val == 0xffffffff) + return -EIO; + + } while (time_before(jiffies, deadline) && (val & op)); + + if (val & op) + return -ETIMEDOUT; + + return 0; +} + +/* Trigger a read to the specified MCB */ +int phy_update_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb) +{ + return __phy_write_mcb_s6g(phydev, reg, mcb, PHY_MCB_S6G_READ); +} + +/* Trigger a write to the specified MCB */ +int phy_commit_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb) +{ + return __phy_write_mcb_s6g(phydev, reg, mcb, PHY_MCB_S6G_WRITE); +} + +static int vsc8514_config_host_serdes(struct phy_device *phydev) +{ + int ret; + u16 val; + + ret = phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, + MSCC_PHY_PAGE_EXTENDED_GPIO); + if (ret) + return ret; + + val = phy_base_read(phydev, MSCC_PHY_MAC_CFG_FASTLINK); + val &= ~MAC_CFG_MASK; + val |= MAC_CFG_QSGMII; + ret = phy_base_write(phydev, MSCC_PHY_MAC_CFG_FASTLINK, val); + if (ret) + return ret; + + ret = phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, + MSCC_PHY_PAGE_STANDARD); + if (ret) + return ret; + + ret = vsc8584_cmd(phydev, PROC_CMD_NOP); + if (ret) + return ret; + + ret = vsc8584_cmd(phydev, + PROC_CMD_MCB_ACCESS_MAC_CONF | + PROC_CMD_RST_CONF_PORT | + PROC_CMD_READ_MOD_WRITE_PORT | PROC_CMD_QSGMII_MAC); + if (ret) { + dev_err(&phydev->mdio.dev, "%s: QSGMII error: %d\n", + __func__, ret); + return ret; + } + + /* Apply 6G SerDes FOJI Algorithm + * Initial condition requirement: + * 1. hold 8051 in reset + * 2. disable patch vector 0, in order to allow IB cal poll during FoJi + * 3. deassert 8051 reset after change patch vector status + * 4. proceed with FoJi (vsc85xx_sd6g_config_v2) + */ + vsc8584_micro_assert_reset(phydev); + val = phy_base_read(phydev, MSCC_INT_MEM_CNTL); + /* clear bit 8, to disable patch vector 0 */ + val &= ~PATCH_VEC_ZERO_EN; + ret = phy_base_write(phydev, MSCC_INT_MEM_CNTL, val); + /* Enable 8051 clock, don't set patch present, disable PRAM clock override */ + vsc8584_micro_deassert_reset(phydev, false); + + return vsc85xx_sd6g_config_v2(phydev); +} + static int vsc8514_config_pre_init(struct phy_device *phydev) { /* These are the settings to override the silicon default @@ -1569,8 +1871,16 @@ static int vsc8514_config_pre_init(struct phy_device *phydev) {0x16b2, 0x00007000}, {0x16b4, 0x00000814}, }; + struct device *dev = &phydev->mdio.dev; unsigned int i; u16 reg; + int ret; + + ret = vsc8584_pll5g_reset(phydev); + if (ret < 0) { + dev_err(dev, "failed LCPLL reset, ret: %d\n", ret); + return ret; + } phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD); @@ -1602,151 +1912,48 @@ static int vsc8514_config_pre_init(struct phy_device *phydev) reg &= ~SMI_BROADCAST_WR_EN; phy_base_write(phydev, MSCC_PHY_EXT_CNTL_STATUS, reg); - return 0; -} - -static u32 vsc85xx_csr_ctrl_phy_read(struct phy_device *phydev, - u32 target, u32 reg) -{ - unsigned long deadline; - u32 val, val_l, val_h; - - phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_CSR_CNTL); - - /* CSR registers are grouped under different Target IDs. - * 6-bit Target_ID is split between MSCC_EXT_PAGE_CSR_CNTL_20 and - * MSCC_EXT_PAGE_CSR_CNTL_19 registers. - * Target_ID[5:2] maps to bits[3:0] of MSCC_EXT_PAGE_CSR_CNTL_20 - * and Target_ID[1:0] maps to bits[13:12] of MSCC_EXT_PAGE_CSR_CNTL_19. - */ - - /* Setup the Target ID */ - phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_20, - MSCC_PHY_CSR_CNTL_20_TARGET(target >> 2)); - - /* Trigger CSR Action - Read into the CSR's */ - phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_19, - MSCC_PHY_CSR_CNTL_19_CMD | MSCC_PHY_CSR_CNTL_19_READ | - MSCC_PHY_CSR_CNTL_19_REG_ADDR(reg) | - MSCC_PHY_CSR_CNTL_19_TARGET(target & 0x3)); - - /* Wait for register access*/ - deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); - do { - usleep_range(500, 1000); - val = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_19); - } while (time_before(jiffies, deadline) && - !(val & MSCC_PHY_CSR_CNTL_19_CMD)); - - if (!(val & MSCC_PHY_CSR_CNTL_19_CMD)) - return 0xffffffff; - - /* Read the Least Significant Word (LSW) (17) */ - val_l = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_17); - - /* Read the Most Significant Word (MSW) (18) */ - val_h = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_18); - - phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, - MSCC_PHY_PAGE_STANDARD); - - return (val_h << 16) | val_l; -} - -static int vsc85xx_csr_ctrl_phy_write(struct phy_device *phydev, - u32 target, u32 reg, u32 val) -{ - unsigned long deadline; - - phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_CSR_CNTL); - - /* CSR registers are grouped under different Target IDs. - * 6-bit Target_ID is split between MSCC_EXT_PAGE_CSR_CNTL_20 and - * MSCC_EXT_PAGE_CSR_CNTL_19 registers. - * Target_ID[5:2] maps to bits[3:0] of MSCC_EXT_PAGE_CSR_CNTL_20 - * and Target_ID[1:0] maps to bits[13:12] of MSCC_EXT_PAGE_CSR_CNTL_19. + /* Add pre-patching commands to: + * 1. enable 8051 clock, operate 8051 clock at 125 MHz + * instead of HW default 62.5MHz + * 2. write patch vector 0, to skip IB cal polling executed + * as part of the 0x80E0 ROM command */ + vsc8584_micro_deassert_reset(phydev, false); - /* Setup the Target ID */ - phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_20, - MSCC_PHY_CSR_CNTL_20_TARGET(target >> 2)); - - /* Write the Least Significant Word (LSW) (17) */ - phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_17, (u16)val); - - /* Write the Most Significant Word (MSW) (18) */ - phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_18, (u16)(val >> 16)); - - /* Trigger CSR Action - Write into the CSR's */ - phy_base_write(phydev, MSCC_EXT_PAGE_CSR_CNTL_19, - MSCC_PHY_CSR_CNTL_19_CMD | - MSCC_PHY_CSR_CNTL_19_REG_ADDR(reg) | - MSCC_PHY_CSR_CNTL_19_TARGET(target & 0x3)); - - /* Wait for register access */ - deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); - do { - usleep_range(500, 1000); - val = phy_base_read(phydev, MSCC_EXT_PAGE_CSR_CNTL_19); - } while (time_before(jiffies, deadline) && - !(val & MSCC_PHY_CSR_CNTL_19_CMD)); - - if (!(val & MSCC_PHY_CSR_CNTL_19_CMD)) - return -ETIMEDOUT; - + vsc8584_micro_assert_reset(phydev); phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, - MSCC_PHY_PAGE_STANDARD); - - return 0; -} - -static int __phy_write_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb, - u32 op) -{ - unsigned long deadline; - u32 val; - int ret; - - ret = vsc85xx_csr_ctrl_phy_write(phydev, PHY_MCB_TARGET, reg, - op | (1 << mcb)); + MSCC_PHY_PAGE_EXTENDED_GPIO); + /* ROM address to trap, for patch vector 0 */ + reg = MSCC_ROM_TRAP_SERDES_6G_CFG; + ret = phy_base_write(phydev, MSCC_TRAP_ROM_ADDR(1), reg); if (ret) - return -EINVAL; - - deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); - do { - usleep_range(500, 1000); - val = vsc85xx_csr_ctrl_phy_read(phydev, PHY_MCB_TARGET, reg); - - if (val == 0xffffffff) - return -EIO; - - } while (time_before(jiffies, deadline) && (val & op)); - - if (val & op) - return -ETIMEDOUT; - - return 0; -} - -/* Trigger a read to the specified MCB */ -static int phy_update_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb) -{ - return __phy_write_mcb_s6g(phydev, reg, mcb, PHY_MCB_S6G_READ); -} + goto err; + /* RAM address to jump to, when patch vector 0 enabled */ + reg = MSCC_RAM_TRAP_SERDES_6G_CFG; + ret = phy_base_write(phydev, MSCC_PATCH_RAM_ADDR(1), reg); + if (ret) + goto err; + reg = phy_base_read(phydev, MSCC_INT_MEM_CNTL); + reg |= PATCH_VEC_ZERO_EN; /* bit 8, enable patch vector 0 */ + ret = phy_base_write(phydev, MSCC_INT_MEM_CNTL, reg); + if (ret) + goto err; -/* Trigger a write to the specified MCB */ -static int phy_commit_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb) -{ - return __phy_write_mcb_s6g(phydev, reg, mcb, PHY_MCB_S6G_WRITE); + /* Enable 8051 clock, don't set patch present + * yet, disable PRAM clock override + */ + vsc8584_micro_deassert_reset(phydev, false); + return ret; + err: + /* restore 8051 and bail w error */ + vsc8584_micro_deassert_reset(phydev, false); + return ret; } static int vsc8514_config_init(struct phy_device *phydev) { struct vsc8531_private *vsc8531 = phydev->priv; - unsigned long deadline; int ret, i; - u16 val; - u32 reg; phydev->mdix_ctrl = ETH_TP_MDI_AUTO; @@ -1763,123 +1970,14 @@ static int vsc8514_config_init(struct phy_device *phydev) * do the correct init sequence for all PHYs that are package-critical * in this pre-init function. */ - if (phy_package_init_once(phydev)) - vsc8514_config_pre_init(phydev); - - ret = phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, - MSCC_PHY_PAGE_EXTENDED_GPIO); - if (ret) - goto err; - - val = phy_base_read(phydev, MSCC_PHY_MAC_CFG_FASTLINK); - - val &= ~MAC_CFG_MASK; - val |= MAC_CFG_QSGMII; - ret = phy_base_write(phydev, MSCC_PHY_MAC_CFG_FASTLINK, val); - if (ret) - goto err; - - ret = phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, - MSCC_PHY_PAGE_STANDARD); - if (ret) - goto err; - - ret = vsc8584_cmd(phydev, - PROC_CMD_MCB_ACCESS_MAC_CONF | - PROC_CMD_RST_CONF_PORT | - PROC_CMD_READ_MOD_WRITE_PORT | PROC_CMD_QSGMII_MAC); - if (ret) - goto err; - - /* 6g mcb */ - phy_update_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); - /* lcpll mcb */ - phy_update_mcb_s6g(phydev, PHY_S6G_LCPLL_CFG, 0); - /* pll5gcfg0 */ - ret = vsc85xx_csr_ctrl_phy_write(phydev, PHY_MCB_TARGET, - PHY_S6G_PLL5G_CFG0, 0x7036f145); - if (ret) - goto err; - - phy_commit_mcb_s6g(phydev, PHY_S6G_LCPLL_CFG, 0); - /* pllcfg */ - ret = vsc85xx_csr_ctrl_phy_write(phydev, PHY_MCB_TARGET, - PHY_S6G_PLL_CFG, - (3 << PHY_S6G_PLL_ENA_OFFS_POS) | - (120 << PHY_S6G_PLL_FSM_CTRL_DATA_POS) - | (0 << PHY_S6G_PLL_FSM_ENA_POS)); - if (ret) - goto err; - - /* commoncfg */ - ret = vsc85xx_csr_ctrl_phy_write(phydev, PHY_MCB_TARGET, - PHY_S6G_COMMON_CFG, - (0 << PHY_S6G_SYS_RST_POS) | - (0 << PHY_S6G_ENA_LANE_POS) | - (0 << PHY_S6G_ENA_LOOP_POS) | - (0 << PHY_S6G_QRATE_POS) | - (3 << PHY_S6G_IF_MODE_POS)); - if (ret) - goto err; - - /* misccfg */ - ret = vsc85xx_csr_ctrl_phy_write(phydev, PHY_MCB_TARGET, - PHY_S6G_MISC_CFG, 1); - if (ret) - goto err; - - /* gpcfg */ - ret = vsc85xx_csr_ctrl_phy_write(phydev, PHY_MCB_TARGET, - PHY_S6G_GPC_CFG, 768); - if (ret) - goto err; - - phy_commit_mcb_s6g(phydev, PHY_S6G_DFT_CFG2, 0); - - deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); - do { - usleep_range(500, 1000); - phy_update_mcb_s6g(phydev, PHY_MCB_S6G_CFG, - 0); /* read 6G MCB into CSRs */ - reg = vsc85xx_csr_ctrl_phy_read(phydev, PHY_MCB_TARGET, - PHY_S6G_PLL_STATUS); - if (reg == 0xffffffff) { - phy_unlock_mdio_bus(phydev); - return -EIO; - } - - } while (time_before(jiffies, deadline) && (reg & BIT(12))); - - if (reg & BIT(12)) { - phy_unlock_mdio_bus(phydev); - return -ETIMEDOUT; - } - - /* misccfg */ - ret = vsc85xx_csr_ctrl_phy_write(phydev, PHY_MCB_TARGET, - PHY_S6G_MISC_CFG, 0); - if (ret) - goto err; - - phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); - - deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); - do { - usleep_range(500, 1000); - phy_update_mcb_s6g(phydev, PHY_MCB_S6G_CFG, - 0); /* read 6G MCB into CSRs */ - reg = vsc85xx_csr_ctrl_phy_read(phydev, PHY_MCB_TARGET, - PHY_S6G_IB_STATUS0); - if (reg == 0xffffffff) { - phy_unlock_mdio_bus(phydev); - return -EIO; - } - - } while (time_before(jiffies, deadline) && !(reg & BIT(8))); - - if (!(reg & BIT(8))) { - phy_unlock_mdio_bus(phydev); - return -ETIMEDOUT; + if (phy_package_init_once(phydev)) { + ret = vsc8514_config_pre_init(phydev); + if (ret) + goto err; + ret = vsc8514_config_host_serdes(phydev); + if (ret) + goto err; + vsc85xx_coma_mode_release(phydev); } phy_unlock_mdio_bus(phydev); diff --git a/drivers/net/phy/mscc/mscc_serdes.c b/drivers/net/phy/mscc/mscc_serdes.c new file mode 100644 index 000000000000..b3e854f53d67 --- /dev/null +++ b/drivers/net/phy/mscc/mscc_serdes.c @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* + * Driver for Microsemi VSC85xx PHYs + * + * Author: Bjarni Jonasson <[email protected]> + * License: Dual MIT/GPL + * Copyright (c) 2021 Microsemi Corporation + */ + +#include <linux/phy.h> +#include "mscc_serdes.h" +#include "mscc.h" + +static int pll5g_detune(struct phy_device *phydev) +{ + u32 rd_dat; + int ret; + + rd_dat = vsc85xx_csr_read(phydev, MACRO_CTRL, PHY_S6G_PLL5G_CFG2); + rd_dat &= ~PHY_S6G_PLL5G_CFG2_GAIN_MASK; + rd_dat |= PHY_S6G_PLL5G_CFG2_ENA_GAIN; + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_PLL5G_CFG2, rd_dat); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int pll5g_tune(struct phy_device *phydev) +{ + u32 rd_dat; + int ret; + + rd_dat = vsc85xx_csr_read(phydev, MACRO_CTRL, PHY_S6G_PLL5G_CFG2); + rd_dat &= ~PHY_S6G_PLL5G_CFG2_ENA_GAIN; + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_PLL5G_CFG2, rd_dat); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_pll_cfg_wr(struct phy_device *phydev, + const u32 pll_ena_offs, + const u32 pll_fsm_ctrl_data, + const u32 pll_fsm_ena) +{ + int ret; + + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_PLL_CFG, + (pll_fsm_ena << PHY_S6G_PLL_ENA_OFFS_POS) | + (pll_fsm_ctrl_data << PHY_S6G_PLL_FSM_CTRL_DATA_POS) | + (pll_ena_offs << PHY_S6G_PLL_FSM_ENA_POS)); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_common_cfg_wr(struct phy_device *phydev, + const u32 sys_rst, + const u32 ena_lane, + const u32 ena_loop, + const u32 qrate, + const u32 if_mode, + const u32 pwd_tx) +{ + /* ena_loop = 8 for eloop */ + /* = 4 for floop */ + /* = 2 for iloop */ + /* = 1 for ploop */ + /* qrate = 1 for SGMII, 0 for QSGMII */ + /* if_mode = 1 for SGMII, 3 for QSGMII */ + + int ret; + + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_COMMON_CFG, + (sys_rst << PHY_S6G_SYS_RST_POS) | + (ena_lane << PHY_S6G_ENA_LANE_POS) | + (ena_loop << PHY_S6G_ENA_LOOP_POS) | + (qrate << PHY_S6G_QRATE_POS) | + (if_mode << PHY_S6G_IF_MODE_POS)); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_des_cfg_wr(struct phy_device *phydev, + const u32 des_phy_ctrl, + const u32 des_mbtr_ctrl, + const u32 des_bw_hyst, + const u32 des_bw_ana, + const u32 des_cpmd_sel) +{ + u32 reg_val; + int ret; + + /* configurable terms */ + reg_val = (des_phy_ctrl << PHY_S6G_DES_PHY_CTRL_POS) | + (des_mbtr_ctrl << PHY_S6G_DES_MBTR_CTRL_POS) | + (des_cpmd_sel << PHY_S6G_DES_CPMD_SEL_POS) | + (des_bw_hyst << PHY_S6G_DES_BW_HYST_POS) | + (des_bw_ana << PHY_S6G_DES_BW_ANA_POS); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_DES_CFG, + reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_ib_cfg0_wr(struct phy_device *phydev, + const u32 ib_rtrm_adj, + const u32 ib_sig_det_clk_sel, + const u32 ib_reg_pat_sel_offset, + const u32 ib_cal_ena) +{ + u32 base_val; + u32 reg_val; + int ret; + + /* constant terms */ + base_val = 0x60a85837; + /* configurable terms */ + reg_val = base_val | (ib_rtrm_adj << 25) | + (ib_sig_det_clk_sel << 16) | + (ib_reg_pat_sel_offset << 8) | + (ib_cal_ena << 3); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_IB_CFG0, + reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_ib_cfg1_wr(struct phy_device *phydev, + const u32 ib_tjtag, + const u32 ib_tsdet, + const u32 ib_scaly, + const u32 ib_frc_offset, + const u32 ib_filt_offset) +{ + u32 ib_filt_val; + u32 reg_val = 0; + int ret; + + /* constant terms */ + ib_filt_val = 0xe0; + /* configurable terms */ + reg_val = (ib_tjtag << 17) + (ib_tsdet << 12) + (ib_scaly << 8) + + ib_filt_val + (ib_filt_offset << 4) + (ib_frc_offset << 0); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_IB_CFG1, + reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_ib_cfg2_wr(struct phy_device *phydev, + const u32 ib_tinfv, + const u32 ib_tcalv, + const u32 ib_ureg) +{ + u32 ib_cfg2_val; + u32 base_val; + int ret; + + /* constant terms */ + base_val = 0x0f878010; + /* configurable terms */ + ib_cfg2_val = base_val | ((ib_tinfv) << 28) | ((ib_tcalv) << 5) | + (ib_ureg << 0); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_IB_CFG2, + ib_cfg2_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_ib_cfg3_wr(struct phy_device *phydev, + const u32 ib_ini_hp, + const u32 ib_ini_mid, + const u32 ib_ini_lp, + const u32 ib_ini_offset) +{ + u32 reg_val; + int ret; + + reg_val = (ib_ini_hp << 24) + (ib_ini_mid << 16) + + (ib_ini_lp << 8) + (ib_ini_offset << 0); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_IB_CFG3, + reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_ib_cfg4_wr(struct phy_device *phydev, + const u32 ib_max_hp, + const u32 ib_max_mid, + const u32 ib_max_lp, + const u32 ib_max_offset) +{ + u32 reg_val; + int ret; + + reg_val = (ib_max_hp << 24) + (ib_max_mid << 16) + + (ib_max_lp << 8) + (ib_max_offset << 0); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_IB_CFG4, + reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_misc_cfg_wr(struct phy_device *phydev, + const u32 lane_rst) +{ + int ret; + + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_MISC_CFG, + lane_rst); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_gp_cfg_wr(struct phy_device *phydev, const u32 gp_cfg_val) +{ + int ret; + + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_GP_CFG, + gp_cfg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_dft_cfg2_wr(struct phy_device *phydev, + const u32 rx_ji_ampl, + const u32 rx_step_freq, + const u32 rx_ji_ena, + const u32 rx_waveform_sel, + const u32 rx_freqoff_dir, + const u32 rx_freqoff_ena) +{ + u32 reg_val; + int ret; + + /* configurable terms */ + reg_val = (rx_ji_ampl << 8) | (rx_step_freq << 4) | + (rx_ji_ena << 3) | (rx_waveform_sel << 2) | + (rx_freqoff_dir << 1) | rx_freqoff_ena; + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_IB_DFT_CFG2, + reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +static int vsc85xx_sd6g_dft_cfg0_wr(struct phy_device *phydev, + const u32 prbs_sel, + const u32 test_mode, + const u32 rx_dft_ena) +{ + u32 reg_val; + int ret; + + /* configurable terms */ + reg_val = (prbs_sel << 20) | (test_mode << 16) | (rx_dft_ena << 2); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_DFT_CFG0, + reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +/* Access LCPLL Cfg_0 */ +static int vsc85xx_pll5g_cfg0_wr(struct phy_device *phydev, + const u32 selbgv820) +{ + u32 base_val; + u32 reg_val; + int ret; + + /* constant terms */ + base_val = 0x7036f145; + /* configurable terms */ + reg_val = base_val | (selbgv820 << 23); + ret = vsc85xx_csr_write(phydev, MACRO_CTRL, + PHY_S6G_PLL5G_CFG0, reg_val); + if (ret) + dev_err(&phydev->mdio.dev, "%s: write error\n", __func__); + return ret; +} + +int vsc85xx_sd6g_config_v2(struct phy_device *phydev) +{ + u32 ib_sig_det_clk_sel_cal = 0; + u32 ib_sig_det_clk_sel_mm = 7; + u32 pll_fsm_ctrl_data = 60; + unsigned long deadline; + u32 des_bw_ana_val = 3; + u32 ib_tsdet_cal = 16; + u32 ib_tsdet_mm = 5; + u32 ib_rtrm_adj; + u32 if_mode = 1; + u32 gp_iter = 5; + u32 val32 = 0; + u32 qrate = 1; + u32 iter; + int val = 0; + int ret; + + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD); + + /* Detune/Unlock LCPLL */ + ret = pll5g_detune(phydev); + if (ret) + return ret; + + /* 0. Reset RCPLL */ + ret = vsc85xx_sd6g_pll_cfg_wr(phydev, 3, pll_fsm_ctrl_data, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_common_cfg_wr(phydev, 0, 0, 0, qrate, if_mode, 0); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_des_cfg_wr(phydev, 6, 2, 5, des_bw_ana_val, 0); + if (ret) + return ret; + + /* 1. Configure sd6g for SGMII prior to sd6g_IB_CAL */ + ib_rtrm_adj = 13; + ret = vsc85xx_sd6g_ib_cfg0_wr(phydev, ib_rtrm_adj, ib_sig_det_clk_sel_mm, 0, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg1_wr(phydev, 8, ib_tsdet_mm, 15, 0, 1); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg2_wr(phydev, 3, 13, 5); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg3_wr(phydev, 0, 31, 1, 31); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg4_wr(phydev, 63, 63, 2, 63); + if (ret) + return ret; + ret = vsc85xx_sd6g_common_cfg_wr(phydev, 1, 1, 0, qrate, if_mode, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_misc_cfg_wr(phydev, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 2. Start rcpll_fsm */ + ret = vsc85xx_sd6g_pll_cfg_wr(phydev, 3, pll_fsm_ctrl_data, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + usleep_range(500, 1000); + ret = phy_update_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + val32 = vsc85xx_csr_read(phydev, MACRO_CTRL, + PHY_S6G_PLL_STATUS); + /* wait for bit 12 to clear */ + } while (time_before(jiffies, deadline) && (val32 & BIT(12))); + + if (val32 & BIT(12)) + return -ETIMEDOUT; + + /* 4. Release digital reset and disable transmitter */ + ret = vsc85xx_sd6g_misc_cfg_wr(phydev, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_common_cfg_wr(phydev, 1, 1, 0, qrate, if_mode, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 5. Apply a frequency offset on RX-side (using internal FoJi logic) */ + ret = vsc85xx_sd6g_gp_cfg_wr(phydev, 768); + if (ret) + return ret; + ret = vsc85xx_sd6g_dft_cfg2_wr(phydev, 0, 2, 0, 0, 0, 1); + if (ret) + return ret; + ret = vsc85xx_sd6g_dft_cfg0_wr(phydev, 0, 0, 1); + if (ret) + return ret; + ret = vsc85xx_sd6g_des_cfg_wr(phydev, 6, 2, 5, des_bw_ana_val, 2); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 6. Prepare required settings for IBCAL */ + ret = vsc85xx_sd6g_ib_cfg1_wr(phydev, 8, ib_tsdet_cal, 15, 1, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg0_wr(phydev, ib_rtrm_adj, ib_sig_det_clk_sel_cal, 0, 0); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 7. Start IB_CAL */ + ret = vsc85xx_sd6g_ib_cfg0_wr(phydev, ib_rtrm_adj, + ib_sig_det_clk_sel_cal, 0, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + /* 11 cycles (for ViperA) or 5 cycles (for ViperB & Elise) w/ SW clock */ + for (iter = 0; iter < gp_iter; iter++) { + /* set gp(0) */ + ret = vsc85xx_sd6g_gp_cfg_wr(phydev, 769); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + /* clear gp(0) */ + ret = vsc85xx_sd6g_gp_cfg_wr(phydev, 768); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + } + + ret = vsc85xx_sd6g_ib_cfg1_wr(phydev, 8, ib_tsdet_cal, 15, 1, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg1_wr(phydev, 8, ib_tsdet_cal, 15, 0, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 8. Wait for IB cal to complete */ + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + usleep_range(500, 1000); + ret = phy_update_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + val32 = vsc85xx_csr_read(phydev, MACRO_CTRL, + PHY_S6G_IB_STATUS0); + /* wait for bit 8 to set */ + } while (time_before(jiffies, deadline) && (~val32 & BIT(8))); + + if (~val32 & BIT(8)) + return -ETIMEDOUT; + + /* 9. Restore cfg values for mission mode */ + ret = vsc85xx_sd6g_ib_cfg0_wr(phydev, ib_rtrm_adj, ib_sig_det_clk_sel_mm, 0, 1); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg1_wr(phydev, 8, ib_tsdet_mm, 15, 0, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 10. Re-enable transmitter */ + ret = vsc85xx_sd6g_common_cfg_wr(phydev, 1, 1, 0, qrate, if_mode, 0); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 11. Disable frequency offset generation (using internal FoJi logic) */ + ret = vsc85xx_sd6g_dft_cfg2_wr(phydev, 0, 0, 0, 0, 0, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_dft_cfg0_wr(phydev, 0, 0, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_des_cfg_wr(phydev, 6, 2, 5, des_bw_ana_val, 0); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* Tune/Re-lock LCPLL */ + ret = pll5g_tune(phydev); + if (ret) + return ret; + + /* 12. Configure for Final Configuration and Settings */ + /* a. Reset RCPLL */ + ret = vsc85xx_sd6g_pll_cfg_wr(phydev, 3, pll_fsm_ctrl_data, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_common_cfg_wr(phydev, 0, 1, 0, qrate, if_mode, 0); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* b. Configure sd6g for desired operating mode */ + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_EXTENDED_GPIO); + ret = phy_base_read(phydev, MSCC_PHY_MAC_CFG_FASTLINK); + if ((ret & MAC_CFG_MASK) == MAC_CFG_QSGMII) { + /* QSGMII */ + pll_fsm_ctrl_data = 120; + qrate = 0; + if_mode = 3; + des_bw_ana_val = 5; + val = PROC_CMD_MCB_ACCESS_MAC_CONF | PROC_CMD_RST_CONF_PORT | + PROC_CMD_READ_MOD_WRITE_PORT | PROC_CMD_QSGMII_MAC; + + ret = vsc8584_cmd(phydev, val); + if (ret) { + dev_err(&phydev->mdio.dev, "%s: QSGMII error: %d\n", + __func__, ret); + return ret; + } + + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD); + } else if ((ret & MAC_CFG_MASK) == MAC_CFG_SGMII) { + /* SGMII */ + pll_fsm_ctrl_data = 60; + qrate = 1; + if_mode = 1; + des_bw_ana_val = 3; + + val = PROC_CMD_MCB_ACCESS_MAC_CONF | PROC_CMD_RST_CONF_PORT | + PROC_CMD_READ_MOD_WRITE_PORT | PROC_CMD_SGMII_MAC; + + ret = vsc8584_cmd(phydev, val); + if (ret) { + dev_err(&phydev->mdio.dev, "%s: SGMII error: %d\n", + __func__, ret); + return ret; + } + + phy_base_write(phydev, MSCC_EXT_PAGE_ACCESS, MSCC_PHY_PAGE_STANDARD); + } else { + dev_err(&phydev->mdio.dev, "%s: invalid mac_if: %x\n", + __func__, ret); + } + + ret = phy_update_mcb_s6g(phydev, PHY_S6G_LCPLL_CFG, 0); + if (ret) + return ret; + ret = phy_update_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + ret = vsc85xx_pll5g_cfg0_wr(phydev, 4); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_S6G_LCPLL_CFG, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_des_cfg_wr(phydev, 6, 2, 5, des_bw_ana_val, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg0_wr(phydev, ib_rtrm_adj, ib_sig_det_clk_sel_mm, 0, 1); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg1_wr(phydev, 8, ib_tsdet_mm, 15, 0, 1); + if (ret) + return ret; + ret = vsc85xx_sd6g_common_cfg_wr(phydev, 1, 1, 0, qrate, if_mode, 0); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg2_wr(phydev, 3, 13, 5); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg3_wr(phydev, 0, 31, 1, 31); + if (ret) + return ret; + ret = vsc85xx_sd6g_ib_cfg4_wr(phydev, 63, 63, 2, 63); + if (ret) + return ret; + ret = vsc85xx_sd6g_misc_cfg_wr(phydev, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 13. Start rcpll_fsm */ + ret = vsc85xx_sd6g_pll_cfg_wr(phydev, 3, pll_fsm_ctrl_data, 1); + if (ret) + return ret; + ret = phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + + /* 14. Wait for PLL cal to complete */ + deadline = jiffies + msecs_to_jiffies(PROC_CMD_NCOMPLETED_TIMEOUT_MS); + do { + usleep_range(500, 1000); + ret = phy_update_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); + if (ret) + return ret; + val32 = vsc85xx_csr_read(phydev, MACRO_CTRL, + PHY_S6G_PLL_STATUS); + /* wait for bit 12 to clear */ + } while (time_before(jiffies, deadline) && (val32 & BIT(12))); + + if (val32 & BIT(12)) + return -ETIMEDOUT; + + /* release lane reset */ + ret = vsc85xx_sd6g_misc_cfg_wr(phydev, 0); + if (ret) + return ret; + + return phy_commit_mcb_s6g(phydev, PHY_MCB_S6G_CFG, 0); +} diff --git a/drivers/net/phy/mscc/mscc_serdes.h b/drivers/net/phy/mscc/mscc_serdes.h new file mode 100644 index 000000000000..2a6371322af9 --- /dev/null +++ b/drivers/net/phy/mscc/mscc_serdes.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Driver for Microsemi VSC85xx PHYs + * + * Copyright (c) 2021 Microsemi Corporation + */ + +#ifndef _MSCC_SERDES_PHY_H_ +#define _MSCC_SERDES_PHY_H_ + +#define PHY_S6G_PLL5G_CFG2_GAIN_MASK GENMASK(9, 5) +#define PHY_S6G_PLL5G_CFG2_ENA_GAIN 1 + +#define PHY_S6G_DES_PHY_CTRL_POS 13 +#define PHY_S6G_DES_MBTR_CTRL_POS 10 +#define PHY_S6G_DES_CPMD_SEL_POS 8 +#define PHY_S6G_DES_BW_HYST_POS 5 +#define PHY_S6G_DES_BW_ANA_POS 1 +#define PHY_S6G_DES_CFG 0x21 +#define PHY_S6G_IB_CFG0 0x22 +#define PHY_S6G_IB_CFG1 0x23 +#define PHY_S6G_IB_CFG2 0x24 +#define PHY_S6G_IB_CFG3 0x25 +#define PHY_S6G_IB_CFG4 0x26 +#define PHY_S6G_GP_CFG 0x2E +#define PHY_S6G_DFT_CFG0 0x35 +#define PHY_S6G_IB_DFT_CFG2 0x37 + +int vsc85xx_sd6g_config_v2(struct phy_device *phydev); + +#endif /* _MSCC_PHY_SERDES_H_ */ diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index fdb914b5b857..1be07e45d314 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1145,7 +1145,7 @@ void phy_state_machine(struct work_struct *work) } /* Only re-schedule a PHY state machine change if we are polling the - * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving + * PHY, if PHY_MAC_INTERRUPT is set, then we will be moving * between states from phy_mac_interrupt(). * * In state PHY_HALTED the PHY gets suspended, so rescheduling the diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 30a20a29ae05..ce495473cd5d 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -300,50 +300,22 @@ static int mdio_bus_phy_resume(struct device *dev) phydev->suspended_by_mdio_bus = 0; - ret = phy_resume(phydev); + ret = phy_init_hw(phydev); if (ret < 0) return ret; -no_resume: - if (phydev->attached_dev && phydev->adjust_link) - phy_start_machine(phydev); - - return 0; -} - -static int mdio_bus_phy_restore(struct device *dev) -{ - struct phy_device *phydev = to_phy_device(dev); - struct net_device *netdev = phydev->attached_dev; - int ret; - - if (!netdev) - return 0; - - ret = phy_init_hw(phydev); + ret = phy_resume(phydev); if (ret < 0) return ret; - +no_resume: if (phydev->attached_dev && phydev->adjust_link) phy_start_machine(phydev); return 0; } -static const struct dev_pm_ops mdio_bus_phy_pm_ops = { - .suspend = mdio_bus_phy_suspend, - .resume = mdio_bus_phy_resume, - .freeze = mdio_bus_phy_suspend, - .thaw = mdio_bus_phy_resume, - .restore = mdio_bus_phy_restore, -}; - -#define MDIO_BUS_PHY_PM_OPS (&mdio_bus_phy_pm_ops) - -#else - -#define MDIO_BUS_PHY_PM_OPS NULL - +static SIMPLE_DEV_PM_OPS(mdio_bus_phy_pm_ops, mdio_bus_phy_suspend, + mdio_bus_phy_resume); #endif /* CONFIG_PM */ /** @@ -554,7 +526,7 @@ static const struct device_type mdio_bus_phy_type = { .name = "PHY", .groups = phy_dev_groups, .release = phy_device_release, - .pm = MDIO_BUS_PHY_PM_OPS, + .pm = pm_ptr(&mdio_bus_phy_pm_ops), }; static int phy_request_driver_module(struct phy_device *dev, u32 phy_id) @@ -1144,10 +1116,19 @@ int phy_init_hw(struct phy_device *phydev) if (ret < 0) return ret; - if (phydev->drv->config_init) + if (phydev->drv->config_init) { ret = phydev->drv->config_init(phydev); + if (ret < 0) + return ret; + } - return ret; + if (phydev->drv->config_intr) { + ret = phydev->drv->config_intr(phydev); + if (ret < 0) + return ret; + } + + return 0; } EXPORT_SYMBOL(phy_init_hw); @@ -1167,8 +1148,8 @@ char *phy_attached_info_irq(struct phy_device *phydev) case PHY_POLL: irq_str = "POLL"; break; - case PHY_IGNORE_INTERRUPT: - irq_str = "IGNORE"; + case PHY_MAC_INTERRUPT: + irq_str = "MAC"; break; default: snprintf(irq_num, sizeof(irq_num), "%d", phydev->irq); @@ -1377,6 +1358,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, if (phydev->sfp_bus_attached) dev->sfp_bus = phydev->sfp_bus; + else if (dev->sfp_bus) + phydev->is_on_sfp_module = true; } /* Some Ethernet drivers try to connect to a PHY device before diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 84f6e197f965..053c92e02cd8 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -306,6 +306,10 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode) phylink_set(pl->supported, 2500baseX_Full); break; + case PHY_INTERFACE_MODE_5GBASER: + phylink_set(pl->supported, 5000baseT_Full); + break; + case PHY_INTERFACE_MODE_USXGMII: case PHY_INTERFACE_MODE_10GKR: case PHY_INTERFACE_MODE_10GBASER: diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 3cfd773ae5f4..2e11176c6b94 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -400,6 +400,9 @@ phy_interface_t sfp_select_interface(struct sfp_bus *bus, phylink_test(link_modes, 10000baseT_Full)) return PHY_INTERFACE_MODE_10GBASER; + if (phylink_test(link_modes, 5000baseT_Full)) + return PHY_INTERFACE_MODE_5GBASER; + if (phylink_test(link_modes, 2500baseX_Full)) return PHY_INTERFACE_MODE_2500BASEX; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 99caae7d1641..aa1a66ad2ce5 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -35,6 +35,7 @@ #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) #define VETH_XDP_TX_BULK_SIZE 16 +#define VETH_XDP_BATCH 16 struct veth_stats { u64 rx_drops; @@ -562,20 +563,13 @@ static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, return 0; } -static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, - struct xdp_frame *frame, - struct veth_xdp_tx_bq *bq, - struct veth_stats *stats) +static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, + struct xdp_frame *frame, + struct veth_xdp_tx_bq *bq, + struct veth_stats *stats) { - void *hard_start = frame->data - frame->headroom; - int len = frame->len, delta = 0; struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; - unsigned int headroom; - struct sk_buff *skb; - - /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */ - hard_start -= sizeof(struct xdp_frame); rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); @@ -590,8 +584,8 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, switch (act) { case XDP_PASS: - delta = frame->data - xdp.data; - len = xdp.data_end - xdp.data; + if (xdp_update_frame_from_buff(&xdp, frame)) + goto err_xdp; break; case XDP_TX: orig_frame = *frame; @@ -629,19 +623,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, } rcu_read_unlock(); - headroom = sizeof(struct xdp_frame) + frame->headroom - delta; - skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz); - if (!skb) { - xdp_return_frame(frame); - stats->rx_drops++; - goto err; - } - - xdp_release_frame(frame); - xdp_scrub_frame(frame); - skb->protocol = eth_type_trans(skb, rq->dev); -err: - return skb; + return frame; err_xdp: rcu_read_unlock(); xdp_return_frame(frame); @@ -649,6 +631,37 @@ xdp_xmit: return NULL; } +/* frames array contains VETH_XDP_BATCH at most */ +static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, + int n_xdpf, struct veth_xdp_tx_bq *bq, + struct veth_stats *stats) +{ + void *skbs[VETH_XDP_BATCH]; + int i; + + if (xdp_alloc_skb_bulk(skbs, n_xdpf, + GFP_ATOMIC | __GFP_ZERO) < 0) { + for (i = 0; i < n_xdpf; i++) + xdp_return_frame(frames[i]); + stats->rx_drops += n_xdpf; + + return; + } + + for (i = 0; i < n_xdpf; i++) { + struct sk_buff *skb = skbs[i]; + + skb = __xdp_build_skb_from_frame(frames[i], skb, + rq->dev); + if (!skb) { + xdp_return_frame(frames[i]); + stats->rx_drops++; + continue; + } + napi_gro_receive(&rq->xdp_napi, skb); + } +} + static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, struct veth_xdp_tx_bq *bq, @@ -796,32 +809,45 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - int i, done = 0; + int i, done = 0, n_xdpf = 0; + void *xdpf[VETH_XDP_BATCH]; for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); - struct sk_buff *skb; if (!ptr) break; if (veth_is_xdp_frame(ptr)) { + /* ndo_xdp_xmit */ struct xdp_frame *frame = veth_ptr_to_xdp(ptr); stats->xdp_bytes += frame->len; - skb = veth_xdp_rcv_one(rq, frame, bq, stats); + frame = veth_xdp_rcv_one(rq, frame, bq, stats); + if (frame) { + /* XDP_PASS */ + xdpf[n_xdpf++] = frame; + if (n_xdpf == VETH_XDP_BATCH) { + veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, + bq, stats); + n_xdpf = 0; + } + } } else { - skb = ptr; + /* ndo_start_xmit */ + struct sk_buff *skb = ptr; + stats->xdp_bytes += skb->len; skb = veth_xdp_rcv_skb(rq, skb, bq, stats); + if (skb) + napi_gro_receive(&rq->xdp_napi, skb); } - - if (skb) - napi_gro_receive(&rq->xdp_napi, skb); - done++; } + if (n_xdpf) + veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); + u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_redirect += stats->xdp_redirect; rq->stats.vs.xdp_bytes += stats->xdp_bytes; diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c index 93c7e8502845..6c163db52835 100644 --- a/drivers/net/wan/lmc/lmc_main.c +++ b/drivers/net/wan/lmc/lmc_main.c @@ -854,7 +854,7 @@ static int lmc_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) spin_lock_init(&sc->lmc_lock); pci_set_master(pdev); - printk(KERN_INFO "%s: detected at %lx, irq %d\n", dev->name, + printk(KERN_INFO "hdlc: detected at %lx, irq %d\n", dev->base_addr, dev->irq); err = register_hdlc_device(dev); @@ -899,6 +899,8 @@ static int lmc_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) break; default: printk(KERN_WARNING "%s: LMC UNKNOWN CARD!\n", dev->name); + unregister_hdlc_device(dev); + return -EIO; break; } diff --git a/drivers/net/wireless/broadcom/b43/phy_n.c b/drivers/net/wireless/broadcom/b43/phy_n.c index b669dff24b6e..665b737fbb0d 100644 --- a/drivers/net/wireless/broadcom/b43/phy_n.c +++ b/drivers/net/wireless/broadcom/b43/phy_n.c @@ -5311,7 +5311,7 @@ static void b43_nphy_restore_cal(struct b43_wldev *dev) for (i = 0; i < 4; i++) { if (dev->phy.rev >= 3) - table[i] = coef[i]; + coef[i] = table[i]; else coef[i] = 0; } diff --git a/drivers/target/iscsi/cxgbit/cxgbit_target.c b/drivers/target/iscsi/cxgbit/cxgbit_target.c index 9b3eb2e8c92a..b926e1d6c7b8 100644 --- a/drivers/target/iscsi/cxgbit/cxgbit_target.c +++ b/drivers/target/iscsi/cxgbit/cxgbit_target.c @@ -86,8 +86,7 @@ static int cxgbit_is_ofld_imm(const struct sk_buff *skb) if (likely(cxgbit_skcb_flags(skb) & SKCBF_TX_ISO)) length += sizeof(struct cpl_tx_data_iso); -#define MAX_IMM_TX_PKT_LEN 256 - return length <= MAX_IMM_TX_PKT_LEN; + return length <= MAX_IMM_OFLD_TX_DATA_WR_LEN; } /* diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index d7493016cd46..60cd25c0461b 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -207,7 +207,7 @@ struct atm_skb_data { struct atm_vcc *vcc; /* ATM VCC */ unsigned long atm_options; /* ATM layer options */ unsigned int acct_truesize; /* truesize accounted to vcc */ -}; +} __packed; #define VCC_HTABLE_SIZE 32 diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 72e69a0e1e8c..c42e02b4d84b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,8 +23,8 @@ struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF -extern struct static_key_false cgroup_bpf_enabled_key; -#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -125,7 +125,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx); + void *t_ctx, + u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, @@ -147,6 +148,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int __user *optlen, int max_optlen, int retval); +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -185,7 +190,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ BPF_CGROUP_INET_INGRESS); \ \ @@ -195,7 +200,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ @@ -207,7 +212,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ @@ -227,33 +232,53 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ + u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(type)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - NULL); \ + NULL, \ + &__unused_flags); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ + u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - t_ctx); \ + t_ctx, \ + &__unused_flags); \ release_sock(sk); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) - -#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) +/* BPF_CGROUP_INET4_BIND and BPF_CGROUP_INET6_BIND can return extra flags + * via upper bits of return code. The only flag that is supported + * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check + * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). + */ +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags) \ +({ \ + u32 __flags = 0; \ + int __ret = 0; \ + if (cgroup_bpf_enabled(type)) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + NULL, &__flags); \ + release_sock(sk); \ + if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ + *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ + } \ + __ret; \ +}) -#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ - sk->sk_prot->pre_connect) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ + ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) @@ -297,7 +322,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ BPF_CGROUP_SOCK_OPS); \ @@ -307,7 +332,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ @@ -320,7 +345,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ access, \ BPF_CGROUP_DEVICE); \ @@ -332,7 +357,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ BPF_CGROUP_SYSCTL); \ @@ -343,7 +368,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -354,7 +379,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -363,11 +388,24 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ - optname, optval, \ - optlen, max_optlen, \ - retval); \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ + !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ + tcp_bpf_bypass_getsockopt, \ + level, optname)) \ + __ret = __cgroup_bpf_run_filter_getsockopt( \ + sock, level, optname, optval, optlen, \ + max_optlen, retval); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ + sock, level, optname, optval, optlen, retval); \ __ret; \ }) @@ -427,15 +465,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled (0) +#define cgroup_bpf_enabled(type) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) @@ -452,6 +489,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1aac2af12fed..cccaef1088ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -14,7 +14,6 @@ #include <linux/numa.h> #include <linux/mm_types.h> #include <linux/wait.h> -#include <linux/u64_stats_sync.h> #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/module.h> @@ -507,12 +506,6 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_ARGS 12 -struct bpf_prog_stats { - u64 cnt; - u64 nsecs; - struct u64_stats_sync syncp; -} __aligned(2 * sizeof(u64)); - struct btf_func_model { u8 ret_size; u8 nr_args; @@ -536,7 +529,7 @@ struct btf_func_model { /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ -#define BPF_MAX_TRAMP_PROGS 40 +#define BPF_MAX_TRAMP_PROGS 38 struct bpf_tramp_progs { struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS]; @@ -568,10 +561,10 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, struct bpf_tramp_progs *tprogs, void *orig_call); /* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(void); +u64 notrace __bpf_prog_enter(struct bpf_prog *prog); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -void notrace __bpf_prog_enter_sleepable(void); -void notrace __bpf_prog_exit_sleepable(void); +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); struct bpf_ksym { unsigned long start; @@ -845,7 +838,6 @@ struct bpf_prog_aux { u32 linfo_idx; u32 num_exentries; struct exception_table_entry *extable; - struct bpf_prog_stats __percpu *stats; union { struct work_struct work; struct rcu_head rcu; @@ -1073,6 +1065,34 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); +/* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ +#define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) +/* BPF program asks to set CN on the packet. */ +#define BPF_RET_SET_CN (1 << 0) + +#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 _ret = 1; \ + u32 func_ret; \ + migrate_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + func_ret = func(_prog, ctx); \ + _ret &= (func_ret & 1); \ + *(ret_flags) |= (func_ret >> 1); \ + _item++; \ + } \ + rcu_read_unlock(); \ + migrate_enable(); \ + _ret; \ + }) + #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ struct bpf_prog_array_item *_item; \ @@ -1120,25 +1140,11 @@ _out: \ */ #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - u32 ret; \ - u32 _ret = 1; \ - u32 _cn = 0; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ - ret = func(_prog, ctx); \ - _ret &= (ret & 1); \ - _cn |= (ret & 2); \ - _item++; \ - } \ - rcu_read_unlock(); \ - migrate_enable(); \ + u32 _flags = 0; \ + bool _cn; \ + u32 _ret; \ + _ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \ + _cn = _flags & BPF_RET_SET_CN; \ if (_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ @@ -1276,6 +1282,11 @@ static inline bool bpf_allow_ptr_leaks(void) return perfmon_capable(); } +static inline bool bpf_allow_uninit_stack(void) +{ + return perfmon_capable(); +} + static inline bool bpf_allow_ptr_to_map_access(void) { return perfmon_capable(); @@ -1874,6 +1885,7 @@ extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; +extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dfe6f85d97dd..971b33aca13d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -195,7 +195,7 @@ struct bpf_func_state { * 0 = main function, 1 = first callee. */ u32 frameno; - /* subprog number == index within subprog_stack_depth + /* subprog number == index within subprog_info * zero == main subprog */ u32 subprogno; @@ -404,6 +404,7 @@ struct bpf_verifier_env { u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool allow_uninit_stack; bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; @@ -470,6 +471,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index de9430d55c90..c2c2147dfeb8 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -61,19 +61,11 @@ #define PHY_BCM_OUI_5 0x03625e00 #define PHY_BCM_OUI_6 0xae025000 -#define PHY_BCM_FLAGS_MODE_COPPER 0x00000001 -#define PHY_BCM_FLAGS_MODE_1000BX 0x00000002 -#define PHY_BCM_FLAGS_INTF_SGMII 0x00000010 -#define PHY_BCM_FLAGS_INTF_XAUI 0x00000020 -#define PHY_BRCM_WIRESPEED_ENABLE 0x00000100 -#define PHY_BRCM_AUTO_PWRDWN_ENABLE 0x00000200 -#define PHY_BRCM_RX_REFCLK_UNUSED 0x00000400 -#define PHY_BRCM_STD_IBND_DISABLE 0x00000800 -#define PHY_BRCM_EXT_IBND_RX_ENABLE 0x00001000 -#define PHY_BRCM_EXT_IBND_TX_ENABLE 0x00002000 -#define PHY_BRCM_CLEAR_RGMII_MODE 0x00004000 -#define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 -#define PHY_BRCM_EN_MASTER_MODE 0x00010000 +#define PHY_BRCM_AUTO_PWRDWN_ENABLE 0x00000001 +#define PHY_BRCM_RX_REFCLK_UNUSED 0x00000002 +#define PHY_BRCM_CLEAR_RGMII_MODE 0x00000004 +#define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00000008 +#define PHY_BRCM_EN_MASTER_MODE 0x00000010 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) @@ -137,6 +129,7 @@ #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC 0x07 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN 0x0010 +#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN 0x0080 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN 0x0100 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX 0x0200 #define MII_BCM54XX_AUXCTL_MISC_WREN 0x8000 @@ -198,6 +191,7 @@ #define BCM54XX_SHD_SCR3_DEF_CLK125 0x0001 #define BCM54XX_SHD_SCR3_DLLAPD_DIS 0x0002 #define BCM54XX_SHD_SCR3_TRDDAPD 0x0004 +#define BCM54XX_SHD_SCR3_RXCTXC_DIS 0x0100 /* 01010: Auto Power-Down */ #define BCM54XX_SHD_APD 0x0a @@ -223,6 +217,9 @@ /* 11111: Mode Control Register */ #define BCM54XX_SHD_MODE 0x1f #define BCM54XX_SHD_INTF_SEL_MASK GENMASK(2, 1) /* INTERF_SEL[1:0] */ +#define BCM54XX_SHD_INTF_SEL_RGMII 0x02 +#define BCM54XX_SHD_INTF_SEL_SGMII 0x04 +#define BCM54XX_SHD_INTF_SEL_GBIC 0x06 #define BCM54XX_SHD_MODE_1000BX BIT(0) /* Enable 1000-X registers */ /* @@ -258,7 +255,6 @@ #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN (1 << 0) #define BCM54810_SHD_CLK_CTL 0x3 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN (1 << 9) -#define BCM54810_SHD_SCR3_TRDDAPD 0x0100 /* BCM54612E Registers */ #define BCM54612E_EXP_SPARE0 (MII_BCM54XX_EXP_SEL_ETC + 0x34) diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h new file mode 100644 index 000000000000..4265f328681a --- /dev/null +++ b/include/linux/dsa/ocelot.h @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright 2019-2021 NXP Semiconductors + */ + +#ifndef _NET_DSA_TAG_OCELOT_H +#define _NET_DSA_TAG_OCELOT_H + +#include <linux/packing.h> + +#define OCELOT_TAG_LEN 16 +#define OCELOT_SHORT_PREFIX_LEN 4 +#define OCELOT_LONG_PREFIX_LEN 16 +#define OCELOT_TOTAL_TAG_LEN (OCELOT_SHORT_PREFIX_LEN + OCELOT_TAG_LEN) + +/* The CPU injection header and the CPU extraction header can have 3 types of + * prefixes: long, short and no prefix. The format of the header itself is the + * same in all 3 cases. + * + * Extraction with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | ff:ff:ff:ff:ff:ff | fe:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Extraction with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | extraction | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Extraction with no prefix: + * + * +------------+-------+ + * | extraction | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * + * Injection with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | any dmac | any smac | 8880 | 000a | injection | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Injection with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | injection | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Injection with no prefix: + * + * +------------+-------+ + * | injection | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * The injection header looks like this (network byte order, bit 127 + * is part of lowest address byte in memory, bit 0 is part of highest + * address byte): + * + * +------+------+------+------+------+------+------+------+ + * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | RSV | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | RSV | DEST | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | DEST | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| + * +------+------+------+------+------+------+------+------+ + * 39: 32 | TFRM_TIMER | RSV | + * +------+------+------+------+------+------+------+------+ + * 31: 24 | RSV | DP | POP_CNT | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + * + * And the extraction header looks like this: + * + * +------+------+------+------+------+------+------+------+ + * 127:120 | RSV | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | LLEN | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | LLEN | WLEN | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | WLEN | RSV | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | RSV | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | ACL_ID | + * +------+------+------+------+------+------+------+------+ + * 39: 32 | ACL_ID | RSV | SFLOW_ID | + * +------+------+------+------+------+------+------+------+ + * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + */ + +static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val) +{ + packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_len(void *extraction, u64 *len) +{ + u64 llen, wlen; + + packing(extraction, &llen, 84, 79, OCELOT_TAG_LEN, UNPACK, 0); + packing(extraction, &wlen, 78, 71, OCELOT_TAG_LEN, UNPACK, 0); + + *len = 60 * wlen + llen - 80; +} + +static inline void ocelot_xfh_get_src_port(void *extraction, u64 *src_port) +{ + packing(extraction, src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_cpuq(void *extraction, u64 *cpuq) +{ + packing(extraction, cpuq, 28, 20, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_qos_class(void *extraction, u64 *qos_class) +{ + packing(extraction, qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_tag_type(void *extraction, u64 *tag_type) +{ + packing(extraction, tag_type, 16, 16, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_vlan_tci(void *extraction, u64 *vlan_tci) +{ + packing(extraction, vlan_tci, 15, 0, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_ifh_set_bypass(void *injection, u64 bypass) +{ + packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_rew_op(void *injection, u64 rew_op) +{ + packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_dest(void *injection, u64 dest) +{ + packing(injection, &dest, 67, 56, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_qos_class(void *injection, u64 qos_class) +{ + packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void seville_ifh_set_dest(void *injection, u64 dest) +{ + packing(injection, &dest, 67, 57, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_src(void *injection, u64 src) +{ + packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_tag_type(void *injection, u64 tag_type) +{ + packing(injection, &tag_type, 16, 16, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_vid(void *injection, u64 vid) +{ + packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0); +} + +#endif diff --git a/include/linux/filter.h b/include/linux/filter.h index 7fdce5407214..3b00fc906ccd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -22,6 +22,7 @@ #include <linux/vmalloc.h> #include <linux/sockptr.h> #include <crypto/sha1.h> +#include <linux/u64_stats_sync.h> #include <net/sch_generic.h> @@ -539,6 +540,13 @@ struct bpf_binary_header { u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; +struct bpf_prog_stats { + u64 cnt; + u64 nsecs; + u64 misses; + struct u64_stats_sync syncp; +} __aligned(2 * sizeof(u64)); + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ @@ -557,10 +565,12 @@ struct bpf_prog { u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; - struct bpf_prog_aux *aux; /* Auxiliary fields */ - struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_prog_stats __percpu *stats; + int __percpu *active; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); + struct bpf_prog_aux *aux; /* Auxiliary fields */ + struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ struct sock_filter insns[0]; struct bpf_insn insnsi[]; @@ -581,7 +591,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); struct bpf_prog_stats *__stats; \ u64 __start = sched_clock(); \ __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - __stats = this_cpu_ptr(prog->aux->stats); \ + __stats = this_cpu_ptr(prog->stats); \ u64_stats_update_begin(&__stats->syncp); \ __stats->cnt++; \ __stats->nsecs += sched_clock() - __start; \ @@ -1298,6 +1308,11 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +#define BPF_SOCKOPT_KERN_BUF_SIZE 32 +struct bpf_sockopt_buf { + u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; +}; + struct bpf_sockopt_kern { struct sock *sk; u8 *optval; diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index a8345c8a613d..c1c76a70a6ce 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -62,4 +62,10 @@ #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) #endif +#if IS_ENABLED(CONFIG_INET) +#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) +#endif + #endif diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 00057eae89ab..dc3d2508f5c6 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -582,7 +582,10 @@ struct mlx5_init_seg { __be32 internal_timer_l; __be32 rsvd3[2]; __be32 health_counter; - __be32 rsvd4[1019]; + __be32 rsvd4[11]; + __be32 real_time_h; + __be32 real_time_l; + __be32 rsvd5[1006]; __be64 ieee1588_clk; __be32 ieee1588_clk_type; __be32 clr_intx; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 936302b2c141..6ea8d67e3cb8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -143,6 +143,7 @@ enum { MLX5_REG_MPCNT = 0x9051, MLX5_REG_MTPPS = 0x9053, MLX5_REG_MTPPSE = 0x9054, + MLX5_REG_MTUTC = 0x9055, MLX5_REG_MPEGC = 0x9056, MLX5_REG_MCQS = 0x9060, MLX5_REG_MCQI = 0x9061, @@ -675,18 +676,22 @@ struct mlx5_pps { u8 enabled; }; -struct mlx5_clock { - struct mlx5_nb pps_nb; - seqlock_t lock; +struct mlx5_timer { struct cyclecounter cycles; struct timecounter tc; - struct hwtstamp_config hwtstamp_config; u32 nominal_c_mult; unsigned long overflow_period; struct delayed_work overflow_work; +}; + +struct mlx5_clock { + struct mlx5_nb pps_nb; + seqlock_t lock; + struct hwtstamp_config hwtstamp_config; struct ptp_clock *ptp; struct ptp_clock_info ptp_info; struct mlx5_pps pps_info; + struct mlx5_timer timer; }; struct mlx5_dm; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 71ae6aac3410..6f0b866fb495 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -937,11 +937,18 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 reserved_at_200[0x600]; }; +enum { + MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING = 0x0, + MLX5_QP_TIMESTAMP_FORMAT_CAP_REAL_TIME = 0x1, + MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2, +}; + struct mlx5_ifc_roce_cap_bits { u8 roce_apm[0x1]; u8 reserved_at_1[0x3]; u8 sw_r_roce_src_udp_port[0x1]; - u8 reserved_at_5[0x1b]; + u8 reserved_at_5[0x19]; + u8 qp_ts_format[0x2]; u8 reserved_at_20[0x60]; @@ -1258,6 +1265,18 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, }; +enum { + MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING = 0x0, + MLX5_SQ_TIMESTAMP_FORMAT_CAP_REAL_TIME = 0x1, + MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2, +}; + +enum { + MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING = 0x0, + MLX5_RQ_TIMESTAMP_FORMAT_CAP_REAL_TIME = 0x1, + MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2, +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x1f]; u8 vhca_resource_manager[0x1]; @@ -1571,7 +1590,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 general_obj_types[0x40]; - u8 reserved_at_440[0x4]; + u8 sq_ts_format[0x2]; + u8 rq_ts_format[0x2]; u8 steering_format_version[0x4]; u8 create_qp_start_hint[0x18]; @@ -2875,6 +2895,12 @@ enum { MLX5_QPC_CS_RES_UP_TO_64B = 0x2, }; +enum { + MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0, + MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT = 0x1, + MLX5_QPC_TIMESTAMP_FORMAT_REAL_TIME = 0x2, +}; + struct mlx5_ifc_qpc_bits { u8 state[0x4]; u8 lag_tx_port_affinity[0x4]; @@ -2903,7 +2929,9 @@ struct mlx5_ifc_qpc_bits { u8 log_rq_stride[0x3]; u8 no_sq[0x1]; u8 log_sq_size[0x4]; - u8 reserved_at_55[0x6]; + u8 reserved_at_55[0x3]; + u8 ts_format[0x2]; + u8 reserved_at_5a[0x1]; u8 rlky[0x1]; u8 ulp_stateless_offload_mode[0x4]; @@ -3319,6 +3347,12 @@ enum { MLX5_SQC_STATE_ERR = 0x3, }; +enum { + MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0, + MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT = 0x1, + MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME = 0x2, +}; + struct mlx5_ifc_sqc_bits { u8 rlky[0x1]; u8 cd_master[0x1]; @@ -3330,7 +3364,9 @@ struct mlx5_ifc_sqc_bits { u8 reg_umr[0x1]; u8 allow_swp[0x1]; u8 hairpin[0x1]; - u8 reserved_at_f[0x11]; + u8 reserved_at_f[0xb]; + u8 ts_format[0x2]; + u8 reserved_at_1c[0x4]; u8 reserved_at_20[0x8]; u8 user_index[0x18]; @@ -3422,6 +3458,12 @@ enum { MLX5_RQC_STATE_ERR = 0x3, }; +enum { + MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0, + MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT = 0x1, + MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME = 0x2, +}; + struct mlx5_ifc_rqc_bits { u8 rlky[0x1]; u8 delay_drop_en[0x1]; @@ -3432,7 +3474,9 @@ struct mlx5_ifc_rqc_bits { u8 reserved_at_c[0x1]; u8 flush_in_error_en[0x1]; u8 hairpin[0x1]; - u8 reserved_at_f[0x11]; + u8 reserved_at_f[0xb]; + u8 ts_format[0x2]; + u8 reserved_at_1c[0x4]; u8 reserved_at_20[0x8]; u8 user_index[0x18]; @@ -5913,6 +5957,18 @@ struct mlx5_ifc_dealloc_modify_header_context_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_query_modify_header_context_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 modify_header_id[0x20]; + + u8 reserved_at_60[0xa0]; +}; + struct mlx5_ifc_query_dct_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -9103,6 +9159,28 @@ struct mlx5_ifc_mpegc_reg_bits { u8 reserved_at_60[0x100]; }; +enum { + MLX5_MTUTC_OPERATION_SET_TIME_IMMEDIATE = 0x1, + MLX5_MTUTC_OPERATION_ADJUST_TIME = 0x2, + MLX5_MTUTC_OPERATION_ADJUST_FREQ_UTC = 0x3, +}; + +struct mlx5_ifc_mtutc_reg_bits { + u8 reserved_at_0[0x1c]; + u8 operation[0x4]; + + u8 freq_adjustment[0x20]; + + u8 reserved_at_40[0x40]; + + u8 utc_sec[0x20]; + + u8 reserved_at_a0[0x2]; + u8 utc_nsec[0x1e]; + + u8 time_adjustment[0x20]; +}; + struct mlx5_ifc_pcam_enhanced_features_bits { u8 reserved_at_0[0x68]; u8 fec_50G_per_lane_in_pplm[0x1]; @@ -9161,7 +9239,9 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x6e]; + u8 reserved_at_0[0x6b]; + u8 ptpcyc2realtime_modify[0x1]; + u8 reserved_at_6c[0x2]; u8 pci_status_and_power[0x1]; u8 reserved_at_6f[0x5]; u8 mark_tx_action_cnp[0x1]; @@ -9184,7 +9264,8 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_95_to_87[0x9]; u8 mpegc[0x1]; - u8 regs_85_to_68[0x12]; + u8 mtutc[0x1]; + u8 regs_84_to_68[0x11]; u8 tracer_registers[0x4]; u8 regs_63_to_32[0x20]; @@ -9917,6 +9998,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mcda_reg_bits mcda_reg; struct mlx5_ifc_mirc_reg_bits mirc_reg; struct mlx5_ifc_mfrl_reg_bits mfrl_reg; + struct mlx5_ifc_mtutc_reg_bits mtutc_reg; u8 reserved_at_0[0x60e0]; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bfadf3b82f9c..ddf4cfc12615 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3931,14 +3931,42 @@ int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb); +static __always_inline bool __is_skb_forwardable(const struct net_device *dev, + const struct sk_buff *skb, + const bool check_mtu) +{ + const u32 vlan_hdr_len = 4; /* VLAN_HLEN */ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + if (!check_mtu) + return true; + + len = dev->mtu + dev->hard_header_len + vlan_hdr_len; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + static __always_inline int ____dev_forward_skb(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + const bool check_mtu) { if (skb_orphan_frags(skb, GFP_ATOMIC) || - unlikely(!is_skb_forwardable(dev, skb))) { + unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index cfe8c607a628..2b05e7f7c238 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -26,9 +26,6 @@ of_phy_connect(struct net_device *dev, struct device_node *phy_np, struct phy_device * of_phy_get_and_connect(struct net_device *dev, struct device_node *np, void (*hndlr)(struct net_device *)); -struct phy_device * -of_phy_attach(struct net_device *dev, struct device_node *phy_np, - u32 flags, phy_interface_t iface); struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np); int of_phy_register_fixed_link(struct device_node *np); @@ -100,13 +97,6 @@ of_phy_get_and_connect(struct net_device *dev, struct device_node *np, return NULL; } -static inline struct phy_device *of_phy_attach(struct net_device *dev, - struct device_node *phy_np, - u32 flags, phy_interface_t iface) -{ - return NULL; -} - static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np) { return NULL; diff --git a/include/linux/phy.h b/include/linux/phy.h index c130788306c8..1a12e4436b5b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -71,11 +71,11 @@ extern const int phy_10gbit_features_array[1]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, - * or not desired for this PHY. Set to PHY_IGNORE_INTERRUPT if - * the attached driver handles the interrupt + * or not desired for this PHY. Set to PHY_MAC_INTERRUPT if + * the attached MAC driver handles the interrupt */ #define PHY_POLL -1 -#define PHY_IGNORE_INTERRUPT -2 +#define PHY_MAC_INTERRUPT -2 #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 @@ -107,6 +107,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX + * @PHY_INTERFACE_MODE_5GBASER: 5G BaseR * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR @@ -139,6 +140,7 @@ typedef enum { PHY_INTERFACE_MODE_100BASEX, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, + PHY_INTERFACE_MODE_5GBASER, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ @@ -209,6 +211,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "1000base-x"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; + case PHY_INTERFACE_MODE_5GBASER: + return "5gbase-r"; case PHY_INTERFACE_MODE_RXAUI: return "rxaui"; case PHY_INTERFACE_MODE_XAUI: @@ -488,6 +492,7 @@ struct macsec_ops; * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. + * @is_on_sfp_module: Set true if PHY is located on an SFP module. * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * @irq: IRQ number of the PHY's interrupt (-1 if none) @@ -561,6 +566,7 @@ struct phy_device { unsigned sysfs_links:1; unsigned loopback_enabled:1; unsigned downshifted_rate:1; + unsigned is_on_sfp_module:1; unsigned autoneg:1; /* The most recently read link state */ @@ -1202,11 +1208,11 @@ static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad, * @phydev: the phy_device struct * * NOTE: must be kept in sync with addition/removal of PHY_POLL and - * PHY_IGNORE_INTERRUPT + * PHY_MAC_INTERRUPT */ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) { - return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT; + return phydev->irq != PHY_POLL && phydev->irq != PHY_MAC_INTERRUPT; } /** @@ -1293,6 +1299,15 @@ static inline bool phy_is_internal(struct phy_device *phydev) } /** + * phy_on_sfp - Convenience function for testing if a PHY is on an SFP module + * @phydev: the phy_device struct + */ +static inline bool phy_on_sfp(struct phy_device *phydev) +{ + return phydev->is_on_sfp_module; +} + +/** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) * @mode: the &phy_interface_t enum diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index fec0c5ac1c4f..8edbbf5f2f93 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -390,7 +390,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk) } void sk_psock_stop(struct sock *sk, struct sk_psock *psock); -void sk_psock_destroy(struct rcu_head *rcu); void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) diff --git a/include/net/act_api.h b/include/net/act_api.h index 761c0e331915..2bf3092ae7ec 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -166,6 +166,7 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); +void tcf_idr_insert_many(struct tc_action *actions[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); diff --git a/include/net/dsa.h b/include/net/dsa.h index 74457aaffec7..83a933e563fe 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -48,6 +48,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 +#define DSA_TAG_PROTO_SEVILLE_VALUE 21 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -71,6 +72,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, + DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, }; struct packet_type; @@ -639,9 +641,11 @@ struct dsa_switch_ops { * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, - bool vlan_filtering); + bool vlan_filtering, + struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); /* @@ -788,6 +792,18 @@ struct dsa_switch_ops { struct net_device *hsr); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); + + /* + * MRP integration + */ + int (*port_mrp_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp); + int (*port_mrp_del)(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp); + int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp); + int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index cb2818862919..cad2a611efde 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -41,6 +41,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) +/* Skip CAP_NET_BIND_SERVICE check. */ +#define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/net/sock.h b/include/net/sock.h index 855c068c6c86..636810ddcd9b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1174,6 +1174,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + bool (*bpf_bypass_getsockopt)(int level, + int optname); void (*release_cb)(struct sock *sk); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 25d9e4570934..b7fc7d0f54e2 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -27,9 +27,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, -#if IS_ENABLED(CONFIG_BRIDGE_MRP) SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, -#endif }; struct switchdev_brport_flags { @@ -51,9 +49,7 @@ struct switchdev_attr { bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */ bool mc_disabled; /* MC_DISABLED */ -#if IS_ENABLED(CONFIG_BRIDGE_MRP) u8 mrp_port_role; /* MRP_PORT_ROLE */ -#endif } u; }; @@ -62,7 +58,6 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_PORT_MDB, SWITCHDEV_OBJ_ID_HOST_MDB, -#if IS_ENABLED(CONFIG_BRIDGE_MRP) SWITCHDEV_OBJ_ID_MRP, SWITCHDEV_OBJ_ID_RING_TEST_MRP, SWITCHDEV_OBJ_ID_RING_ROLE_MRP, @@ -70,8 +65,6 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_IN_TEST_MRP, SWITCHDEV_OBJ_ID_IN_ROLE_MRP, SWITCHDEV_OBJ_ID_IN_STATE_MRP, - -#endif }; struct switchdev_obj { @@ -103,7 +96,6 @@ struct switchdev_obj_port_mdb { container_of((OBJ), struct switchdev_obj_port_mdb, obj) -#if IS_ENABLED(CONFIG_BRIDGE_MRP) /* SWITCHDEV_OBJ_ID_MRP */ struct switchdev_obj_mrp { struct switchdev_obj obj; @@ -135,6 +127,7 @@ struct switchdev_obj_ring_role_mrp { struct switchdev_obj obj; u8 ring_role; u32 ring_id; + u8 sw_backup; }; #define SWITCHDEV_OBJ_RING_ROLE_MRP(OBJ) \ @@ -169,6 +162,7 @@ struct switchdev_obj_in_role_mrp { u32 ring_id; u16 in_id; u8 in_role; + u8 sw_backup; }; #define SWITCHDEV_OBJ_IN_ROLE_MRP(OBJ) \ @@ -183,8 +177,6 @@ struct switchdev_obj_in_state_mrp { #define SWITCHDEV_OBJ_IN_STATE_MRP(OBJ) \ container_of((OBJ), struct switchdev_obj_in_state_mrp, obj) -#endif - typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj); enum switchdev_notifier_type { @@ -247,7 +239,8 @@ switchdev_notifier_info_to_extack(const struct switchdev_notifier_info *info) void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr); + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack); int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack); @@ -295,7 +288,8 @@ static inline void switchdev_deferred_process(void) } static inline int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/include/net/tcp.h b/include/net/tcp.h index 484eb2362645..963cd86d12dd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); +bool tcp_bpf_bypass_getsockopt(int level, int optname); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); diff --git a/include/net/xdp.h b/include/net/xdp.h index 0cf3976ce77c..a5bc214a49d9 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -164,6 +164,12 @@ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev); +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev); +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 40792b37bb9f..425ff29d9389 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -87,9 +87,6 @@ /* Source PGIDs, one per physical port */ #define PGID_SRC 80 -#define IFH_INJ_BYPASS BIT(31) -#define IFH_INJ_POP_CNT_DISABLE (3 << 28) - #define IFH_TAG_TYPE_C 0 #define IFH_TAG_TYPE_S 1 @@ -100,10 +97,6 @@ #define IFH_REW_OP_ORIGIN_PTP 0x5 #define OCELOT_NUM_TC 8 -#define OCELOT_TAG_LEN 16 -#define OCELOT_SHORT_PREFIX_LEN 4 -#define OCELOT_LONG_PREFIX_LEN 16 -#define OCELOT_TOTAL_TAG_LEN (OCELOT_SHORT_PREFIX_LEN + OCELOT_TAG_LEN) #define OCELOT_SPEED_2500 0 #define OCELOT_SPEED_1000 1 @@ -119,6 +112,8 @@ #define REG_RESERVED_ADDR 0xffffffff #define REG_RESERVED(reg) REG(reg, REG_RESERVED_ADDR) +#define OCELOT_MRP_CPUQ 7 + enum ocelot_target { ANA = 1, QS, @@ -684,6 +679,12 @@ struct ocelot { /* Protects the PTP clock */ spinlock_t ptp_clock_lock; struct ptp_pin_desc ptp_pins[OCELOT_PTP_PINS_NUM]; + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + u16 mrp_ring_id; + struct net_device *mrp_p_port; + struct net_device *mrp_s_port; +#endif }; struct ocelot_policer { @@ -742,6 +743,40 @@ u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, u32 val, u32 reg, u32 offset); +/* Packet I/O */ +#if IS_ENABLED(CONFIG_MSCC_OCELOT_SWITCH_LIB) + +bool ocelot_can_inject(struct ocelot *ocelot, int grp); +void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, + u32 rew_op, struct sk_buff *skb); +int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb); +void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp); + +#else + +static inline bool ocelot_can_inject(struct ocelot *ocelot, int grp) +{ + return false; +} + +static inline void ocelot_port_inject_frame(struct ocelot *ocelot, int port, + int grp, u32 rew_op, + struct sk_buff *skb) +{ +} + +static inline int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, + struct sk_buff **skb) +{ + return -EIO; +} + +static inline void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) +{ +} + +#endif + /* Hardware initialization */ int ocelot_regfields_init(struct ocelot *ocelot, const struct reg_field *const regfields); @@ -847,4 +882,41 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +int ocelot_mrp_add(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp); +int ocelot_mrp_del(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp); +int ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp); +int ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp); +#else +static inline int ocelot_mrp_add(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + return -EOPNOTSUPP; +} + +static inline int ocelot_mrp_del(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + return -EOPNOTSUPP; +} + +static inline int +ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + return -EOPNOTSUPP; +} + +static inline int +ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + return -EOPNOTSUPP; +} +#endif + #endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index cd74bffed5c6..a23be89119aa 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -55,8 +55,7 @@ /* tracepoints with more than 12 arguments will hit build error */ #define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#define __BPF_DECLARE_TRACE(call, proto, args) \ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ @@ -64,6 +63,10 @@ __bpf_trace_##call(void *__data, proto) \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ } +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) + /* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the @@ -111,6 +114,11 @@ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) +#undef DECLARE_TRACE +#define DECLARE_TRACE(call, proto, args) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ + __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) #undef DEFINE_EVENT_WRITABLE diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c001766adcbc..4c24daa43bac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1656,22 +1656,30 @@ union bpf_attr { * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return @@ -2231,6 +2239,9 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. @@ -3836,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4001,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4501,6 +4576,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { @@ -4981,9 +5057,13 @@ struct bpf_fib_lookup { __be16 sport; __be16 dport; - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + /* output: MTU value */ + __u16 mtu_result; + }; /* input: L3 device index for lookup * output: device index from FIB lookup */ @@ -5029,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index c91578aaab32..e1172c1ffdfd 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -103,6 +103,8 @@ struct mptcp_info { __u64 mptcpi_write_seq; __u64 mptcpi_snd_una; __u64 mptcpi_rcv_nxt; + __u8 mptcpi_local_addr_used; + __u8 mptcpi_local_addr_max; }; /* diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index afe6836e44b1..7ea59cfe1fa7 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -593,6 +593,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ + __TCA_FLOWER_KEY_CT_FLAGS_MAX, }; enum { diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5454161407f1..a0d9eade9c80 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -287,7 +287,7 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 1b6b9349cb85..d99e89f113c4 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { + u8 node_type = READ_ONCE(node->type); unsigned long flags; - if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || - WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; - if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8962f988514f..2efeb5f4b343 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3540,11 +3540,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (!btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen == 0"); - return -EINVAL; - } - if (!t->size) { btf_verifier_log_type(env, t, "size == 0"); return -EINVAL; @@ -5296,15 +5291,16 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. */ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; - u32 i, nargs, btf_id; + const struct btf_type *t, *ref_t; + u32 i, nargs, btf_id, type_size; const char *tname; + bool is_global; if (!prog->aux->func_info) return -EINVAL; @@ -5338,38 +5334,57 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); goto out; } + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - if (reg[i + 1].type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", i + 1); goto out; } if (btf_type_is_ptr(t)) { - if (reg[i + 1].type == SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer\n", i + 1); - goto out; - } /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { - if (reg[i + 1].type != PTR_TO_CTX) { + if (reg->type != PTR_TO_CTX) { bpf_log(log, "arg#%d expected pointer to ctx, but got %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ctx_reg(env, reg, i + 1)) goto out; continue; } + + if (!is_global) + goto out; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, &type_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + goto out; + } + + if (check_mem_reg(env, reg, i + 1, type_size)) + goto out; + + continue; } bpf_log(log, "Unrecognized arg#%d type %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5393,14 +5408,14 @@ out: * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; + const struct btf_type *t, *ref_t; u32 i, nargs, btf_id; const char *tname; @@ -5464,16 +5479,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - reg[i + 1].type = SCALAR_VALUE; + reg->type = SCALAR_VALUE; continue; } - if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg[i + 1].type = PTR_TO_CTX; + if (btf_type_is_ptr(t)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + reg->type = PTR_TO_CTX; + continue; + } + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, ®->mem_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + return -EINVAL; + } + + reg->type = PTR_TO_MEM_OR_NULL; + reg->id = ++env->id_gen; + continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6aa9e10c6335..b567ca46555c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0; cleanup: @@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted * @t_ctx: Pointer to attach type specific context + * @flags: Pointer to u32 which contains higher bits of BPF program + * return value (OR'ed together). * * socket is expected to be of type INET or INET6. * @@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx) + void *t_ctx, + u32 *flags) { struct bpf_sock_addr_kern ctx = { .sk = sk, @@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN, flags); return ret == 1 ? 0 : -EPERM; } @@ -1298,7 +1302,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; } -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1315,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; } + if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1333,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; } -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); } +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1340,8 +1364,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1350,7 +1373,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1390,14 +1413,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } } out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } @@ -1407,6 +1447,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1419,13 +1460,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1488,9 +1528,55 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval; out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } + +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + .optlen = *optlen, + .optval = optval, + .optval_end = optval + *optlen, + }; + int ret; + + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy + * user data back into BPF buffer when reval != 0. This is + * done as an optimization to avoid extra copy, assuming + * kernel won't populate the data in case of an error. + * Here we always pass the data and memset() should + * be called if that data shouldn't be "exported". + */ + + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + if (!ret) + return -EPERM; + + if (ctx.optlen > *optlen) + return -EFAULT; + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) + return -EFAULT; + + /* BPF programs can shrink the buffer, export the modifications. + */ + if (ctx.optlen != 0) + *optlen = ctx.optlen; + + return ctx.retval; +} #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5bbd4884ff7a..0ae015ad1e05 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } + fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + if (!fp->active) { + vfree(fp); + kfree(aux); + return NULL; + } fp->pages = size / PAGE_SIZE; fp->aux = aux; @@ -114,8 +120,9 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (!prog) return NULL; - prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); - if (!prog->aux->stats) { + prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->stats) { + free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; @@ -124,7 +131,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; - pstats = per_cpu_ptr(prog->aux->stats, cpu); + pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; @@ -238,6 +245,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, * reallocated structure. */ fp_old->aux = NULL; + fp_old->stats = NULL; + fp_old->active = NULL; __bpf_prog_free(fp_old); } @@ -249,10 +258,11 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); - free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); } + free_percpu(fp->stats); + free_percpu(fp->active); vfree(fp); } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 747313698178..5d1469de6921 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -141,49 +141,6 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, - struct sk_buff *skb) -{ - unsigned int hard_start_headroom; - unsigned int frame_size; - void *pkt_data_start; - - /* Part of headroom was reserved to xdpf */ - hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - - /* Memory size backing xdp_frame data already have reserved - * room for build_skb to place skb_shared_info in tailroom. - */ - frame_size = xdpf->frame_sz; - - pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb_around(skb, pkt_data_start, frame_size); - if (unlikely(!skb)) - return NULL; - - skb_reserve(skb, hard_start_headroom); - __skb_put(skb, xdpf->len); - if (xdpf->metasize) - skb_metadata_set(skb, xdpf->metasize); - - /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdpf->dev_rx); - - /* Optional SKB info, currently missing: - * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) - * - RX ring dev queue index (skb_record_rx_queue) - */ - - /* Until page_pool get SKB return path, release DMA here */ - xdp_release_frame(xdpf); - - /* Allow SKB to reuse area used by xdp_frame */ - xdp_scrub_frame(xdpf); - - return skb; -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -350,7 +307,8 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(xdpf, skb); + skb = __xdp_build_skb_from_frame(xdpf, skb, + xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..85d9d1b72a33 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -802,9 +802,7 @@ static int dev_map_notification(struct notifier_block *notifier, break; /* will be freed in free_netdev() */ - netdev->xdp_bulkq = - __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), - sizeof(void *), GFP_ATOMIC); + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 19ff8fed7f4b..3acc7e0b6916 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -161,7 +161,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && - (insn->imm == BPF_ADD || insn->imm == BPF_ADD || + (insn->imm == BPF_ADD || insn->imm == BPF_AND || insn->imm == BPF_OR || insn->imm == BPF_XOR)) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c1ac7f964bc9..d63912e73ad9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1148,7 +1148,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1202,7 +1202,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41ca280b1dc1..308427fe03a3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -720,14 +720,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - if (!perfmon_capable()) - return NULL; - return bpf_get_trace_printk_proto(); - case BPF_FUNC_snprintf_btf: - if (!perfmon_capable()) - return NULL; - return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: @@ -742,6 +734,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_probe_read_user: @@ -752,6 +746,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e5999d86c76e..c859bc46d06c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_stats *stats) { - u64 nsecs = 0, cnt = 0; + u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; - u64 tnsecs, tcnt; + u64 tnsecs, tcnt, tmisses; - st = per_cpu_ptr(prog->aux->stats, cpu); + st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; tcnt = st->cnt; + tmisses = st->misses; } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; + misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; + stats->misses = misses; } #ifdef CONFIG_PROC_FS @@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" - "run_cnt:\t%llu\n", + "run_cnt:\t%llu\n" + "recursion_misses:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, - stats.cnt); + stats.cnt, + stats.misses); } #endif @@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; + info.recursion_misses = stats.misses; if (!bpf_capable()) { info.jited_prog_len = 0; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 175b7b42bfc4..b68cb5d6d6eb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -286,9 +286,248 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +struct bpf_iter_seq_task_vma_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct vm_area_struct *vma; + u32 tid; + unsigned long prev_vm_start; + unsigned long prev_vm_end; +}; + +enum bpf_task_vma_iter_find_op { + task_vma_iter_first_vma, /* use mm->mmap */ + task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_find_vma, /* use find_vma() to find next vma */ +}; + +static struct vm_area_struct * +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) +{ + struct pid_namespace *ns = info->common.ns; + enum bpf_task_vma_iter_find_op op; + struct vm_area_struct *curr_vma; + struct task_struct *curr_task; + u32 curr_tid = info->tid; + + /* If this function returns a non-NULL vma, it holds a reference to + * the task_struct, and holds read lock on vma->mm->mmap_lock. + * If this function returns NULL, it does not hold any reference or + * lock. + */ + if (info->task) { + curr_task = info->task; + curr_vma = info->vma; + /* In case of lock contention, drop mmap_lock to unblock + * the writer. + * + * After relock, call find(mm, prev_vm_end - 1) to find + * new vma to process. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * For example, curr_vma == VMA2. Before unlock, we set + * + * prev_vm_start = 8k + * prev_vm_end = 16k + * + * There are a few cases: + * + * 1) VMA2 is freed, but VMA3 exists. + * + * find_vma() will return VMA3, just process VMA3. + * + * 2) VMA2 still exists. + * + * find_vma() will return VMA2, process VMA2->next. + * + * 3) no more vma in this mm. + * + * Process the next task. + * + * 4) find_vma() returns a different vma, VMA2'. + * + * 4.1) If VMA2 covers same range as VMA2', skip VMA2', + * because we already covered the range; + * 4.2) VMA2 and VMA2' covers different ranges, process + * VMA2'. + */ + if (mmap_lock_is_contended(curr_task->mm)) { + info->prev_vm_start = curr_vma->vm_start; + info->prev_vm_end = curr_vma->vm_end; + op = task_vma_iter_find_vma; + mmap_read_unlock(curr_task->mm); + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } else { + op = task_vma_iter_next_vma; + } + } else { +again: + curr_task = task_seq_get_next(ns, &curr_tid, true); + if (!curr_task) { + info->tid = curr_tid + 1; + goto finish; + } + + if (curr_tid != info->tid) { + info->tid = curr_tid; + /* new task, process the first vma */ + op = task_vma_iter_first_vma; + } else { + /* Found the same tid, which means the user space + * finished data in previous buffer and read more. + * We dropped mmap_lock before returning to user + * space, so it is necessary to use find_vma() to + * find the next vma to process. + */ + op = task_vma_iter_find_vma; + } + + if (!curr_task->mm) + goto next_task; + + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } + + switch (op) { + case task_vma_iter_first_vma: + curr_vma = curr_task->mm->mmap; + break; + case task_vma_iter_next_vma: + curr_vma = curr_vma->vm_next; + break; + case task_vma_iter_find_vma: + /* We dropped mmap_lock so it is necessary to use find_vma + * to find the next vma. This is similar to the mechanism + * in show_smaps_rollup(). + */ + curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + /* case 1) and 4.2) above just use curr_vma */ + + /* check for case 2) or case 4.1) above */ + if (curr_vma && + curr_vma->vm_start == info->prev_vm_start && + curr_vma->vm_end == info->prev_vm_end) + curr_vma = curr_vma->vm_next; + break; + } + if (!curr_vma) { + /* case 3) above, or case 2) 4.1) with vma->next == NULL */ + mmap_read_unlock(curr_task->mm); + goto next_task; + } + info->task = curr_task; + info->vma = curr_vma; + return curr_vma; + +next_task: + put_task_struct(curr_task); + info->task = NULL; + curr_tid++; + goto again; + +finish: + if (curr_task) + put_task_struct(curr_task); + info->task = NULL; + info->vma = NULL; + return NULL; +} + +static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct vm_area_struct *vma; + + vma = task_vma_seq_get_next(info); + if (vma && *pos == 0) + ++*pos; + + return vma; +} + +static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + ++*pos; + return task_vma_seq_get_next(info); +} + +struct bpf_iter__task_vma { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + __bpf_md_ptr(struct vm_area_struct *, vma); +}; + +DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, + struct task_struct *task, struct vm_area_struct *vma) + +static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct bpf_iter__task_vma ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.vma = info->vma; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_vma_seq_show(struct seq_file *seq, void *v) +{ + return __task_vma_seq_show(seq, false); +} + +static void task_vma_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + if (!v) { + (void)__task_vma_seq_show(seq, true); + } else { + /* info->vma has not been seen by the BPF program. If the + * user space reads more, task_vma_seq_get_next should + * return this vma again. Set prev_vm_start to ~0UL, + * so that we don't skip the vma returned by the next + * find_vma() (case task_vma_iter_find_vma in + * task_vma_seq_get_next()). + */ + info->prev_vm_start = ~0UL; + info->prev_vm_end = info->vma->vm_end; + mmap_read_unlock(info->task->mm); + put_task_struct(info->task); + info->task = NULL; + } +} + +static const struct seq_operations task_vma_seq_ops = { + .start = task_vma_seq_start, + .next = task_vma_seq_next, + .stop = task_vma_seq_stop, + .show = task_vma_seq_show, +}; + BTF_ID_LIST(btf_task_file_ids) BTF_ID(struct, task_struct) BTF_ID(struct, file) +BTF_ID(struct, vm_area_struct) static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, @@ -328,6 +567,26 @@ static struct bpf_iter_reg task_file_reg_info = { .seq_info = &task_file_seq_info, }; +static const struct bpf_iter_seq_info task_vma_seq_info = { + .seq_ops = &task_vma_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), +}; + +static struct bpf_iter_reg task_vma_reg_info = { + .target = "task_vma", + .feature = BPF_ITER_RESCHED, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_vma, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_vma, vma), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &task_vma_seq_info, +}; + static int __init task_iter_init(void) { int ret; @@ -339,6 +598,12 @@ static int __init task_iter_init(void) task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; - return bpf_iter_reg_target(&task_file_reg_info); + ret = bpf_iter_reg_target(&task_file_reg_info); + if (ret) + return ret; + + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2]; + return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 35c5887d82ff..7bc3b3209224 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -381,55 +381,100 @@ out: mutex_unlock(&trampoline_mutex); } +#define NO_START_TIME 1 +static u64 notrace bpf_prog_start_time(void) +{ + u64 start = NO_START_TIME; + + if (static_branch_unlikely(&bpf_stats_enabled_key)) { + start = sched_clock(); + if (unlikely(!start)) + start = NO_START_TIME; + } + return start; +} + +static void notrace inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->misses++; + u64_stats_update_end(&stats->syncp); +} + /* The logic is similar to BPF_PROG_RUN, but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into - * call _bpf_prog_enter + * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit + * + * __bpf_prog_enter returns: + * 0 - skip execution of the bpf prog + * 1 - execute bpf prog + * [2..MAX_U64] - excute bpf prog and record execution time. + * This is start time. */ -u64 notrace __bpf_prog_enter(void) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog) __acquires(RCU) { - u64 start = 0; - rcu_read_lock(); migrate_disable(); - if (static_branch_unlikely(&bpf_stats_enabled_key)) - start = sched_clock(); - return start; + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); + return 0; + } + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) - __releases(RCU) +static void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) { struct bpf_prog_stats *stats; if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter - * and disabled in __bpf_prog_exit. + /* static_key could be enabled in __bpf_prog_enter* + * and disabled in __bpf_prog_exit*. * And vice versa. - * Hence check that 'start' is not zero. + * Hence check that 'start' is valid. */ - start) { - stats = this_cpu_ptr(prog->aux->stats); + start > NO_START_TIME) { + stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) +{ + update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -void notrace __bpf_prog_enter_sleepable(void) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) { rcu_read_lock_trace(); + migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); + return 0; + } + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(void) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) { + update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); + migrate_enable(); rcu_read_unlock_trace(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1cffd4e84725..1dda9d81f12c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -228,6 +228,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static bool bpf_pseudo_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_CALL; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1073,6 +1079,51 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) +{ + switch (reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; + } else { + reg->type = PTR_TO_MAP_VALUE; + } + break; + } + case PTR_TO_SOCKET_OR_NULL: + reg->type = PTR_TO_SOCKET; + break; + case PTR_TO_SOCK_COMMON_OR_NULL: + reg->type = PTR_TO_SOCK_COMMON; + break; + case PTR_TO_TCP_SOCK_OR_NULL: + reg->type = PTR_TO_TCP_SOCK; + break; + case PTR_TO_BTF_ID_OR_NULL: + reg->type = PTR_TO_BTF_ID; + break; + case PTR_TO_MEM_OR_NULL: + reg->type = PTR_TO_MEM; + break; + case PTR_TO_RDONLY_BUF_OR_NULL: + reg->type = PTR_TO_RDONLY_BUF; + break; + case PTR_TO_RDWR_BUF_OR_NULL: + reg->type = PTR_TO_RDWR_BUF; + break; + default: + WARN_ON("unknown nullable register type"); + } +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@ -1486,9 +1537,7 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { verbose(env, @@ -2271,12 +2320,14 @@ static void save_register_state(struct bpf_func_state *state, state->stack[spi].slot_type[i] = STACK_SPILL; } -/* check_stack_read/write functions track spill/fill of registers, +/* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno, int insn_idx) +static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + /* stack frame we're writing to */ + struct bpf_func_state *state, + int off, int size, int value_regno, + int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -2402,9 +2453,175 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_func_state *reg_state /* func where register points to */, - int off, int size, int value_regno) +/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is + * known to contain a variable offset. + * This function checks whether the write is permitted and conservatively + * tracks the effects of the write, considering that each stack slot in the + * dynamic range is potentially written to. + * + * 'off' includes 'regno->off'. + * 'value_regno' can be -1, meaning that an unknown value is being written to + * the stack. + * + * Spilled pointers in range are not marked as written because we don't know + * what's going to be actually written. This means that read propagation for + * future reads cannot be terminated by this write. + * + * For privileged programs, uninitialized stack slots are considered + * initialized by this write (even though we don't know exactly what offsets + * are going to be written to). The idea is that we don't want the verifier to + * reject future reads that access slots written to through variable offsets. + */ +static int check_stack_write_var_off(struct bpf_verifier_env *env, + /* func where register points to */ + struct bpf_func_state *state, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_func_state *cur; /* state of the current function */ + int min_off, max_off; + int i, err; + struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + bool writing_zero = false; + /* set if the fact that we're writing a zero is used to let any + * stack slots remain STACK_ZERO + */ + bool zero_used = false; + + cur = env->cur_state->frame[env->cur_state->curframe]; + ptr_reg = &cur->regs[ptr_regno]; + min_off = ptr_reg->smin_value + off; + max_off = ptr_reg->smax_value + off + size; + if (value_regno >= 0) + value_reg = &cur->regs[value_regno]; + if (value_reg && register_is_null(value_reg)) + writing_zero = true; + + err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE), + state->acquired_refs, true); + if (err) + return err; + + + /* Variable offset writes destroy any spilled pointers in range. */ + for (i = min_off; i < max_off; i++) { + u8 new_type, *stype; + int slot, spi; + + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + + if (!env->allow_ptr_leaks + && *stype != NOT_INIT + && *stype != SCALAR_VALUE) { + /* Reject the write if there's are spilled pointers in + * range. If we didn't reject here, the ptr status + * would be erased below (even though not all slots are + * actually overwritten), possibly opening the door to + * leaks. + */ + verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", + insn_idx, i); + return -EINVAL; + } + + /* Erase all spilled pointers. */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + + /* Update the slot type. */ + new_type = STACK_MISC; + if (writing_zero && *stype == STACK_ZERO) { + new_type = STACK_ZERO; + zero_used = true; + } + /* If the slot is STACK_INVALID, we check whether it's OK to + * pretend that it will be initialized by this write. The slot + * might not actually be written to, and so if we mark it as + * initialized future reads might leak uninitialized memory. + * For privileged programs, we will accept such reads to slots + * that may or may not be written because, if we're reject + * them, the error would be too confusing. + */ + if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", + insn_idx, i); + return -EINVAL; + } + *stype = new_type; + } + if (zero_used) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + return 0; +} + +/* When register 'dst_regno' is assigned some values from stack[min_off, + * max_off), we set the register's type according to the types of the + * respective stack slots. If all the stack values are known to be zeros, then + * so is the destination reg. Otherwise, the register is considered to be + * SCALAR. This function does not deal with register filling; the caller must + * ensure that all spilled registers in the stack range have been marked as + * read. + */ +static void mark_reg_stack_read(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *ptr_state, + int min_off, int max_off, int dst_regno) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + int i, slot, spi; + u8 *stype; + int zeros = 0; + + for (i = min_off; i < max_off; i++) { + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = ptr_state->stack[spi].slot_type; + if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) + break; + zeros++; + } + if (zeros == max_off - min_off) { + /* any access_size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[dst_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[dst_regno].precise = true; + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, dst_regno); + } + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; +} + +/* Read the stack at 'off' and put the results into the register indicated by + * 'dst_regno'. It handles reg filling if the addressed stack slot is a + * spilled reg. + * + * 'dst_regno' can be -1, meaning that the read value is not going to a + * register. + * + * The access is assumed to be within the current stack bounds. + */ +static int check_stack_read_fixed_off(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *reg_state, + int off, int size, int dst_regno) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -2412,11 +2629,6 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype; - if (reg_state->allocated_stack <= slot) { - verbose(env, "invalid read from stack off %d+0 size %d\n", - off, size); - return -EACCES; - } stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -2427,9 +2639,9 @@ static int check_stack_read(struct bpf_verifier_env *env, verbose(env, "invalid size of register fill\n"); return -EACCES; } - if (value_regno >= 0) { - mark_reg_unknown(env, state->regs, value_regno); - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + if (dst_regno >= 0) { + mark_reg_unknown(env, state->regs, dst_regno); + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); return 0; @@ -2441,16 +2653,16 @@ static int check_stack_read(struct bpf_verifier_env *env, } } - if (value_regno >= 0) { + if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = *reg; + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { - /* If value_regno==-1, the caller is asking us whether + /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE * (e.g. for XADD). * We must not allow unprivileged callers to do that @@ -2462,70 +2674,167 @@ static int check_stack_read(struct bpf_verifier_env *env, } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { - int zeros = 0; + u8 type; for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + type = stype[(slot - i) % BPF_REG_SIZE]; + if (type == STACK_MISC) continue; - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { - zeros++; + if (type == STACK_ZERO) continue; - } verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - if (value_regno >= 0) { - if (zeros == size) { - /* any size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[value_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. - */ - state->regs[value_regno].precise = true; - } else { - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); - } - state->regs[value_regno].live |= REG_LIVE_WRITTEN; - } + if (dst_regno >= 0) + mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); } return 0; } -static int check_stack_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) +enum stack_access_src { + ACCESS_DIRECT = 1, /* the access is performed by an instruction */ + ACCESS_HELPER = 2, /* the access is performed by a helper */ +}; + +static int check_stack_range_initialized(struct bpf_verifier_env *env, + int regno, int off, int access_size, + bool zero_size_allowed, + enum stack_access_src type, + struct bpf_call_arg_meta *meta); + +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ + return cur_regs(env) + regno; +} + +/* Read the stack at 'ptr_regno + off' and put the result into the register + * 'dst_regno'. + * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * but not its variable offset. + * 'size' is assumed to be <= reg size and the access is assumed to be aligned. + * + * As opposed to check_stack_read_fixed_off, this function doesn't deal with + * filling registers (i.e. reads of spilled register cannot be detected when + * the offset is not fixed). We conservatively mark 'dst_regno' as containing + * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * offset; for a fixed offset check_stack_read_fixed_off should be used + * instead. + */ +static int check_stack_read_var_off(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, int dst_regno) { - /* Stack accesses must be at a fixed offset, so that we - * can determine what type of data were returned. See - * check_stack_read(). + /* The state of the source register. */ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *ptr_state = func(env, reg); + int err; + int min_off, max_off; + + /* Note that we pass a NULL meta, so raw access will not be permitted. */ - if (!tnum_is_const(reg->var_off)) { + err = check_stack_range_initialized(env, ptr_regno, off, size, + false, ACCESS_DIRECT, NULL); + if (err) + return err; + + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; + mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + return 0; +} + +/* check_stack_read dispatches to check_stack_read_fixed_off or + * check_stack_read_var_off. + * + * The caller must ensure that the offset falls within the allocated stack + * bounds. + * + * 'dst_regno' is a register which will receive the value from the stack. It + * can be -1, meaning that the read value is not going to a register. + */ +static int check_stack_read(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int dst_regno) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + /* Some accesses are only permitted with a static offset. */ + bool var_off = !tnum_is_const(reg->var_off); + + /* The offset is required to be static when reads don't go to a + * register, in order to not leak pointers (see + * check_stack_read_fixed_off). + */ + if (dst_regno < 0 && var_off) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d\n", + verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } + /* Variable offset is prohibited for unprivileged mode for simplicity + * since it requires corresponding support in Spectre masking for stack + * ALU. See also retrieve_ptr_limit(). + */ + if (!env->bypass_spec_v1 && var_off) { + char tn_buf[48]; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, size); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", + ptr_regno, tn_buf); return -EACCES; } - return 0; + if (!var_off) { + off += reg->var_off.value; + err = check_stack_read_fixed_off(env, state, off, size, + dst_regno); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. Note that dst_regno >= 0 on this + * branch. + */ + err = check_stack_read_var_off(env, ptr_regno, off, size, + dst_regno); + } + return err; +} + + +/* check_stack_write dispatches to check_stack_write_fixed_off or + * check_stack_write_var_off. + * + * 'ptr_regno' is the register used as a pointer into the stack. + * 'off' includes 'ptr_regno->off', but not its variable offset (if any). + * 'value_regno' is the register whose value we're writing to the stack. It can + * be -1, meaning that we're not writing from a register. + * + * The caller must ensure that the offset falls within the maximum stack size. + */ +static int check_stack_write(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + + if (tnum_is_const(reg->var_off)) { + off += reg->var_off.value; + err = check_stack_write_fixed_off(env, state, off, size, + value_regno, insn_idx); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. + */ + err = check_stack_write_var_off(env, state, + ptr_regno, off, size, + value_regno, insn_idx); + } + return err; } static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, @@ -2858,11 +3167,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } -static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) -{ - return cur_regs(env) + regno; -} - static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); @@ -2981,8 +3285,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; - /* The stack spill tracking logic in check_stack_write() - * and check_stack_read() relies on stack accesses being + /* The stack spill tracking logic in check_stack_write_fixed_off() + * and check_stack_read_fixed_off() relies on stack accesses being * aligned. */ strict = true; @@ -3074,9 +3378,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3400,6 +3702,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return 0; } +/* Check that the stack access at the given offset is within bounds. The + * maximum valid offset is -1. + * + * The minimum valid offset is -MAX_BPF_STACK for writes, and + * -state->allocated_stack for reads. + */ +static int check_stack_slot_within_bounds(int off, + struct bpf_func_state *state, + enum bpf_access_type t) +{ + int min_valid_off; + + if (t == BPF_WRITE) + min_valid_off = -MAX_BPF_STACK; + else + min_valid_off = -state->allocated_stack; + + if (off < min_valid_off || off > -1) + return -EACCES; + return 0; +} + +/* Check that the stack access at 'regno + off' falls within the maximum stack + * bounds. + * + * 'off' includes `regno->offset`, but not its dynamic part (if any). + */ +static int check_stack_access_within_bounds( + struct bpf_verifier_env *env, + int regno, int off, int access_size, + enum stack_access_src src, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state = func(env, reg); + int min_off, max_off; + int err; + char *err_extra; + + if (src == ACCESS_HELPER) + /* We don't know if helpers are reading or writing (or both). */ + err_extra = " indirect access to"; + else if (type == BPF_READ) + err_extra = " read from"; + else + err_extra = " write to"; + + if (tnum_is_const(reg->var_off)) { + min_off = reg->var_off.value + off; + if (access_size > 0) + max_off = min_off + access_size - 1; + else + max_off = min_off; + } else { + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smin_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack R%d\n", + err_extra, regno); + return -EACCES; + } + min_off = reg->smin_value + off; + if (access_size > 0) + max_off = reg->smax_value + off + access_size - 1; + else + max_off = min_off; + } + + err = check_stack_slot_within_bounds(min_off, state, type); + if (!err) + err = check_stack_slot_within_bounds(max_off, state, type); + + if (err) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid%s stack R%d off=%d size=%d\n", + err_extra, regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", + err_extra, regno, tn_buf, access_size); + } + } + return err; +} /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory @@ -3515,8 +3902,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - off += reg->var_off.value; - err = check_stack_access(env, reg, off, size); + /* Basic bounds checks. */ + err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); if (err) return err; @@ -3525,12 +3912,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - if (t == BPF_WRITE) - err = check_stack_write(env, state, off, size, - value_regno, insn_idx); - else - err = check_stack_read(env, state, off, size, + if (t == BPF_READ) + err = check_stack_read(env, regno, off, size, value_regno); + else + err = check_stack_write(env, regno, off, size, + value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@ -3665,9 +4052,26 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } + if (insn->imm & BPF_FETCH) { + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); + if (err) + return err; + } else { + /* This instruction accesses a memory location but doesn't + * actually load it into a register. + */ + load_reg = -1; + } + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, load_reg, true); if (err) return err; @@ -3677,65 +4081,56 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; - if (!(insn->imm & BPF_FETCH)) - return 0; - - if (insn->imm == BPF_CMPXCHG) - load_reg = BPF_REG_0; - else - load_reg = insn->src_reg; - - /* check and record load of old value */ - err = check_reg_arg(env, load_reg, DST_OP); - if (err) - return err; - return 0; } -static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, - int off, int access_size, - bool zero_size_allowed) +/* When register 'regno' is used to read the stack (either directly or through + * a helper function) make sure that it's within stack boundary and, depending + * on the access type, that all elements of the stack are initialized. + * + * 'off' includes 'regno->off', but not its dynamic part (if any). + * + * All registers that have been spilled on the stack in the slots within the + * read offsets are marked as read. + */ +static int check_stack_range_initialized( + struct bpf_verifier_env *env, int regno, int off, + int access_size, bool zero_size_allowed, + enum stack_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = func(env, reg); + int err, min_off, max_off, i, j, slot, spi; + char *err_extra = type == ACCESS_HELPER ? " indirect" : ""; + enum bpf_access_type bounds_check_type; + /* Some accesses can write anything into the stack, others are + * read-only. + */ + bool clobber = false; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - } else { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", - regno, tn_buf, access_size); - } + if (access_size == 0 && !zero_size_allowed) { + verbose(env, "invalid zero-sized read\n"); return -EACCES; } - return 0; -} -/* when register 'regno' is passed into function that will read 'access_size' - * bytes from that pointer, make sure that it's within stack boundary - * and all elements of stack are initialized. - * Unlike most pointer bounds-checking functions, this one doesn't take an - * 'off' argument, so it has to add in reg->off itself. - */ -static int check_stack_boundary(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, - struct bpf_call_arg_meta *meta) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, j, slot, spi; + if (type == ACCESS_HELPER) { + /* The bounds checks for writes are more permissive than for + * reads. However, if raw_mode is not set, we'll do extra + * checks below. + */ + bounds_check_type = BPF_WRITE; + clobber = true; + } else { + bounds_check_type = BPF_READ; + } + err = check_stack_access_within_bounds(env, regno, off, access_size, + type, bounds_check_type); + if (err) + return err; + if (tnum_is_const(reg->var_off)) { - min_off = max_off = reg->var_off.value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) - return err; + min_off = max_off = reg->var_off.value + off; } else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in @@ -3746,8 +4141,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n", + regno, err_extra, tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -3759,28 +4154,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smax_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded indirect variable offset stack access\n", - regno); - return -EACCES; - } - min_off = reg->smin_value + reg->off; - max_off = reg->smax_value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d min value is outside of stack bound\n", - regno); - return err; - } - err = __check_stack_boundary(env, regno, max_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d max value is outside of stack bound\n", - regno); - return err; - } + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; } if (meta && meta->raw_mode) { @@ -3800,8 +4175,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (*stype == STACK_MISC) goto mark; if (*stype == STACK_ZERO) { - /* helper can write anything into the stack */ - *stype = STACK_MISC; + if (clobber) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + } goto mark; } @@ -3812,22 +4189,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (state->stack[spi].slot_type[0] == STACK_SPILL && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { - __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - state->stack[spi].slot_type[j] = STACK_MISC; + if (clobber) { + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + } goto mark; } err: if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - min_off, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", + err_extra, regno, min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", - tn_buf, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n", + err_extra, regno, tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -3876,8 +4255,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, "rdwr", &env->prog->aux->max_rdwr_access); case PTR_TO_STACK: - return check_stack_boundary(env, regno, access_size, - zero_size_allowed, meta); + return check_stack_range_initialized( + env, + regno, reg->off, access_size, + zero_size_allowed, ACCESS_HELPER, meta); default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@ -3891,6 +4272,29 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) +{ + if (register_is_null(reg)) + return 0; + + if (reg_type_may_be_null(reg->type)) { + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + const struct bpf_reg_state saved_reg = *reg; + int rv; + + mark_ptr_not_null_reg(reg); + rv = check_helper_mem_access(env, regno, mem_size, true, NULL); + *reg = saved_reg; + return rv; + } + + return check_helper_mem_access(env, regno, mem_size, true, NULL); +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -4875,8 +5279,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, subprog); clear_caller_saved_regs(env, caller->regs); - /* All global functions return SCALAR_VALUE */ + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* continue with next insn after call */ return 0; @@ -5541,6 +5946,41 @@ do_sim: return !ret ? -EFAULT : 0; } +/* check that stack access falls within stack limits and that 'reg' doesn't + * have a variable offset. + * + * Variable offset is prohibited for unprivileged mode for simplicity since it + * requires corresponding support in Spectre masking for stack ALU. See also + * retrieve_ptr_limit(). + * + * + * 'off' includes 'reg->off'. + */ +static int check_stack_access_for_ptr_arithmetic( + struct bpf_verifier_env *env, + int regno, + const struct bpf_reg_state *reg, + int off) +{ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n", + regno, tn_buf, off); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root; off=%d\n", regno, off); + return -EACCES; + } + + return 0; +} + + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -5784,10 +6224,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, "prohibited for !root\n", dst); return -EACCES; } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access(env, dst_reg, dst_reg->off + - dst_reg->var_off.value, 1)) { - verbose(env, "R%d stack pointer arithmetic goes out of range, " - "prohibited for !root\n", dst); + check_stack_access_for_ptr_arithmetic( + env, dst, dst_reg, dst_reg->off + + dst_reg->var_off.value)) { return -EACCES; } } @@ -6266,7 +6705,7 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@ -6297,7 +6736,7 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@ -7367,43 +7806,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - const struct bpf_map *map = reg->map_ptr; - - if (map->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = map->inner_map_meta; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - reg->type = PTR_TO_XDP_SOCK; - } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH) { - reg->type = PTR_TO_SOCKET; - } else { - reg->type = PTR_TO_MAP_VALUE; - } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; - } - if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ reg->id = 0; reg->ref_obj_id = 0; - } else if (!reg_may_point_to_spin_lock(reg)) { + + return; + } + + mark_ptr_not_null_reg(reg); + + if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset * in release_reg_references(). * @@ -7986,6 +8401,9 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); + if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || + env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) + range = tnum_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { @@ -10015,15 +10433,22 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: if (!is_preallocated_map(map)) { verbose(env, - "Sleepable programs can only use preallocated hash maps\n"); + "Sleepable programs can only use preallocated maps\n"); return -EINVAL; } break; + case BPF_MAP_TYPE_RINGBUF: + break; default: verbose(env, - "Sleepable programs can only use array and hash maps\n"); + "Sleepable programs can only use array, hash, and ringbuf maps\n"); return -EINVAL; } @@ -10581,6 +11006,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; + u8 load_reg; insn = insns[adj_idx]; if (!aux[adj_idx].zext_dst) { @@ -10623,9 +11049,27 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext()) continue; + /* zext_dst means that we want to zero-extend whatever register + * the insn defines, which is dst_reg most of the time, with + * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. + */ + if (BPF_CLASS(insn.code) == BPF_STX && + BPF_MODE(insn.code) == BPF_ATOMIC) { + /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not + * define any registers, therefore zext_dst cannot be + * set. + */ + if (WARN_ON(!(insn.imm & BPF_FETCH))) + return -EINVAL; + load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 + : insn.src_reg; + } else { + load_reg = insn.dst_reg; + } + zext_patch[0] = insn; - zext_patch[1].dst_reg = insn.dst_reg; - zext_patch[1].src_reg = insn.dst_reg; + zext_patch[1].dst_reg = load_reg; + zext_patch[1].src_reg = load_reg; patch = zext_patch; patch_len = 2; apply_patch_buffer: @@ -10841,8 +11285,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but * need a hard reject of the program. Thus -EFAULT is @@ -10883,7 +11326,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* BPF_PROG_RUN doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->aux->stats will never be accessed and stays NULL + * func[i]->stats will never be accessed and stays NULL */ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) @@ -10971,8 +11414,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - @@ -11017,8 +11459,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); @@ -11047,8 +11488,7 @@ out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; @@ -11083,8 +11523,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); if (depth < 0) @@ -11124,7 +11563,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) bool isdiv = BPF_OP(insn->code) == BPF_DIV; struct bpf_insn *patchlet; struct bpf_insn chk_and_div[] = { - /* Rx div 0 -> 0 */ + /* [R,W]x div 0 -> 0 */ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | BPF_JNE | BPF_K, insn->src_reg, 0, 2, 0), @@ -11133,16 +11572,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) *insn, }; struct bpf_insn chk_and_mod[] = { - /* Rx mod 0 -> Rx */ + /* [R,W]x mod 0 -> [R,W]x */ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | BPF_JEQ | BPF_K, insn->src_reg, - 0, 1, 0), + 0, 1 + (is64 ? 0 : 1), 0), *insn, + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), }; patchlet = isdiv ? chk_and_div : chk_and_mod; cnt = isdiv ? ARRAY_SIZE(chk_and_div) : - ARRAY_SIZE(chk_and_mod); + ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) @@ -11547,6 +11988,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); + else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + const u32 mem_size = regs[i].mem_size; + + mark_reg_known_zero(env, regs, i); + regs[i].mem_size = mem_size; + regs[i].id = ++env->id_gen; + } } } else { /* 1st arg to a function */ @@ -12125,6 +12573,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_uninit_stack = bpf_allow_uninit_stack(); env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 764400260eb6..b0c45d923f0f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1188,6 +1188,10 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_ITER) + return true; + if (prog->type == BPF_PROG_TYPE_LSM) return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); @@ -1757,6 +1761,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_tracing_proto; case BPF_FUNC_sock_from_file: return &bpf_sock_from_file_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/lib/parman.c b/lib/parman.c index c6e42a8db824..a11f2f667639 100644 --- a/lib/parman.c +++ b/lib/parman.c @@ -85,7 +85,6 @@ static int parman_shrink(struct parman *parman) } static bool parman_prio_used(struct parman_prio *prio) - { return !list_empty(&prio->item_list); } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 49ec9e8d8aed..4dc4dcbecd12 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -345,7 +345,7 @@ static int __bpf_fill_ja(struct bpf_test *self, unsigned int len, static int bpf_fill_maxinsns11(struct bpf_test *self) { - /* Hits 70 passes on x86_64, so cannot get JITed there. */ + /* Hits 70 passes on x86_64 and triggers NOPs padding. */ return __bpf_fill_ja(self, BPF_MAXINSNS, 68); } @@ -5318,15 +5318,10 @@ static struct bpf_test tests[] = { { "BPF_MAXINSNS: Jump, gap, jump, ...", { }, -#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_X86) - CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, -#else CLASSIC | FLAG_NO_DATA, -#endif { }, { { 0, 0xababcbac } }, .fill_helper = bpf_fill_maxinsns11, - .expected_errcode = -ENOTSUPP, }, { "BPF_MAXINSNS: jump over MSH", diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index ca1a0d07a087..ebda397fa95a 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1577,8 +1577,8 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sk_buff *skb; struct net_device *dev; struct ddpehdr *ddp; - int size; - struct atalk_route *rt; + int size, hard_header_len; + struct atalk_route *rt, *rt_lo = NULL; int err; if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) @@ -1641,7 +1641,22 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n", sk, size, dev->name); - size += dev->hard_header_len; + hard_header_len = dev->hard_header_len; + /* Leave room for loopback hardware header if necessary */ + if (usat->sat_addr.s_node == ATADDR_BCAST && + (dev->flags & IFF_LOOPBACK || !(rt->flags & RTF_GATEWAY))) { + struct atalk_addr at_lo; + + at_lo.s_node = 0; + at_lo.s_net = 0; + + rt_lo = atrtr_find(&at_lo); + + if (rt_lo && rt_lo->dev->hard_header_len > hard_header_len) + hard_header_len = rt_lo->dev->hard_header_len; + } + + size += hard_header_len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err); lock_sock(sk); @@ -1649,7 +1664,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; skb_reserve(skb, ddp_dl->header_length); - skb_reserve(skb, dev->hard_header_len); + skb_reserve(skb, hard_header_len); skb->dev = dev; SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk); @@ -1700,18 +1715,12 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* loop back */ skb_orphan(skb); if (ddp->deh_dnode == ATADDR_BCAST) { - struct atalk_addr at_lo; - - at_lo.s_node = 0; - at_lo.s_net = 0; - - rt = atrtr_find(&at_lo); - if (!rt) { + if (!rt_lo) { kfree_skb(skb); err = -ENETUNREACH; goto out; } - dev = rt->dev; + dev = rt_lo->dev; skb->dev = dev; } ddp_dl->request(ddp_dl, skb, dev->dev_addr); diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 01c67ed727a9..12487f6fe9b4 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -639,7 +639,7 @@ int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role) { struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); - int err; + enum br_mrp_hw_support support; if (!mrp) return -EINVAL; @@ -647,9 +647,9 @@ int br_mrp_set_ring_role(struct net_bridge *br, mrp->ring_role = role->ring_role; /* If there is an error just bailed out */ - err = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); - if (err && err != -EOPNOTSUPP) - return err; + support = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; /* Now detect if the HW actually applied the role or not. If the HW * applied the role it means that the SW will not to do those operations @@ -657,7 +657,7 @@ int br_mrp_set_ring_role(struct net_bridge *br, * SW when ring is open, but if the is not pushed to the HW the SW will * need to detect when the ring is open */ - mrp->ring_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + mrp->ring_role_offloaded = support == BR_MRP_SW ? 0 : 1; return 0; } @@ -670,6 +670,7 @@ int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test) { struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id); + enum br_mrp_hw_support support; if (!mrp) return -EINVAL; @@ -677,9 +678,13 @@ int br_mrp_start_test(struct net_bridge *br, /* Try to push it to the HW and if it fails then continue with SW * implementation and if that also fails then return error. */ - if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, - test->max_miss, test->period, - test->monitor)) + support = br_mrp_switchdev_send_ring_test(br, mrp, test->interval, + test->max_miss, test->period, + test->monitor); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; + + if (support == BR_MRP_HW) return 0; mrp->test_interval = test->interval; @@ -721,8 +726,8 @@ int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) { struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + enum br_mrp_hw_support support; struct net_bridge_port *p; - int err; if (!mrp) return -EINVAL; @@ -780,10 +785,10 @@ int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) mrp->in_id = role->in_id; /* If there is an error just bailed out */ - err = br_mrp_switchdev_set_in_role(br, mrp, role->in_id, - role->ring_id, role->in_role); - if (err && err != -EOPNOTSUPP) - return err; + support = br_mrp_switchdev_set_in_role(br, mrp, role->in_id, + role->ring_id, role->in_role); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; /* Now detect if the HW actually applied the role or not. If the HW * applied the role it means that the SW will not to do those operations @@ -791,7 +796,7 @@ int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) * SW when interconnect ring is open, but if the is not pushed to the HW * the SW will need to detect when the interconnect ring is open. */ - mrp->in_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + mrp->in_role_offloaded = support == BR_MRP_SW ? 0 : 1; return 0; } @@ -804,6 +809,7 @@ int br_mrp_start_in_test(struct net_bridge *br, struct br_mrp_start_in_test *in_test) { struct br_mrp *mrp = br_mrp_find_in_id(br, in_test->in_id); + enum br_mrp_hw_support support; if (!mrp) return -EINVAL; @@ -814,8 +820,13 @@ int br_mrp_start_in_test(struct net_bridge *br, /* Try to push it to the HW and if it fails then continue with SW * implementation and if that also fails then return error. */ - if (!br_mrp_switchdev_send_in_test(br, mrp, in_test->interval, - in_test->max_miss, in_test->period)) + support = br_mrp_switchdev_send_in_test(br, mrp, in_test->interval, + in_test->max_miss, + in_test->period); + if (support == BR_MRP_NONE) + return -EOPNOTSUPP; + + if (support == BR_MRP_HW) return 0; mrp->in_test_interval = in_test->interval; diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 75a7e8d0a268..cb54b324fa8c 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -4,6 +4,30 @@ #include "br_private_mrp.h" +static enum br_mrp_hw_support +br_mrp_switchdev_port_obj(struct net_bridge *br, + const struct switchdev_obj *obj, bool add) +{ + int err; + + if (add) + err = switchdev_port_obj_add(br->dev, obj, NULL); + else + err = switchdev_port_obj_del(br->dev, obj); + + /* In case of success just return and notify the SW that doesn't need + * to do anything + */ + if (!err) + return BR_MRP_HW; + + if (err != -EOPNOTSUPP) + return BR_MRP_NONE; + + /* Continue with SW backup */ + return BR_MRP_SW; +} + int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) { struct switchdev_obj_mrp mrp_obj = { @@ -14,14 +38,11 @@ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) .ring_id = mrp->ring_id, .prio = mrp->prio, }; - int err; - err = switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - if (err && err != -EOPNOTSUPP) - return err; - - return 0; + return switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); } int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) @@ -33,40 +54,54 @@ int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) .s_port = NULL, .ring_id = mrp->ring_id, }; - int err; - - err = switchdev_port_obj_del(br->dev, &mrp_obj.obj); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_obj_del(br->dev, &mrp_obj.obj); } -int br_mrp_switchdev_set_ring_role(struct net_bridge *br, - struct br_mrp *mrp, - enum br_mrp_ring_role_type role) +enum br_mrp_hw_support +br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role) { struct switchdev_obj_ring_role_mrp mrp_role = { .obj.orig_dev = br->dev, .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP, .ring_role = role, .ring_id = mrp->ring_id, + .sw_backup = false, }; + enum br_mrp_hw_support support; int err; - if (role == BR_MRP_RING_ROLE_DISABLED) - err = switchdev_port_obj_del(br->dev, &mrp_role.obj); - else + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; + + support = br_mrp_switchdev_port_obj(br, &mrp_role.obj, + role != BR_MRP_RING_ROLE_DISABLED); + if (support != BR_MRP_SW) + return support; + + /* If the driver can't configure to run completely the protocol in HW, + * then try again to configure the HW so the SW can run the protocol. + */ + mrp_role.sw_backup = true; + if (role != BR_MRP_RING_ROLE_DISABLED) err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + else + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); - return err; + if (!err) + return BR_MRP_SW; + + return BR_MRP_NONE; } -int br_mrp_switchdev_send_ring_test(struct net_bridge *br, - struct br_mrp *mrp, u32 interval, - u8 max_miss, u32 period, - bool monitor) +enum br_mrp_hw_support +br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period, + bool monitor) { struct switchdev_obj_ring_test_mrp test = { .obj.orig_dev = br->dev, @@ -77,14 +112,11 @@ int br_mrp_switchdev_send_ring_test(struct net_bridge *br, .period = period, .monitor = monitor, }; - int err; - if (interval == 0) - err = switchdev_port_obj_del(br->dev, &test.obj); - else - err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; - return err; + return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0); } int br_mrp_switchdev_set_ring_state(struct net_bridge *br, @@ -97,19 +129,17 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, .ring_state = state, .ring_id = mrp->ring_id, }; - int err; - - err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); } -int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, - u16 in_id, u32 ring_id, - enum br_mrp_in_role_type role) +enum br_mrp_hw_support +br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role) { struct switchdev_obj_in_role_mrp mrp_role = { .obj.orig_dev = br->dev, @@ -118,15 +148,32 @@ int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, .in_id = mrp->in_id, .ring_id = mrp->ring_id, .i_port = rtnl_dereference(mrp->i_port)->dev, + .sw_backup = false, }; + enum br_mrp_hw_support support; int err; - if (role == BR_MRP_IN_ROLE_DISABLED) - err = switchdev_port_obj_del(br->dev, &mrp_role.obj); - else + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; + + support = br_mrp_switchdev_port_obj(br, &mrp_role.obj, + role != BR_MRP_IN_ROLE_DISABLED); + if (support != BR_MRP_NONE) + return support; + + /* If the driver can't configure to run completely the protocol in HW, + * then try again to configure the HW so the SW can run the protocol. + */ + mrp_role.sw_backup = true; + if (role != BR_MRP_IN_ROLE_DISABLED) err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + else + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + + if (!err) + return BR_MRP_SW; - return err; + return BR_MRP_NONE; } int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, @@ -138,18 +185,16 @@ int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, .in_state = state, .in_id = mrp->in_id, }; - int err; - - err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); } -int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period) +enum br_mrp_hw_support +br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period) { struct switchdev_obj_in_test_mrp test = { .obj.orig_dev = br->dev, @@ -159,14 +204,11 @@ int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, .in_id = mrp->in_id, .period = period, }; - int err; - if (interval == 0) - err = switchdev_port_obj_del(br->dev, &test.obj); - else - err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return BR_MRP_SW; - return err; + return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0); } int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state) @@ -176,14 +218,11 @@ int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state) .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, .u.stp_state = state, }; - int err; - err = switchdev_port_attr_set(p->dev, &attr); - if (err && err != -EOPNOTSUPP) - br_warn(p->br, "error setting offload MRP state on port %u(%s)\n", - (unsigned int)p->port_no, p->dev->name); + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return err; + return switchdev_port_attr_set(p->dev, &attr, NULL); } int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, @@ -194,11 +233,9 @@ int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, .u.mrp_port_role = role, }; - int err; - err = switchdev_port_attr_set(p->dev, &attr); - if (err && err != -EOPNOTSUPP) - return err; + if (!IS_ENABLED(CONFIG_NET_SWITCHDEV)) + return 0; - return 0; + return switchdev_port_attr_set(p->dev, &attr, NULL); } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index bf10ef5bbcd9..9d265447d654 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1381,7 +1381,7 @@ static void br_mc_router_state_change(struct net_bridge *p, .u.mrouter = is_mc_router, }; - switchdev_port_attr_set(p->dev, &attr); + switchdev_port_attr_set(p->dev, &attr, NULL); } static void br_multicast_local_router_expired(struct timer_list *t) @@ -1602,7 +1602,7 @@ static void br_mc_disabled_update(struct net_device *dev, bool value) .u.mc_disabled = !value, }; - switchdev_port_attr_set(dev, &attr); + switchdev_port_attr_set(dev, &attr, NULL); } int br_multicast_add_port(struct net_bridge_port *port) @@ -2645,7 +2645,7 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, .u.mrouter = is_mc_router, }; - switchdev_port_attr_set(p->dev, &attr); + switchdev_port_attr_set(p->dev, &attr, NULL); } /* diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 7b513c5d347f..f2b1343f8332 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1212,7 +1212,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_VLAN_FILTERING]) { u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); - err = __br_vlan_filter_toggle(br, vlan_filter); + err = br_vlan_filter_toggle(br, vlan_filter, extack); if (err) return err; } @@ -1221,7 +1221,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_VLAN_PROTOCOL]) { __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); - err = __br_vlan_set_proto(br, vlan_proto); + err = __br_vlan_set_proto(br, vlan_proto, extack); if (err) return err; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a1639d41188b..d7d167e10b70 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1085,14 +1085,17 @@ int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); -int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); -int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); -int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); -int br_vlan_set_proto(struct net_bridge *br, unsigned long val); +int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, + struct netlink_ext_ack *extack); +int br_vlan_set_proto(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); -int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, @@ -1261,8 +1264,9 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) return 0; } -static inline int __br_vlan_filter_toggle(struct net_bridge *br, - unsigned long val) +static inline int br_vlan_filter_toggle(struct net_bridge *br, + unsigned long val, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 2514954c1431..9559aa2750fb 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -46,6 +46,20 @@ struct br_mrp { struct rcu_head rcu; }; +/* This type is returned by br_mrp_switchdev functions that allow to have a SW + * backup in case the HW can't implement completely the protocol. + * BR_MRP_NONE - means the HW can't run at all the protocol, so the SW stops + * configuring the node anymore. + * BR_MRP_SW - the HW can help the SW to run the protocol, by redirecting MRP + * frames to CPU. + * BR_MRP_HW - the HW can implement completely the protocol. + */ +enum br_mrp_hw_support { + BR_MRP_NONE, + BR_MRP_SW, + BR_MRP_HW, +}; + /* br_mrp.c */ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance); int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); @@ -65,23 +79,27 @@ int br_mrp_start_in_test(struct net_bridge *br, /* br_mrp_switchdev.c */ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp); -int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, - enum br_mrp_ring_role_type role); +enum br_mrp_hw_support +br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role); int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, enum br_mrp_ring_state_type state); -int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period, - bool monitor); +enum br_mrp_hw_support +br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period, + bool monitor); int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, enum br_mrp_port_role_type role); -int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, - u16 in_id, u32 ring_id, - enum br_mrp_in_role_type role); +enum br_mrp_hw_support +br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role); int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, enum br_mrp_in_state_type state); -int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period); +enum br_mrp_hw_support +br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period); /* br_mrp_netlink.c */ int br_mrp_ring_port_open(struct net_device *dev, u8 loc); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index a3a5745660dd..21c6781906aa 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -43,7 +43,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) return; p->state = state; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); @@ -591,7 +591,7 @@ int __set_ageing_time(struct net_device *dev, unsigned long t) }; int err; - err = switchdev_port_attr_set(dev, &attr); + err = switchdev_port_attr_set(dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 184cf4c8b06d..b89503832fcc 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -96,9 +96,11 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS; attr.flags = SWITCHDEV_F_DEFER; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { - NL_SET_ERR_MSG_MOD(extack, "error setting offload flag on port"); + if (extack && !extack->_msg) + NL_SET_ERR_MSG_MOD(extack, + "error setting offload flag on port"); return err; } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 71f0f671c4ef..072e29840082 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -30,11 +30,13 @@ */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, - int (*set)(struct net_bridge *, unsigned long)) + int (*set)(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack)) { struct net_bridge *br = to_bridge(d); - char *endp; + struct netlink_ext_ack extack = {0}; unsigned long val; + char *endp; int err; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) @@ -47,9 +49,15 @@ static ssize_t store_bridge_parm(struct device *d, if (!rtnl_trylock()) return restart_syscall(); - err = (*set)(br, val); + err = (*set)(br, val, &extack); if (!err) netdev_state_change(br->dev); + if (extack._msg) { + if (err) + br_err(br, "%s\n", extack._msg); + else + br_warn(br, "%s\n", extack._msg); + } rtnl_unlock(); return err ? err : len; @@ -63,11 +71,17 @@ static ssize_t forward_delay_show(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } +static int set_forward_delay(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_set_forward_delay(br, val); +} + static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_set_forward_delay); + return store_bridge_parm(d, buf, len, set_forward_delay); } static DEVICE_ATTR_RW(forward_delay); @@ -78,11 +92,17 @@ static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->hello_time)); } +static int set_hello_time(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_set_hello_time(br, val); +} + static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_set_hello_time); + return store_bridge_parm(d, buf, len, set_hello_time); } static DEVICE_ATTR_RW(hello_time); @@ -93,10 +113,16 @@ static ssize_t max_age_show(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->max_age)); } +static int set_max_age(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_set_max_age(br, val); +} + static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_set_max_age); + return store_bridge_parm(d, buf, len, set_max_age); } static DEVICE_ATTR_RW(max_age); @@ -107,7 +133,8 @@ static ssize_t ageing_time_show(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } -static int set_ageing_time(struct net_bridge *br, unsigned long val) +static int set_ageing_time(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { return br_set_ageing_time(br, val); } @@ -128,9 +155,10 @@ static ssize_t stp_state_show(struct device *d, } -static int set_stp_state(struct net_bridge *br, unsigned long val) +static int set_stp_state(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { - return br_stp_set_enabled(br, val, NULL); + return br_stp_set_enabled(br, val, extack); } static ssize_t stp_state_store(struct device *d, @@ -149,7 +177,8 @@ static ssize_t group_fwd_mask_show(struct device *d, return sprintf(buf, "%#x\n", br->group_fwd_mask); } -static int set_group_fwd_mask(struct net_bridge *br, unsigned long val) +static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; @@ -176,7 +205,8 @@ static ssize_t priority_show(struct device *d, struct device_attribute *attr, (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } -static int set_priority(struct net_bridge *br, unsigned long val) +static int set_priority(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_stp_set_bridge_priority(br, (u16) val); return 0; @@ -312,7 +342,8 @@ static ssize_t group_addr_store(struct device *d, static DEVICE_ATTR_RW(group_addr); -static int set_flush(struct net_bridge *br, unsigned long val) +static int set_flush(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_fdb_flush(br); return 0; @@ -334,9 +365,10 @@ static ssize_t no_linklocal_learn_show(struct device *d, return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } -static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val) +static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { - return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, NULL); + return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack); } static ssize_t no_linklocal_learn_store(struct device *d, @@ -355,11 +387,17 @@ static ssize_t multicast_router_show(struct device *d, return sprintf(buf, "%d\n", br->multicast_router); } +static int set_multicast_router(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_router(br, val); +} + static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_router); + return store_bridge_parm(d, buf, len, set_multicast_router); } static DEVICE_ATTR_RW(multicast_router); @@ -371,11 +409,17 @@ static ssize_t multicast_snooping_show(struct device *d, return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } +static int toggle_multicast(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_toggle(br, val); +} + static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_toggle); + return store_bridge_parm(d, buf, len, toggle_multicast); } static DEVICE_ATTR_RW(multicast_snooping); @@ -388,7 +432,8 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } -static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val) +static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; @@ -411,11 +456,17 @@ static ssize_t multicast_querier_show(struct device *d, return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER)); } +static int set_multicast_querier(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_querier(br, val); +} + static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_querier); + return store_bridge_parm(d, buf, len, set_multicast_querier); } static DEVICE_ATTR_RW(multicast_querier); @@ -425,10 +476,12 @@ static ssize_t hash_elasticity_show(struct device *d, return sprintf(buf, "%u\n", RHT_ELASTICITY); } -static int set_elasticity(struct net_bridge *br, unsigned long val) +static int set_elasticity(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { - br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", - RHT_ELASTICITY); + /* 16 is RHT_ELASTICITY */ + NL_SET_ERR_MSG_MOD(extack, + "the hash_elasticity option has been deprecated and is always 16"); return 0; } @@ -447,7 +500,8 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, return sprintf(buf, "%u\n", br->hash_max); } -static int set_hash_max(struct net_bridge *br, unsigned long val) +static int set_hash_max(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->hash_max = val; return 0; @@ -469,11 +523,17 @@ static ssize_t multicast_igmp_version_show(struct device *d, return sprintf(buf, "%u\n", br->multicast_igmp_version); } +static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_igmp_version(br, val); +} + static ssize_t multicast_igmp_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version); + return store_bridge_parm(d, buf, len, set_multicast_igmp_version); } static DEVICE_ATTR_RW(multicast_igmp_version); @@ -485,7 +545,8 @@ static ssize_t multicast_last_member_count_show(struct device *d, return sprintf(buf, "%u\n", br->multicast_last_member_count); } -static int set_last_member_count(struct net_bridge *br, unsigned long val) +static int set_last_member_count(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_last_member_count = val; return 0; @@ -506,7 +567,8 @@ static ssize_t multicast_startup_query_count_show( return sprintf(buf, "%u\n", br->multicast_startup_query_count); } -static int set_startup_query_count(struct net_bridge *br, unsigned long val) +static int set_startup_query_count(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_startup_query_count = val; return 0; @@ -528,7 +590,8 @@ static ssize_t multicast_last_member_interval_show( jiffies_to_clock_t(br->multicast_last_member_interval)); } -static int set_last_member_interval(struct net_bridge *br, unsigned long val) +static int set_last_member_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_last_member_interval = clock_t_to_jiffies(val); return 0; @@ -550,7 +613,8 @@ static ssize_t multicast_membership_interval_show( jiffies_to_clock_t(br->multicast_membership_interval)); } -static int set_membership_interval(struct net_bridge *br, unsigned long val) +static int set_membership_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_membership_interval = clock_t_to_jiffies(val); return 0; @@ -573,7 +637,8 @@ static ssize_t multicast_querier_interval_show(struct device *d, jiffies_to_clock_t(br->multicast_querier_interval)); } -static int set_querier_interval(struct net_bridge *br, unsigned long val) +static int set_querier_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_querier_interval = clock_t_to_jiffies(val); return 0; @@ -596,7 +661,8 @@ static ssize_t multicast_query_interval_show(struct device *d, jiffies_to_clock_t(br->multicast_query_interval)); } -static int set_query_interval(struct net_bridge *br, unsigned long val) +static int set_query_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_query_interval = clock_t_to_jiffies(val); return 0; @@ -619,7 +685,8 @@ static ssize_t multicast_query_response_interval_show( jiffies_to_clock_t(br->multicast_query_response_interval)); } -static int set_query_response_interval(struct net_bridge *br, unsigned long val) +static int set_query_response_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_query_response_interval = clock_t_to_jiffies(val); return 0; @@ -642,7 +709,8 @@ static ssize_t multicast_startup_query_interval_show( jiffies_to_clock_t(br->multicast_startup_query_interval)); } -static int set_startup_query_interval(struct net_bridge *br, unsigned long val) +static int set_startup_query_interval(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br->multicast_startup_query_interval = clock_t_to_jiffies(val); return 0; @@ -666,7 +734,8 @@ static ssize_t multicast_stats_enabled_show(struct device *d, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } -static int set_stats_enabled(struct net_bridge *br, unsigned long val) +static int set_stats_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; @@ -691,11 +760,17 @@ static ssize_t multicast_mld_version_show(struct device *d, return sprintf(buf, "%u\n", br->multicast_mld_version); } +static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_multicast_set_mld_version(br, val); +} + static ssize_t multicast_mld_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_mld_version); + return store_bridge_parm(d, buf, len, set_multicast_mld_version); } static DEVICE_ATTR_RW(multicast_mld_version); #endif @@ -708,7 +783,8 @@ static ssize_t nf_call_iptables_show( return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } -static int set_nf_call_iptables(struct net_bridge *br, unsigned long val) +static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; @@ -729,7 +805,8 @@ static ssize_t nf_call_ip6tables_show( return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } -static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val) +static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; @@ -750,7 +827,8 @@ static ssize_t nf_call_arptables_show( return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } -static int set_nf_call_arptables(struct net_bridge *br, unsigned long val) +static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; @@ -821,11 +899,17 @@ static ssize_t vlan_stats_enabled_show(struct device *d, return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } +static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_vlan_set_stats(br, val); +} + static ssize_t vlan_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_vlan_set_stats); + return store_bridge_parm(d, buf, len, set_vlan_stats_enabled); } static DEVICE_ATTR_RW(vlan_stats_enabled); @@ -837,11 +921,17 @@ static ssize_t vlan_stats_per_port_show(struct device *d, return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } +static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) +{ + return br_vlan_set_stats_per_port(br, val); +} + static ssize_t vlan_stats_per_port_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port); + return store_bridge_parm(d, buf, len, set_vlan_stats_per_port); } static DEVICE_ATTR_RW(vlan_stats_per_port); #endif diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index bb2909738518..8829f621b8ec 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -806,7 +806,8 @@ void br_recalculate_fwd_mask(struct net_bridge *br) ~(1u << br->group_addr[5]); } -int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) +int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, @@ -819,7 +820,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; - err = switchdev_port_attr_set(br->dev, &attr); + err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; @@ -831,11 +832,6 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) return 0; } -int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) -{ - return __br_vlan_filter_toggle(br, val); -} - bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -854,7 +850,8 @@ int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) } EXPORT_SYMBOL_GPL(br_vlan_get_proto); -int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, + struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, @@ -871,7 +868,7 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) if (br->vlan_proto == proto) return 0; - err = switchdev_port_attr_set(br->dev, &attr); + err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; @@ -901,7 +898,7 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) err_filt: attr.u.vlan_protocol = ntohs(oldproto); - switchdev_port_attr_set(br->dev, &attr); + switchdev_port_attr_set(br->dev, &attr, NULL); list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) vlan_vid_del(p->dev, proto, vlan->vid); @@ -915,12 +912,13 @@ err_filt: return err; } -int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +int br_vlan_set_proto(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; - return __br_vlan_set_proto(br, htons(val)); + return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) @@ -1100,7 +1098,8 @@ err_port: goto out; } -int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { u16 pvid = val; int err = 0; @@ -1117,7 +1116,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) err = -EPERM; goto out; } - err = __br_vlan_set_default_pvid(br, pvid, NULL); + err = __br_vlan_set_default_pvid(br, pvid, extack); out: return err; } @@ -1167,7 +1166,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) if (!vg) goto out; - ret = switchdev_port_attr_set(p->dev, &attr); + ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 79b6a04d8eb6..fadc7c8a3107 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -115,10 +115,7 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) else skb->ip_summed = CHECKSUM_NONE; - if (in_interrupt()) - netif_rx(skb); - else - netif_rx_ni(skb); + netif_rx_any_context(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; diff --git a/net/core/dev.c b/net/core/dev.c index ea9b46318d23..6c5967e80132 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2217,28 +2217,14 @@ static inline void net_timestamp_set(struct sk_buff *skb) bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; - - if (!(dev->flags & IFF_UP)) - return false; - - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; - - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; - - return false; + return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); -int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); @@ -2247,6 +2233,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return ret; } + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); +} EXPORT_SYMBOL_GPL(__dev_forward_skb); /** @@ -2273,6 +2264,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) diff --git a/net/core/filter.c b/net/core/filter.c index 3b728ab79a61..adfdad234674 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2083,13 +2083,13 @@ static const struct bpf_func_proto bpf_csum_level_proto = { static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { - return dev_forward_skb(dev, skb); + return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; @@ -2480,7 +2480,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; dev = ops->ndo_get_peer_dev(dev); if (unlikely(!dev || - !is_skb_forwardable(dev, skb) || + !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; @@ -3552,11 +3552,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, return 0; } -static u32 __bpf_skb_max_len(const struct sk_buff *skb) -{ - return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : - SKB_MAX_ALLOC; -} +#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) @@ -3605,7 +3601,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); - u32 len_max = __bpf_skb_max_len(skb); + u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; @@ -3688,7 +3684,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; @@ -3764,7 +3760,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; @@ -4631,6 +4627,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); @@ -5291,12 +5299,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, const struct neighbour *neigh, - const struct net_device *dev) + const struct net_device *dev, u32 mtu) { memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + if (mtu) + params->mtu_result = mtu; /* union with tot_len */ return 0; } @@ -5312,8 +5322,8 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct net_device *dev; struct fib_result res; struct flowi4 fl4; + u32 mtu = 0; int err; - u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -5380,8 +5390,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } nhc = res.nhc; @@ -5415,7 +5427,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5432,7 +5444,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif, err; - u32 mtu; + u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -5507,8 +5519,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } if (res.nh->fib_nh_lws) @@ -5528,7 +5542,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5571,6 +5585,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; + bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; @@ -5578,25 +5593,33 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) return -EINVAL; + if (params->tot_len) + check_mtu = true; + switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - rc = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - rc = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } - if (!rc) { + if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; + /* When tot_len isn't provided by user, check skb + * against MTU of FIB lookup resulting net_device + */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; + + params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; @@ -5612,6 +5635,116 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int skb_len, dev_len; + int mtu; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + + if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + dev_len = mtu + dev->hard_header_len; + skb_len = skb->len + len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + + if (flags & BPF_MTU_CHK_SEGS && + !skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } +out: + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + /* Add L2-header as dev MTU is L3 size */ + dev_len = mtu + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { @@ -7021,6 +7154,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; @@ -7031,6 +7172,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; @@ -7181,6 +7330,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7250,6 +7401,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c565c7a17091..2ef2224b3bff 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1056,6 +1056,9 @@ proto_again: key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; @@ -1072,9 +1075,6 @@ proto_again: } } - __skb_flow_dissect_ipv4(skb, flow_dissector, - target_container, data, iph); - break; } case htons(ETH_P_IPV6): { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 25cdbb20f3a0..1261512d6807 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -669,14 +669,13 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -void sk_psock_destroy(struct rcu_head *rcu) +static void sk_psock_destroy(struct rcu_head *rcu) { struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); INIT_WORK(&psock->gc, sk_psock_destroy_deferred); schedule_work(&psock->gc); } -EXPORT_SYMBOL_GPL(sk_psock_destroy); void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 3a8c9ab4ecbe..05354976c1fc 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -513,3 +513,73 @@ void xdp_warn(const char *msg, const char *func, const int line) WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); }; EXPORT_SYMBOL_GPL(xdp_warn); + +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +{ + n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, + n_skb, skbs); + if (unlikely(!n_skb)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int headroom, frame_size; + void *hard_start; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); + +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + + return __xdp_build_skb_from_frame(xdpf, skb, dev); +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f5949b39f6f7..2eeaa42f2e08 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -31,6 +31,10 @@ enum { DSA_NOTIFIER_VLAN_DEL, DSA_NOTIFIER_MTU, DSA_NOTIFIER_TAG_PROTO, + DSA_NOTIFIER_MRP_ADD, + DSA_NOTIFIER_MRP_DEL, + DSA_NOTIFIER_MRP_ADD_RING_ROLE, + DSA_NOTIFIER_MRP_DEL_RING_ROLE, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -75,6 +79,7 @@ struct dsa_notifier_vlan_info { const struct switchdev_obj_port_vlan *vlan; int sw_index; int port; + struct netlink_ext_ack *extack; }; /* DSA_NOTIFIER_MTU */ @@ -90,6 +95,20 @@ struct dsa_notifier_tag_proto_info { const struct dsa_device_ops *tag_ops; }; +/* DSA_NOTIFIER_MRP_* */ +struct dsa_notifier_mrp_info { + const struct switchdev_obj_mrp *mrp; + int sw_index; + int port; +}; + +/* DSA_NOTIFIER_MRP_* */ +struct dsa_notifier_mrp_ring_role_info { + const struct switchdev_obj_ring_role_mrp *mrp; + int sw_index; + int port; +}; + struct dsa_switchdev_event_work { struct dsa_switch *ds; int port; @@ -169,7 +188,8 @@ int dsa_port_lag_change(struct dsa_port *dp, int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo); void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); -int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering); +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct netlink_ext_ack *extack); bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, @@ -192,9 +212,18 @@ int dsa_port_bridge_flags(const struct dsa_port *dp, int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, struct netlink_ext_ack *extack); int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); +int dsa_port_mrp_add(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_del(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); +int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); diff --git a/net/dsa/port.c b/net/dsa/port.c index 80e6471a7a5c..c9c6d7ab3f47 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -294,7 +294,8 @@ void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) /* Must be called under rcu_read_lock() */ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; int err, i; @@ -324,8 +325,8 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { - dev_err(ds->dev, "Must remove upper %s first\n", - upper_dev->name); + NL_SET_ERR_MSG_MOD(extack, + "Must first remove VLAN uppers having VIDs also present in bridge"); return false; } } @@ -351,14 +352,16 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, if (other_bridge == dp->bridge_dev) continue; if (br_vlan_enabled(other_bridge) != vlan_filtering) { - dev_err(ds->dev, "VLAN filtering is a global setting\n"); + NL_SET_ERR_MSG_MOD(extack, + "VLAN filtering is a global setting"); return false; } } return true; } -int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; bool apply; @@ -372,7 +375,7 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) * dsa_slave_switchdev_event(). */ rcu_read_lock(); - apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering); + apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering, extack); rcu_read_unlock(); if (!apply) return -EINVAL; @@ -380,7 +383,8 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) if (dsa_port_is_vlan_filtering(dp) == vlan_filtering) return 0; - err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering); + err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering, + extack); if (err) return err; @@ -535,12 +539,14 @@ int dsa_port_mdb_del(const struct dsa_port *dp, } int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct dsa_notifier_vlan_info info = { .sw_index = dp->ds->index, .port = dp->index, .vlan = vlan, + .extack = extack, }; return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); @@ -558,6 +564,54 @@ int dsa_port_vlan_del(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } +int dsa_port_mrp_add(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp) +{ + struct dsa_notifier_mrp_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_ADD, &info); +} + +int dsa_port_mrp_del(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp) +{ + struct dsa_notifier_mrp_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_DEL, &info); +} + +int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct dsa_notifier_mrp_ring_role_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_ADD_RING_ROLE, &info); +} + +int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct dsa_notifier_mrp_ring_role_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_DEL_RING_ROLE, &info); +} + void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8c9a41a7209a..491e3761b5f4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -286,7 +286,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = dsa_port_set_state(dp, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering); + ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, + extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: ret = dsa_port_ageing_time(dp, attr->u.ageing_time); @@ -357,11 +358,14 @@ static int dsa_slave_vlan_add(struct net_device *dev, rcu_read_lock(); err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan); rcu_read_unlock(); - if (err) + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Port already has a VLAN upper with this VID"); return err; + } } - err = dsa_port_vlan_add(dp, &vlan); + err = dsa_port_vlan_add(dp, &vlan, extack); if (err) return err; @@ -371,7 +375,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; - err = dsa_port_vlan_add(dp->cpu_dp, &vlan); + err = dsa_port_vlan_add(dp->cpu_dp, &vlan, extack); if (err) return err; @@ -400,6 +404,17 @@ static int dsa_slave_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_vlan_add(dev, obj, extack); break; + case SWITCHDEV_OBJ_ID_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_add_ring_role(dp, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: err = -EOPNOTSUPP; break; @@ -457,6 +472,17 @@ static int dsa_slave_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_vlan_del(dev, obj); break; + case SWITCHDEV_OBJ_ID_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_del_ring_role(dp, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: err = -EOPNOTSUPP; break; @@ -1287,17 +1313,25 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; + struct netlink_ext_ack extack = {0}; int ret; /* User port... */ - ret = dsa_port_vlan_add(dp, &vlan); - if (ret) + ret = dsa_port_vlan_add(dp, &vlan, &extack); + if (ret) { + if (extack._msg) + netdev_err(dev, "%s\n", extack._msg); return ret; + } /* And CPU port... */ - ret = dsa_port_vlan_add(dp->cpu_dp, &vlan); - if (ret) + ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &extack); + if (ret) { + if (extack._msg) + netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, + extack._msg); return ret; + } return vlan_vid_add(master, proto, vid); } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 1906179e59f7..4b5da89dc27a 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -106,6 +106,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, { bool unset_vlan_filtering = br_vlan_enabled(info->br); struct dsa_switch_tree *dst = ds->dst; + struct netlink_ext_ack extack = {0}; int err, i; if (dst->index == info->tree_index && ds->index == info->sw_index && @@ -137,7 +138,10 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, } if (unset_vlan_filtering) { err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false); + false, &extack); + if (extack._msg) + dev_err(ds->dev, "port %d: %s\n", info->port, + extack._msg); if (err && err != EOPNOTSUPP) return err; } @@ -291,7 +295,8 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_vlan_match(ds, port, info)) { - err = ds->ops->port_vlan_add(ds, port, info->vlan); + err = ds->ops->port_vlan_add(ds, port, info->vlan, + info->extack); if (err) return err; } @@ -367,6 +372,99 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, return 0; } +static bool dsa_switch_mrp_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mrp_info *info) +{ + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_mrp_add(struct dsa_switch *ds, + struct dsa_notifier_mrp_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_mrp_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mrp_match(ds, port, info)) { + err = ds->ops->port_mrp_add(ds, port, info->mrp); + if (err) + break; + } + } + + return err; +} + +static int dsa_switch_mrp_del(struct dsa_switch *ds, + struct dsa_notifier_mrp_info *info) +{ + if (!ds->ops->port_mrp_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_mrp_del(ds, info->port, info->mrp); + + return 0; +} + +static bool +dsa_switch_mrp_ring_role_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mrp_ring_role_info *info) +{ + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int +dsa_switch_mrp_add_ring_role(struct dsa_switch *ds, + struct dsa_notifier_mrp_ring_role_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_mrp_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mrp_ring_role_match(ds, port, info)) { + err = ds->ops->port_mrp_add_ring_role(ds, port, + info->mrp); + if (err) + break; + } + } + + return err; +} + +static int +dsa_switch_mrp_del_ring_role(struct dsa_switch *ds, + struct dsa_notifier_mrp_ring_role_info *info) +{ + if (!ds->ops->port_mrp_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_mrp_del_ring_role(ds, info->port, + info->mrp); + + return 0; +} + static int dsa_switch_event(struct notifier_block *nb, unsigned long event, void *info) { @@ -422,6 +520,18 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_TAG_PROTO: err = dsa_switch_change_tag_proto(ds, info); break; + case DSA_NOTIFIER_MRP_ADD: + err = dsa_switch_mrp_add(ds, info); + break; + case DSA_NOTIFIER_MRP_DEL: + err = dsa_switch_mrp_del(ds, info); + break; + case DSA_NOTIFIER_MRP_ADD_RING_ROLE: + err = dsa_switch_mrp_add_ring_role(ds, info); + break; + case DSA_NOTIFIER_MRP_DEL_RING_ROLE: + err = dsa_switch_mrp_del_ring_role(ds, info); + break; default: err = -EOPNOTSUPP; break; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 16a1afd5b8e1..743809b5806b 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -1,174 +1,74 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright 2019 NXP Semiconductors */ +#include <linux/dsa/ocelot.h> #include <soc/mscc/ocelot.h> -#include <linux/packing.h> #include "dsa_priv.h" -/* The CPU injection header and the CPU extraction header can have 3 types of - * prefixes: long, short and no prefix. The format of the header itself is the - * same in all 3 cases. - * - * Extraction with long prefix: - * - * +-------------------+-------------------+------+------+------------+-------+ - * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | - * | | | | | header | | - * +-------------------+-------------------+------+------+------------+-------+ - * 48 bits 48 bits 16 bits 16 bits 128 bits - * - * Extraction with short prefix: - * - * +------+------+------------+-------+ - * | 8880 | 000a | extraction | frame | - * | | | header | | - * +------+------+------------+-------+ - * 16 bits 16 bits 128 bits - * - * Extraction with no prefix: - * - * +------------+-------+ - * | extraction | frame | - * | header | | - * +------------+-------+ - * 128 bits - * - * - * Injection with long prefix: - * - * +-------------------+-------------------+------+------+------------+-------+ - * | any dmac | any smac | 8880 | 000a | injection | frame | - * | | | | | header | | - * +-------------------+-------------------+------+------+------------+-------+ - * 48 bits 48 bits 16 bits 16 bits 128 bits - * - * Injection with short prefix: - * - * +------+------+------------+-------+ - * | 8880 | 000a | injection | frame | - * | | | header | | - * +------+------+------------+-------+ - * 16 bits 16 bits 128 bits - * - * Injection with no prefix: - * - * +------------+-------+ - * | injection | frame | - * | header | | - * +------------+-------+ - * 128 bits - * - * The injection header looks like this (network byte order, bit 127 - * is part of lowest address byte in memory, bit 0 is part of highest - * address byte): - * - * +------+------+------+------+------+------+------+------+ - * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| - * +------+------+------+------+------+------+------+------+ - * 119:112 | REW_OP | - * +------+------+------+------+------+------+------+------+ - * 111:104 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 103: 96 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 95: 88 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 87: 80 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 79: 72 | RSV | - * +------+------+------+------+------+------+------+------+ - * 71: 64 | RSV | DEST | - * +------+------+------+------+------+------+------+------+ - * 63: 56 | DEST | - * +------+------+------+------+------+------+------+------+ - * 55: 48 | RSV | - * +------+------+------+------+------+------+------+------+ - * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| - * +------+------+------+------+------+------+------+------+ - * 39: 32 | TFRM_TIMER | RSV | - * +------+------+------+------+------+------+------+------+ - * 31: 24 | RSV | DP | POP_CNT | CPUQ | - * +------+------+------+------+------+------+------+------+ - * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| - * +------+------+------+------+------+------+------+------+ - * 15: 8 | PCP | DEI | VID | - * +------+------+------+------+------+------+------+------+ - * 7: 0 | VID | - * +------+------+------+------+------+------+------+------+ - * - * And the extraction header looks like this: - * - * +------+------+------+------+------+------+------+------+ - * 127:120 | RSV | REW_OP | - * +------+------+------+------+------+------+------+------+ - * 119:112 | REW_OP | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 111:104 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 103: 96 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 95: 88 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 87: 80 | REW_VAL | LLEN | - * +------+------+------+------+------+------+------+------+ - * 79: 72 | LLEN | WLEN | - * +------+------+------+------+------+------+------+------+ - * 71: 64 | WLEN | RSV | - * +------+------+------+------+------+------+------+------+ - * 63: 56 | RSV | - * +------+------+------+------+------+------+------+------+ - * 55: 48 | RSV | - * +------+------+------+------+------+------+------+------+ - * 47: 40 | RSV | SRC_PORT | ACL_ID | - * +------+------+------+------+------+------+------+------+ - * 39: 32 | ACL_ID | RSV | SFLOW_ID | - * +------+------+------+------+------+------+------+------+ - * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | - * +------+------+------+------+------+------+------+------+ - * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| - * +------+------+------+------+------+------+------+------+ - * 15: 8 | PCP | DEI | VID | - * +------+------+------+------+------+------+------+------+ - * 7: 0 | VID | - * +------+------+------+------+------+------+------+------+ - */ +static void ocelot_xmit_ptp(struct dsa_port *dp, void *injection, + struct sk_buff *clone) +{ + struct ocelot *ocelot = dp->ds->priv; + struct ocelot_port *ocelot_port; + u64 rew_op; -static struct sk_buff *ocelot_xmit(struct sk_buff *skb, - struct net_device *netdev) + ocelot_port = ocelot->ports[dp->index]; + rew_op = ocelot_port->ptp_cmd; + + /* Retrieve timestamp ID populated inside skb->cb[0] of the + * clone by ocelot_port_add_txtstamp_skb + */ + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) + rew_op |= clone->cb[0] << 3; + + ocelot_ifh_set_rew_op(injection, rew_op); +} + +static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, + __be32 ifh_prefix, void **ifh) { struct dsa_port *dp = dsa_slave_to_port(netdev); struct sk_buff *clone = DSA_SKB_CB(skb)->clone; struct dsa_switch *ds = dp->ds; - struct ocelot *ocelot = ds->priv; - struct ocelot_port *ocelot_port; - u8 *prefix, *injection; - u64 qos_class, rew_op; - - ocelot_port = ocelot->ports[dp->index]; + void *injection; + __be32 *prefix; injection = skb_push(skb, OCELOT_TAG_LEN); - prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN); - memcpy(prefix, ocelot_port->xmit_template, OCELOT_TOTAL_TAG_LEN); - - /* Fix up the fields which are not statically determined - * in the template - */ - qos_class = skb->priority; - packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); + *prefix = ifh_prefix; + memset(injection, 0, OCELOT_TAG_LEN); + ocelot_ifh_set_bypass(injection, 1); + ocelot_ifh_set_src(injection, ds->num_ports); + ocelot_ifh_set_qos_class(injection, skb->priority); /* TX timestamping was requested */ - if (clone) { - rew_op = ocelot_port->ptp_cmd; - /* Retrieve timestamp ID populated inside skb->cb[0] of the - * clone by ocelot_port_add_txtstamp_skb - */ - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - rew_op |= clone->cb[0] << 3; + if (clone) + ocelot_xmit_ptp(dp, injection, clone); - packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); - } + *ifh = injection; +} + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + void *injection; + + ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection); + ocelot_ifh_set_dest(injection, BIT_ULL(dp->index)); + + return skb; +} + +static struct sk_buff *seville_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + void *injection; + + ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection); + seville_ifh_set_dest(injection, BIT_ULL(dp->index)); return skb; } @@ -177,14 +77,13 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) { - struct dsa_port *cpu_dp = netdev->dsa_ptr; - struct dsa_switch *ds = cpu_dp->ds; - struct ocelot *ocelot = ds->priv; u64 src_port, qos_class; u64 vlan_tci, tag_type; u8 *start = skb->data; + struct dsa_port *dp; u8 *extraction; u16 vlan_tpid; + u64 cpuq; /* Revert skb->data by the amount consumed by the DSA master, * so it points to the beginning of the frame. @@ -210,10 +109,11 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, /* Remove from inet csum the extraction header */ skb_postpull_rcsum(skb, start, OCELOT_TOTAL_TAG_LEN); - packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &tag_type, 16, 16, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &vlan_tci, 15, 0, OCELOT_TAG_LEN, UNPACK, 0); + ocelot_xfh_get_src_port(extraction, &src_port); + ocelot_xfh_get_qos_class(extraction, &qos_class); + ocelot_xfh_get_tag_type(extraction, &tag_type); + ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); + ocelot_xfh_get_cpuq(extraction, &cpuq); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) @@ -228,6 +128,12 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, skb->offload_fwd_mark = 1; skb->priority = qos_class; +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + if (eth_hdr(skb)->h_proto == cpu_to_be16(ETH_P_MRP) && + cpuq & BIT(OCELOT_MRP_CPUQ)) + skb->offload_fwd_mark = 0; +#endif + /* Ocelot switches copy frames unmodified to the CPU. However, it is * possible for the user to request a VLAN modification through * VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that @@ -243,9 +149,10 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, * equal to the pvid of the ingress port and should not be used for * processing. */ + dp = dsa_slave_to_port(skb->dev); vlan_tpid = tag_type ? ETH_P_8021AD : ETH_P_8021Q; - if (ocelot->ports[src_port]->vlan_aware && + if (dsa_port_is_vlan_filtering(dp) && eth_hdr(skb)->h_proto == htons(vlan_tpid)) { u16 dummy_vlan_tci; @@ -267,7 +174,26 @@ static const struct dsa_device_ops ocelot_netdev_ops = { .promisc_on_master = true, }; -MODULE_LICENSE("GPL v2"); +DSA_TAG_DRIVER(ocelot_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT); -module_dsa_tag_driver(ocelot_netdev_ops); +static const struct dsa_device_ops seville_netdev_ops = { + .name = "seville", + .proto = DSA_TAG_PROTO_SEVILLE, + .xmit = seville_xmit, + .rcv = ocelot_rcv, + .overhead = OCELOT_TOTAL_TAG_LEN, + .promisc_on_master = true, +}; + +DSA_TAG_DRIVER(seville_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE); + +static struct dsa_tag_driver *ocelot_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(ocelot_netdev_ops), + &DSA_TAG_DRIVER_NAME(seville_netdev_ops), +}; + +module_dsa_tag_drivers(ocelot_tag_driver_array); + +MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 8991ebf098a3..5f3e8e124a82 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -9,8 +9,36 @@ * that on egress */ #include <linux/dsa/8021q.h> +#include <soc/mscc/ocelot.h> +#include <soc/mscc/ocelot_ptp.h> #include "dsa_priv.h" +static struct sk_buff *ocelot_xmit_ptp(struct dsa_port *dp, + struct sk_buff *skb, + struct sk_buff *clone) +{ + struct ocelot *ocelot = dp->ds->priv; + struct ocelot_port *ocelot_port; + int port = dp->index; + u32 rew_op; + + if (!ocelot_can_inject(ocelot, 0)) + return NULL; + + ocelot_port = ocelot->ports[port]; + rew_op = ocelot_port->ptp_cmd; + + /* Retrieve timestamp ID populated inside skb->cb[0] of the + * clone by ocelot_port_add_txtstamp_skb + */ + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) + rew_op |= clone->cb[0] << 3; + + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + + return NULL; +} + static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -18,6 +46,11 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + struct sk_buff *clone = DSA_SKB_CB(skb)->clone; + + /* TX timestamping was requested, so inject through MMIO */ + if (clone) + return ocelot_xmit_ptp(dp, skb, clone); return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); @@ -60,6 +93,7 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = { .xmit = ocelot_xmit, .rcv = ocelot_rcv, .overhead = VLAN_HLEN, + .promisc_on_master = true, }; MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 2646abe5a69e..c17d39b4a1a0 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -12,9 +12,7 @@ * * The 2 bytes tag form a 16 bit big endian word. The exact * meaning has been guessed from packet dumps from ingress - * frames, as no working egress traffic has been available - * we do not know the format of the egress tags or if they - * are even supported. + * frames. */ #include <linux/etherdevice.h> @@ -36,17 +34,34 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - /* - * Just let it pass thru, we don't know if it is possible - * to tag a frame with the 0x8899 ethertype and direct it - * to a specific port, all attempts at reverse-engineering have - * ended up with the frames getting dropped. - * - * The VLAN set-up needs to restrict the frames to the right port. - * - * If you have documentation on the tagging format for RTL8366RB - * (tag type A) then please contribute. - */ + struct dsa_port *dp = dsa_slave_to_port(dev); + u8 *tag; + u16 *p; + u16 out; + + /* Pad out to at least 60 bytes */ + if (unlikely(eth_skb_pad(skb))) + return NULL; + if (skb_cow_head(skb, RTL4_A_HDR_LEN) < 0) + return NULL; + + netdev_dbg(dev, "add realtek tag to package to port %d\n", + dp->index); + skb_push(skb, RTL4_A_HDR_LEN); + + memmove(skb->data, skb->data + RTL4_A_HDR_LEN, 2 * ETH_ALEN); + tag = skb->data + 2 * ETH_ALEN; + + /* Set Ethertype */ + p = (u16 *)tag; + *p = htons(RTL4_A_ETHERTYPE); + + out = (RTL4_A_PROTOCOL_RTL8366RB << 12) | (2 << 8); + /* The lower bits is the port numer */ + out |= (u8)dp->index; + p = (u16 *)(tag + 2); + *p = htons(out); + return skb; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2ff5d8058ce6..a02ce89b56b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET4_BIND, &flags); if (err) return err; - return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet_bind); @@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; @@ -777,18 +780,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7a6b58ae408d..a3422e42784e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4162,6 +4162,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EINVAL; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; @@ -4208,6 +4210,18 @@ zerocopy_rcv_out: return 0; } +bool tcp_bpf_bypass_getsockopt(int level, int optname) +{ + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; + + return false; +} +EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); + int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e32a7056cb76..69a545db80d2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4924,7 +4924,7 @@ err: void tcp_data_ready(struct sock *sk) { - if (tcp_epollin_ready(sk, sk->sk_rcvlowat)) + if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) sk->sk_data_ready(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 611039207d30..daad4f99db32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2796,6 +2796,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 48208fb4e895..4a0478b17243 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1130,7 +1130,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1864,9 +1864,8 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0e9994e0ecd7..1fb75f01756c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,7 +295,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -439,6 +440,7 @@ out_unlock: int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err = 0; /* If the socket has its own bind function then use it. */ @@ -451,11 +453,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET6_BIND, &flags); if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet6_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet6_bind); @@ -527,18 +530,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET6_GETPEERNAME, + NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET6_GETPEERNAME : - BPF_CGROUP_INET6_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, NULL); + } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d093ef3ef060..bd44ded7e50c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2124,6 +2124,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d75429205084..d25e5a9252fd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -409,9 +409,8 @@ try_again: } *addr_len = sizeof(*sin6); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) @@ -1462,7 +1461,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 00ed742f48a4..f16d9b5ee978 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -128,11 +128,13 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); val = mptcp_pm_get_add_addr_signal_max(msk); info->mptcpi_add_addr_signal_max = val; val = mptcp_pm_get_add_addr_accept_max(msk); info->mptcpi_add_addr_accepted_max = val; + info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) flags |= MPTCP_INFO_FLAG_FALLBACK; if (READ_ONCE(msk->can_ack)) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bb874c5d663a..b63574d6b812 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -508,8 +508,8 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + u64 snd_data_fin_enable, ack_seq; unsigned int dss_size = 0; - u64 snd_data_fin_enable; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; @@ -541,13 +541,14 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return ret; } + ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; - opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq); + opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; - opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq); + opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; @@ -918,8 +919,7 @@ static void ack_update_msk(struct mptcp_sock *msk, msk->wnd_end = new_wnd_end; /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ - if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)) && - sk_stream_memory_free(ssk)) + if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt))) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 229fd1af2e29..8e8e35fa4002 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -228,13 +228,14 @@ unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); -static unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet; pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->local_addr_max); } +EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); static void check_work_pending(struct mptcp_sock *msk) { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c2a8392254dc..a57f3eab7b6a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -363,8 +363,6 @@ static void mptcp_check_data_fin_ack(struct sock *sk) /* Look for an acknowledged DATA_FIN */ if (mptcp_pending_data_fin_ack(sk)) { - mptcp_stop_timer(sk); - WRITE_ONCE(msk->snd_data_fin_enable, 0); switch (sk->sk_state) { @@ -458,7 +456,18 @@ static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { struct sock *ack_hint = READ_ONCE(msk->ack_hint); + int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + bool cleanup; + + /* this is a simple superset of what tcp_cleanup_rbuf() implements + * so that we don't have to acquire the ssk socket lock most of the time + * to do actually nothing + */ + cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); + if (!cleanup) + return; /* if the hinted ssk is still active, try to use it */ if (likely(ack_hint)) { @@ -1565,6 +1574,9 @@ out: mptcp_set_timeout(sk, ssk); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) mptcp_schedule_work(sk); @@ -1868,7 +1880,7 @@ static void __mptcp_splice_receive_queue(struct sock *sk) skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); } -static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) +static bool __mptcp_move_skbs(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; @@ -1888,13 +1900,10 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) slowpath = lock_sock_fast(ssk); mptcp_data_lock(sk); + __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); - if (moved && rcv) { - WRITE_ONCE(msk->rmem_pending, min(rcv, moved)); - tcp_cleanup_rbuf(ssk, 1); - WRITE_ONCE(msk->rmem_pending, 0); - } + tcp_cleanup_rbuf(ssk, moved); unlock_sock_fast(ssk, slowpath); } while (!done); @@ -1907,6 +1916,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); + mptcp_cleanup_rbuf(msk); } if (ret) mptcp_check_data_fin((struct sock *)msk); @@ -1936,7 +1946,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); while (copied < len) { - int bytes_read, old_space; + int bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); if (unlikely(bytes_read < 0)) { @@ -1947,14 +1957,11 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&msk->receive_queue) && - __mptcp_move_skbs(msk, len - copied)) - continue; - /* be sure to advertise window change */ - old_space = READ_ONCE(msk->old_wspace); - if ((tcp_space(sk) - old_space) >= old_space) - mptcp_cleanup_rbuf(msk); + mptcp_cleanup_rbuf(msk); + + if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) + continue; /* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() @@ -1982,7 +1989,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* race breaker: the shutdown could be after the * previous receive queue check */ - if (__mptcp_move_skbs(msk, len - copied)) + if (__mptcp_move_skbs(msk)) continue; break; } @@ -2015,7 +2022,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* .. race-breaker: ssk might have gotten new data * after last __mptcp_move_skbs() returned false. */ - if (unlikely(__mptcp_move_skbs(msk, 0))) + if (unlikely(__mptcp_move_skbs(msk))) set_bit(MPTCP_DATA_READY, &msk->flags); } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { /* data to read but mptcp_wait_data() cleared DATA_READY */ @@ -2275,6 +2282,7 @@ static void mptcp_worker(struct work_struct *work) if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; + __mptcp_clean_una(sk); dfrag = mptcp_rtx_head(sk); if (!dfrag) goto unlock; @@ -2943,6 +2951,8 @@ static void mptcp_release_cb(struct sock *sk) mptcp_push_pending(sk, 0); spin_lock_bh(&sk->sk_lock.slock); } + if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) + __mptcp_error_report(sk); /* clear any wmem reservation and errors */ __mptcp_update_wmem(sk); @@ -3319,7 +3329,7 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) - return 0; + return EPOLLOUT | EPOLLWRNORM; if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; @@ -3352,9 +3362,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, mask |= mptcp_check_readable(msk); mask |= mptcp_check_writeable(msk); } + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + /* This barrier is coupled with smp_wmb() in tcp_reset() */ + smp_rmb(); + if (sk->sk_err) + mask |= EPOLLERR; + return mask; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d31edbae8da8..91827d949766 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -102,6 +102,7 @@ #define MPTCP_WORK_CLOSE_SUBFLOW 5 #define MPTCP_PUSH_PENDING 6 #define MPTCP_CLEAN_UNA 7 +#define MPTCP_ERROR_REPORT 8 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -237,7 +238,6 @@ struct mptcp_sock { u64 wnd_end; unsigned long timer_ival; u32 token; - int rmem_pending; int rmem_released; unsigned long flags; bool can_ack; @@ -301,7 +301,7 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) static inline int __mptcp_space(const struct sock *sk) { - return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending); + return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) @@ -334,20 +334,13 @@ static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } -static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) +static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una))) + if (msk->snd_una == READ_ONCE(msk->snd_nxt)) return NULL; - return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); -} - -static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } @@ -436,6 +429,7 @@ struct mptcp_subflow_context { void (*tcp_data_ready)(struct sock *sk); void (*tcp_state_change)(struct sock *sk); void (*tcp_write_space)(struct sock *sk); + void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; }; @@ -560,6 +554,7 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, sk->sk_data_ready = ctx->tcp_data_ready; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = ctx->tcp_write_space; + sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } @@ -587,6 +582,7 @@ bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); +void __mptcp_error_report(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); void __mptcp_flush_join_list(struct mptcp_sock *msk); @@ -725,6 +721,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ce2dea2a6e0a..06e233410e0e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -100,7 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) return msk; } -static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) +static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -108,16 +108,6 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li subflow_req->mp_join = 0; subflow_req->msk = NULL; mptcp_token_init_request(req); - -#ifdef CONFIG_TCP_MD5SIG - /* no MPTCP if MD5SIG is enabled on this socket or we may run out of - * TCP option space. - */ - if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) - return -EINVAL; -#endif - - return 0; } static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) @@ -130,20 +120,23 @@ static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct soc * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ -static int subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static int subflow_check_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; - int ret; pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); - ret = __subflow_init_req(req, sk_listener); - if (ret) - return 0; +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return -EINVAL; +#endif mptcp_get_options(skb, &mp_opt); @@ -236,10 +229,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, struct mptcp_options_received mp_opt; int err; - err = __subflow_init_req(req, sk_listener); - if (err) - return err; - + subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); if (mp_opt.mp_capable && mp_opt.mp_join) @@ -279,12 +269,13 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, int err; tcp_rsk(req)->is_mptcp = 1; + subflow_init_req(req, sk); dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); if (!dst) return NULL; - err = subflow_init_req(req, sk, skb); + err = subflow_check_req(req, sk, skb); if (err == 0) return dst; @@ -304,12 +295,13 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, int err; tcp_rsk(req)->is_mptcp = 1; + subflow_init_req(req, sk); dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); if (!dst) return NULL; - err = subflow_init_req(req, sk, skb); + err = subflow_check_req(req, sk, skb); if (err == 0) return dst; @@ -1124,6 +1116,46 @@ static void subflow_write_space(struct sock *ssk) mptcp_write_space(sk); } +void __mptcp_error_report(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err = sock_error(ssk); + + if (!err) + continue; + + /* only propagate errors on fallen-back sockets or + * on MPC connect + */ + if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) + continue; + + inet_sk_state_store(sk, inet_sk_state_load(ssk)); + sk->sk_err = -err; + + /* This barrier is coupled with smp_rmb() in mptcp_poll() */ + smp_wmb(); + sk->sk_error_report(sk); + break; + } +} + +static void subflow_error_report(struct sock *ssk) +{ + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_error_report(sk); + else + set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); +} + static struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { @@ -1470,9 +1502,11 @@ static int subflow_ulp_init(struct sock *sk) ctx->tcp_data_ready = sk->sk_data_ready; ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_write_space = sk->sk_write_space; + ctx->tcp_error_report = sk->sk_error_report; sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; + sk->sk_error_report = subflow_error_report; out: return err; } @@ -1526,6 +1560,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_write_space = old_ctx->tcp_write_space; + new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4dd235ce9a07..b919826939e0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -908,7 +908,7 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; -static void tcf_idr_insert_many(struct tc_action *actions[]) +void tcf_idr_insert_many(struct tc_action *actions[]) { int i; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a67c66a512a4..e37556cc37ab 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3060,6 +3060,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; exts->nr_actions = 1; + tcf_idr_insert_many(exts->actions); } else if (exts->action && tb[exts->action]) { int err; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index caf7643e9c83..2409e522c68f 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -30,6 +30,11 @@ #include <uapi/linux/netfilter/nf_conntrack_common.h> +#define TCA_FLOWER_KEY_CT_FLAGS_MAX \ + ((__TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) << 1) +#define TCA_FLOWER_KEY_CT_FLAGS_MASK \ + (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) + struct fl_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; @@ -690,8 +695,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, - [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 }, - [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_STATE] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), + [TCA_FLOWER_KEY_CT_STATE_MASK] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 }, @@ -1394,12 +1401,33 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_validate_ct_state(u16 state, struct nlattr *tb, + struct netlink_ext_ack *extack) +{ + if (state && !(state & TCA_FLOWER_KEY_CT_FLAGS_TRACKED)) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "no trk, so no other flag can be set"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW && + state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "new and est are mutually exclusive"); + return -EINVAL; + } + + return 0; +} + static int fl_set_key_ct(struct nlattr **tb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask, struct netlink_ext_ack *extack) { if (tb[TCA_FLOWER_KEY_CT_STATE]) { + int err; + if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) { NL_SET_ERR_MSG(extack, "Conntrack isn't enabled"); return -EOPNOTSUPP; @@ -1407,6 +1435,13 @@ static int fl_set_key_ct(struct nlattr **tb, fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, sizeof(key->ct_state)); + + err = fl_validate_ct_state(mask->ct_state, + tb[TCA_FLOWER_KEY_CT_STATE_MASK], + extack); + if (err) + return err; + } if (tb[TCA_FLOWER_KEY_CT_ZONE]) { if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { diff --git a/net/socket.c b/net/socket.c index 33e8b6c4e1d3..7f0617ab5437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __sys_setsockopt(fd, level, optname, optval, optlen); } +INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, + int optname)); + /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 0b84f076591e..89a36db47ab4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -100,7 +100,8 @@ static int switchdev_deferred_enqueue(struct net_device *dev, static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { int err; int rc; @@ -111,7 +112,7 @@ static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, }; rc = call_switchdev_blocking_notifiers(nt, dev, - &attr_info.info, NULL); + &attr_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!attr_info.handled); @@ -125,9 +126,11 @@ static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, } static int switchdev_port_attr_set_now(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { - return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr); + return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, + extack); } static void switchdev_port_attr_set_deferred(struct net_device *dev, @@ -136,7 +139,7 @@ static void switchdev_port_attr_set_deferred(struct net_device *dev, const struct switchdev_attr *attr = data; int err; - err = switchdev_port_attr_set_now(dev, attr); + err = switchdev_port_attr_set_now(dev, attr, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); @@ -156,17 +159,19 @@ static int switchdev_port_attr_set_defer(struct net_device *dev, * * @dev: port device * @attr: attribute to set + * @extack: netlink extended ack, for error message propagation * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { if (attr->flags & SWITCHDEV_F_DEFER) return switchdev_port_attr_set_defer(dev, attr); ASSERT_RTNL(); - return switchdev_port_attr_set_now(dev, attr); + return switchdev_port_attr_set_now(dev, attr, extack); } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4a83117507f5..4faabd1ecfd1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -184,12 +184,13 @@ static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, - bool explicit_free) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { struct xdp_buff *xsk_xdp; int err; + u32 len; + len = xdp->data_end - xdp->data; if (len > xsk_pool_get_rx_frame_size(xs->pool)) { xs->rx_dropped++; return -ENOSPC; @@ -207,8 +208,6 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, xsk_buff_free(xsk_xdp); return err; } - if (explicit_free) - xdp_return_buff(xdp); return 0; } @@ -230,11 +229,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, - bool explicit_free) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 len; - if (!xsk_is_bound(xs)) return -EINVAL; @@ -242,11 +238,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, return -EINVAL; sk_mark_napi_id_once_xdp(&xs->sk, xdp); - len = xdp->data_end - xdp->data; - - return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? - __xsk_rcv_zc(xs, xdp, len) : - __xsk_rcv(xs, xdp, len, explicit_free); + return 0; } static void xsk_flush(struct xdp_sock *xs) @@ -261,18 +253,41 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv(xs, xdp, false); - xsk_flush(xs); + err = xsk_rcv_check(xs, xdp); + if (!err) { + err = __xsk_rcv(xs, xdp); + xsk_flush(xs); + } spin_unlock_bh(&xs->rx_lock); return err; } +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + u32 len; + + err = xsk_rcv_check(xs, xdp); + if (err) + return err; + + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { + len = xdp->data_end - xdp->data; + return __xsk_rcv_zc(xs, xdp, len); + } + + err = __xsk_rcv(xs, xdp); + if (!err) + xdp_return_buff(xdp); + return err; +} + int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp, true); + err = xsk_rcv(xs, xdp); if (err) return err; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 20598eea658c..8de01aaac4a0 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -119,8 +119,8 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) } } -static int __xp_assign_dev(struct xsk_buff_pool *pool, - struct net_device *netdev, u16 queue_id, u16 flags) +int xp_assign_dev(struct xsk_buff_pool *pool, + struct net_device *netdev, u16 queue_id, u16 flags) { bool force_zc, force_copy; struct netdev_bpf bpf; @@ -191,12 +191,6 @@ err_unreg_pool: return err; } -int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, - u16 queue_id, u16 flags) -{ - return __xp_assign_dev(pool, dev, queue_id, flags); -} - int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, struct net_device *dev, u16 queue_id) { @@ -210,7 +204,7 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, if (pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; - return __xp_assign_dev(pool, dev, queue_id, flags); + return xp_assign_dev(pool, dev, queue_id, flags); } void xp_clear_dev(struct xsk_buff_pool *pool) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 26fc96ca619e..45ceca4e2c70 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -183,6 +183,14 @@ BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) endif +ifeq ($(ARCH), mips) +TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__ +ifdef CONFIG_MACH_LOONGSON64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-loongson64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic +endif +endif + TPROGS_CFLAGS += -Wall -O2 TPROGS_CFLAGS += -Wmissing-prototypes TPROGS_CFLAGS += -Wstrict-prototypes @@ -208,7 +216,7 @@ TPROGLDLIBS_xdpsock += -pthread -lcap TPROGLDLIBS_xsk_fwd += -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: -# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang LLC ?= llc CLANG ?= clang OPT ?= opt diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index dd34b2d26f1c..60c6494adb1b 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -62,20 +62,26 @@ To generate a smaller llc binary one can use:: -DLLVM_TARGETS_TO_BUILD="BPF" +We recommend that developers who want the fastest incremental builds +use the Ninja build system, you can find it in your system's package +manager, usually the package is ninja or ninja-build. + Quick sniplet for manually compiling LLVM and clang -(build dependencies are cmake and gcc-c++):: +(build dependencies are ninja, cmake and gcc-c++):: - $ git clone http://llvm.org/git/llvm.git - $ cd llvm/tools - $ git clone --depth 1 http://llvm.org/git/clang.git - $ cd ..; mkdir build; cd build - $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" - $ make -j $(getconf _NPROCESSORS_ONLN) + $ git clone https://github.com/llvm/llvm-project.git + $ mkdir -p llvm-project/llvm/build + $ cd llvm-project/llvm/build + $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + -DLLVM_ENABLE_PROJECTS="clang" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_BUILD_RUNTIME=OFF + $ ninja It is also possible to point make to the newly compiled 'llc' or 'clang' command via redefining LLC or CLANG on the make command line:: - make M=samples/bpf LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang Cross compiling samples ----------------------- diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h index db67a2847395..aee04534483a 100644 --- a/samples/bpf/bpf_insn.h +++ b/samples/bpf/bpf_insn.h @@ -134,15 +134,31 @@ struct bpf_insn; .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ - -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = BPF_ADD }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index c5ff7a13918c..cc3bce8d3aac 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -313,7 +313,7 @@ int main(int argc, char *argv[]) print_table(); printf("\n"); sleep(1); - }; + } } else if (cfg_test_cookie) { udp_client(); } diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c index 6489352ab7a4..a92b8e567bdd 100644 --- a/samples/bpf/xdp_redirect_map_kern.c +++ b/samples/bpf/xdp_redirect_map_kern.c @@ -19,12 +19,22 @@ #include <linux/ipv6.h> #include <bpf/bpf_helpers.h> +/* The 2nd xdp prog on egress does not support skb mode, so we define two + * maps, tx_port_general and tx_port_native. + */ struct { __uint(type, BPF_MAP_TYPE_DEVMAP); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, 100); -} tx_port SEC(".maps"); +} tx_port_general SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 100); +} tx_port_native SEC(".maps"); /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success * feedback. Redirect TX errors can be caught via a tracepoint. @@ -36,6 +46,14 @@ struct { __uint(max_entries, 1); } rxcnt SEC(".maps"); +/* map to store egress interface mac address */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, __be64); + __uint(max_entries, 1); +} tx_mac SEC(".maps"); + static void swap_src_dst_mac(void *data) { unsigned short *p = data; @@ -52,17 +70,16 @@ static void swap_src_dst_mac(void *data) p[5] = dst[2]; } -SEC("xdp_redirect_map") -int xdp_redirect_map_prog(struct xdp_md *ctx) +static __always_inline int xdp_redirect_map(struct xdp_md *ctx, void *redirect_map) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; int rc = XDP_DROP; - int vport, port = 0, m = 0; long *value; u32 key = 0; u64 nh_off; + int vport; nh_off = sizeof(*eth); if (data + nh_off > data_end) @@ -79,7 +96,40 @@ int xdp_redirect_map_prog(struct xdp_md *ctx) swap_src_dst_mac(data); /* send packet out physical port */ - return bpf_redirect_map(&tx_port, vport, 0); + return bpf_redirect_map(redirect_map, vport, 0); +} + +SEC("xdp_redirect_general") +int xdp_redirect_map_general(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &tx_port_general); +} + +SEC("xdp_redirect_native") +int xdp_redirect_map_native(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &tx_port_native); +} + +SEC("xdp_devmap/map_prog") +int xdp_redirect_map_egress(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + __be64 *mac; + u32 key = 0; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + mac = bpf_map_lookup_elem(&tx_mac, &key); + if (mac) + __builtin_memcpy(eth->h_source, mac, ETH_ALEN); + + return XDP_PASS; } /* Redirect require an XDP bpf_prog loaded on the TX device */ diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index 31131b6e7782..0e8192688dfc 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -14,6 +14,10 @@ #include <unistd.h> #include <libgen.h> #include <sys/resource.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> #include "bpf_util.h" #include <bpf/bpf.h> @@ -22,6 +26,7 @@ static int ifindex_in; static int ifindex_out; static bool ifindex_out_xdp_dummy_attached = true; +static bool xdp_devmap_attached; static __u32 prog_id; static __u32 dummy_prog_id; @@ -83,6 +88,32 @@ static void poll_stats(int interval, int ifindex) } } +static int get_mac_addr(unsigned int ifindex_out, void *mac_addr) +{ + char ifname[IF_NAMESIZE]; + struct ifreq ifr; + int fd, ret = -1; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return ret; + + if (!if_indextoname(ifindex_out, ifname)) + goto err_out; + + strcpy(ifr.ifr_name, ifname); + + if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) + goto err_out; + + memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); + ret = 0; + +err_out: + close(fd); + return ret; +} + static void usage(const char *prog) { fprintf(stderr, @@ -90,24 +121,26 @@ static void usage(const char *prog) "OPTS:\n" " -S use skb-mode\n" " -N enforce native mode\n" - " -F force loading prog\n", + " -F force loading prog\n" + " -X load xdp program on egress\n", prog); } int main(int argc, char **argv) { struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, + .prog_type = BPF_PROG_TYPE_UNSPEC, }; - struct bpf_program *prog, *dummy_prog; + struct bpf_program *prog, *dummy_prog, *devmap_prog; + int prog_fd, dummy_prog_fd, devmap_prog_fd = 0; + int tx_port_map_fd, tx_mac_map_fd; + struct bpf_devmap_val devmap_val; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); - int prog_fd, dummy_prog_fd; - const char *optstr = "FSN"; + const char *optstr = "FSNX"; struct bpf_object *obj; int ret, opt, key = 0; char filename[256]; - int tx_port_map_fd; while ((opt = getopt(argc, argv, optstr)) != -1) { switch (opt) { @@ -120,14 +153,21 @@ int main(int argc, char **argv) case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; break; + case 'X': + xdp_devmap_attached = true; + break; default: usage(basename(argv[0])); return 1; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) { xdp_flags |= XDP_FLAGS_DRV_MODE; + } else if (xdp_devmap_attached) { + printf("Load xdp program on egress with SKB mode not supported yet\n"); + return 1; + } if (optind == argc) { printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); @@ -150,24 +190,28 @@ int main(int argc, char **argv) if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) return 1; - prog = bpf_program__next(NULL, obj); - dummy_prog = bpf_program__next(prog, obj); - if (!prog || !dummy_prog) { - printf("finding a prog in obj file failed\n"); - return 1; + if (xdp_flags & XDP_FLAGS_SKB_MODE) { + prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general"); + tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port_general"); + } else { + prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native"); + tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port_native"); + } + dummy_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_dummy_prog"); + if (!prog || dummy_prog < 0 || tx_port_map_fd < 0) { + printf("finding prog/dummy_prog/tx_port_map in obj file failed\n"); + goto out; } - /* bpf_prog_load_xattr gives us the pointer to first prog's fd, - * so we're missing only the fd for dummy prog - */ + prog_fd = bpf_program__fd(prog); dummy_prog_fd = bpf_program__fd(dummy_prog); - if (prog_fd < 0 || dummy_prog_fd < 0) { + if (prog_fd < 0 || dummy_prog_fd < 0 || tx_port_map_fd < 0) { printf("bpf_prog_load_xattr: %s\n", strerror(errno)); return 1; } - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); + tx_mac_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_mac"); rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { + if (tx_mac_map_fd < 0 || rxcnt_map_fd < 0) { printf("bpf_object__find_map_fd_by_name failed\n"); return 1; } @@ -199,11 +243,39 @@ int main(int argc, char **argv) } dummy_prog_id = info.id; + /* Load 2nd xdp prog on egress. */ + if (xdp_devmap_attached) { + unsigned char mac_addr[6]; + + devmap_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_egress"); + if (!devmap_prog) { + printf("finding devmap_prog in obj file failed\n"); + goto out; + } + devmap_prog_fd = bpf_program__fd(devmap_prog); + if (devmap_prog_fd < 0) { + printf("finding devmap_prog fd failed\n"); + goto out; + } + + if (get_mac_addr(ifindex_out, mac_addr) < 0) { + printf("get interface %d mac failed\n", ifindex_out); + goto out; + } + + ret = bpf_map_update_elem(tx_mac_map_fd, &key, mac_addr, 0); + if (ret) { + perror("bpf_update_elem tx_mac_map_fd"); + goto out; + } + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); - /* populate virtual to physical port map */ - ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); + devmap_val.ifindex = ifindex_out; + devmap_val.bpf_prog.fd = devmap_prog_fd; + ret = bpf_map_update_elem(tx_port_map_fd, &key, &devmap_val, 0); if (ret) { perror("bpf_update_elem"); goto out; diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index a0ebcdf59c31..a07dfc479270 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -890,7 +890,7 @@ static int bpf_run_stepping(struct sock_filter *f, uint16_t bpf_len, bool stop = false; int i = 1; - while (bpf_curr.Rs == false && stop == false) { + while (!bpf_curr.Rs && !stop) { bpf_safe_regs(); if (i++ == next) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 45ac2f9e0aa9..8ced1655fea6 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -75,8 +75,6 @@ endif INSTALL ?= install RM ?= rm -f -CLANG ?= clang -LLVM_STRIP ?= llvm-strip FEATURE_USER = .bpftool FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libcap \ diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 1fe3ba255bad..f2b915b20546 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -368,6 +368,8 @@ static void print_prog_header_json(struct bpf_prog_info *info) jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns); jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt); } + if (info->recursion_misses) + jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses); } static void print_prog_json(struct bpf_prog_info *info, int fd) @@ -446,6 +448,8 @@ static void print_prog_header_plain(struct bpf_prog_info *info) if (info->run_time_ns) printf(" run_time_ns %lld run_cnt %lld", info->run_time_ns, info->run_cnt); + if (info->recursion_misses) + printf(" recursion_misses %lld", info->recursion_misses); printf("\n"); } diff --git a/tools/bpf/resolve_btfids/.gitignore b/tools/bpf/resolve_btfids/.gitignore index a026df7dc280..16913fffc985 100644 --- a/tools/bpf/resolve_btfids/.gitignore +++ b/tools/bpf/resolve_btfids/.gitignore @@ -1,4 +1,3 @@ -/FEATURE-DUMP.libbpf -/bpf_helper_defs.h /fixdep /resolve_btfids +/libbpf/ diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index bf656432ad73..bb9fa8de7e62 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -2,11 +2,7 @@ include ../../scripts/Makefile.include include ../../scripts/Makefile.arch -ifeq ($(srctree),) -srctree := $(patsubst %/,%,$(dir $(CURDIR))) -srctree := $(patsubst %/,%,$(dir $(srctree))) -srctree := $(patsubst %/,%,$(dir $(srctree))) -endif +srctree := $(abspath $(CURDIR)/../../../) ifeq ($(V),1) Q = @@ -22,28 +18,29 @@ AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) ARCH = $(HOSTARCH) +RM ?= rm OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/ LIBBPF_SRC := $(srctree)/tools/lib/bpf/ SUBCMD_SRC := $(srctree)/tools/lib/subcmd/ -BPFOBJ := $(OUTPUT)/libbpf.a -SUBCMDOBJ := $(OUTPUT)/libsubcmd.a +BPFOBJ := $(OUTPUT)/libbpf/libbpf.a +SUBCMDOBJ := $(OUTPUT)/libsubcmd/libsubcmd.a BINARY := $(OUTPUT)/resolve_btfids BINARY_IN := $(BINARY)-in.o all: $(BINARY) -$(OUTPUT): +$(OUTPUT) $(OUTPUT)/libbpf $(OUTPUT)/libsubcmd: $(call msg,MKDIR,,$@) - $(Q)mkdir -p $(OUTPUT) + $(Q)mkdir -p $(@) -$(SUBCMDOBJ): fixdep FORCE - $(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(OUTPUT) +$(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/libsubcmd + $(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@) -$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@) CFLAGS := -g \ @@ -57,24 +54,27 @@ LIBS = -lelf -lz export srctree OUTPUT CFLAGS Q include $(srctree)/tools/build/Makefile.include -$(BINARY_IN): fixdep FORCE +$(BINARY_IN): fixdep FORCE | $(OUTPUT) $(Q)$(MAKE) $(build)=resolve_btfids $(BINARY): $(BPFOBJ) $(SUBCMDOBJ) $(BINARY_IN) $(call msg,LINK,$@) $(Q)$(CC) $(BINARY_IN) $(LDFLAGS) -o $@ $(BPFOBJ) $(SUBCMDOBJ) $(LIBS) -libsubcmd-clean: - $(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(OUTPUT) clean - -libbpf-clean: - $(Q)$(MAKE) -C $(LIBBPF_SRC) OUTPUT=$(OUTPUT) clean +clean_objects := $(wildcard $(OUTPUT)/*.o \ + $(OUTPUT)/.*.o.cmd \ + $(OUTPUT)/.*.o.d \ + $(OUTPUT)/libbpf \ + $(OUTPUT)/libsubcmd \ + $(OUTPUT)/resolve_btfids) -clean: libsubcmd-clean libbpf-clean fixdep-clean +ifneq ($(clean_objects),) +clean: fixdep-clean $(call msg,CLEAN,$(BINARY)) - $(Q)$(RM) -f $(BINARY); \ - $(RM) -rf $(if $(OUTPUT),$(OUTPUT),.)/feature; \ - find $(if $(OUTPUT),$(OUTPUT),.) -name \*.o -or -name \*.o.cmd -or -name \*.o.d | xargs $(RM) + $(Q)$(RM) -rf $(clean_objects) +else +clean: +endif tags: $(call msg,GEN,,tags) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 4d5ca54fcd4c..9d9fb6209be1 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -3,9 +3,6 @@ include ../../scripts/Makefile.include OUTPUT ?= $(abspath .output)/ -CLANG ?= clang -LLC ?= llc -LLVM_STRIP ?= llvm-strip BPFTOOL_OUTPUT := $(OUTPUT)bpftool/ DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 89ba522e377d..3e55edb3ea54 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +include ../../scripts/Makefile.include + FILES= \ test-all.bin \ test-backtrace.bin \ @@ -76,8 +78,6 @@ FILES= \ FILES := $(addprefix $(OUTPUT),$(FILES)) PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config -LLVM_CONFIG ?= llvm-config -CLANG ?= clang all: $(FILES) diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h index 154eb4e3ca7c..e9c5a215837d 100644 --- a/tools/include/linux/types.h +++ b/tools/include/linux/types.h @@ -6,7 +6,10 @@ #include <stddef.h> #include <stdint.h> +#ifndef __SANE_USERSPACE_TYPES__ #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ +#endif + #include <asm/types.h> #include <asm/posix_types.h> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c001766adcbc..4c24daa43bac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1656,22 +1656,30 @@ union bpf_attr { * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return @@ -2231,6 +2239,9 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. @@ -3836,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4001,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4501,6 +4576,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { @@ -4981,9 +5057,13 @@ struct bpf_fib_lookup { __be16 sport; __be16 dport; - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + /* output: MTU value */ + __u16 mtu_result; + }; /* input: L3 device index for lookup * output: device index from FIB lookup */ @@ -5029,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/tools/include/uapi/linux/bpf_perf_event.h b/tools/include/uapi/linux/bpf_perf_event.h index 8f95303f9d80..eb1b9d21250c 100644 --- a/tools/include/uapi/linux/bpf_perf_event.h +++ b/tools/include/uapi/linux/bpf_perf_event.h @@ -13,6 +13,7 @@ struct bpf_perf_event_data { bpf_user_pt_regs_t regs; __u64 sample_period; + __u64 addr; }; #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/tools/include/uapi/linux/tcp.h b/tools/include/uapi/linux/tcp.h new file mode 100644 index 000000000000..13ceeb395eb8 --- /dev/null +++ b/tools/include/uapi/linux/tcp.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol. + * + * Version: @(#)tcp.h 1.0.2 04/28/93 + * + * Author: Fred N. van Kempen, <[email protected]> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _UAPI_LINUX_TCP_H +#define _UAPI_LINUX_TCP_H + +#include <linux/types.h> +#include <asm/byteorder.h> +#include <linux/socket.h> + +struct tcphdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +}; + +/* + * The union cast uses a gcc extension to avoid aliasing problems + * (union is compatible to any of its members) + * This means this part of the code is -fstrict-aliasing safe now. + */ +union tcp_word_hdr { + struct tcphdr hdr; + __be32 words[5]; +}; + +#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) + +enum { + TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), + TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), + TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), + TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), + TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), + TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), + TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), + TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), + TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) +}; + +/* + * TCP general constants + */ +#define TCP_MSS_DEFAULT 536U /* IPv4 (RFC1122, RFC2581) */ +#define TCP_MSS_DESIRED 1220U /* IPv6 (tunneled), EDNS0 (RFC3226) */ + +/* TCP socket options */ +#define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */ +#define TCP_MAXSEG 2 /* Limit MSS */ +#define TCP_CORK 3 /* Never send partially complete segments */ +#define TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define TCP_INFO 11 /* Information about this connection. */ +#define TCP_QUICKACK 12 /* Block/reenable quick acks */ +#define TCP_CONGESTION 13 /* Congestion control algorithm */ +#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ +#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR_QUEUE 20 +#define TCP_QUEUE_SEQ 21 +#define TCP_REPAIR_OPTIONS 22 +#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ +#define TCP_TIMESTAMP 24 +#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ +#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ +#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ +#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */ +#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */ +#define TCP_ULP 31 /* Attach a ULP to a TCP connection */ +#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ +#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ +#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ +#define TCP_ZEROCOPY_RECEIVE 35 +#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ + +#define TCP_CM_INQ TCP_INQ + +#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ + + +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + +struct tcp_repair_window { + __u32 snd_wl1; + __u32 snd_wnd; + __u32 max_window; + + __u32 rcv_wnd; + __u32 rcv_wup; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; + +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + +/* for TCP_INFO socket option */ +#define TCPI_OPT_TIMESTAMPS 1 +#define TCPI_OPT_SACK 2 +#define TCPI_OPT_WSCALE 4 +#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ +#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ +#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + +/* + * Sender's congestion state indicating normal or abnormal situations + * in the last round of packets sent. The state is driven by the ACK + * information and timer events. + */ +enum tcp_ca_state { + /* + * Nothing bad has been observed recently. + * No apparent reordering, packet loss, or ECN marks. + */ + TCP_CA_Open = 0, +#define TCPF_CA_Open (1<<TCP_CA_Open) + /* + * The sender enters disordered state when it has received DUPACKs or + * SACKs in the last round of packets sent. This could be due to packet + * loss or reordering but needs further information to confirm packets + * have been lost. + */ + TCP_CA_Disorder = 1, +#define TCPF_CA_Disorder (1<<TCP_CA_Disorder) + /* + * The sender enters Congestion Window Reduction (CWR) state when it + * has received ACKs with ECN-ECE marks, or has experienced congestion + * or packet discard on the sender host (e.g. qdisc). + */ + TCP_CA_CWR = 2, +#define TCPF_CA_CWR (1<<TCP_CA_CWR) + /* + * The sender is in fast recovery and retransmitting lost packets, + * typically triggered by ACK events. + */ + TCP_CA_Recovery = 3, +#define TCPF_CA_Recovery (1<<TCP_CA_Recovery) + /* + * The sender is in loss recovery triggered by retransmission timeout. + */ + TCP_CA_Loss = 4 +#define TCPF_CA_Loss (1<<TCP_CA_Loss) +}; + +struct tcp_info { + __u8 tcpi_state; + __u8 tcpi_ca_state; + __u8 tcpi_retransmits; + __u8 tcpi_probes; + __u8 tcpi_backoff; + __u8 tcpi_options; + __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; + __u8 tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; + + __u32 tcpi_rto; + __u32 tcpi_ato; + __u32 tcpi_snd_mss; + __u32 tcpi_rcv_mss; + + __u32 tcpi_unacked; + __u32 tcpi_sacked; + __u32 tcpi_lost; + __u32 tcpi_retrans; + __u32 tcpi_fackets; + + /* Times. */ + __u32 tcpi_last_data_sent; + __u32 tcpi_last_ack_sent; /* Not remembered, sorry. */ + __u32 tcpi_last_data_recv; + __u32 tcpi_last_ack_recv; + + /* Metrics. */ + __u32 tcpi_pmtu; + __u32 tcpi_rcv_ssthresh; + __u32 tcpi_rtt; + __u32 tcpi_rttvar; + __u32 tcpi_snd_ssthresh; + __u32 tcpi_snd_cwnd; + __u32 tcpi_advmss; + __u32 tcpi_reordering; + + __u32 tcpi_rcv_rtt; + __u32 tcpi_rcv_space; + + __u32 tcpi_total_retrans; + + __u64 tcpi_pacing_rate; + __u64 tcpi_max_pacing_rate; + __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ + __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ + __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ + + __u32 tcpi_notsent_bytes; + __u32 tcpi_min_rtt; + __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ + __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ + + __u64 tcpi_delivery_rate; + + __u64 tcpi_busy_time; /* Time (usec) busy sending data */ + __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ + __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + __u32 tcpi_delivered; + __u32 tcpi_delivered_ce; + + __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ + __u32 tcpi_reord_seen; /* reordering events seen */ + + __u32 tcpi_rcv_ooopack; /* Out-of-order packets received */ + + __u32 tcpi_snd_wnd; /* peer's advertised receive window after + * scaling (bytes) + */ +}; + +/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ +enum { + TCP_NLA_PAD, + TCP_NLA_BUSY, /* Time (usec) busy sending data */ + TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */ + TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */ + TCP_NLA_DATA_SEGS_OUT, /* Data pkts sent including retransmission */ + TCP_NLA_TOTAL_RETRANS, /* Data pkts retransmitted */ + TCP_NLA_PACING_RATE, /* Pacing rate in bytes per second */ + TCP_NLA_DELIVERY_RATE, /* Delivery rate in bytes per second */ + TCP_NLA_SND_CWND, /* Sending congestion window */ + TCP_NLA_REORDERING, /* Reordering metric */ + TCP_NLA_MIN_RTT, /* minimum RTT */ + TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ + TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ + TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ + TCP_NLA_CA_STATE, /* ca_state of socket */ + TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ + TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ + TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ + TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ + TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ + TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ + TCP_NLA_REORD_SEEN, /* reordering events seen */ + TCP_NLA_SRTT, /* smoothed RTT in usecs */ + TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ + TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ + TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ +}; + +/* for TCP_MD5SIG socket option */ +#define TCP_MD5SIG_MAXKEYLEN 80 + +/* tcp_md5sig extension flags for TCP_MD5SIG_EXT */ +#define TCP_MD5SIG_FLAG_PREFIX 0x1 /* address prefix length */ +#define TCP_MD5SIG_FLAG_IFINDEX 0x2 /* ifindex set */ + +struct tcp_md5sig { + struct __kernel_sockaddr_storage tcpm_addr; /* address associated */ + __u8 tcpm_flags; /* extension flags */ + __u8 tcpm_prefixlen; /* address prefix */ + __u16 tcpm_keylen; /* key length */ + int tcpm_ifindex; /* device index for scope */ + __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */ +}; + +/* INET_DIAG_MD5SIG */ +struct tcp_diag_md5sig { + __u8 tcpm_family; + __u8 tcpm_prefixlen; + __u16 tcpm_keylen; + __be32 tcpm_addr[4]; + __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; +}; + +/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ + +#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 +struct tcp_zerocopy_receive { + __u64 address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ + __u32 inq; /* out: amount of bytes in read queue */ + __s32 err; /* out: socket error */ + __u64 copybuf_address; /* in: copybuf address (small reads) */ + __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ + __u32 flags; /* in: flags */ +}; +#endif /* _UAPI_LINUX_TCP_H */ diff --git a/tools/lib/bpf/.gitignore b/tools/lib/bpf/.gitignore index 8a81b3679d2b..5d4cfac671d5 100644 --- a/tools/lib/bpf/.gitignore +++ b/tools/lib/bpf/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only libbpf_version.h libbpf.pc -FEATURE-DUMP.libbpf libbpf.so.* TAGS tags diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 55bd78b3496f..887a494ad5fc 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -58,28 +58,7 @@ ifndef VERBOSE VERBOSE = 0 endif -FEATURE_USER = .libbpf -FEATURE_TESTS = libelf zlib bpf -FEATURE_DISPLAY = libelf zlib bpf - INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/include/uapi -FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES) - -check_feat := 1 -NON_CHECK_FEAT_TARGETS := clean TAGS tags cscope help -ifdef MAKECMDGOALS -ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),) - check_feat := 0 -endif -endif - -ifeq ($(check_feat),1) -ifeq ($(FEATURES_DUMP),) -include $(srctree)/tools/build/Makefile.feature -else -include $(FEATURES_DUMP) -endif -endif export prefix libdir src obj @@ -157,7 +136,7 @@ all: fixdep all_cmd: $(CMD_TARGETS) check -$(BPF_IN_SHARED): force elfdep zdep bpfdep $(BPF_HELPER_DEFS) +$(BPF_IN_SHARED): force $(BPF_HELPER_DEFS) @(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \ (diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true @@ -175,7 +154,7 @@ $(BPF_IN_SHARED): force elfdep zdep bpfdep $(BPF_HELPER_DEFS) echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)" -$(BPF_IN_STATIC): force elfdep zdep bpfdep $(BPF_HELPER_DEFS) +$(BPF_IN_STATIC): force $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h @@ -264,34 +243,16 @@ install_pkgconfig: $(PC_FILE) install: install_lib install_pkgconfig install_headers -### Cleaning rules - -config-clean: - $(call QUIET_CLEAN, feature-detect) - $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null - -clean: config-clean +clean: $(call QUIET_CLEAN, libbpf) $(RM) -rf $(CMD_TARGETS) \ *~ .*.d .*.cmd LIBBPF-CFLAGS $(BPF_HELPER_DEFS) \ $(SHARED_OBJDIR) $(STATIC_OBJDIR) \ $(addprefix $(OUTPUT), \ *.o *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) *.pc) - $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf - - -PHONY += force elfdep zdep bpfdep cscope tags +PHONY += force cscope tags force: -elfdep: - @if [ "$(feature-libelf)" != "1" ]; then echo "No libelf found"; exit 1 ; fi - -zdep: - @if [ "$(feature-zlib)" != "1" ]; then echo "No zlib found"; exit 1 ; fi - -bpfdep: - @if [ "$(feature-bpf)" != "1" ]; then echo "BPF API too old"; exit 1 ; fi - cscope: ls *.c *.h > cscope.files cscope -b -q -I $(srctree)/include -f cscope.out diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 9970a288dda5..d9c10830d749 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -858,6 +858,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, Elf_Scn *scn = NULL; Elf *elf = NULL; GElf_Ehdr ehdr; + size_t shstrndx; if (elf_version(EV_CURRENT) == EV_NONE) { pr_warn("failed to init libelf for %s\n", path); @@ -882,7 +883,14 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, pr_warn("failed to get EHDR from %s\n", path); goto done; } - if (!elf_rawdata(elf_getscn(elf, ehdr.e_shstrndx), NULL)) { + + if (elf_getshdrstrndx(elf, &shstrndx)) { + pr_warn("failed to get section names section index for %s\n", + path); + goto done; + } + + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) { pr_warn("failed to get e_shstrndx from %s\n", path); goto done; } @@ -897,7 +905,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, idx, path); goto done; } - name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name); + name = elf_strptr(elf, shstrndx, sh.sh_name); if (!name) { pr_warn("failed to get section(%d) name from %s\n", idx, path); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2abbc3800568..d43cc3f29dae 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -884,24 +884,24 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map, if (btf_is_ptr(mtype)) { struct bpf_program *prog; - mtype = skip_mods_and_typedefs(btf, mtype->type, &mtype_id); + prog = st_ops->progs[i]; + if (!prog) + continue; + kern_mtype = skip_mods_and_typedefs(kern_btf, kern_mtype->type, &kern_mtype_id); - if (!btf_is_func_proto(mtype) || - !btf_is_func_proto(kern_mtype)) { - pr_warn("struct_ops init_kern %s: non func ptr %s is not supported\n", + + /* mtype->type must be a func_proto which was + * guaranteed in bpf_object__collect_st_ops_relos(), + * so only check kern_mtype for func_proto here. + */ + if (!btf_is_func_proto(kern_mtype)) { + pr_warn("struct_ops init_kern %s: kernel member %s is not a func ptr\n", map->name, mname); return -ENOTSUP; } - prog = st_ops->progs[i]; - if (!prog) { - pr_debug("struct_ops init_kern %s: func ptr %s is not set\n", - map->name, mname); - continue; - } - prog->attach_btf_id = kern_type_id; prog->expected_attach_type = kern_member_idx; diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index e3e41ceeb1bc..ffbb588724d8 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -46,6 +46,11 @@ #define PF_XDP AF_XDP #endif +enum xsk_prog { + XSK_PROG_FALLBACK, + XSK_PROG_REDIRECT_FLAGS, +}; + struct xsk_umem { struct xsk_ring_prod *fill_save; struct xsk_ring_cons *comp_save; @@ -351,6 +356,54 @@ int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area, COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2) DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4) +static enum xsk_prog get_xsk_prog(void) +{ + enum xsk_prog detected = XSK_PROG_FALLBACK; + struct bpf_load_program_attr prog_attr; + struct bpf_create_map_attr map_attr; + __u32 size_out, retval, duration; + char data_in = 0, data_out; + struct bpf_insn insns[] = { + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + BPF_EXIT_INSN(), + }; + int prog_fd, map_fd, ret; + + memset(&map_attr, 0, sizeof(map_attr)); + map_attr.map_type = BPF_MAP_TYPE_XSKMAP; + map_attr.key_size = sizeof(int); + map_attr.value_size = sizeof(int); + map_attr.max_entries = 1; + + map_fd = bpf_create_map_xattr(&map_attr); + if (map_fd < 0) + return detected; + + insns[0].imm = map_fd; + + memset(&prog_attr, 0, sizeof(prog_attr)); + prog_attr.prog_type = BPF_PROG_TYPE_XDP; + prog_attr.insns = insns; + prog_attr.insns_cnt = ARRAY_SIZE(insns); + prog_attr.license = "GPL"; + + prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0); + if (prog_fd < 0) { + close(map_fd); + return detected; + } + + ret = bpf_prog_test_run(prog_fd, 0, &data_in, 1, &data_out, &size_out, &retval, &duration); + if (!ret && retval == XDP_PASS) + detected = XSK_PROG_REDIRECT_FLAGS; + close(prog_fd); + close(map_fd); + return detected; +} + static int xsk_load_xdp_prog(struct xsk_socket *xsk) { static const int log_buf_size = 16 * 1024; @@ -358,7 +411,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) char log_buf[log_buf_size]; int err, prog_fd; - /* This is the C-program: + /* This is the fallback C-program: * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) * { * int ret, index = ctx->rx_queue_index; @@ -414,9 +467,31 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) /* The jumps are to this instruction */ BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt, + /* This is the post-5.3 kernel C-program: + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) + * { + * return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS); + * } + */ + struct bpf_insn prog_redirect_flags[] = { + /* r2 = *(u32 *)(r1 + 16) */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), + /* r1 = xskmap[] */ + BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), + /* r3 = XDP_PASS */ + BPF_MOV64_IMM(BPF_REG_3, 2), + /* call bpf_redirect_map */ + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + BPF_EXIT_INSN(), + }; + size_t insns_cnt[] = {sizeof(prog) / sizeof(struct bpf_insn), + sizeof(prog_redirect_flags) / sizeof(struct bpf_insn), + }; + struct bpf_insn *progs[] = {prog, prog_redirect_flags}; + enum xsk_prog option = get_xsk_prog(); + + prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, progs[option], insns_cnt[option], "LGPL-2.1 or BSD-2-Clause", 0, log_buf, log_buf_size); if (prog_fd < 0) { @@ -442,7 +517,7 @@ static int xsk_get_max_queues(struct xsk_socket *xsk) struct ifreq ifr = {}; int fd, err, ret; - fd = socket(AF_INET, SOCK_DGRAM, 0); + fd = socket(AF_LOCAL, SOCK_DGRAM, 0); if (fd < 0) return -errno; diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 62f3deb1d3a8..f4df7534026d 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -176,7 +176,6 @@ endef LD += $(EXTRA_LDFLAGS) PKG_CONFIG = $(CROSS_COMPILE)pkg-config -LLVM_CONFIG ?= llvm-config RM = rm -f LN = ln -f diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index 1358e89cdf7d..4255e71f72b7 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -69,6 +69,13 @@ HOSTCC ?= gcc HOSTLD ?= ld endif +# Some tools require Clang, LLC and/or LLVM utils +CLANG ?= clang +LLC ?= llc +LLVM_CONFIG ?= llvm-config +LLVM_OBJCOPY ?= llvm-objcopy +LLVM_STRIP ?= llvm-strip + ifeq ($(CC_NO_CLANG), 1) EXTRA_WARNINGS += -Wstrict-aliasing=3 endif diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index f5b7ef93618c..c0c48fdb9ac1 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -17,7 +17,6 @@ test_sockmap test_lirc_mode2_user get_cgroup_id_user test_skb_cgroup_id_user -test_socket_cookie test_cgroup_storage test_flow_dissector flow_dissector_load @@ -26,7 +25,6 @@ test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user test_sysctl -test_current_pid_tgid_new_ns xdping test_cpp *.skel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0552b07717b6..044bfdcf5b74 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -19,8 +19,6 @@ ifneq ($(wildcard $(GENHDR)),) GENFLAGS := -DHAVE_GENHDR endif -CLANG ?= clang -LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ @@ -33,11 +31,10 @@ LDLIBS += -lcap -lelf -lz -lrt -lpthread # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_verifier_log test_dev_cgroup \ - test_sock test_sockmap get_cgroup_id_user test_socket_cookie \ + test_sock test_sockmap get_cgroup_id_user \ test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sysctl \ - test_progs-no_alu32 \ - test_current_pid_tgid_new_ns + test_progs-no_alu32 # Also test bpf-gcc, if present ifneq ($(BPF_GCC),) @@ -188,7 +185,6 @@ $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_sock: cgroup_helpers.c $(OUTPUT)/test_sock_addr: cgroup_helpers.c -$(OUTPUT)/test_socket_cookie: cgroup_helpers.c $(OUTPUT)/test_sockmap: cgroup_helpers.c $(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index ca064180d4d0..fd148b8410fa 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -6,6 +6,30 @@ General instructions on running selftests can be found in __ /Documentation/bpf/bpf_devel_QA.rst#q-how-to-run-bpf-selftests +========================= +Running Selftests in a VM +========================= + +It's now possible to run the selftests using ``tools/testing/selftests/bpf/vmtest.sh``. +The script tries to ensure that the tests are run with the same environment as they +would be run post-submit in the CI used by the Maintainers. + +This script downloads a suitable Kconfig and VM userspace image from the system used by +the CI. It builds the kernel (without overwriting your existing Kconfig), recompiles the +bpf selftests, runs them (by default ``tools/testing/selftests/bpf/test_progs``) and +saves the resulting output (by default in ``~/.bpf_selftests``). + +For more information on about using the script, run: + +.. code-block:: console + + $ tools/testing/selftests/bpf/vmtest.sh -h + +.. note:: The script uses pahole and clang based on host environment setting. + If you want to change pahole and llvm, you can change `PATH` environment + variable in the beginning of script. + +.. note:: The script currently only supports x86_64. Additional information about selftest failures are documented here. diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c index da87c7f31891..bde6c9d4cbd4 100644 --- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -319,7 +319,7 @@ static void ringbuf_custom_process_ring(struct ringbuf_custom *r) smp_store_release(r->consumer_pos, cons_pos); else break; - }; + } } static void *ringbuf_custom_consumer(void *input) diff --git a/tools/testing/selftests/bpf/bpf_sockopt_helpers.h b/tools/testing/selftests/bpf/bpf_sockopt_helpers.h new file mode 100644 index 000000000000..11f3a0976174 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_sockopt_helpers.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <sys/socket.h> +#include <bpf/bpf_helpers.h> + +int get_set_sk_priority(void *ctx) +{ + int prio; + + /* Verify that context allows calling bpf_getsockopt and + * bpf_setsockopt by reading and writing back socket + * priority. + */ + + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; + + return 1; +} diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 6a9053162cf2..91f0fac632f4 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -177,6 +177,7 @@ struct tcp_congestion_ops { * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void *owner; }; #define min(a, b) ((a) < (b) ? (a) : (b)) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h index b83ea448bc79..89c6d58e5dd6 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h @@ -28,6 +28,12 @@ TRACE_EVENT(bpf_testmod_test_read, __entry->pid, __entry->comm, __entry->off, __entry->len) ); +/* A bare tracepoint with no event associated with it */ +DECLARE_TRACE(bpf_testmod_test_write_bare, + TP_PROTO(struct task_struct *task, struct bpf_testmod_test_write_ctx *ctx), + TP_ARGS(task, ctx) +); + #endif /* _BPF_TESTMOD_EVENTS_H */ #undef TRACE_INCLUDE_PATH diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 0b991e115d1f..141d8da687d2 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -31,9 +31,28 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, EXPORT_SYMBOL(bpf_testmod_test_read); ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); +noinline ssize_t +bpf_testmod_test_write(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + struct bpf_testmod_test_write_ctx ctx = { + .buf = buf, + .off = off, + .len = len, + }; + + trace_bpf_testmod_test_write_bare(current, &ctx); + + return -EIO; /* always fail */ +} +EXPORT_SYMBOL(bpf_testmod_test_write); +ALLOW_ERROR_INJECTION(bpf_testmod_test_write, ERRNO); + static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { - .attr = { .name = "bpf_testmod", .mode = 0444, }, + .attr = { .name = "bpf_testmod", .mode = 0666, }, .read = bpf_testmod_test_read, + .write = bpf_testmod_test_write, }; static int bpf_testmod_init(void) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index b81adfedb4f6..b3892dc40111 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -11,4 +11,10 @@ struct bpf_testmod_test_read_ctx { size_t len; }; +struct bpf_testmod_test_write_ctx { + char *buf; + loff_t off; + size_t len; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c b/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c new file mode 100644 index 000000000000..69bd7853e8f1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> + +#include "atomic_bounds.skel.h" + +void test_atomic_bounds(void) +{ + struct atomic_bounds *skel; + __u32 duration = 0; + + skel = atomic_bounds__open_and_load(); + if (CHECK(!skel, "skel_load", "couldn't load program\n")) + return; + + atomic_bounds__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bind_perm.c b/tools/testing/selftests/bpf/prog_tests/bind_perm.c new file mode 100644 index 000000000000..d0f06e40c16d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bind_perm.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "bind_perm.skel.h" + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/capability.h> + +static int duration; + +void try_bind(int family, int port, int expected_errno) +{ + struct sockaddr_storage addr = {}; + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int fd = -1; + + fd = socket(family, SOCK_STREAM, 0); + if (CHECK(fd < 0, "fd", "errno %d", errno)) + goto close_socket; + + if (family == AF_INET) { + sin = (struct sockaddr_in *)&addr; + sin->sin_family = family; + sin->sin_port = htons(port); + } else { + sin6 = (struct sockaddr_in6 *)&addr; + sin6->sin6_family = family; + sin6->sin6_port = htons(port); + } + + errno = 0; + bind(fd, (struct sockaddr *)&addr, sizeof(addr)); + ASSERT_EQ(errno, expected_errno, "bind"); + +close_socket: + if (fd >= 0) + close(fd); +} + +bool cap_net_bind_service(cap_flag_value_t flag) +{ + const cap_value_t cap_net_bind_service = CAP_NET_BIND_SERVICE; + cap_flag_value_t original_value; + bool was_effective = false; + cap_t caps; + + caps = cap_get_proc(); + if (CHECK(!caps, "cap_get_proc", "errno %d", errno)) + goto free_caps; + + if (CHECK(cap_get_flag(caps, CAP_NET_BIND_SERVICE, CAP_EFFECTIVE, + &original_value), + "cap_get_flag", "errno %d", errno)) + goto free_caps; + + was_effective = (original_value == CAP_SET); + + if (CHECK(cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_bind_service, + flag), + "cap_set_flag", "errno %d", errno)) + goto free_caps; + + if (CHECK(cap_set_proc(caps), "cap_set_proc", "errno %d", errno)) + goto free_caps; + +free_caps: + CHECK(cap_free(caps), "cap_free", "errno %d", errno); + return was_effective; +} + +void test_bind_perm(void) +{ + bool cap_was_effective; + struct bind_perm *skel; + int cgroup_fd; + + cgroup_fd = test__join_cgroup("/bind_perm"); + if (CHECK(cgroup_fd < 0, "cg-join", "errno %d", errno)) + return; + + skel = bind_perm__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto close_cgroup_fd; + + skel->links.bind_v4_prog = bpf_program__attach_cgroup(skel->progs.bind_v4_prog, cgroup_fd); + if (!ASSERT_OK_PTR(skel, "bind_v4_prog")) + goto close_skeleton; + + skel->links.bind_v6_prog = bpf_program__attach_cgroup(skel->progs.bind_v6_prog, cgroup_fd); + if (!ASSERT_OK_PTR(skel, "bind_v6_prog")) + goto close_skeleton; + + cap_was_effective = cap_net_bind_service(CAP_CLEAR); + + try_bind(AF_INET, 110, EACCES); + try_bind(AF_INET6, 110, EACCES); + + try_bind(AF_INET, 111, 0); + try_bind(AF_INET6, 111, 0); + + if (cap_was_effective) + cap_net_bind_service(CAP_SET); + +close_skeleton: + bind_perm__destroy(skel); +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 0e586368948d..74c45d557a2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -7,6 +7,7 @@ #include "bpf_iter_task.skel.h" #include "bpf_iter_task_stack.skel.h" #include "bpf_iter_task_file.skel.h" +#include "bpf_iter_task_vma.skel.h" #include "bpf_iter_task_btf.skel.h" #include "bpf_iter_tcp4.skel.h" #include "bpf_iter_tcp6.skel.h" @@ -64,6 +65,22 @@ free_link: bpf_link__destroy(link); } +static int read_fd_into_buffer(int fd, char *buf, int size) +{ + int bufleft = size; + int len; + + do { + len = read(fd, buf, bufleft); + if (len > 0) { + buf += len; + bufleft -= len; + } + } while (len > 0); + + return len < 0 ? len : size - bufleft; +} + static void test_ipv6_route(void) { struct bpf_iter_ipv6_route *skel; @@ -177,7 +194,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel) { struct bpf_program *prog = skel->progs.dump_task_struct; struct bpf_iter_task_btf__bss *bss = skel->bss; - int iter_fd = -1, len = 0, bufleft = TASKBUFSZ; + int iter_fd = -1, err; struct bpf_link *link; char *buf = taskbuf; int ret = 0; @@ -190,14 +207,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel) if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) goto free_link; - do { - len = read(iter_fd, buf, bufleft); - if (len > 0) { - buf += len; - bufleft -= len; - } - } while (len > 0); - + err = read_fd_into_buffer(iter_fd, buf, TASKBUFSZ); if (bss->skip) { printf("%s:SKIP:no __builtin_btf_type_id\n", __func__); ret = 1; @@ -205,7 +215,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel) goto free_link; } - if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno))) + if (CHECK(err < 0, "read", "read failed: %s\n", strerror(errno))) goto free_link; CHECK(strstr(taskbuf, "(struct task_struct)") == NULL, @@ -1133,6 +1143,92 @@ static void test_buf_neg_offset(void) bpf_iter_test_kern6__destroy(skel); } +#define CMP_BUFFER_SIZE 1024 +static char task_vma_output[CMP_BUFFER_SIZE]; +static char proc_maps_output[CMP_BUFFER_SIZE]; + +/* remove \0 and \t from str, and only keep the first line */ +static void str_strip_first_line(char *str) +{ + char *dst = str, *src = str; + + do { + if (*src == ' ' || *src == '\t') + src++; + else + *(dst++) = *(src++); + + } while (*src != '\0' && *src != '\n'); + + *dst = '\0'; +} + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +static void test_task_vma(void) +{ + int err, iter_fd = -1, proc_maps_fd = -1; + struct bpf_iter_task_vma *skel; + int len, read_size = 4; + char maps_path[64]; + + skel = bpf_iter_task_vma__open(); + if (CHECK(!skel, "bpf_iter_task_vma__open", "skeleton open failed\n")) + return; + + skel->bss->pid = getpid(); + + err = bpf_iter_task_vma__load(skel); + if (CHECK(err, "bpf_iter_task_vma__load", "skeleton load failed\n")) + goto out; + + skel->links.proc_maps = bpf_program__attach_iter( + skel->progs.proc_maps, NULL); + + if (CHECK(IS_ERR(skel->links.proc_maps), "bpf_program__attach_iter", + "attach iterator failed\n")) { + skel->links.proc_maps = NULL; + goto out; + } + + iter_fd = bpf_iter_create(bpf_link__fd(skel->links.proc_maps)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto out; + + /* Read CMP_BUFFER_SIZE (1kB) from bpf_iter. Read in small chunks + * to trigger seq_file corner cases. The expected output is much + * longer than 1kB, so the while loop will terminate. + */ + len = 0; + while (len < CMP_BUFFER_SIZE) { + err = read_fd_into_buffer(iter_fd, task_vma_output + len, + min(read_size, CMP_BUFFER_SIZE - len)); + if (CHECK(err < 0, "read_iter_fd", "read_iter_fd failed\n")) + goto out; + len += err; + } + + /* read CMP_BUFFER_SIZE (1kB) from /proc/pid/maps */ + snprintf(maps_path, 64, "/proc/%u/maps", skel->bss->pid); + proc_maps_fd = open(maps_path, O_RDONLY); + if (CHECK(proc_maps_fd < 0, "open_proc_maps", "open_proc_maps failed\n")) + goto out; + err = read_fd_into_buffer(proc_maps_fd, proc_maps_output, CMP_BUFFER_SIZE); + if (CHECK(err < 0, "read_prog_maps_fd", "read_prog_maps_fd failed\n")) + goto out; + + /* strip and compare the first line of the two files */ + str_strip_first_line(task_vma_output); + str_strip_first_line(proc_maps_output); + + CHECK(strcmp(task_vma_output, proc_maps_output), "compare_output", + "found mismatch\n"); +out: + close(proc_maps_fd); + close(iter_fd); + bpf_iter_task_vma__destroy(skel); +} + void test_bpf_iter(void) { if (test__start_subtest("btf_id_or_null")) @@ -1149,6 +1245,8 @@ void test_bpf_iter(void) test_task_stack(); if (test__start_subtest("task_file")) test_task_file(); + if (test__start_subtest("task_vma")) + test_task_vma(); if (test__start_subtest("task_btf")) test_task_btf(); if (test__start_subtest("tcp4")) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 9a8f47fc0b91..37c5494a0381 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -2,6 +2,7 @@ /* Copyright (c) 2019 Facebook */ #include <linux/err.h> +#include <netinet/tcp.h> #include <test_progs.h> #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 8ae97e2a4b9d..6a7ee7420701 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -914,7 +914,7 @@ static struct btf_raw_test raw_tests[] = { .err_str = "Member exceeds struct_size", }, -/* Test member exeeds the size of struct +/* Test member exceeds the size of struct * * struct A { * int m; @@ -948,7 +948,7 @@ static struct btf_raw_test raw_tests[] = { .err_str = "Member exceeds struct_size", }, -/* Test member exeeds the size of struct +/* Test member exceeds the size of struct * * struct A { * int m; @@ -3509,6 +3509,27 @@ static struct btf_raw_test raw_tests[] = { .value_type_id = 3 /* arr_t */, .max_entries = 4, }, +/* + * elf .rodata section size 4 and btf .rodata section vlen 0. + */ +{ + .descr = "datasec: vlen == 0", + .raw_types = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* .rodata section */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 0), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0.rodata"), + .map_type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 1, +}, }; /* struct btf_raw_test raw_tests[] */ diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c new file mode 100644 index 000000000000..36af1c138faf --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Jesper Dangaard Brouer */ + +#include <linux/if_link.h> /* before test_progs.h, avoid bpf_util.h redefines */ +#include <test_progs.h> +#include "test_check_mtu.skel.h" +#include "network_helpers.h" + +#include <stdlib.h> +#include <inttypes.h> + +#define IFINDEX_LO 1 + +static __u32 duration; /* Hint: needed for CHECK macro */ + +static int read_mtu_device_lo(void) +{ + const char *filename = "/sys/class/net/lo/mtu"; + char buf[11] = {}; + int value, n, fd; + + fd = open(filename, 0, O_RDONLY); + if (fd == -1) + return -1; + + n = read(fd, buf, sizeof(buf)); + close(fd); + + if (n == -1) + return -2; + + value = strtoimax(buf, NULL, 10); + if (errno == ERANGE) + return -3; + + return value; +} + +static void test_check_mtu_xdp_attach(void) +{ + struct bpf_link_info link_info; + __u32 link_info_len = sizeof(link_info); + struct test_check_mtu *skel; + struct bpf_program *prog; + struct bpf_link *link; + int err = 0; + int fd; + + skel = test_check_mtu__open_and_load(); + if (CHECK(!skel, "open and load skel", "failed")) + return; /* Exit if e.g. helper unknown to kernel */ + + prog = skel->progs.xdp_use_helper_basic; + + link = bpf_program__attach_xdp(prog, IFINDEX_LO); + if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link))) + goto out; + skel->links.xdp_use_helper_basic = link; + + memset(&link_info, 0, sizeof(link_info)); + fd = bpf_link__fd(link); + err = bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); + if (CHECK(err, "link_info", "failed: %d\n", err)) + goto out; + + CHECK(link_info.type != BPF_LINK_TYPE_XDP, "link_type", + "got %u != exp %u\n", link_info.type, BPF_LINK_TYPE_XDP); + CHECK(link_info.xdp.ifindex != IFINDEX_LO, "link_ifindex", + "got %u != exp %u\n", link_info.xdp.ifindex, IFINDEX_LO); + + err = bpf_link__detach(link); + CHECK(err, "link_detach", "failed %d\n", err); + +out: + test_check_mtu__destroy(skel); +} + +static void test_check_mtu_run_xdp(struct test_check_mtu *skel, + struct bpf_program *prog, + __u32 mtu_expect) +{ + const char *prog_name = bpf_program__name(prog); + int retval_expect = XDP_PASS; + __u32 mtu_result = 0; + char buf[256] = {}; + int err; + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .data_out = buf, + .data_size_out = sizeof(buf), + .prog_fd = bpf_program__fd(prog), + }; + + err = bpf_prog_test_run_xattr(&tattr); + CHECK_ATTR(err != 0, "bpf_prog_test_run", + "prog_name:%s (err %d errno %d retval %d)\n", + prog_name, err, errno, tattr.retval); + + CHECK(tattr.retval != retval_expect, "retval", + "progname:%s unexpected retval=%d expected=%d\n", + prog_name, tattr.retval, retval_expect); + + /* Extract MTU that BPF-prog got */ + mtu_result = skel->bss->global_bpf_mtu_xdp; + ASSERT_EQ(mtu_result, mtu_expect, "MTU-compare-user"); +} + + +static void test_check_mtu_xdp(__u32 mtu, __u32 ifindex) +{ + struct test_check_mtu *skel; + int err; + + skel = test_check_mtu__open(); + if (CHECK(!skel, "skel_open", "failed")) + return; + + /* Update "constants" in BPF-prog *BEFORE* libbpf load */ + skel->rodata->GLOBAL_USER_MTU = mtu; + skel->rodata->GLOBAL_USER_IFINDEX = ifindex; + + err = test_check_mtu__load(skel); + if (CHECK(err, "skel_load", "failed: %d\n", err)) + goto cleanup; + + test_check_mtu_run_xdp(skel, skel->progs.xdp_use_helper, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_exceed_mtu, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_minus_delta, mtu); + +cleanup: + test_check_mtu__destroy(skel); +} + +static void test_check_mtu_run_tc(struct test_check_mtu *skel, + struct bpf_program *prog, + __u32 mtu_expect) +{ + const char *prog_name = bpf_program__name(prog); + int retval_expect = BPF_OK; + __u32 mtu_result = 0; + char buf[256] = {}; + int err; + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .data_out = buf, + .data_size_out = sizeof(buf), + .prog_fd = bpf_program__fd(prog), + }; + + err = bpf_prog_test_run_xattr(&tattr); + CHECK_ATTR(err != 0, "bpf_prog_test_run", + "prog_name:%s (err %d errno %d retval %d)\n", + prog_name, err, errno, tattr.retval); + + CHECK(tattr.retval != retval_expect, "retval", + "progname:%s unexpected retval=%d expected=%d\n", + prog_name, tattr.retval, retval_expect); + + /* Extract MTU that BPF-prog got */ + mtu_result = skel->bss->global_bpf_mtu_tc; + ASSERT_EQ(mtu_result, mtu_expect, "MTU-compare-user"); +} + + +static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) +{ + struct test_check_mtu *skel; + int err; + + skel = test_check_mtu__open(); + if (CHECK(!skel, "skel_open", "failed")) + return; + + /* Update "constants" in BPF-prog *BEFORE* libbpf load */ + skel->rodata->GLOBAL_USER_MTU = mtu; + skel->rodata->GLOBAL_USER_IFINDEX = ifindex; + + err = test_check_mtu__load(skel); + if (CHECK(err, "skel_load", "failed: %d\n", err)) + goto cleanup; + + test_check_mtu_run_tc(skel, skel->progs.tc_use_helper, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu_da, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu); +cleanup: + test_check_mtu__destroy(skel); +} + +void test_check_mtu(void) +{ + __u32 mtu_lo; + + if (test__start_subtest("bpf_check_mtu XDP-attach")) + test_check_mtu_xdp_attach(); + + mtu_lo = read_mtu_device_lo(); + if (CHECK(mtu_lo < 0, "reading MTU value", "failed (err:%d)", mtu_lo)) + return; + + if (test__start_subtest("bpf_check_mtu XDP-run")) + test_check_mtu_xdp(mtu_lo, 0); + + if (test__start_subtest("bpf_check_mtu XDP-run ifindex-lookup")) + test_check_mtu_xdp(mtu_lo, IFINDEX_LO); + + if (test__start_subtest("bpf_check_mtu TC-run")) + test_check_mtu_tc(mtu_lo, 0); + + if (test__start_subtest("bpf_check_mtu TC-run ifindex-lookup")) + test_check_mtu_tc(mtu_lo, IFINDEX_LO); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 9781d85cb223..e075d03ab630 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -7,6 +7,7 @@ #include <string.h> #include <linux/pkt_cls.h> +#include <netinet/tcp.h> #include <test_progs.h> diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c index 3b9dbf7433f0..7c9b62e971f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c @@ -2,8 +2,8 @@ /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -/* x86-64 fits 55 JITed and 43 interpreted progs into half page */ -#define CNT 40 +/* that's kernel internal BPF_MAX_TRAMP_PROGS define */ +#define CNT 38 void test_fexit_stress(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/global_func_args.c b/tools/testing/selftests/bpf/prog_tests/global_func_args.c new file mode 100644 index 000000000000..8bcc2869102f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/global_func_args.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "test_progs.h" +#include "network_helpers.h" + +static __u32 duration; + +static void test_global_func_args0(struct bpf_object *obj) +{ + int err, i, map_fd, actual_value; + const char *map_name = "values"; + + map_fd = bpf_find_map(__func__, obj, map_name); + if (CHECK(map_fd < 0, "bpf_find_map", "cannot find BPF map %s: %s\n", + map_name, strerror(errno))) + return; + + struct { + const char *descr; + int expected_value; + } tests[] = { + {"passing NULL pointer", 0}, + {"returning value", 1}, + {"reading local variable", 100 }, + {"writing local variable", 101 }, + {"reading global variable", 42 }, + {"writing global variable", 43 }, + {"writing to pointer-to-pointer", 1 }, + }; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + const int expected_value = tests[i].expected_value; + + err = bpf_map_lookup_elem(map_fd, &i, &actual_value); + + CHECK(err || actual_value != expected_value, tests[i].descr, + "err %d result %d expected %d\n", err, actual_value, expected_value); + } +} + +void test_global_func_args(void) +{ + const char *file = "./test_global_func_args.o"; + __u32 retval; + struct bpf_object *obj; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd); + if (CHECK(err, "load program", "error %d loading %s\n", err, file)) + return; + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + CHECK(err || retval, "pass global func args run", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration); + + test_global_func_args0(obj); + + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 50796b651f72..5bc53d53d86e 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -21,9 +21,34 @@ static int trigger_module_test_read(int read_sz) return 0; } +static int trigger_module_test_write(int write_sz) +{ + int fd, err; + char *buf = malloc(write_sz); + + if (!buf) + return -ENOMEM; + + memset(buf, 'a', write_sz); + buf[write_sz-1] = '\0'; + + fd = open("/sys/kernel/bpf_testmod", O_WRONLY); + err = -errno; + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) { + free(buf); + return err; + } + + write(fd, buf, write_sz); + close(fd); + free(buf); + return 0; +} + void test_module_attach(void) { const int READ_SZ = 456; + const int WRITE_SZ = 457; struct test_module_attach* skel; struct test_module_attach__bss *bss; int err; @@ -48,8 +73,10 @@ void test_module_attach(void) /* trigger tracepoint */ ASSERT_OK(trigger_module_test_read(READ_SZ), "trigger_read"); + ASSERT_OK(trigger_module_test_write(WRITE_SZ), "trigger_write"); ASSERT_EQ(bss->raw_tp_read_sz, READ_SZ, "raw_tp"); + ASSERT_EQ(bss->raw_tp_bare_write_sz, WRITE_SZ, "raw_tp_bare"); ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual"); diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index e74dc501b27f..31a3114906e2 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -1,85 +1,87 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Carlos Neira [email protected] */ + +#define _GNU_SOURCE #include <test_progs.h> +#include "test_ns_current_pid_tgid.skel.h" #include <sys/stat.h> #include <sys/types.h> #include <unistd.h> #include <sys/syscall.h> +#include <sched.h> +#include <sys/wait.h> +#include <sys/mount.h> +#include <sys/fcntl.h> -struct bss { - __u64 dev; - __u64 ino; - __u64 pid_tgid; - __u64 user_pid_tgid; -}; +#define STACK_SIZE (1024 * 1024) +static char child_stack[STACK_SIZE]; -void test_ns_current_pid_tgid(void) +static int test_current_pid_tgid(void *args) { - const char *probe_name = "raw_tracepoint/sys_enter"; - const char *file = "test_ns_current_pid_tgid.o"; - int err, key = 0, duration = 0; - struct bpf_link *link = NULL; - struct bpf_program *prog; - struct bpf_map *bss_map; - struct bpf_object *obj; - struct bss bss; + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int err = -1, duration = 0; + pid_t tgid, pid; struct stat st; - __u64 id; - - obj = bpf_object__open_file(file, NULL); - if (CHECK(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj))) - return; - err = bpf_object__load(obj); - if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) + skel = test_ns_current_pid_tgid__open_and_load(); + if (CHECK(!skel, "skel_open_load", "failed to load skeleton\n")) goto cleanup; - bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss"); - if (CHECK(!bss_map, "find_bss_map", "failed\n")) + pid = syscall(SYS_gettid); + tgid = getpid(); + + err = stat("/proc/self/ns/pid", &st); + if (CHECK(err, "stat", "failed /proc/self/ns/pid: %d\n", err)) goto cleanup; - prog = bpf_object__find_program_by_title(obj, probe_name); - if (CHECK(!prog, "find_prog", "prog '%s' not found\n", - probe_name)) + bss = skel->bss; + bss->dev = st.st_dev; + bss->ino = st.st_ino; + bss->user_pid = 0; + bss->user_tgid = 0; + + err = test_ns_current_pid_tgid__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) goto cleanup; - memset(&bss, 0, sizeof(bss)); - pid_t tid = syscall(SYS_gettid); - pid_t pid = getpid(); + /* trigger tracepoint */ + usleep(1); + ASSERT_EQ(bss->user_pid, pid, "pid"); + ASSERT_EQ(bss->user_tgid, tgid, "tgid"); + err = 0; - id = (__u64) tid << 32 | pid; - bss.user_pid_tgid = id; +cleanup: + test_ns_current_pid_tgid__destroy(skel); - if (CHECK_FAIL(stat("/proc/self/ns/pid", &st))) { - perror("Failed to stat /proc/self/ns/pid"); - goto cleanup; - } + return err; +} - bss.dev = st.st_dev; - bss.ino = st.st_ino; +static void test_ns_current_pid_tgid_new_ns(void) +{ + int wstatus, duration = 0; + pid_t cpid; - err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0); - if (CHECK(err, "setting_bss", "failed to set bss : %d\n", err)) - goto cleanup; + /* Create a process in a new namespace, this process + * will be the init process of this new namespace hence will be pid 1. + */ + cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, + CLONE_NEWPID | SIGCHLD, NULL); - link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); - if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", - PTR_ERR(link))) { - link = NULL; - goto cleanup; - } + if (CHECK(cpid == -1, "clone", strerror(errno))) + return; - /* trigger some syscalls */ - usleep(1); + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + return; - err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss); - if (CHECK(err, "set_bss", "failed to get bss : %d\n", err)) - goto cleanup; + if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) + return; +} - if (CHECK(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid", - "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) - goto cleanup; -cleanup: - bpf_link__destroy(link); - bpf_object__close(obj); +void test_ns_current_pid_tgid(void) +{ + if (test__start_subtest("ns_current_pid_tgid_root_ns")) + test_current_pid_tgid(NULL); + if (test__start_subtest("ns_current_pid_tgid_new_ns")) + test_ns_current_pid_tgid_new_ns(); } diff --git a/tools/testing/selftests/bpf/prog_tests/recursion.c b/tools/testing/selftests/bpf/prog_tests/recursion.c new file mode 100644 index 000000000000..0e378d63fe18 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/recursion.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include "recursion.skel.h" + +void test_recursion(void) +{ + struct bpf_prog_info prog_info = {}; + __u32 prog_info_len = sizeof(prog_info); + struct recursion *skel; + int key = 0; + int err; + + skel = recursion__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = recursion__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + ASSERT_EQ(skel->bss->pass1, 0, "pass1 == 0"); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.hash1), &key, 0); + ASSERT_EQ(skel->bss->pass1, 1, "pass1 == 1"); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.hash1), &key, 0); + ASSERT_EQ(skel->bss->pass1, 2, "pass1 == 2"); + + ASSERT_EQ(skel->bss->pass2, 0, "pass2 == 0"); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.hash2), &key, 0); + ASSERT_EQ(skel->bss->pass2, 1, "pass2 == 1"); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.hash2), &key, 0); + ASSERT_EQ(skel->bss->pass2, 2, "pass2 == 2"); + + err = bpf_obj_get_info_by_fd(bpf_program__fd(skel->progs.on_lookup), + &prog_info, &prog_info_len); + if (!ASSERT_OK(err, "get_prog_info")) + goto out; + ASSERT_EQ(prog_info.recursion_misses, 2, "recursion_misses"); +out: + recursion__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/socket_cookie.c b/tools/testing/selftests/bpf/prog_tests/socket_cookie.c new file mode 100644 index 000000000000..232db28dde18 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/socket_cookie.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Google LLC. +// Copyright (c) 2018 Facebook + +#include <test_progs.h> +#include "socket_cookie_prog.skel.h" +#include "network_helpers.h" + +static int duration; + +struct socket_cookie { + __u64 cookie_key; + __u32 cookie_value; +}; + +void test_socket_cookie(void) +{ + int server_fd = 0, client_fd = 0, cgroup_fd = 0, err = 0; + socklen_t addr_len = sizeof(struct sockaddr_in6); + struct socket_cookie_prog *skel; + __u32 cookie_expected_value; + struct sockaddr_in6 addr; + struct socket_cookie val; + + skel = socket_cookie_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + cgroup_fd = test__join_cgroup("/socket_cookie"); + if (CHECK(cgroup_fd < 0, "join_cgroup", "cgroup creation failed\n")) + goto out; + + skel->links.set_cookie = bpf_program__attach_cgroup( + skel->progs.set_cookie, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.set_cookie, "prog_attach")) + goto close_cgroup_fd; + + skel->links.update_cookie_sockops = bpf_program__attach_cgroup( + skel->progs.update_cookie_sockops, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.update_cookie_sockops, "prog_attach")) + goto close_cgroup_fd; + + skel->links.update_cookie_tracing = bpf_program__attach( + skel->progs.update_cookie_tracing); + if (!ASSERT_OK_PTR(skel->links.update_cookie_tracing, "prog_attach")) + goto close_cgroup_fd; + + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno)) + goto close_cgroup_fd; + + client_fd = connect_to_fd(server_fd, 0); + if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno)) + goto close_server_fd; + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.socket_cookies), + &client_fd, &val); + if (!ASSERT_OK(err, "map_lookup(socket_cookies)")) + goto close_client_fd; + + err = getsockname(client_fd, (struct sockaddr *)&addr, &addr_len); + if (!ASSERT_OK(err, "getsockname")) + goto close_client_fd; + + cookie_expected_value = (ntohs(addr.sin6_port) << 8) | 0xFF; + ASSERT_EQ(val.cookie_value, cookie_expected_value, "cookie_value"); + +close_client_fd: + close(client_fd); +close_server_fd: + close(server_fd); +close_cgroup_fd: + close(cgroup_fd); +out: + socket_cookie_prog__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 85f73261fab0..b8b48cac2ac3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include <error.h> +#include <netinet/tcp.h> #include "test_progs.h" #include "test_skmsg_load_helpers.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index b25c9c45c148..d5b44b135c00 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -2,6 +2,12 @@ #include <test_progs.h> #include "cgroup_helpers.h" +#include <linux/tcp.h> + +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef static int getsetsockopt(void) @@ -11,6 +17,7 @@ static int getsetsockopt(void) char u8[4]; __u32 u32; char cc[16]; /* TCP_CA_NAME_MAX */ + struct tcp_zerocopy_receive zc; } buf = {}; socklen_t optlen; char *big_buf = NULL; @@ -154,6 +161,27 @@ static int getsetsockopt(void) goto err; } + /* TCP_ZEROCOPY_RECEIVE triggers */ + memset(&buf, 0, sizeof(buf)); + optlen = sizeof(buf.zc); + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (err) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + + memset(&buf, 0, sizeof(buf)); + buf.zc.address = 12345; /* rejected by BPF */ + optlen = sizeof(buf.zc); + errno = 0; + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (errno != EPERM) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + free(big_buf); close(fd); return 0; diff --git a/tools/testing/selftests/bpf/prog_tests/stack_var_off.c b/tools/testing/selftests/bpf/prog_tests/stack_var_off.c new file mode 100644 index 000000000000..2ce9deefa59c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stack_var_off.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "test_stack_var_off.skel.h" + +/* Test read and writes to the stack performed with offsets that are not + * statically known. + */ +void test_stack_var_off(void) +{ + int duration = 0; + struct test_stack_var_off *skel; + + skel = test_stack_var_off__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + /* Give pid to bpf prog so it doesn't trigger for anyone else. */ + skel->bss->test_pid = getpid(); + /* Initialize the probe's input. */ + skel->bss->input[0] = 2; + skel->bss->input[1] = 42; /* This will be returned in probe_res. */ + + if (!ASSERT_OK(test_stack_var_off__attach(skel), "skel_attach")) + goto cleanup; + + /* Trigger probe. */ + usleep(1); + + if (CHECK(skel->bss->probe_res != 42, "check_probe_res", + "wrong probe res: %d\n", skel->bss->probe_res)) + goto cleanup; + +cleanup: + test_stack_var_off__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index 32e4348b714b..7e13129f593a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -61,6 +61,14 @@ void test_test_global_funcs(void) { "test_global_func6.o" , "modified ctx ptr R2" }, { "test_global_func7.o" , "foo() doesn't return scalar" }, { "test_global_func8.o" }, + { "test_global_func9.o" }, + { "test_global_func10.o", "invalid indirect read from stack" }, + { "test_global_func11.o", "Caller passes invalid args into func#1" }, + { "test_global_func12.o", "invalid mem access 'mem_or_null'" }, + { "test_global_func13.o", "Caller passes invalid args into func#1" }, + { "test_global_func14.o", "reference type('FWD S') size cannot be determined" }, + { "test_global_func15.o", "At program exit the register R0 has value" }, + { "test_global_func16.o", "invalid indirect read from stack" }, }; libbpf_print_fn_t old_print_fn = NULL; int err, i, duration = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c index 61fca681d524..b54bc0c351b7 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_ima.c +++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c @@ -9,6 +9,7 @@ #include <unistd.h> #include <sys/wait.h> #include <test_progs.h> +#include <linux/ring_buffer.h> #include "ima.skel.h" @@ -31,9 +32,18 @@ static int run_measured_process(const char *measured_dir, u32 *monitored_pid) return -EINVAL; } +static u64 ima_hash_from_bpf; + +static int process_sample(void *ctx, void *data, size_t len) +{ + ima_hash_from_bpf = *((u64 *)data); + return 0; +} + void test_test_ima(void) { char measured_dir_template[] = "/tmp/ima_measuredXXXXXX"; + struct ring_buffer *ringbuf; const char *measured_dir; char cmd[256]; @@ -44,6 +54,11 @@ void test_test_ima(void) if (CHECK(!skel, "skel_load", "skeleton failed\n")) goto close_prog; + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf), + process_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ringbuf")) + goto close_prog; + err = ima__attach(skel); if (CHECK(err, "attach", "attach failed: %d\n", err)) goto close_prog; @@ -60,11 +75,9 @@ void test_test_ima(void) if (CHECK(err, "run_measured_process", "err = %d\n", err)) goto close_clean; - CHECK(skel->data->ima_hash_ret < 0, "ima_hash_ret", - "ima_hash_ret = %ld\n", skel->data->ima_hash_ret); - - CHECK(skel->bss->ima_hash == 0, "ima_hash", - "ima_hash = %lu\n", skel->bss->ima_hash); + err = ring_buffer__consume(ringbuf); + ASSERT_EQ(err, 1, "num_samples_or_err"); + ASSERT_NEQ(ima_hash_from_bpf, 0, "ima_hash"); close_clean: snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index 3bfcf00c0a67..d2c16eaae367 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -113,7 +113,7 @@ static bool check_syscall_operations(int map_fd, int obj_fd) void test_test_local_storage(void) { - char tmp_dir_path[64] = "/tmp/local_storageXXXXXX"; + char tmp_dir_path[] = "/tmp/local_storageXXXXXX"; int err, serv_sk = -1, task_fd = -1, rm_fd = -1; struct local_storage *skel = NULL; char tmp_exec_path[64]; diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index 781c8d11604b..f3022d934e2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -4,7 +4,7 @@ #include <sys/prctl.h> #include <test_progs.h> -#define MAX_TRAMP_PROGS 40 +#define MAX_TRAMP_PROGS 38 struct inst { struct bpf_object *obj; @@ -52,7 +52,7 @@ void test_trampoline_count(void) struct bpf_link *link; char comm[16] = {}; - /* attach 'allowed' 40 trampoline programs */ + /* attach 'allowed' trampoline programs */ for (i = 0; i < MAX_TRAMP_PROGS; i++) { obj = bpf_object__open_file(object, NULL); if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { diff --git a/tools/testing/selftests/bpf/progs/atomic_bounds.c b/tools/testing/selftests/bpf/progs/atomic_bounds.c new file mode 100644 index 000000000000..e5fff7fc7f8f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/atomic_bounds.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <stdbool.h> + +#ifdef ENABLE_ATOMICS_TESTS +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(sub, int x) +{ +#ifdef ENABLE_ATOMICS_TESTS + int a = 0; + int b = __sync_fetch_and_add(&a, 1); + /* b is certainly 0 here. Can the verifier tell? */ + while (b) + continue; +#endif + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bind_perm.c b/tools/testing/selftests/bpf/progs/bind_perm.c new file mode 100644 index 000000000000..7bd2a027025d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind_perm.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +static __always_inline int bind_prog(struct bpf_sock_addr *ctx, int family) +{ + struct bpf_sock *sk; + + sk = ctx->sk; + if (!sk) + return 0; + + if (sk->family != family) + return 0; + + if (ctx->type != SOCK_STREAM) + return 0; + + /* Return 1 OR'ed with the first bit set to indicate + * that CAP_NET_BIND_SERVICE should be bypassed. + */ + if (ctx->user_port == bpf_htons(111)) + return (1 | 2); + + return 1; +} + +SEC("cgroup/bind4") +int bind_v4_prog(struct bpf_sock_addr *ctx) +{ + return bind_prog(ctx, AF_INET); +} + +SEC("cgroup/bind6") +int bind_v6_prog(struct bpf_sock_addr *ctx) +{ + return bind_prog(ctx, AF_INET6); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index 6a1255465fd6..3d83b185c4bc 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -7,6 +7,7 @@ #define bpf_iter__netlink bpf_iter__netlink___not_used #define bpf_iter__task bpf_iter__task___not_used #define bpf_iter__task_file bpf_iter__task_file___not_used +#define bpf_iter__task_vma bpf_iter__task_vma___not_used #define bpf_iter__tcp bpf_iter__tcp___not_used #define tcp6_sock tcp6_sock___not_used #define bpf_iter__udp bpf_iter__udp___not_used @@ -26,6 +27,7 @@ #undef bpf_iter__netlink #undef bpf_iter__task #undef bpf_iter__task_file +#undef bpf_iter__task_vma #undef bpf_iter__tcp #undef tcp6_sock #undef bpf_iter__udp @@ -67,6 +69,12 @@ struct bpf_iter__task_file { struct file *file; } __attribute__((preserve_access_index)); +struct bpf_iter__task_vma { + struct bpf_iter_meta *meta; + struct task_struct *task; + struct vm_area_struct *vma; +} __attribute__((preserve_access_index)); + struct bpf_iter__bpf_map { struct bpf_iter_meta *meta; struct bpf_map *map; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c new file mode 100644 index 000000000000..11d1aa37cf11 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "bpf_iter.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +/* Copied from mm.h */ +#define VM_READ 0x00000001 +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_MAYSHARE 0x00000080 + +/* Copied from kdev_t.h */ +#define MINORBITS 20 +#define MINORMASK ((1U << MINORBITS) - 1) +#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) +#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) + +#define D_PATH_BUF_SIZE 1024 +char d_path_buf[D_PATH_BUF_SIZE] = {}; +__u32 pid = 0; + +SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx) +{ + struct vm_area_struct *vma = ctx->vma; + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + struct file *file; + char perm_str[] = "----"; + + if (task == (void *)0 || vma == (void *)0) + return 0; + + file = vma->vm_file; + if (task->tgid != pid) + return 0; + perm_str[0] = (vma->vm_flags & VM_READ) ? 'r' : '-'; + perm_str[1] = (vma->vm_flags & VM_WRITE) ? 'w' : '-'; + perm_str[2] = (vma->vm_flags & VM_EXEC) ? 'x' : '-'; + perm_str[3] = (vma->vm_flags & VM_MAYSHARE) ? 's' : 'p'; + BPF_SEQ_PRINTF(seq, "%08llx-%08llx %s ", vma->vm_start, vma->vm_end, perm_str); + + if (file) { + __u32 dev = file->f_inode->i_sb->s_dev; + + bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE); + + BPF_SEQ_PRINTF(seq, "%08llx ", vma->vm_pgoff << 12); + BPF_SEQ_PRINTF(seq, "%02x:%02x %u", MAJOR(dev), MINOR(dev), + file->f_inode->i_ino); + BPF_SEQ_PRINTF(seq, "\t%s\n", d_path_buf); + } else { + BPF_SEQ_PRINTF(seq, "%08llx 00:00 0\n", 0ULL); + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c index 7396308677a3..a979aaef2a76 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port4.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -10,6 +10,8 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include <bpf_sockopt_helpers.h> + char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; @@ -58,6 +60,9 @@ int connect4(struct bpf_sock_addr *ctx) SEC("cgroup/getsockname4") int getsockname4(struct bpf_sock_addr *ctx) { + if (!get_set_sk_priority(ctx)) + return 1; + /* Expose local server as 1.2.3.4:60000 to client. */ if (ctx->user_port == bpf_htons(60123)) { ctx->user_ip4 = bpf_htonl(0x01020304); @@ -71,6 +76,9 @@ int getpeername4(struct bpf_sock_addr *ctx) { struct svc_addr *orig; + if (!get_set_sk_priority(ctx)) + return 1; + /* Expose service 1.2.3.4:60000 as peer instead of backend. */ if (ctx->user_port == bpf_htons(60123)) { orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c index c1a2b555e9ad..afc8f1c5a9d6 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port6.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -9,6 +9,8 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include <bpf_sockopt_helpers.h> + char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; @@ -63,6 +65,9 @@ int connect6(struct bpf_sock_addr *ctx) SEC("cgroup/getsockname6") int getsockname6(struct bpf_sock_addr *ctx) { + if (!get_set_sk_priority(ctx)) + return 1; + /* Expose local server as [fc00::1]:60000 to client. */ if (ctx->user_port == bpf_htons(60124)) { ctx->user_ip6[0] = bpf_htonl(0xfc000000); @@ -79,6 +84,9 @@ int getpeername6(struct bpf_sock_addr *ctx) { struct svc_addr *orig; + if (!get_set_sk_priority(ctx)) + return 1; + /* Expose service [fc00::1]:60000 as peer instead of backend. */ if (ctx->user_port == bpf_htons(60124)) { orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); diff --git a/tools/testing/selftests/bpf/progs/ima.c b/tools/testing/selftests/bpf/progs/ima.c index 86b21aff4bc5..96060ff4ffc6 100644 --- a/tools/testing/selftests/bpf/progs/ima.c +++ b/tools/testing/selftests/bpf/progs/ima.c @@ -9,20 +9,37 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -long ima_hash_ret = -1; -u64 ima_hash = 0; u32 monitored_pid = 0; +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + char _license[] SEC("license") = "GPL"; SEC("lsm.s/bprm_committed_creds") -int BPF_PROG(ima, struct linux_binprm *bprm) +void BPF_PROG(ima, struct linux_binprm *bprm) { - u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 ima_hash = 0; + u64 *sample; + int ret; + u32 pid; + + pid = bpf_get_current_pid_tgid() >> 32; + if (pid == monitored_pid) { + ret = bpf_ima_inode_hash(bprm->file->f_inode, &ima_hash, + sizeof(ima_hash)); + if (ret < 0 || ima_hash == 0) + return; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(u64), 0); + if (!sample) + return; - if (pid == monitored_pid) - ima_hash_ret = bpf_ima_inode_hash(bprm->file->f_inode, - &ima_hash, sizeof(ima_hash)); + *sample = ima_hash; + bpf_ringbuf_submit(sample, 0); + } - return 0; + return; } diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c index ff4d343b94b5..33694ef8acfa 100644 --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -30,6 +30,53 @@ struct { __type(value, __u64); } lru_hash SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_PERCPU_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} lru_percpu_hash SEC(".maps"); + +struct inner_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, __u64); +} inner_map SEC(".maps"); + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct inner_map); +} outer_arr SEC(".maps") = { + .values = { [0] = &inner_map }, +}; + +struct outer_hash { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(int)); + __array(values, struct inner_map); +} outer_hash SEC(".maps") = { + .values = { [0] = &inner_map }, +}; + char _license[] SEC("license") = "GPL"; int monitored_pid = 0; @@ -61,6 +108,7 @@ SEC("lsm.s/bprm_committed_creds") int BPF_PROG(test_void_hook, struct linux_binprm *bprm) { __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct inner_map *inner_map; char args[64]; __u32 key = 0; __u64 *value; @@ -80,6 +128,27 @@ int BPF_PROG(test_void_hook, struct linux_binprm *bprm) value = bpf_map_lookup_elem(&lru_hash, &key); if (value) *value = 0; + value = bpf_map_lookup_elem(&percpu_array, &key); + if (value) + *value = 0; + value = bpf_map_lookup_elem(&percpu_hash, &key); + if (value) + *value = 0; + value = bpf_map_lookup_elem(&lru_percpu_hash, &key); + if (value) + *value = 0; + inner_map = bpf_map_lookup_elem(&outer_arr, &key); + if (inner_map) { + value = bpf_map_lookup_elem(inner_map, &key); + if (value) + *value = 0; + } + inner_map = bpf_map_lookup_elem(&outer_hash, &key); + if (inner_map) { + value = bpf_map_lookup_elem(inner_map, &key); + if (value) + *value = 0; + } return 0; } diff --git a/tools/testing/selftests/bpf/progs/recursion.c b/tools/testing/selftests/bpf/progs/recursion.c new file mode 100644 index 000000000000..49f679375b9d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/recursion.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, long); +} hash1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, long); +} hash2 SEC(".maps"); + +int pass1 = 0; +int pass2 = 0; + +SEC("fentry/__htab_map_lookup_elem") +int BPF_PROG(on_lookup, struct bpf_map *map) +{ + int key = 0; + + if (map == (void *)&hash1) { + pass1++; + return 0; + } + if (map == (void *)&hash2) { + pass2++; + /* htab_map_gen_lookup() will inline below call + * into direct call to __htab_map_lookup_elem() + */ + bpf_map_lookup_elem(&hash2, &key); + return 0; + } + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/recvmsg4_prog.c b/tools/testing/selftests/bpf/progs/recvmsg4_prog.c new file mode 100644 index 000000000000..3d1ae8b3402f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/recvmsg4_prog.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <linux/in.h> +#include <sys/socket.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#include <bpf_sockopt_helpers.h> + +#define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ +#define SERV4_PORT 4040 + +SEC("cgroup/recvmsg4") +int recvmsg4_prog(struct bpf_sock_addr *ctx) +{ + struct bpf_sock *sk; + __u32 user_ip4; + __u16 user_port; + + sk = ctx->sk; + if (!sk) + return 1; + + if (sk->family != AF_INET) + return 1; + + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) + return 1; + + if (!get_set_sk_priority(ctx)) + return 1; + + ctx->user_ip4 = bpf_htonl(SERV4_IP); + ctx->user_port = bpf_htons(SERV4_PORT); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/recvmsg6_prog.c b/tools/testing/selftests/bpf/progs/recvmsg6_prog.c new file mode 100644 index 000000000000..27dfb21b21b4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/recvmsg6_prog.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <linux/in6.h> +#include <sys/socket.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#include <bpf_sockopt_helpers.h> + +#define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ +#define SERV6_IP_1 0x12345678 +#define SERV6_IP_2 0x00000000 +#define SERV6_IP_3 0x0000abcd +#define SERV6_PORT 6060 + +SEC("cgroup/recvmsg6") +int recvmsg6_prog(struct bpf_sock_addr *ctx) +{ + struct bpf_sock *sk; + __u32 user_ip4; + __u16 user_port; + + sk = ctx->sk; + if (!sk) + return 1; + + if (sk->family != AF_INET6) + return 1; + + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) + return 1; + + if (!get_set_sk_priority(ctx)) + return 1; + + ctx->user_ip6[0] = bpf_htonl(SERV6_IP_0); + ctx->user_ip6[1] = bpf_htonl(SERV6_IP_1); + ctx->user_ip6[2] = bpf_htonl(SERV6_IP_2); + ctx->user_ip6[3] = bpf_htonl(SERV6_IP_3); + ctx->user_port = bpf_htons(SERV6_PORT); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c index 092d9da536f3..ac5abc34cde8 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c @@ -8,6 +8,8 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include <bpf_sockopt_helpers.h> + #define SRC1_IP4 0xAC100001U /* 172.16.0.1 */ #define SRC2_IP4 0x00000000U #define SRC_REWRITE_IP4 0x7f000004U @@ -21,9 +23,14 @@ int _version SEC("version") = 1; SEC("cgroup/sendmsg4") int sendmsg_v4_prog(struct bpf_sock_addr *ctx) { + int prio; + if (ctx->type != SOCK_DGRAM) return 0; + if (!get_set_sk_priority(ctx)) + return 0; + /* Rewrite source. */ if (ctx->msg_src_ip4 == bpf_htonl(SRC1_IP4) || ctx->msg_src_ip4 == bpf_htonl(SRC2_IP4)) { diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index 255a432bc163..24694b1a8d82 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -8,6 +8,8 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include <bpf_sockopt_helpers.h> + #define SRC_REWRITE_IP6_0 0 #define SRC_REWRITE_IP6_1 0 #define SRC_REWRITE_IP6_2 0 @@ -28,6 +30,9 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx) if (ctx->type != SOCK_DGRAM) return 0; + if (!get_set_sk_priority(ctx)) + return 0; + /* Rewrite source. */ if (ctx->msg_src_ip6[3] == bpf_htonl(1) || ctx->msg_src_ip6[3] == bpf_htonl(0)) { diff --git a/tools/testing/selftests/bpf/progs/socket_cookie_prog.c b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c index 0cb5656a22b0..35630a5aaf5f 100644 --- a/tools/testing/selftests/bpf/progs/socket_cookie_prog.c +++ b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2018 Facebook -#include <linux/bpf.h> -#include <sys/socket.h> +#include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include <bpf/bpf_tracing.h> + +#define AF_INET6 10 struct socket_cookie { __u64 cookie_key; @@ -19,6 +21,14 @@ struct { __type(value, struct socket_cookie); } socket_cookies SEC(".maps"); +/* + * These three programs get executed in a row on connect() syscalls. The + * userspace side of the test creates a client socket, issues a connect() on it + * and then checks that the local storage associated with this socket has: + * cookie_value == local_port << 8 | 0xFF + * The different parts of this cookie_value are appended by those hooks if they + * all agree on the output of bpf_get_socket_cookie(). + */ SEC("cgroup/connect6") int set_cookie(struct bpf_sock_addr *ctx) { @@ -32,16 +42,16 @@ int set_cookie(struct bpf_sock_addr *ctx) if (!p) return 1; - p->cookie_value = 0xFF; + p->cookie_value = 0xF; p->cookie_key = bpf_get_socket_cookie(ctx); return 1; } SEC("sockops") -int update_cookie(struct bpf_sock_ops *ctx) +int update_cookie_sockops(struct bpf_sock_ops *ctx) { - struct bpf_sock *sk; + struct bpf_sock *sk = ctx->sk; struct socket_cookie *p; if (ctx->family != AF_INET6) @@ -50,21 +60,40 @@ int update_cookie(struct bpf_sock_ops *ctx) if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB) return 1; - if (!ctx->sk) + if (!sk) return 1; - p = bpf_sk_storage_get(&socket_cookies, ctx->sk, 0, 0); + p = bpf_sk_storage_get(&socket_cookies, sk, 0, 0); if (!p) return 1; if (p->cookie_key != bpf_get_socket_cookie(ctx)) return 1; - p->cookie_value = (ctx->local_port << 8) | p->cookie_value; + p->cookie_value |= (ctx->local_port << 8); return 1; } -int _version SEC("version") = 1; +SEC("fexit/inet_stream_connect") +int BPF_PROG(update_cookie_tracing, struct socket *sock, + struct sockaddr *uaddr, int addr_len, int flags) +{ + struct socket_cookie *p; + + if (uaddr->sa_family != AF_INET6) + return 0; + + p = bpf_sk_storage_get(&socket_cookies, sock->sk, 0, 0); + if (!p) + return 0; + + if (p->cookie_key != bpf_get_socket_cookie(sock->sk)) + return 0; + + p->cookie_value |= 0xF0; + + return 0; +} char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 712df7b49cb1..d3597f81e6e9 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <string.h> -#include <netinet/in.h> -#include <netinet/tcp.h> +#include <linux/tcp.h> #include <linux/bpf.h> +#include <netinet/in.h> #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; @@ -12,6 +12,10 @@ __u32 _version SEC("version") = 1; #define PAGE_SIZE 4096 #endif +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef struct sockopt_sk { @@ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. + * It has a custom implementation for performance + * reasons. + */ + + if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end) + return 0; /* EPERM, bounds check */ + + if (((struct tcp_zerocopy_receive *)optval)->address != 0) + return 0; /* EPERM, unexpected data */ + + return 1; + } + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { if (optval + 1 > optval_end) return 0; /* EPERM, bounds check */ diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c new file mode 100644 index 000000000000..b7787b43f9db --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Jesper Dangaard Brouer */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <linux/if_ether.h> + +#include <stddef.h> +#include <stdint.h> + +char _license[] SEC("license") = "GPL"; + +/* Userspace will update with MTU it can see on device */ +static volatile const int GLOBAL_USER_MTU; +static volatile const __u32 GLOBAL_USER_IFINDEX; + +/* BPF-prog will update these with MTU values it can see */ +__u32 global_bpf_mtu_xdp = 0; +__u32 global_bpf_mtu_tc = 0; + +SEC("xdp") +int xdp_use_helper_basic(struct xdp_md *ctx) +{ + __u32 mtu_len = 0; + + if (bpf_check_mtu(ctx, 0, &mtu_len, 0, 0)) + return XDP_ABORTED; + + return XDP_PASS; +} + +SEC("xdp") +int xdp_use_helper(struct xdp_md *ctx) +{ + int retval = XDP_PASS; /* Expected retval on successful test */ + __u32 mtu_len = 0; + __u32 ifindex = 0; + int delta = 0; + + /* When ifindex is zero, save net_device lookup and use ctx netdev */ + if (GLOBAL_USER_IFINDEX > 0) + ifindex = GLOBAL_USER_IFINDEX; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, delta, 0)) { + /* mtu_len is also valid when check fail */ + retval = XDP_ABORTED; + goto out; + } + + if (mtu_len != GLOBAL_USER_MTU) + retval = XDP_DROP; + +out: + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("xdp") +int xdp_exceed_mtu(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 data_len = data_end - data; + int retval = XDP_ABORTED; /* Fail */ + __u32 mtu_len = 0; + int delta; + int err; + + /* Exceed MTU with 1 via delta adjust */ + delta = GLOBAL_USER_MTU - (data_len - ETH_HLEN) + 1; + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, delta, 0); + if (err) { + retval = XDP_PASS; /* Success in exceeding MTU check */ + if (err != BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = XDP_DROP; + } + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("xdp") +int xdp_minus_delta(struct xdp_md *ctx) +{ + int retval = XDP_PASS; /* Expected retval on successful test */ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 data_len = data_end - data; + __u32 mtu_len = 0; + int delta; + + /* Borderline test case: Minus delta exceeding packet length allowed */ + delta = -((data_len - ETH_HLEN) + 1); + + /* Minus length (adjusted via delta) still pass MTU check, other helpers + * are responsible for catching this, when doing actual size adjust + */ + if (bpf_check_mtu(ctx, ifindex, &mtu_len, delta, 0)) + retval = XDP_ABORTED; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("classifier") +int tc_use_helper(struct __sk_buff *ctx) +{ + int retval = BPF_OK; /* Expected retval on successful test */ + __u32 mtu_len = 0; + int delta = 0; + + if (bpf_check_mtu(ctx, 0, &mtu_len, delta, 0)) { + retval = BPF_DROP; + goto out; + } + + if (mtu_len != GLOBAL_USER_MTU) + retval = BPF_REDIRECT; +out: + global_bpf_mtu_tc = mtu_len; + return retval; +} + +SEC("classifier") +int tc_exceed_mtu(struct __sk_buff *ctx) +{ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int retval = BPF_DROP; /* Fail */ + __u32 skb_len = ctx->len; + __u32 mtu_len = 0; + int delta; + int err; + + /* Exceed MTU with 1 via delta adjust */ + delta = GLOBAL_USER_MTU - (skb_len - ETH_HLEN) + 1; + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, delta, 0); + if (err) { + retval = BPF_OK; /* Success in exceeding MTU check */ + if (err != BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = BPF_DROP; + } + + global_bpf_mtu_tc = mtu_len; + return retval; +} + +SEC("classifier") +int tc_exceed_mtu_da(struct __sk_buff *ctx) +{ + /* SKB Direct-Access variant */ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 data_len = data_end - data; + int retval = BPF_DROP; /* Fail */ + __u32 mtu_len = 0; + int delta; + int err; + + /* Exceed MTU with 1 via delta adjust */ + delta = GLOBAL_USER_MTU - (data_len - ETH_HLEN) + 1; + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, delta, 0); + if (err) { + retval = BPF_OK; /* Success in exceeding MTU check */ + if (err != BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = BPF_DROP; + } + + global_bpf_mtu_tc = mtu_len; + return retval; +} + +SEC("classifier") +int tc_minus_delta(struct __sk_buff *ctx) +{ + int retval = BPF_OK; /* Expected retval on successful test */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 skb_len = ctx->len; + __u32 mtu_len = 0; + int delta; + + /* Borderline test case: Minus delta exceeding packet length allowed */ + delta = -((skb_len - ETH_HLEN) + 1); + + /* Minus length (adjusted via delta) still pass MTU check, other helpers + * are responsible for catching this, when doing actual size adjust + */ + if (bpf_check_mtu(ctx, ifindex, &mtu_len, delta, 0)) + retval = BPF_DROP; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index c9f8464996ea..3c1e042962e6 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -70,6 +70,7 @@ typedef struct { uint64_t errors_total_encap_adjust_failed; uint64_t errors_total_encap_buffer_too_small; uint64_t errors_total_redirect_loop; + uint64_t errors_total_encap_mtu_violate; } metrics_t; typedef enum { @@ -407,6 +408,7 @@ static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *e payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; uint16_t proto = ETH_P_IP; + uint32_t mtu_len = 0; /* Loop protection: the inner packet's TTL is decremented as a safeguard * against any forwarding loop. As the only interesting field is the TTL @@ -479,6 +481,11 @@ static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *e } } + if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { + metrics->errors_total_encap_mtu_violate++; + return TC_ACT_SHOT; + } + if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_NO_CSUM_RESET) || diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c new file mode 100644 index 000000000000..61c2ae92ce41 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct Small { + int x; +}; + +struct Big { + int x; + int y; +}; + +__noinline int foo(const struct Big *big) +{ + if (big == 0) + return 0; + + return bpf_get_prandom_u32() < big->y; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + const struct Small small = {.x = skb->len }; + + return foo((struct Big *)&small) ? 1 : 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func11.c b/tools/testing/selftests/bpf/progs/test_global_func11.c new file mode 100644 index 000000000000..28488047c849 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func11.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct S { + int x; +}; + +__noinline int foo(const struct S *s) +{ + return s ? bpf_get_prandom_u32() < s->x : 0; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + return foo(skb); +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c new file mode 100644 index 000000000000..62343527cc59 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func12.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct S { + int x; +}; + +__noinline int foo(const struct S *s) +{ + return bpf_get_prandom_u32() < s->x; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + const struct S s = {.x = skb->len }; + + return foo(&s); +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func13.c b/tools/testing/selftests/bpf/progs/test_global_func13.c new file mode 100644 index 000000000000..ff8897c1ac22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func13.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct S { + int x; +}; + +__noinline int foo(const struct S *s) +{ + if (s) + return bpf_get_prandom_u32() < s->x; + + return 0; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + const struct S *s = (const struct S *)(0xbedabeda); + + return foo(s); +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func14.c b/tools/testing/selftests/bpf/progs/test_global_func14.c new file mode 100644 index 000000000000..698c77199ebf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func14.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct S; + +__noinline int foo(const struct S *s) +{ + if (s) + return bpf_get_prandom_u32() < *(const int *) s; + + return 0; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + + return foo(NULL); +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func15.c b/tools/testing/selftests/bpf/progs/test_global_func15.c new file mode 100644 index 000000000000..c19c435988d5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func15.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +__noinline int foo(unsigned int *v) +{ + if (v) + *v = bpf_get_prandom_u32(); + + return 0; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + unsigned int v = 1; + + foo(&v); + + return v; +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func16.c b/tools/testing/selftests/bpf/progs/test_global_func16.c new file mode 100644 index 000000000000..0312d1e8d8c0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func16.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +__noinline int foo(int (*arr)[10]) +{ + if (arr) + return (*arr)[9]; + + return 0; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + int array[10]; + + const int rv = foo(&array); + + return rv ? 1 : 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func9.c b/tools/testing/selftests/bpf/progs/test_global_func9.c new file mode 100644 index 000000000000..bd233ddede98 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func9.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct S { + int x; +}; + +struct C { + int x; + int y; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct S); +} map SEC(".maps"); + +enum E { + E_ITEM +}; + +static int global_data_x = 100; +static int volatile global_data_y = 500; + +__noinline int foo(const struct S *s) +{ + if (s) + return bpf_get_prandom_u32() < s->x; + + return 0; +} + +__noinline int bar(int *x) +{ + if (x) + *x &= bpf_get_prandom_u32(); + + return 0; +} +__noinline int baz(volatile int *x) +{ + if (x) + *x &= bpf_get_prandom_u32(); + + return 0; +} + +__noinline int qux(enum E *e) +{ + if (e) + return *e; + + return 0; +} + +__noinline int quux(int (*arr)[10]) +{ + if (arr) + return (*arr)[9]; + + return 0; +} + +__noinline int quuz(int **p) +{ + if (p) + *p = NULL; + + return 0; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + int result = 0; + + { + const struct S s = {.x = skb->len }; + + result |= foo(&s); + } + + { + const __u32 key = 1; + const struct S *s = bpf_map_lookup_elem(&map, &key); + + result |= foo(s); + } + + { + const struct C c = {.x = skb->len, .y = skb->family }; + + result |= foo((const struct S *)&c); + } + + { + result |= foo(NULL); + } + + { + bar(&result); + bar(&global_data_x); + } + + { + result |= baz(&global_data_y); + } + + { + enum E e = E_ITEM; + + result |= qux(&e); + } + + { + int array[10] = {0}; + + result |= quux(&array); + } + + { + int *p; + + result |= quuz(&p); + } + + return result ? 1 : 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func_args.c b/tools/testing/selftests/bpf/progs/test_global_func_args.c new file mode 100644 index 000000000000..cae309538a9e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func_args.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> + +#include <bpf/bpf_helpers.h> + +struct S { + int v; +}; + +static volatile struct S global_variable; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 7); + __type(key, __u32); + __type(value, int); +} values SEC(".maps"); + +static void save_value(__u32 index, int value) +{ + bpf_map_update_elem(&values, &index, &value, 0); +} + +__noinline int foo(__u32 index, struct S *s) +{ + if (s) { + save_value(index, s->v); + return ++s->v; + } + + save_value(index, 0); + + return 1; +} + +__noinline int bar(__u32 index, volatile struct S *s) +{ + if (s) { + save_value(index, s->v); + return ++s->v; + } + + save_value(index, 0); + + return 1; +} + +__noinline int baz(struct S **s) +{ + if (s) + *s = 0; + + return 0; +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + __u32 index = 0; + + { + const int v = foo(index++, 0); + + save_value(index++, v); + } + + { + struct S s = { .v = 100 }; + + foo(index++, &s); + save_value(index++, s.v); + } + + { + global_variable.v = 42; + bar(index++, &global_variable); + save_value(index++, global_variable.v); + } + + { + struct S v, *p = &v; + + baz(&p); + save_value(index++, !p); + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index efd1e287ac17..bd37ceec5587 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -17,6 +17,16 @@ int BPF_PROG(handle_raw_tp, return 0; } +__u32 raw_tp_bare_write_sz = 0; + +SEC("raw_tp/bpf_testmod_test_write_bare") +int BPF_PROG(handle_raw_tp_bare, + struct task_struct *task, struct bpf_testmod_test_write_ctx *write_ctx) +{ + raw_tp_bare_write_sz = BPF_CORE_READ(write_ctx, len); + return 0; +} + __u32 tp_btf_read_sz = 0; SEC("tp_btf/bpf_testmod_test_read") diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c index 1dca70a6de2f..0763d49f9c42 100644 --- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -5,31 +5,21 @@ #include <stdint.h> #include <bpf/bpf_helpers.h> -static volatile struct { - __u64 dev; - __u64 ino; - __u64 pid_tgid; - __u64 user_pid_tgid; -} res; +__u64 user_pid = 0; +__u64 user_tgid = 0; +__u64 dev = 0; +__u64 ino = 0; -SEC("raw_tracepoint/sys_enter") -int trace(void *ctx) +SEC("tracepoint/syscalls/sys_enter_nanosleep") +int handler(const void *ctx) { - __u64 ns_pid_tgid, expected_pid; struct bpf_pidns_info nsdata; - __u32 key = 0; - if (bpf_get_ns_current_pid_tgid(res.dev, res.ino, &nsdata, - sizeof(struct bpf_pidns_info))) + if (bpf_get_ns_current_pid_tgid(dev, ino, &nsdata, sizeof(struct bpf_pidns_info))) return 0; - ns_pid_tgid = (__u64)nsdata.tgid << 32 | nsdata.pid; - expected_pid = res.user_pid_tgid; - - if (expected_pid != ns_pid_tgid) - return 0; - - res.pid_tgid = ns_pid_tgid; + user_pid = nsdata.pid; + user_tgid = nsdata.tgid; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_stack_var_off.c b/tools/testing/selftests/bpf/progs/test_stack_var_off.c new file mode 100644 index 000000000000..665e6ae09d37 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_stack_var_off.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +int probe_res; + +char input[4] = {}; +int test_pid; + +SEC("tracepoint/syscalls/sys_enter_nanosleep") +int probe(void *ctx) +{ + /* This BPF program performs variable-offset reads and writes on a + * stack-allocated buffer. + */ + char stack_buf[16]; + unsigned long len; + unsigned long last; + + if ((bpf_get_current_pid_tgid() >> 32) != test_pid) + return 0; + + /* Copy the input to the stack. */ + __builtin_memcpy(stack_buf, input, 4); + + /* The first byte in the buffer indicates the length. */ + len = stack_buf[0] & 0xf; + last = (len - 1) & 0xf; + + /* Append something to the buffer. The offset where we write is not + * statically known; this is a variable-offset stack write. + */ + stack_buf[len] = 42; + + /* Index into the buffer at an unknown offset. This is a + * variable-offset stack read. + * + * Note that if it wasn't for the preceding variable-offset write, this + * read would be rejected because the stack slot cannot be verified as + * being initialized. With the preceding variable-offset write, the + * stack slot still cannot be verified, but the write inhibits the + * respective check on the reasoning that, if there was a + * variable-offset to a higher-or-equal spot, we're probably reading + * what we just wrote. + */ + probe_res = stack_buf[last]; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c b/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c deleted file mode 100644 index ec53b1ef90d2..000000000000 --- a/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c +++ /dev/null @@ -1,160 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2020 Carlos Neira [email protected] */ -#define _GNU_SOURCE -#include <sys/stat.h> -#include <sys/types.h> -#include <unistd.h> -#include <sys/syscall.h> -#include <sched.h> -#include <sys/wait.h> -#include <sys/mount.h> -#include "test_progs.h" - -#define CHECK_NEWNS(condition, tag, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - printf("%s:FAIL:%s ", __func__, tag); \ - printf(format); \ - } else { \ - printf("%s:PASS:%s\n", __func__, tag); \ - } \ - __ret; \ -}) - -struct bss { - __u64 dev; - __u64 ino; - __u64 pid_tgid; - __u64 user_pid_tgid; -}; - -int main(int argc, char **argv) -{ - pid_t pid; - int exit_code = 1; - struct stat st; - - printf("Testing bpf_get_ns_current_pid_tgid helper in new ns\n"); - - if (stat("/proc/self/ns/pid", &st)) { - perror("stat failed on /proc/self/ns/pid ns\n"); - printf("%s:FAILED\n", argv[0]); - return exit_code; - } - - if (CHECK_NEWNS(unshare(CLONE_NEWPID | CLONE_NEWNS), - "unshare CLONE_NEWPID | CLONE_NEWNS", "error errno=%d\n", errno)) - return exit_code; - - pid = fork(); - if (pid == -1) { - perror("Fork() failed\n"); - printf("%s:FAILED\n", argv[0]); - return exit_code; - } - - if (pid > 0) { - int status; - - usleep(5); - waitpid(pid, &status, 0); - return 0; - } else { - - pid = fork(); - if (pid == -1) { - perror("Fork() failed\n"); - printf("%s:FAILED\n", argv[0]); - return exit_code; - } - - if (pid > 0) { - int status; - waitpid(pid, &status, 0); - return 0; - } else { - if (CHECK_NEWNS(mount("none", "/proc", NULL, MS_PRIVATE|MS_REC, NULL), - "Unmounting proc", "Cannot umount proc! errno=%d\n", errno)) - return exit_code; - - if (CHECK_NEWNS(mount("proc", "/proc", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL), - "Mounting proc", "Cannot mount proc! errno=%d\n", errno)) - return exit_code; - - const char *probe_name = "raw_tracepoint/sys_enter"; - const char *file = "test_ns_current_pid_tgid.o"; - struct bpf_link *link = NULL; - struct bpf_program *prog; - struct bpf_map *bss_map; - struct bpf_object *obj; - int exit_code = 1; - int err, key = 0; - struct bss bss; - struct stat st; - __u64 id; - - obj = bpf_object__open_file(file, NULL); - if (CHECK_NEWNS(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj))) - return exit_code; - - err = bpf_object__load(obj); - if (CHECK_NEWNS(err, "obj_load", "err %d errno %d\n", err, errno)) - goto cleanup; - - bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss"); - if (CHECK_NEWNS(!bss_map, "find_bss_map", "failed\n")) - goto cleanup; - - prog = bpf_object__find_program_by_title(obj, probe_name); - if (CHECK_NEWNS(!prog, "find_prog", "prog '%s' not found\n", - probe_name)) - goto cleanup; - - memset(&bss, 0, sizeof(bss)); - pid_t tid = syscall(SYS_gettid); - pid_t pid = getpid(); - - id = (__u64) tid << 32 | pid; - bss.user_pid_tgid = id; - - if (CHECK_NEWNS(stat("/proc/self/ns/pid", &st), - "stat new ns", "Failed to stat /proc/self/ns/pid errno=%d\n", errno)) - goto cleanup; - - bss.dev = st.st_dev; - bss.ino = st.st_ino; - - err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0); - if (CHECK_NEWNS(err, "setting_bss", "failed to set bss : %d\n", err)) - goto cleanup; - - link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); - if (CHECK_NEWNS(IS_ERR(link), "attach_raw_tp", "err %ld\n", - PTR_ERR(link))) { - link = NULL; - goto cleanup; - } - - /* trigger some syscalls */ - usleep(1); - - err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss); - if (CHECK_NEWNS(err, "set_bss", "failed to get bss : %d\n", err)) - goto cleanup; - - if (CHECK_NEWNS(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid", - "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) - goto cleanup; - - exit_code = 0; - printf("%s:PASS\n", argv[0]); -cleanup: - if (!link) { - bpf_link__destroy(link); - link = NULL; - } - bpf_object__close(obj); - } - } - return 0; -} diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c index 01f0c634d548..571cc076dd7d 100644 --- a/tools/testing/selftests/bpf/test_flow_dissector.c +++ b/tools/testing/selftests/bpf/test_flow_dissector.c @@ -503,7 +503,7 @@ static int do_rx(int fd) if (rbuf != cfg_payload_char) error(1, 0, "recv: payload mismatch"); num++; - }; + } return num; } diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 213628ee721c..6396932b97e2 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -390,7 +390,7 @@ static void unload_bpf_testmod(void) return; } fprintf(env.stderr, "Failed to unload bpf_testmod.ko from kernel: %d\n", -errno); - exit(1); + return; } if (env.verbosity > VERBOSE_NONE) fprintf(stdout, "Successfully unloaded bpf_testmod.ko.\n"); diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index e49e2fdde942..f7c2fd89d01a 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -16,7 +16,6 @@ typedef __u16 __sum16; #include <linux/if_packet.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include <netinet/tcp.h> #include <linux/filter.h> #include <linux/perf_event.h> #include <linux/socket.h> diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index dcb83ab02919..aa3f185fcb89 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -31,6 +31,8 @@ #define CONNECT6_PROG_PATH "./connect6_prog.o" #define SENDMSG4_PROG_PATH "./sendmsg4_prog.o" #define SENDMSG6_PROG_PATH "./sendmsg6_prog.o" +#define RECVMSG4_PROG_PATH "./recvmsg4_prog.o" +#define RECVMSG6_PROG_PATH "./recvmsg6_prog.o" #define BIND4_PROG_PATH "./bind4_prog.o" #define BIND6_PROG_PATH "./bind6_prog.o" @@ -94,10 +96,10 @@ static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int recvmsg_allow_prog_load(const struct sock_addr_test *test); static int recvmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg4_rw_asm_prog_load(const struct sock_addr_test *test); +static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg6_rw_asm_prog_load(const struct sock_addr_test *test); +static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); @@ -573,8 +575,8 @@ static struct sock_addr_test tests[] = { LOAD_REJECT, }, { - "recvmsg4: rewrite IP & port (asm)", - recvmsg4_rw_asm_prog_load, + "recvmsg4: rewrite IP & port (C)", + recvmsg4_rw_c_prog_load, BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP4_RECVMSG, AF_INET, @@ -587,8 +589,8 @@ static struct sock_addr_test tests[] = { SUCCESS, }, { - "recvmsg6: rewrite IP & port (asm)", - recvmsg6_rw_asm_prog_load, + "recvmsg6: rewrite IP & port (C)", + recvmsg6_rw_c_prog_load, BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, AF_INET6, @@ -786,45 +788,9 @@ static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); } -static int recvmsg4_rw_asm_prog_load(const struct sock_addr_test *test) +static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test) { - struct sockaddr_in src4_rw_addr; - - if (mk_sockaddr(AF_INET, SERV4_IP, SERV4_PORT, - (struct sockaddr *)&src4_rw_addr, - sizeof(src4_rw_addr)) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 6), - - /* sk.type == SOCK_DGRAM) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, type)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 4), - - /* user_ip4 = src4_rw_addr.sin_addr */ - BPF_MOV32_IMM(BPF_REG_7, src4_rw_addr.sin_addr.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_ip4)), - - /* user_port = src4_rw_addr.sin_port */ - BPF_MOV32_IMM(BPF_REG_7, src4_rw_addr.sin_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); + return load_path(test, RECVMSG4_PROG_PATH); } static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test) @@ -890,37 +856,9 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); } -static int recvmsg6_rw_asm_prog_load(const struct sock_addr_test *test) +static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test) { - struct sockaddr_in6 src6_rw_addr; - - if (mk_sockaddr(AF_INET6, SERV6_IP, SERV6_PORT, - (struct sockaddr *)&src6_rw_addr, - sizeof(src6_rw_addr)) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET6) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 10), - - STORE_IPV6(user_ip6, src6_rw_addr.sin6_addr.s6_addr32), - - /* user_port = dst6_rw_addr.sin6_port */ - BPF_MOV32_IMM(BPF_REG_7, src6_rw_addr.sin6_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); + return load_path(test, RECVMSG6_PROG_PATH); } static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c deleted file mode 100644 index ca7ca87e91aa..000000000000 --- a/tools/testing/selftests/bpf/test_socket_cookie.c +++ /dev/null @@ -1,208 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#include <string.h> -#include <unistd.h> - -#include <arpa/inet.h> -#include <netinet/in.h> -#include <sys/types.h> -#include <sys/socket.h> - -#include <bpf/bpf.h> -#include <bpf/libbpf.h> - -#include "bpf_rlimit.h" -#include "cgroup_helpers.h" - -#define CG_PATH "/foo" -#define SOCKET_COOKIE_PROG "./socket_cookie_prog.o" - -struct socket_cookie { - __u64 cookie_key; - __u32 cookie_value; -}; - -static int start_server(void) -{ - struct sockaddr_in6 addr; - int fd; - - fd = socket(AF_INET6, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - memset(&addr, 0, sizeof(addr)); - addr.sin6_family = AF_INET6; - addr.sin6_addr = in6addr_loopback; - addr.sin6_port = 0; - - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET6, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto out; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto close_out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) == -1) { - log_err("Fail to connect to server"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} - -static int validate_map(struct bpf_map *map, int client_fd) -{ - __u32 cookie_expected_value; - struct sockaddr_in6 addr; - socklen_t len = sizeof(addr); - struct socket_cookie val; - int err = 0; - int map_fd; - - if (!map) { - log_err("Map not found in BPF object"); - goto err; - } - - map_fd = bpf_map__fd(map); - - err = bpf_map_lookup_elem(map_fd, &client_fd, &val); - - err = getsockname(client_fd, (struct sockaddr *)&addr, &len); - if (err) { - log_err("Can't get client local addr"); - goto out; - } - - cookie_expected_value = (ntohs(addr.sin6_port) << 8) | 0xFF; - if (val.cookie_value != cookie_expected_value) { - log_err("Unexpected value in map: %x != %x", val.cookie_value, - cookie_expected_value); - goto err; - } - - goto out; -err: - err = -1; -out: - return err; -} - -static int run_test(int cgfd) -{ - enum bpf_attach_type attach_type; - struct bpf_prog_load_attr attr; - struct bpf_program *prog; - struct bpf_object *pobj; - const char *prog_name; - int server_fd = -1; - int client_fd = -1; - int prog_fd = -1; - int err = 0; - - memset(&attr, 0, sizeof(attr)); - attr.file = SOCKET_COOKIE_PROG; - attr.prog_type = BPF_PROG_TYPE_UNSPEC; - attr.prog_flags = BPF_F_TEST_RND_HI32; - - err = bpf_prog_load_xattr(&attr, &pobj, &prog_fd); - if (err) { - log_err("Failed to load %s", attr.file); - goto out; - } - - bpf_object__for_each_program(prog, pobj) { - prog_name = bpf_program__section_name(prog); - - if (libbpf_attach_type_by_name(prog_name, &attach_type)) - goto err; - - err = bpf_prog_attach(bpf_program__fd(prog), cgfd, attach_type, - BPF_F_ALLOW_OVERRIDE); - if (err) { - log_err("Failed to attach prog %s", prog_name); - goto out; - } - } - - server_fd = start_server(); - if (server_fd == -1) - goto err; - - client_fd = connect_to_server(server_fd); - if (client_fd == -1) - goto err; - - if (validate_map(bpf_map__next(NULL, pobj), client_fd)) - goto err; - - goto out; -err: - err = -1; -out: - close(client_fd); - close(server_fd); - bpf_object__close(pobj); - printf("%s\n", err ? "FAILED" : "PASSED"); - return err; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; - - if (run_test(cgfd)) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - return err; -} diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f8569f04064b..58b5a349d3ba 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -88,6 +88,10 @@ struct bpf_test { int fixup_map_event_output[MAX_FIXUPS]; int fixup_map_reuseport_array[MAX_FIXUPS]; int fixup_map_ringbuf[MAX_FIXUPS]; + /* Expected verifier log output for result REJECT or VERBOSE_ACCEPT. + * Can be a tab-separated sequence of expected strings. An empty string + * means no log verification. + */ const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -297,6 +301,78 @@ static void bpf_fill_scale(struct bpf_test *self) } } +static int bpf_fill_torturous_jumps_insn_1(struct bpf_insn *insn) +{ + unsigned int len = 259, hlen = 128; + int i; + + insn[0] = BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32); + for (i = 1; i <= hlen; i++) { + insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, i, hlen); + insn[i + hlen] = BPF_JMP_A(hlen - i); + } + insn[len - 2] = BPF_MOV64_IMM(BPF_REG_0, 1); + insn[len - 1] = BPF_EXIT_INSN(); + + return len; +} + +static int bpf_fill_torturous_jumps_insn_2(struct bpf_insn *insn) +{ + unsigned int len = 4100, jmp_off = 2048; + int i, j; + + insn[0] = BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32); + for (i = 1; i <= jmp_off; i++) { + insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, i, jmp_off); + } + insn[i++] = BPF_JMP_A(jmp_off); + for (; i <= jmp_off * 2 + 1; i+=16) { + for (j = 0; j < 16; j++) { + insn[i + j] = BPF_JMP_A(16 - j - 1); + } + } + + insn[len - 2] = BPF_MOV64_IMM(BPF_REG_0, 2); + insn[len - 1] = BPF_EXIT_INSN(); + + return len; +} + +static void bpf_fill_torturous_jumps(struct bpf_test *self) +{ + struct bpf_insn *insn = self->fill_insns; + int i = 0; + + switch (self->retval) { + case 1: + self->prog_len = bpf_fill_torturous_jumps_insn_1(insn); + return; + case 2: + self->prog_len = bpf_fill_torturous_jumps_insn_2(insn); + return; + case 3: + /* main */ + insn[i++] = BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4); + insn[i++] = BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 262); + insn[i++] = BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0); + insn[i++] = BPF_MOV64_IMM(BPF_REG_0, 3); + insn[i++] = BPF_EXIT_INSN(); + + /* subprog 1 */ + i += bpf_fill_torturous_jumps_insn_1(insn + i); + + /* subprog 2 */ + i += bpf_fill_torturous_jumps_insn_2(insn + i); + + self->prog_len = i; + return; + default: + self->prog_len = 0; + break; + } +} + /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */ #define BPF_SK_LOOKUP(func) \ /* struct bpf_sock_tuple tuple = {} */ \ @@ -923,13 +999,19 @@ static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val, return 0; } +/* Returns true if every part of exp (tab-separated) appears in log, in order. + * + * If exp is an empty string, returns true. + */ static bool cmp_str_seq(const char *log, const char *exp) { - char needle[80]; + char needle[200]; const char *p, *q; int len; do { + if (!strlen(exp)) + break; p = strchr(exp, '\t'); if (!p) p = exp + strlen(exp); @@ -943,7 +1025,7 @@ static bool cmp_str_seq(const char *log, const char *exp) needle[len] = 0; q = strstr(log, needle); if (!q) { - printf("FAIL\nUnexpected verifier log in successful load!\n" + printf("FAIL\nUnexpected verifier log!\n" "EXP: %s\nRES:\n", needle); return false; } @@ -1058,7 +1140,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, printf("FAIL\nUnexpected success to load!\n"); goto fail_log; } - if (!expected_err || !strstr(bpf_vlog, expected_err)) { + if (!expected_err || !cmp_str_seq(bpf_vlog, expected_err)) { printf("FAIL\nUnexpected error message!\n\tEXP: %s\n\tRES: %s\n", expected_err, bpf_vlog); goto fail_log; diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh index dd80f0c84afb..c033850886f4 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # Create 2 namespaces with two veth peers, and # forward packets in-between using generic XDP # @@ -57,12 +57,8 @@ test_xdp_redirect() ip link set dev veth1 $xdpmode obj test_xdp_redirect.o sec redirect_to_222 &> /dev/null ip link set dev veth2 $xdpmode obj test_xdp_redirect.o sec redirect_to_111 &> /dev/null - ip netns exec ns1 ping -c 1 10.1.1.22 &> /dev/null - local ret1=$? - ip netns exec ns2 ping -c 1 10.1.1.11 &> /dev/null - local ret2=$? - - if [ $ret1 -eq 0 -a $ret2 -eq 0 ]; then + if ip netns exec ns1 ping -c 1 10.1.1.22 &> /dev/null && + ip netns exec ns2 ping -c 1 10.1.1.11 &> /dev/null; then echo "selftests: test_xdp_redirect $xdpmode [PASS]"; else ret=1 diff --git a/tools/testing/selftests/bpf/verifier/atomic_and.c b/tools/testing/selftests/bpf/verifier/atomic_and.c index 600bc5e0f143..1bdc8e6684f7 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_and.c +++ b/tools/testing/selftests/bpf/verifier/atomic_and.c @@ -7,7 +7,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0x011), BPF_ATOMIC_OP(BPF_DW, BPF_AND, BPF_REG_10, BPF_REG_1, -8), /* if (val != 0x010) exit(2); */ - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x010, 2), BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/atomic_bounds.c b/tools/testing/selftests/bpf/verifier/atomic_bounds.c new file mode 100644 index 000000000000..e82183e4914f --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_bounds.c @@ -0,0 +1,27 @@ +{ + "BPF_ATOMIC bounds propagation, mem->reg", + .insns = { + /* a = 0; */ + /* + * Note this is implemented with two separate instructions, + * where you might think one would suffice: + * + * BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + * + * This is because BPF_ST_MEM doesn't seem to set the stack slot + * type to 0 when storing an immediate. + */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + /* b = atomic_fetch_add(&a, 1); */ + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* Verifier should be able to tell that this infinite loop isn't reachable. */ + /* if (b) while (true) continue; */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "back-edge", +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_or.c b/tools/testing/selftests/bpf/verifier/atomic_or.c index ebe6e51455ba..70f982e1f9f0 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_or.c +++ b/tools/testing/selftests/bpf/verifier/atomic_or.c @@ -7,7 +7,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0x011), BPF_ATOMIC_OP(BPF_DW, BPF_OR, BPF_REG_10, BPF_REG_1, -8), /* if (val != 0x111) exit(2); */ - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x111, 2), BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/atomic_xor.c b/tools/testing/selftests/bpf/verifier/atomic_xor.c index eb791e547b47..74e8fb46694b 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_xor.c +++ b/tools/testing/selftests/bpf/verifier/atomic_xor.c @@ -7,7 +7,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0x011), BPF_ATOMIC_OP(BPF_DW, BPF_XOR, BPF_REG_10, BPF_REG_1, -8), /* if (val != 0x101) exit(2); */ - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x101, 2), BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/basic_stack.c b/tools/testing/selftests/bpf/verifier/basic_stack.c index b56f8117c09d..f995777dddb3 100644 --- a/tools/testing/selftests/bpf/verifier/basic_stack.c +++ b/tools/testing/selftests/bpf/verifier/basic_stack.c @@ -4,7 +4,7 @@ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid stack", + .errstr = "invalid write to stack", .result = REJECT, }, { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c4f5d909e58a..eb888c8479c3 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1228,7 +1228,7 @@ .prog_type = BPF_PROG_TYPE_XDP, .fixup_map_hash_8b = { 23 }, .result = REJECT, - .errstr = "invalid read from stack off -16+0 size 8", + .errstr = "invalid read from stack R7 off=-16 size=8", }, { "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test1", @@ -1958,7 +1958,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 6 }, - .errstr = "invalid indirect read from stack off -8+0 size 8", + .errstr = "invalid indirect read from stack R2 off -8+0 size 8", .result = REJECT, .prog_type = BPF_PROG_TYPE_XDP, }, diff --git a/tools/testing/selftests/bpf/verifier/const_or.c b/tools/testing/selftests/bpf/verifier/const_or.c index 6c214c58e8d4..0719b0ddec04 100644 --- a/tools/testing/selftests/bpf/verifier/const_or.c +++ b/tools/testing/selftests/bpf/verifier/const_or.c @@ -23,7 +23,7 @@ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, - .errstr = "invalid stack type R1 off=-48 access_size=58", + .errstr = "invalid indirect access to stack R1 off=-48 size=58", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -54,7 +54,7 @@ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, - .errstr = "invalid stack type R1 off=-48 access_size=58", + .errstr = "invalid indirect access to stack R1 off=-48 size=58", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 87c4e7900083..0ab7f1dfc97a 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -39,7 +39,7 @@ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, - .errstr = "invalid indirect read from stack off -64+0 size 64", + .errstr = "invalid indirect read from stack R1 off -64+0 size 64", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -59,7 +59,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid stack type R1 off=-64 access_size=65", + .errstr = "invalid indirect access to stack R1 off=-64 size=65", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -136,7 +136,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid stack type R1 off=-64 access_size=65", + .errstr = "invalid indirect access to stack R1 off=-64 size=65", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -156,7 +156,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid stack type R1 off=-64 access_size=65", + .errstr = "invalid indirect access to stack R1 off=-64 size=65", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -194,7 +194,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid indirect read from stack off -64+0 size 64", + .errstr = "invalid indirect read from stack R1 off -64+0 size 64", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -584,7 +584,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, - .errstr = "invalid indirect read from stack off -64+32 size 64", + .errstr = "invalid indirect read from stack R1 off -64+32 size 64", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, diff --git a/tools/testing/selftests/bpf/verifier/int_ptr.c b/tools/testing/selftests/bpf/verifier/int_ptr.c index ca3b4729df66..070893fb2900 100644 --- a/tools/testing/selftests/bpf/verifier/int_ptr.c +++ b/tools/testing/selftests/bpf/verifier/int_ptr.c @@ -27,7 +27,7 @@ }, .result = REJECT, .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, - .errstr = "invalid indirect read from stack off -16+0 size 8", + .errstr = "invalid indirect read from stack R4 off -16+0 size 8", }, { "ARG_PTR_TO_LONG half-uninitialized", @@ -59,7 +59,7 @@ }, .result = REJECT, .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, - .errstr = "invalid indirect read from stack off -16+4 size 8", + .errstr = "invalid indirect read from stack R4 off -16+4 size 8", }, { "ARG_PTR_TO_LONG misaligned", @@ -125,7 +125,7 @@ }, .result = REJECT, .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, - .errstr = "invalid stack type R4 off=-4 access_size=8", + .errstr = "invalid indirect access to stack R4 off=-4 size=8", }, { "ARG_PTR_TO_LONG initialized", diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c index c33adf344fae..df215e004566 100644 --- a/tools/testing/selftests/bpf/verifier/jit.c +++ b/tools/testing/selftests/bpf/verifier/jit.c @@ -105,3 +105,27 @@ .result = ACCEPT, .retval = 2, }, +{ + "jit: torturous jumps, imm8 nop jmp and pure jump padding", + .insns = { }, + .fill_helper = bpf_fill_torturous_jumps, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, +}, +{ + "jit: torturous jumps, imm32 nop jmp and jmp_cond padding", + .insns = { }, + .fill_helper = bpf_fill_torturous_jumps, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 2, +}, +{ + "jit: torturous jumps in subprog", + .insns = { }, + .fill_helper = bpf_fill_torturous_jumps, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 3, +}, diff --git a/tools/testing/selftests/bpf/verifier/raw_stack.c b/tools/testing/selftests/bpf/verifier/raw_stack.c index 193d9e87d5a9..cc8e8c3cdc03 100644 --- a/tools/testing/selftests/bpf/verifier/raw_stack.c +++ b/tools/testing/selftests/bpf/verifier/raw_stack.c @@ -11,7 +11,7 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid read from stack off -8+0 size 8", + .errstr = "invalid read from stack R6 off=-8 size=8", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@ -59,7 +59,7 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3", + .errstr = "invalid zero-sized read", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@ -205,7 +205,7 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3 off=-513 access_size=8", + .errstr = "invalid indirect access to stack R3 off=-513 size=8", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@ -221,7 +221,7 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3 off=-1 access_size=8", + .errstr = "invalid indirect access to stack R3 off=-1 size=8", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { @@ -285,7 +285,7 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack type R3 off=-512 access_size=0", + .errstr = "invalid zero-sized read", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { diff --git a/tools/testing/selftests/bpf/verifier/stack_ptr.c b/tools/testing/selftests/bpf/verifier/stack_ptr.c index 8bfeb77c60bd..07eaa04412ae 100644 --- a/tools/testing/selftests/bpf/verifier/stack_ptr.c +++ b/tools/testing/selftests/bpf/verifier/stack_ptr.c @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack off=-79992 size=8", + .errstr = "invalid write to stack R1 off=-79992 size=8", .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", }, { @@ -57,7 +57,7 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack off=0 size=8", + .errstr = "invalid write to stack R1 off=0 size=8", }, { "PTR_TO_STACK check high 1", @@ -106,7 +106,7 @@ BPF_EXIT_INSN(), }, .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", - .errstr = "invalid stack off=0 size=1", + .errstr = "invalid write to stack R1 off=0 size=1", .result = REJECT, }, { @@ -119,7 +119,8 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack off", + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .errstr = "invalid write to stack R1", }, { "PTR_TO_STACK check high 6", @@ -131,7 +132,8 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack off", + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .errstr = "invalid write to stack", }, { "PTR_TO_STACK check high 7", @@ -183,7 +185,7 @@ BPF_EXIT_INSN(), }, .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", - .errstr = "invalid stack off=-513 size=1", + .errstr = "invalid write to stack R1 off=-513 size=1", .result = REJECT, }, { @@ -208,7 +210,8 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack off", + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .errstr = "invalid write to stack", }, { "PTR_TO_STACK check low 6", @@ -220,7 +223,8 @@ BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "invalid stack off", + .errstr = "invalid write to stack", + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", }, { "PTR_TO_STACK check low 7", @@ -292,7 +296,7 @@ BPF_EXIT_INSN(), }, .result_unpriv = REJECT, - .errstr_unpriv = "invalid stack off=0 size=1", + .errstr_unpriv = "invalid write to stack R1 off=0 size=1", .result = ACCEPT, .retval = 42, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index ee298627abae..b018ad71e0a8 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -108,7 +108,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr_unpriv = "invalid indirect read from stack off -8+0 size 8", + .errstr_unpriv = "invalid indirect read from stack R2 off -8+0 size 8", .result_unpriv = REJECT, .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c index 8504ac937809..eab1f7f56e2f 100644 --- a/tools/testing/selftests/bpf/verifier/var_off.c +++ b/tools/testing/selftests/bpf/verifier/var_off.c @@ -18,7 +18,7 @@ .prog_type = BPF_PROG_TYPE_LWT_IN, }, { - "variable-offset stack access", + "variable-offset stack read, priv vs unpriv", .insns = { /* Fill the top 8 bytes of the stack */ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), @@ -31,15 +31,110 @@ * we don't know which */ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), - /* dereference it */ + /* dereference it for a stack read */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R2 variable stack access prohibited for !root", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, +{ + "variable-offset stack read, uninitialized", + .insns = { + /* Get an unknown value */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 4-byte aligned */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8), + /* add it to fp. We now have either fp-4 or fp-8, but + * we don't know which + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), + /* dereference it for a stack read */ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "variable stack access var_off=(0xfffffffffffffff8; 0x4)", .result = REJECT, + .errstr = "invalid variable-offset read from stack R2", .prog_type = BPF_PROG_TYPE_LWT_IN, }, { + "variable-offset stack write, priv vs unpriv", + .insns = { + /* Get an unknown value */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 8-byte aligned */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 8), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16), + /* Add it to fp. We now have either fp-8 or fp-16, but + * we don't know which + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), + /* Dereference it for a stack write */ + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + /* Now read from the address we just wrote. This shows + * that, after a variable-offset write, a priviledged + * program can read the slots that were in the range of + * that write (even if the verifier doesn't actually know + * if the slot being read was really written to or not. + */ + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + /* Variable stack access is rejected for unprivileged. + */ + .errstr_unpriv = "R2 variable stack access prohibited for !root", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ + "variable-offset stack write clobbers spilled regs", + .insns = { + /* Dummy instruction; needed because we need to patch the next one + * and we can't patch the first instruction. + */ + BPF_MOV64_IMM(BPF_REG_6, 0), + /* Make R0 a map ptr */ + BPF_LD_MAP_FD(BPF_REG_0, 0), + /* Get an unknown value */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 8-byte aligned */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 8), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16), + /* Add it to fp. We now have either fp-8 or fp-16, but + * we don't know which. + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), + /* Spill R0(map ptr) into stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + /* Dereference the unknown value for a stack write */ + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + /* Fill the register back into R2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), + /* Try to dereference R2 for a memory load */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 1 }, + /* The unpriviledged case is not too interesting; variable + * stack access is rejected. + */ + .errstr_unpriv = "R2 variable stack access prohibited for !root", + .result_unpriv = REJECT, + /* In the priviledged case, dereferencing a spilled-and-then-filled + * register is rejected because the previous variable offset stack + * write might have overwritten the spilled pointer (i.e. we lose track + * of the spilled register when we analyze the write). + */ + .errstr = "R2 invalid mem access 'inv'", + .result = REJECT, +}, +{ "indirect variable-offset stack access, unbounded", .insns = { BPF_MOV64_IMM(BPF_REG_2, 6), @@ -63,7 +158,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R4 unbounded indirect variable offset stack access", + .errstr = "invalid unbounded variable-offset indirect access to stack R4", .result = REJECT, .prog_type = BPF_PROG_TYPE_SOCK_OPS, }, @@ -88,7 +183,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 5 }, - .errstr = "R2 max value is outside of stack bound", + .errstr = "invalid variable-offset indirect access to stack R2", .result = REJECT, .prog_type = BPF_PROG_TYPE_LWT_IN, }, @@ -113,7 +208,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 5 }, - .errstr = "R2 min value is outside of stack bound", + .errstr = "invalid variable-offset indirect access to stack R2", .result = REJECT, .prog_type = BPF_PROG_TYPE_LWT_IN, }, @@ -138,7 +233,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 5 }, - .errstr = "invalid indirect read from stack var_off", + .errstr = "invalid indirect read from stack R2 var_off", .result = REJECT, .prog_type = BPF_PROG_TYPE_LWT_IN, }, @@ -163,7 +258,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 5 }, - .errstr = "invalid indirect read from stack var_off", + .errstr = "invalid indirect read from stack R2 var_off", .result = REJECT, .prog_type = BPF_PROG_TYPE_LWT_IN, }, @@ -189,7 +284,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 6 }, - .errstr_unpriv = "R2 stack pointer arithmetic goes out of range, prohibited for !root", + .errstr_unpriv = "R2 variable stack access prohibited for !root", .result_unpriv = REJECT, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_CGROUP_SKB, @@ -217,7 +312,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid indirect read from stack var_off", + .errstr = "invalid indirect read from stack R4 var_off", .result = REJECT, .prog_type = BPF_PROG_TYPE_SOCK_OPS, }, diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh new file mode 100755 index 000000000000..26ae8d0b6ce3 --- /dev/null +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -0,0 +1,368 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -u +set -e + +# This script currently only works for x86_64, as +# it is based on the VM image used by the BPF CI which is +# x86_64. +QEMU_BINARY="${QEMU_BINARY:="qemu-system-x86_64"}" +X86_BZIMAGE="arch/x86/boot/bzImage" +DEFAULT_COMMAND="./test_progs" +MOUNT_DIR="mnt" +ROOTFS_IMAGE="root.img" +OUTPUT_DIR="$HOME/.bpf_selftests" +KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/latest.config" +KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config" +INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX" +NUM_COMPILE_JOBS="$(nproc)" + +usage() +{ + cat <<EOF +Usage: $0 [-i] [-d <output_dir>] -- [<command>] + +<command> is the command you would normally run when you are in +tools/testing/selftests/bpf. e.g: + + $0 -- ./test_progs -t test_lsm + +If no command is specified, "${DEFAULT_COMMAND}" will be run by +default. + +If you build your kernel using KBUILD_OUTPUT= or O= options, these +can be passed as environment variables to the script: + + O=<kernel_build_path> $0 -- ./test_progs -t test_lsm + +or + + KBUILD_OUTPUT=<kernel_build_path> $0 -- ./test_progs -t test_lsm + +Options: + + -i) Update the rootfs image with a newer version. + -d) Update the output directory (default: ${OUTPUT_DIR}) + -j) Number of jobs for compilation, similar to -j in make + (default: ${NUM_COMPILE_JOBS}) +EOF +} + +unset URLS +populate_url_map() +{ + if ! declare -p URLS &> /dev/null; then + # URLS contain the mapping from file names to URLs where + # those files can be downloaded from. + declare -gA URLS + while IFS=$'\t' read -r name url; do + URLS["$name"]="$url" + done < <(curl -Lsf ${INDEX_URL}) + fi +} + +download() +{ + local file="$1" + + if [[ ! -v URLS[$file] ]]; then + echo "$file not found" >&2 + return 1 + fi + + echo "Downloading $file..." >&2 + curl -Lsf "${URLS[$file]}" "${@:2}" +} + +newest_rootfs_version() +{ + { + for file in "${!URLS[@]}"; do + if [[ $file =~ ^libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then + echo "${BASH_REMATCH[1]}" + fi + done + } | sort -rV | head -1 +} + +download_rootfs() +{ + local rootfsversion="$1" + local dir="$2" + + if ! which zstd &> /dev/null; then + echo 'Could not find "zstd" on the system, please install zstd' + exit 1 + fi + + download "libbpf-vmtest-rootfs-$rootfsversion.tar.zst" | + zstd -d | sudo tar -C "$dir" -x +} + +recompile_kernel() +{ + local kernel_checkout="$1" + local make_command="$2" + + cd "${kernel_checkout}" + + ${make_command} olddefconfig + ${make_command} +} + +mount_image() +{ + local rootfs_img="${OUTPUT_DIR}/${ROOTFS_IMAGE}" + local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" + + sudo mount -o loop "${rootfs_img}" "${mount_dir}" +} + +unmount_image() +{ + local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" + + sudo umount "${mount_dir}" &> /dev/null +} + +update_selftests() +{ + local kernel_checkout="$1" + local selftests_dir="${kernel_checkout}/tools/testing/selftests/bpf" + + cd "${selftests_dir}" + ${make_command} + + # Mount the image and copy the selftests to the image. + mount_image + sudo rm -rf "${mount_dir}/root/bpf" + sudo cp -r "${selftests_dir}" "${mount_dir}/root" + unmount_image +} + +update_init_script() +{ + local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" + local init_script="${init_script_dir}/S50-startup" + local command="$1" + local log_file="$2" + + mount_image + + if [[ ! -d "${init_script_dir}" ]]; then + cat <<EOF +Could not find ${init_script_dir} in the mounted image. +This likely indicates a bad rootfs image, Please download +a new image by passing "-i" to the script +EOF + exit 1 + + fi + + sudo bash -c "cat >${init_script}" <<EOF +#!/bin/bash + +{ + cd /root/bpf + echo ${command} + stdbuf -oL -eL ${command} +} 2>&1 | tee /root/${log_file} +poweroff -f +EOF + + sudo chmod a+x "${init_script}" + unmount_image +} + +create_vm_image() +{ + local rootfs_img="${OUTPUT_DIR}/${ROOTFS_IMAGE}" + local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" + + rm -rf "${rootfs_img}" + touch "${rootfs_img}" + chattr +C "${rootfs_img}" >/dev/null 2>&1 || true + + truncate -s 2G "${rootfs_img}" + mkfs.ext4 -q "${rootfs_img}" + + mount_image + download_rootfs "$(newest_rootfs_version)" "${mount_dir}" + unmount_image +} + +run_vm() +{ + local kernel_bzimage="$1" + local rootfs_img="${OUTPUT_DIR}/${ROOTFS_IMAGE}" + + if ! which "${QEMU_BINARY}" &> /dev/null; then + cat <<EOF +Could not find ${QEMU_BINARY} +Please install qemu or set the QEMU_BINARY environment variable. +EOF + exit 1 + fi + + ${QEMU_BINARY} \ + -nodefaults \ + -display none \ + -serial mon:stdio \ + -cpu kvm64 \ + -enable-kvm \ + -smp 4 \ + -m 2G \ + -drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \ + -kernel "${kernel_bzimage}" \ + -append "root=/dev/vda rw console=ttyS0,115200" +} + +copy_logs() +{ + local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" + local log_file="${mount_dir}/root/$1" + + mount_image + sudo cp ${log_file} "${OUTPUT_DIR}" + sudo rm -f ${log_file} + unmount_image +} + +is_rel_path() +{ + local path="$1" + + [[ ${path:0:1} != "/" ]] +} + +update_kconfig() +{ + local kconfig_file="$1" + local update_command="curl -sLf ${KCONFIG_URL} -o ${kconfig_file}" + # Github does not return the "last-modified" header when retrieving the + # raw contents of the file. Use the API call to get the last-modified + # time of the kernel config and only update the config if it has been + # updated after the previously cached config was created. This avoids + # unnecessarily compiling the kernel and selftests. + if [[ -f "${kconfig_file}" ]]; then + local last_modified_date="$(curl -sL -D - "${KCONFIG_API_URL}" -o /dev/null | \ + grep "last-modified" | awk -F ': ' '{print $2}')" + local remote_modified_timestamp="$(date -d "${last_modified_date}" +"%s")" + local local_creation_timestamp="$(stat -c %Y "${kconfig_file}")" + + if [[ "${remote_modified_timestamp}" -gt "${local_creation_timestamp}" ]]; then + ${update_command} + fi + else + ${update_command} + fi +} + +main() +{ + local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" + local kernel_checkout=$(realpath "${script_dir}"/../../../../) + local log_file="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S.log")" + # By default the script searches for the kernel in the checkout directory but + # it also obeys environment variables O= and KBUILD_OUTPUT= + local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" + local command="${DEFAULT_COMMAND}" + local update_image="no" + + while getopts 'hkid:j:' opt; do + case ${opt} in + i) + update_image="yes" + ;; + d) + OUTPUT_DIR="$OPTARG" + ;; + j) + NUM_COMPILE_JOBS="$OPTARG" + ;; + h) + usage + exit 0 + ;; + \? ) + echo "Invalid Option: -$OPTARG" + usage + exit 1 + ;; + : ) + echo "Invalid Option: -$OPTARG requires an argument" + usage + exit 1 + ;; + esac + done + shift $((OPTIND -1)) + + if [[ $# -eq 0 ]]; then + echo "No command specified, will run ${DEFAULT_COMMAND} in the vm" + else + command="$@" + fi + + local kconfig_file="${OUTPUT_DIR}/latest.config" + local make_command="make -j ${NUM_COMPILE_JOBS} KCONFIG_CONFIG=${kconfig_file}" + + # Figure out where the kernel is being built. + # O takes precedence over KBUILD_OUTPUT. + if [[ "${O:=""}" != "" ]]; then + if is_rel_path "${O}"; then + O="$(realpath "${PWD}/${O}")" + fi + kernel_bzimage="${O}/${X86_BZIMAGE}" + make_command="${make_command} O=${O}" + elif [[ "${KBUILD_OUTPUT:=""}" != "" ]]; then + if is_rel_path "${KBUILD_OUTPUT}"; then + KBUILD_OUTPUT="$(realpath "${PWD}/${KBUILD_OUTPUT}")" + fi + kernel_bzimage="${KBUILD_OUTPUT}/${X86_BZIMAGE}" + make_command="${make_command} KBUILD_OUTPUT=${KBUILD_OUTPUT}" + fi + + populate_url_map + + local rootfs_img="${OUTPUT_DIR}/${ROOTFS_IMAGE}" + local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" + + echo "Output directory: ${OUTPUT_DIR}" + + mkdir -p "${OUTPUT_DIR}" + mkdir -p "${mount_dir}" + update_kconfig "${kconfig_file}" + + recompile_kernel "${kernel_checkout}" "${make_command}" + + if [[ "${update_image}" == "no" && ! -f "${rootfs_img}" ]]; then + echo "rootfs image not found in ${rootfs_img}" + update_image="yes" + fi + + if [[ "${update_image}" == "yes" ]]; then + create_vm_image + fi + + update_selftests "${kernel_checkout}" "${make_command}" + update_init_script "${command}" "${log_file}" + run_vm "${kernel_bzimage}" + copy_logs "${log_file}" + echo "Logs saved in ${OUTPUT_DIR}/${log_file}" +} + +catch() +{ + local exit_code=$1 + # This is just a cleanup and the directory may + # have already been unmounted. So, don't let this + # clobber the error code we intend to return. + unmount_image || true + exit ${exit_code} +} + +trap 'catch "$?"' EXIT + +main "$@" diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 1e722ee76b1f..f4a96d5ff524 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -224,14 +224,14 @@ static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt return csum_tcpudp_magic(saddr, daddr, len, proto, csum); } -static void gen_eth_hdr(void *data, struct ethhdr *eth_hdr) +static void gen_eth_hdr(struct ifobject *ifobject, struct ethhdr *eth_hdr) { - memcpy(eth_hdr->h_dest, ((struct ifobject *)data)->dst_mac, ETH_ALEN); - memcpy(eth_hdr->h_source, ((struct ifobject *)data)->src_mac, ETH_ALEN); + memcpy(eth_hdr->h_dest, ifobject->dst_mac, ETH_ALEN); + memcpy(eth_hdr->h_source, ifobject->src_mac, ETH_ALEN); eth_hdr->h_proto = htons(ETH_P_IP); } -static void gen_ip_hdr(void *data, struct iphdr *ip_hdr) +static void gen_ip_hdr(struct ifobject *ifobject, struct iphdr *ip_hdr) { ip_hdr->version = IP_PKT_VER; ip_hdr->ihl = 0x5; @@ -241,18 +241,18 @@ static void gen_ip_hdr(void *data, struct iphdr *ip_hdr) ip_hdr->frag_off = 0; ip_hdr->ttl = IPDEFTTL; ip_hdr->protocol = IPPROTO_UDP; - ip_hdr->saddr = ((struct ifobject *)data)->src_ip; - ip_hdr->daddr = ((struct ifobject *)data)->dst_ip; + ip_hdr->saddr = ifobject->src_ip; + ip_hdr->daddr = ifobject->dst_ip; ip_hdr->check = 0; } -static void gen_udp_hdr(void *data, void *arg, struct udphdr *udp_hdr) +static void gen_udp_hdr(struct generic_data *data, struct ifobject *ifobject, + struct udphdr *udp_hdr) { - udp_hdr->source = htons(((struct ifobject *)arg)->src_port); - udp_hdr->dest = htons(((struct ifobject *)arg)->dst_port); + udp_hdr->source = htons(ifobject->src_port); + udp_hdr->dest = htons(ifobject->dst_port); udp_hdr->len = htons(UDP_PKT_SIZE); - memset32_htonl(pkt_data + PKT_HDR_SIZE, - htonl(((struct generic_data *)data)->seqnum), UDP_PKT_DATA_SIZE); + memset32_htonl(pkt_data + PKT_HDR_SIZE, htonl(data->seqnum), UDP_PKT_DATA_SIZE); } static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) @@ -382,21 +382,19 @@ static bool switch_namespace(int idx) static void *nsswitchthread(void *args) { - if (switch_namespace(((struct targs *)args)->idx)) { - ifdict[((struct targs *)args)->idx]->ifindex = - if_nametoindex(ifdict[((struct targs *)args)->idx]->ifname); - if (!ifdict[((struct targs *)args)->idx]->ifindex) { - ksft_test_result_fail - ("ERROR: [%s] interface \"%s\" does not exist\n", - __func__, ifdict[((struct targs *)args)->idx]->ifname); - ((struct targs *)args)->retptr = false; + struct targs *targs = args; + + targs->retptr = false; + + if (switch_namespace(targs->idx)) { + ifdict[targs->idx]->ifindex = if_nametoindex(ifdict[targs->idx]->ifname); + if (!ifdict[targs->idx]->ifindex) { + ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", + __func__, ifdict[targs->idx]->ifname); } else { - ksft_print_msg("Interface found: %s\n", - ifdict[((struct targs *)args)->idx]->ifname); - ((struct targs *)args)->retptr = true; + ksft_print_msg("Interface found: %s\n", ifdict[targs->idx]->ifname); + targs->retptr = true; } - } else { - ((struct targs *)args)->retptr = false; } pthread_exit(NULL); } @@ -413,12 +411,12 @@ static int validate_interfaces(void) if (strcmp(ifdict[i]->nsname, "")) { struct targs *targs; - targs = (struct targs *)malloc(sizeof(struct targs)); + targs = malloc(sizeof(*targs)); if (!targs) exit_with_error(errno); targs->idx = i; - if (pthread_create(&ns_thread, NULL, nsswitchthread, (void *)targs)) + if (pthread_create(&ns_thread, NULL, nsswitchthread, targs)) exit_with_error(errno); pthread_join(ns_thread, NULL); @@ -569,16 +567,18 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) } for (i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - (void)xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; - u64 orig = xsk_umem__extract_addr(addr); + u64 addr, orig; + + addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + orig = xsk_umem__extract_addr(addr); addr = xsk_umem__add_offset_to_addr(addr); pkt_node_rx = malloc(sizeof(struct pkt) + PKT_SIZE); if (!pkt_node_rx) exit_with_error(errno); - pkt_node_rx->pkt_frame = (char *)malloc(PKT_SIZE); + pkt_node_rx->pkt_frame = malloc(PKT_SIZE); if (!pkt_node_rx->pkt_frame) exit_with_error(errno); @@ -628,28 +628,27 @@ static inline int get_batch_size(int pkt_cnt) return opt_pkt_count - pkt_cnt; } -static void complete_tx_only_all(void *arg) +static void complete_tx_only_all(struct ifobject *ifobject) { bool pending; do { pending = false; - if (((struct ifobject *)arg)->xsk->outstanding_tx) { - complete_tx_only(((struct ifobject *) - arg)->xsk, BATCH_SIZE); - pending = !!((struct ifobject *)arg)->xsk->outstanding_tx; + if (ifobject->xsk->outstanding_tx) { + complete_tx_only(ifobject->xsk, BATCH_SIZE); + pending = !!ifobject->xsk->outstanding_tx; } } while (pending); } -static void tx_only_all(void *arg) +static void tx_only_all(struct ifobject *ifobject) { struct pollfd fds[MAX_SOCKS] = { }; u32 frame_nb = 0; int pkt_cnt = 0; int ret; - fds[0].fd = xsk_socket__fd(((struct ifobject *)arg)->xsk->xsk); + fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLOUT; while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { @@ -664,12 +663,12 @@ static void tx_only_all(void *arg) continue; } - tx_only(((struct ifobject *)arg)->xsk, &frame_nb, batch_size); + tx_only(ifobject->xsk, &frame_nb, batch_size); pkt_cnt += batch_size; } if (opt_pkt_count) - complete_tx_only_all(arg); + complete_tx_only_all(ifobject); } static void worker_pkt_dump(void) @@ -727,21 +726,21 @@ static void worker_pkt_dump(void) static void worker_pkt_validate(void) { u32 payloadseqnum = -2; + struct iphdr *iphdr; while (1) { - pkt_node_rx_q = malloc(sizeof(struct pkt)); pkt_node_rx_q = TAILQ_LAST(&head, head_s); if (!pkt_node_rx_q) break; + + iphdr = (struct iphdr *)(pkt_node_rx_q->pkt_frame + sizeof(struct ethhdr)); + /*do not increment pktcounter if !(tos=0x9 and ipv4) */ - if ((((struct iphdr *)(pkt_node_rx_q->pkt_frame + - sizeof(struct ethhdr)))->version == IP_PKT_VER) - && (((struct iphdr *)(pkt_node_rx_q->pkt_frame + sizeof(struct ethhdr)))->tos == - IP_PKT_TOS)) { - payloadseqnum = *((uint32_t *) (pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE)); + if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { + payloadseqnum = *((uint32_t *)(pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE)); if (debug_pkt_dump && payloadseqnum != EOT) { - pkt_obj = (struct pkt_frame *)malloc(sizeof(struct pkt_frame)); - pkt_obj->payload = (char *)malloc(PKT_SIZE); + pkt_obj = malloc(sizeof(*pkt_obj)); + pkt_obj->payload = malloc(PKT_SIZE); memcpy(pkt_obj->payload, pkt_node_rx_q->pkt_frame, PKT_SIZE); pkt_buf[payloadseqnum] = pkt_obj; } @@ -759,35 +758,29 @@ static void worker_pkt_validate(void) ksft_exit_xfail(); } - TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); - free(pkt_node_rx_q->pkt_frame); - free(pkt_node_rx_q); - pkt_node_rx_q = NULL; prev_pkt = payloadseqnum; pkt_counter++; } else { ksft_print_msg("Invalid frame received: "); - ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", - ((struct iphdr *)(pkt_node_rx_q->pkt_frame + - sizeof(struct ethhdr)))->version, - ((struct iphdr *)(pkt_node_rx_q->pkt_frame + - sizeof(struct ethhdr)))->tos); - TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); - free(pkt_node_rx_q->pkt_frame); - free(pkt_node_rx_q); - pkt_node_rx_q = NULL; + ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, + iphdr->tos); } + + TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); + free(pkt_node_rx_q->pkt_frame); + free(pkt_node_rx_q); + pkt_node_rx_q = NULL; } } -static void thread_common_ops(void *arg, void *bufs, pthread_mutex_t *mutexptr, +static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr, atomic_int *spinningptr) { int ctr = 0; int ret; - xsk_configure_umem((struct ifobject *)arg, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket((struct ifobject *)arg); + xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); + ret = xsk_configure_socket(ifobject); /* Retry Create Socket if it fails as xsk_socket__create() * is asynchronous @@ -798,9 +791,8 @@ static void thread_common_ops(void *arg, void *bufs, pthread_mutex_t *mutexptr, pthread_mutex_lock(mutexptr); while (ret && ctr < SOCK_RECONF_CTR) { atomic_store(spinningptr, 1); - xsk_configure_umem((struct ifobject *)arg, - bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket((struct ifobject *)arg); + xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); + ret = xsk_configure_socket(ifobject); usleep(USLEEP_MAX); ctr++; } @@ -815,9 +807,10 @@ static void *worker_testapp_validate(void *arg) { struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); - struct generic_data *data = (struct generic_data *)malloc(sizeof(struct generic_data)); struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct ethhdr)); struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; + struct ifobject *ifobject = (struct ifobject *)arg; + struct generic_data data; void *bufs = NULL; pthread_attr_setstacksize(&attr, THREAD_STACK); @@ -828,58 +821,56 @@ static void *worker_testapp_validate(void *arg) if (bufs == MAP_FAILED) exit_with_error(errno); - if (strcmp(((struct ifobject *)arg)->nsname, "")) - switch_namespace(((struct ifobject *)arg)->ifdict_index); + if (strcmp(ifobject->nsname, "")) + switch_namespace(ifobject->ifdict_index); } - if (((struct ifobject *)arg)->fv.vector == tx) { + if (ifobject->fv.vector == tx) { int spinningrxctr = 0; if (!bidi_pass) - thread_common_ops(arg, bufs, &sync_mutex_tx, &spinning_tx); + thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx); while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { spinningrxctr++; usleep(USLEEP_MAX); } - ksft_print_msg("Interface [%s] vector [Tx]\n", ((struct ifobject *)arg)->ifname); + ksft_print_msg("Interface [%s] vector [Tx]\n", ifobject->ifname); for (int i = 0; i < num_frames; i++) { /*send EOT frame */ if (i == (num_frames - 1)) - data->seqnum = -1; + data.seqnum = -1; else - data->seqnum = i; - gen_udp_hdr((void *)data, (void *)arg, udp_hdr); - gen_ip_hdr((void *)arg, ip_hdr); + data.seqnum = i; + gen_udp_hdr(&data, ifobject, udp_hdr); + gen_ip_hdr(ifobject, ip_hdr); gen_udp_csum(udp_hdr, ip_hdr); - gen_eth_hdr((void *)arg, eth_hdr); - gen_eth_frame(((struct ifobject *)arg)->umem, - i * XSK_UMEM__DEFAULT_FRAME_SIZE); + gen_eth_hdr(ifobject, eth_hdr); + gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); } - free(data); ksft_print_msg("Sending %d packets on interface %s\n", - (opt_pkt_count - 1), ((struct ifobject *)arg)->ifname); - tx_only_all(arg); - } else if (((struct ifobject *)arg)->fv.vector == rx) { + (opt_pkt_count - 1), ifobject->ifname); + tx_only_all(ifobject); + } else if (ifobject->fv.vector == rx) { struct pollfd fds[MAX_SOCKS] = { }; int ret; if (!bidi_pass) - thread_common_ops(arg, bufs, &sync_mutex_tx, &spinning_rx); + thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); - ksft_print_msg("Interface [%s] vector [Rx]\n", ((struct ifobject *)arg)->ifname); - xsk_populate_fill_ring(((struct ifobject *)arg)->umem); + ksft_print_msg("Interface [%s] vector [Rx]\n", ifobject->ifname); + xsk_populate_fill_ring(ifobject->umem); TAILQ_INIT(&head); if (debug_pkt_dump) { - pkt_buf = malloc(sizeof(struct pkt_frame **) * num_frames); + pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); if (!pkt_buf) exit_with_error(errno); } - fds[0].fd = xsk_socket__fd(((struct ifobject *)arg)->xsk->xsk); + fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLIN; pthread_mutex_lock(&sync_mutex); @@ -892,7 +883,7 @@ static void *worker_testapp_validate(void *arg) if (ret <= 0) continue; } - rx_pkt(((struct ifobject *)arg)->xsk, fds); + rx_pkt(ifobject->xsk, fds); worker_pkt_validate(); if (sigvar) @@ -900,21 +891,23 @@ static void *worker_testapp_validate(void *arg) } ksft_print_msg("Received %d packets on interface %s\n", - pkt_counter, ((struct ifobject *)arg)->ifname); + pkt_counter, ifobject->ifname); if (opt_teardown) ksft_print_msg("Destroying socket\n"); } - if (!opt_bidi || (opt_bidi && bidi_pass)) { - xsk_socket__delete(((struct ifobject *)arg)->xsk->xsk); - (void)xsk_umem__delete(((struct ifobject *)arg)->umem->umem); + if (!opt_bidi || bidi_pass) { + xsk_socket__delete(ifobject->xsk->xsk); + (void)xsk_umem__delete(ifobject->umem->umem); } pthread_exit(NULL); } static void testapp_validate(void) { + struct timespec max_wait = { 0, 0 }; + pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, THREAD_STACK); @@ -929,18 +922,16 @@ static void testapp_validate(void) pthread_mutex_lock(&sync_mutex); /*Spawn RX thread */ - if (!opt_bidi || (opt_bidi && !bidi_pass)) { - if (pthread_create(&t0, &attr, worker_testapp_validate, (void *)ifdict[1])) + if (!opt_bidi || !bidi_pass) { + if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) exit_with_error(errno); } else if (opt_bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[0]->fv.vector = rx; - if (pthread_create(&t0, &attr, worker_testapp_validate, (void *)ifdict[0])) + if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) exit_with_error(errno); } - struct timespec max_wait = { 0, 0 }; - if (clock_gettime(CLOCK_REALTIME, &max_wait)) exit_with_error(errno); max_wait.tv_sec += TMOUT_SEC; @@ -951,13 +942,13 @@ static void testapp_validate(void) pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - if (!opt_bidi || (opt_bidi && !bidi_pass)) { - if (pthread_create(&t1, &attr, worker_testapp_validate, (void *)ifdict[0])) + if (!opt_bidi || !bidi_pass) { + if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) exit_with_error(errno); } else if (opt_bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[1]->fv.vector = tx; - if (pthread_create(&t1, &attr, worker_testapp_validate, (void *)ifdict[1])) + if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) exit_with_error(errno); } @@ -991,25 +982,25 @@ static void testapp_sockets(void) print_ksft_result(); } -static void init_iface_config(void *ifaceconfig) +static void init_iface_config(struct ifaceconfigobj *ifaceconfig) { /*Init interface0 */ ifdict[0]->fv.vector = tx; - memcpy(ifdict[0]->dst_mac, ((struct ifaceconfigobj *)ifaceconfig)->dst_mac, ETH_ALEN); - memcpy(ifdict[0]->src_mac, ((struct ifaceconfigobj *)ifaceconfig)->src_mac, ETH_ALEN); - ifdict[0]->dst_ip = ((struct ifaceconfigobj *)ifaceconfig)->dst_ip.s_addr; - ifdict[0]->src_ip = ((struct ifaceconfigobj *)ifaceconfig)->src_ip.s_addr; - ifdict[0]->dst_port = ((struct ifaceconfigobj *)ifaceconfig)->dst_port; - ifdict[0]->src_port = ((struct ifaceconfigobj *)ifaceconfig)->src_port; + memcpy(ifdict[0]->dst_mac, ifaceconfig->dst_mac, ETH_ALEN); + memcpy(ifdict[0]->src_mac, ifaceconfig->src_mac, ETH_ALEN); + ifdict[0]->dst_ip = ifaceconfig->dst_ip.s_addr; + ifdict[0]->src_ip = ifaceconfig->src_ip.s_addr; + ifdict[0]->dst_port = ifaceconfig->dst_port; + ifdict[0]->src_port = ifaceconfig->src_port; /*Init interface1 */ ifdict[1]->fv.vector = rx; - memcpy(ifdict[1]->dst_mac, ((struct ifaceconfigobj *)ifaceconfig)->src_mac, ETH_ALEN); - memcpy(ifdict[1]->src_mac, ((struct ifaceconfigobj *)ifaceconfig)->dst_mac, ETH_ALEN); - ifdict[1]->dst_ip = ((struct ifaceconfigobj *)ifaceconfig)->src_ip.s_addr; - ifdict[1]->src_ip = ((struct ifaceconfigobj *)ifaceconfig)->dst_ip.s_addr; - ifdict[1]->dst_port = ((struct ifaceconfigobj *)ifaceconfig)->src_port; - ifdict[1]->src_port = ((struct ifaceconfigobj *)ifaceconfig)->dst_port; + memcpy(ifdict[1]->dst_mac, ifaceconfig->src_mac, ETH_ALEN); + memcpy(ifdict[1]->src_mac, ifaceconfig->dst_mac, ETH_ALEN); + ifdict[1]->dst_ip = ifaceconfig->src_ip.s_addr; + ifdict[1]->src_ip = ifaceconfig->dst_ip.s_addr; + ifdict[1]->dst_port = ifaceconfig->src_port; + ifdict[1]->src_port = ifaceconfig->dst_port; } int main(int argc, char **argv) @@ -1026,7 +1017,7 @@ int main(int argc, char **argv) u16 UDP_DST_PORT = 2020; u16 UDP_SRC_PORT = 2121; - ifaceconfig = (struct ifaceconfigobj *)malloc(sizeof(struct ifaceconfigobj)); + ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN); inet_aton(IP1, &ifaceconfig->dst_ip); @@ -1035,7 +1026,7 @@ int main(int argc, char **argv) ifaceconfig->src_port = UDP_SRC_PORT; for (int i = 0; i < MAX_INTERFACES; i++) { - ifdict[i] = (struct ifobject *)malloc(sizeof(struct ifobject)); + ifdict[i] = malloc(sizeof(struct ifobject)); if (!ifdict[i]) exit_with_error(errno); @@ -1048,7 +1039,7 @@ int main(int argc, char **argv) num_frames = ++opt_pkt_count; - init_iface_config((void *)ifaceconfig); + init_iface_config(ifaceconfig); pthread_init_mutex(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 61f595b6f200..0e9f9b7e61c2 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -92,8 +92,6 @@ struct flow_vector { enum fvector { tx, rx, - bidi, - undef, } vector; }; diff --git a/tools/testing/selftests/net/forwarding/tc_flower.sh b/tools/testing/selftests/net/forwarding/tc_flower.sh index a554838666c4..683711f41aa9 100755 --- a/tools/testing/selftests/net/forwarding/tc_flower.sh +++ b/tools/testing/selftests/net/forwarding/tc_flower.sh @@ -3,7 +3,8 @@ ALL_TESTS="match_dst_mac_test match_src_mac_test match_dst_ip_test \ match_src_ip_test match_ip_flags_test match_pcp_test match_vlan_test \ - match_ip_tos_test match_indev_test match_mpls_label_test \ + match_ip_tos_test match_indev_test match_ip_ttl_test + match_mpls_label_test \ match_mpls_tc_test match_mpls_bos_test match_mpls_ttl_test \ match_mpls_lse_test" NUM_NETIFS=2 @@ -312,6 +313,42 @@ match_ip_tos_test() log_test "ip_tos match ($tcflags)" } +match_ip_ttl_test() +{ + RET=0 + + tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ + $tcflags dst_ip 192.0.2.2 ip_ttl 63 action drop + tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \ + $tcflags dst_ip 192.0.2.2 action drop + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip "ttl=63" -q + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip "ttl=63,mf,frag=256" -q + + tc_check_packets "dev $h2 ingress" 102 1 + check_fail $? "Matched on the wrong filter (no check on ttl)" + + tc_check_packets "dev $h2 ingress" 101 2 + check_err $? "Did not match on correct filter (ttl=63)" + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip "ttl=255" -q + + tc_check_packets "dev $h2 ingress" 101 3 + check_fail $? "Matched on a wrong filter (ttl=63)" + + tc_check_packets "dev $h2 ingress" 102 1 + check_err $? "Did not match on correct filter (no check on ttl)" + + tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower + tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower + + log_test "ip_ttl match ($tcflags)" +} + match_indev_test() { RET=0 diff --git a/tools/testing/selftests/tc-testing/Makefile b/tools/testing/selftests/tc-testing/Makefile index 91fee5c43274..4d639279f41e 100644 --- a/tools/testing/selftests/tc-testing/Makefile +++ b/tools/testing/selftests/tc-testing/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +include ../../../scripts/Makefile.include top_srcdir = $(abspath ../../../..) APIDIR := $(top_scrdir)/include/uapi @@ -7,8 +8,6 @@ TEST_GEN_FILES = action.o KSFT_KHDR_INSTALL := 1 include ../lib.mk -CLANG ?= clang -LLC ?= llc PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1) ifeq ($(PROBE),) |