diff options
128 files changed, 4505 insertions, 1563 deletions
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 2ed89abbf9a4..253496af8fef 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -29,7 +29,7 @@ list: This may also include issues related to XDP, BPF tracing, etc. Given netdev has a high volume of traffic, please also add the BPF -maintainers to Cc (from kernel MAINTAINERS_ file): +maintainers to Cc (from kernel ``MAINTAINERS`` file): * Alexei Starovoitov <[email protected]> * Daniel Borkmann <[email protected]> @@ -234,11 +234,11 @@ be subject to change. Q: samples/bpf preference vs selftests? --------------------------------------- -Q: When should I add code to `samples/bpf/`_ and when to BPF kernel -selftests_ ? +Q: When should I add code to ``samples/bpf/`` and when to BPF kernel +selftests_? A: In general, we prefer additions to BPF kernel selftests_ rather than -`samples/bpf/`_. The rationale is very simple: kernel selftests are +``samples/bpf/``. The rationale is very simple: kernel selftests are regularly run by various bots to test for kernel regressions. The more test cases we add to BPF selftests, the better the coverage @@ -246,9 +246,9 @@ and the less likely it is that those could accidentally break. It is not that BPF kernel selftests cannot demo how a specific feature can be used. -That said, `samples/bpf/`_ may be a good place for people to get started, +That said, ``samples/bpf/`` may be a good place for people to get started, so it might be advisable that simple demos of features could go into -`samples/bpf/`_, but advanced functional and corner-case testing rather +``samples/bpf/``, but advanced functional and corner-case testing rather into kernel selftests. If your sample looks like a test case, then go for BPF kernel selftests @@ -449,6 +449,19 @@ from source at https://github.com/acmel/dwarves +pahole starts to use libbpf definitions and APIs since v1.13 after the +commit 21507cd3e97b ("pahole: add libbpf as submodule under lib/bpf"). +It works well with the git repository because the libbpf submodule will +use "git submodule update --init --recursive" to update. + +Unfortunately, the default github release source code does not contain +libbpf submodule source code and this will cause build issues, the tarball +from https://git.kernel.org/pub/scm/devel/pahole/pahole.git/ is same with +github, you can get the source tarball with corresponding libbpf submodule +codes from + +https://fedorapeople.org/~acme/dwarves + Some distros have pahole version 1.16 packaged already, e.g. Fedora, Gentoo. @@ -645,10 +658,9 @@ when: .. Links .. _Documentation/process/: https://www.kernel.org/doc/html/latest/process/ -.. _MAINTAINERS: ../../MAINTAINERS .. _netdev-FAQ: ../networking/netdev-FAQ.rst -.. _samples/bpf/: ../../samples/bpf/ -.. _selftests: ../../tools/testing/selftests/bpf/ +.. _selftests: + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/ .. _Documentation/dev-tools/kselftest.rst: https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html .. _Documentation/bpf/btf.rst: btf.rst diff --git a/Documentation/devicetree/bindings/net/intel,ixp4xx-ethernet.yaml b/Documentation/devicetree/bindings/net/intel,ixp4xx-ethernet.yaml new file mode 100644 index 000000000000..f2e91d1bf7d7 --- /dev/null +++ b/Documentation/devicetree/bindings/net/intel,ixp4xx-ethernet.yaml @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +# Copyright 2018 Linaro Ltd. +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/net/intel,ixp4xx-ethernet.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: Intel IXP4xx ethernet + +allOf: + - $ref: "ethernet-controller.yaml#" + +maintainers: + - Linus Walleij <[email protected]> + +description: | + The Intel IXP4xx ethernet makes use of the IXP4xx NPE (Network + Processing Engine) and the IXP4xx Queue Manager to process + the ethernet frames. It can optionally contain an MDIO bus to + talk to PHYs. + +properties: + compatible: + const: intel,ixp4xx-ethernet + + reg: + maxItems: 1 + description: Ethernet MMIO address range + + queue-rx: + $ref: '/schemas/types.yaml#/definitions/phandle-array' + maxItems: 1 + description: phandle to the RX queue on the NPE + + queue-txready: + $ref: '/schemas/types.yaml#/definitions/phandle-array' + maxItems: 1 + description: phandle to the TX READY queue on the NPE + + phy-mode: true + + phy-handle: true + + intel,npe-handle: + $ref: '/schemas/types.yaml#/definitions/phandle-array' + maxItems: 1 + description: phandle to the NPE this ethernet instance is using + and the instance to use in the second cell + + mdio: + type: object + $ref: "mdio.yaml#" + description: optional node for embedded MDIO controller + +required: + - compatible + - reg + - queue-rx + - queue-txready + - intel,npe-handle + +additionalProperties: false + +examples: + - | + npe: npe@c8006000 { + compatible = "intel,ixp4xx-network-processing-engine"; + reg = <0xc8006000 0x1000>, <0xc8007000 0x1000>, <0xc8008000 0x1000>; + }; + + ethernet@c8009000 { + compatible = "intel,ixp4xx-ethernet"; + reg = <0xc8009000 0x1000>; + status = "disabled"; + queue-rx = <&qmgr 4>; + queue-txready = <&qmgr 21>; + intel,npe-handle = <&npe 1>; + phy-mode = "rgmii"; + phy-handle = <&phy1>; + }; + + ethernet@c800c000 { + compatible = "intel,ixp4xx-ethernet"; + reg = <0xc800c000 0x1000>; + status = "disabled"; + queue-rx = <&qmgr 3>; + queue-txready = <&qmgr 20>; + intel,npe-handle = <&npe 2>; + phy-mode = "rgmii"; + phy-handle = <&phy2>; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + phy1: ethernet-phy@1 { + reg = <1>; + }; + phy2: ethernet-phy@2 { + reg = <2>; + }; + }; + }; diff --git a/MAINTAINERS b/MAINTAINERS index c3c8fa572580..0d85ae9e61e2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8532,7 +8532,6 @@ IBM Power SRIOV Virtual NIC Device Driver M: Dany Madden <[email protected]> M: Sukadev Bhattiprolu <[email protected]> R: Thomas Falcon <[email protected]> -R: Lijun Pan <[email protected]> S: Supported F: drivers/net/ethernet/ibm/ibmvnic.* diff --git a/arch/arm/boot/dts/uniphier-pxs2.dtsi b/arch/arm/boot/dts/uniphier-pxs2.dtsi index b0b15c97306b..e81e5937a60a 100644 --- a/arch/arm/boot/dts/uniphier-pxs2.dtsi +++ b/arch/arm/boot/dts/uniphier-pxs2.dtsi @@ -583,7 +583,7 @@ clocks = <&sys_clk 6>; reset-names = "ether"; resets = <&sys_rst 6>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 0>; diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi index a87b8a678719..8f2c1c1e2c64 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi +++ b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi @@ -734,7 +734,7 @@ clocks = <&sys_clk 6>; reset-names = "ether"; resets = <&sys_rst 6>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 0>; diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi index 0e52dadf54b3..be97da132258 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi +++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi @@ -564,7 +564,7 @@ clocks = <&sys_clk 6>; reset-names = "ether"; resets = <&sys_rst 6>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 0>; @@ -585,7 +585,7 @@ clocks = <&sys_clk 7>; reset-names = "ether"; resets = <&sys_rst 7>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 1>; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index d5ca38aa8aa9..20bbda1b36e1 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4391,9 +4391,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) int agg_id = 0; int ret = 0; -#ifdef CONFIG_LOCKDEP - WARN_ON(lockdep_is_held(&bond->mode_lock)); -#endif + might_sleep(); usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); @@ -4406,7 +4404,9 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; + spin_lock_bh(&bond->mode_lock); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { + spin_unlock_bh(&bond->mode_lock); pr_debug("bond_3ad_get_active_agg_info failed\n"); /* No active aggragator means it's not safe to use * the previous array. @@ -4414,6 +4414,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) bond_reset_slave_arr(bond); goto out; } + spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e15d454e33f0..39ac9e2f5118 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -122,7 +122,10 @@ enum board_idx { NETXTREME_E_VF, NETXTREME_C_VF, NETXTREME_S_VF, + NETXTREME_C_VF_HV, + NETXTREME_E_VF_HV, NETXTREME_E_P5_VF, + NETXTREME_E_P5_VF_HV, }; /* indexed by enum above */ @@ -170,7 +173,10 @@ static const struct { [NETXTREME_E_VF] = { "Broadcom NetXtreme-E Ethernet Virtual Function" }, [NETXTREME_C_VF] = { "Broadcom NetXtreme-C Ethernet Virtual Function" }, [NETXTREME_S_VF] = { "Broadcom NetXtreme-S Ethernet Virtual Function" }, + [NETXTREME_C_VF_HV] = { "Broadcom NetXtreme-C Virtual Function for Hyper-V" }, + [NETXTREME_E_VF_HV] = { "Broadcom NetXtreme-E Virtual Function for Hyper-V" }, [NETXTREME_E_P5_VF] = { "Broadcom BCM5750X NetXtreme-E Ethernet Virtual Function" }, + [NETXTREME_E_P5_VF_HV] = { "Broadcom BCM5750X NetXtreme-E Virtual Function for Hyper-V" }, }; static const struct pci_device_id bnxt_pci_tbl[] = { @@ -222,15 +228,25 @@ static const struct pci_device_id bnxt_pci_tbl[] = { { PCI_VDEVICE(BROADCOM, 0xd804), .driver_data = BCM58804 }, #ifdef CONFIG_BNXT_SRIOV { PCI_VDEVICE(BROADCOM, 0x1606), .driver_data = NETXTREME_E_VF }, + { PCI_VDEVICE(BROADCOM, 0x1607), .driver_data = NETXTREME_E_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x1608), .driver_data = NETXTREME_E_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x1609), .driver_data = NETXTREME_E_VF }, + { PCI_VDEVICE(BROADCOM, 0x16bd), .driver_data = NETXTREME_E_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x16c1), .driver_data = NETXTREME_E_VF }, + { PCI_VDEVICE(BROADCOM, 0x16c2), .driver_data = NETXTREME_C_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x16c3), .driver_data = NETXTREME_C_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x16c4), .driver_data = NETXTREME_E_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x16c5), .driver_data = NETXTREME_E_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x16cb), .driver_data = NETXTREME_C_VF }, { PCI_VDEVICE(BROADCOM, 0x16d3), .driver_data = NETXTREME_E_VF }, { PCI_VDEVICE(BROADCOM, 0x16dc), .driver_data = NETXTREME_E_VF }, { PCI_VDEVICE(BROADCOM, 0x16e1), .driver_data = NETXTREME_C_VF }, { PCI_VDEVICE(BROADCOM, 0x16e5), .driver_data = NETXTREME_C_VF }, + { PCI_VDEVICE(BROADCOM, 0x16e6), .driver_data = NETXTREME_C_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x1806), .driver_data = NETXTREME_E_P5_VF }, { PCI_VDEVICE(BROADCOM, 0x1807), .driver_data = NETXTREME_E_P5_VF }, + { PCI_VDEVICE(BROADCOM, 0x1808), .driver_data = NETXTREME_E_P5_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x1809), .driver_data = NETXTREME_E_P5_VF_HV }, { PCI_VDEVICE(BROADCOM, 0xd800), .driver_data = NETXTREME_S_VF }, #endif { 0 } @@ -265,7 +281,8 @@ static struct workqueue_struct *bnxt_pf_wq; static bool bnxt_vf_pciid(enum board_idx idx) { return (idx == NETXTREME_C_VF || idx == NETXTREME_E_VF || - idx == NETXTREME_S_VF || idx == NETXTREME_E_P5_VF); + idx == NETXTREME_S_VF || idx == NETXTREME_C_VF_HV || + idx == NETXTREME_E_VF_HV || idx == NETXTREME_E_P5_VF); } #define DB_CP_REARM_FLAGS (DB_KEY_CP | DB_IDX_VALID) @@ -358,6 +375,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) struct pci_dev *pdev = bp->pdev; struct bnxt_tx_ring_info *txr; struct bnxt_sw_tx_bd *tx_buf; + __le32 lflags = 0; i = skb_get_queue_mapping(skb); if (unlikely(i >= bp->tx_nr_rings)) { @@ -399,6 +417,11 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT; } + if (unlikely(skb->no_fcs)) { + lflags |= cpu_to_le32(TX_BD_FLAGS_NO_CRC); + goto normal_tx; + } + if (free_size == bp->tx_ring_size && length <= bp->tx_push_thresh) { struct tx_push_buffer *tx_push_buf = txr->tx_push; struct tx_push_bd *tx_push = &tx_push_buf->push_bd; @@ -500,7 +523,7 @@ normal_tx: txbd1 = (struct tx_bd_ext *) &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)]; - txbd1->tx_bd_hsize_lflags = 0; + txbd1->tx_bd_hsize_lflags = lflags; if (skb_is_gso(skb)) { u32 hdr_len; @@ -512,14 +535,14 @@ normal_tx: hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); - txbd1->tx_bd_hsize_lflags = cpu_to_le32(TX_BD_FLAGS_LSO | + txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_LSO | TX_BD_FLAGS_T_IPID | (hdr_len << (TX_BD_HSIZE_SHIFT - 1))); length = skb_shinfo(skb)->gso_size; txbd1->tx_bd_mss = cpu_to_le32(length); length += hdr_len; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - txbd1->tx_bd_hsize_lflags = + txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); txbd1->tx_bd_mss = 0; } @@ -1732,14 +1755,16 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, cons = rxcmp->rx_cmp_opaque; if (unlikely(cons != rxr->rx_next_cons)) { - int rc1 = bnxt_discard_rx(bp, cpr, raw_cons, rxcmp); + int rc1 = bnxt_discard_rx(bp, cpr, &tmp_raw_cons, rxcmp); /* 0xffff is forced error, don't print it */ if (rxr->rx_next_cons != 0xffff) netdev_warn(bp->dev, "RX cons %x != expected cons %x\n", cons, rxr->rx_next_cons); bnxt_sched_reset(bp, rxr); - return rc1; + if (rc1) + return rc1; + goto next_rx_no_prod_no_len; } rx_buf = &rxr->rx_buf_ring[cons]; data = rx_buf->data; @@ -4145,7 +4170,7 @@ static void bnxt_free_mem(struct bnxt *bp, bool irq_re_init) bnxt_free_ntp_fltrs(bp, irq_re_init); if (irq_re_init) { bnxt_free_ring_stats(bp); - if (!(bp->fw_cap & BNXT_FW_CAP_PORT_STATS_NO_RESET) || + if (!(bp->phy_flags & BNXT_PHY_FL_PORT_STATS_NO_RESET) || test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) bnxt_free_port_stats(bp); bnxt_free_ring_grps(bp); @@ -8340,11 +8365,11 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp) #endif } -/* Allow PF and VF with default VLAN to be in promiscuous mode */ +/* Allow PF, trusted VFs and VFs with default VLAN to be in promiscuous mode */ static bool bnxt_promisc_ok(struct bnxt *bp) { #ifdef CONFIG_BNXT_SRIOV - if (BNXT_VF(bp) && !bp->vf.vlan) + if (BNXT_VF(bp) && !bp->vf.vlan && !bnxt_is_trusted_vf(bp, &bp->vf)) return false; #endif return true; @@ -8441,7 +8466,7 @@ static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init) if (bp->dev->flags & IFF_BROADCAST) vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_BCAST; - if ((bp->dev->flags & IFF_PROMISC) && bnxt_promisc_ok(bp)) + if (bp->dev->flags & IFF_PROMISC) vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS; if (bp->dev->flags & IFF_ALLMULTI) { @@ -9075,8 +9100,9 @@ static char *bnxt_report_fec(struct bnxt_link_info *link_info) static void bnxt_report_link(struct bnxt *bp) { if (bp->link_info.link_up) { - const char *duplex; + const char *signal = ""; const char *flow_ctrl; + const char *duplex; u32 speed; u16 fec; @@ -9098,9 +9124,24 @@ static void bnxt_report_link(struct bnxt *bp) flow_ctrl = "ON - receive"; else flow_ctrl = "none"; - netdev_info(bp->dev, "NIC Link is Up, %u Mbps %s duplex, Flow control: %s\n", - speed, duplex, flow_ctrl); - if (bp->flags & BNXT_FLAG_EEE_CAP) + if (bp->link_info.phy_qcfg_resp.option_flags & + PORT_PHY_QCFG_RESP_OPTION_FLAGS_SIGNAL_MODE_KNOWN) { + u8 sig_mode = bp->link_info.active_fec_sig_mode & + PORT_PHY_QCFG_RESP_SIGNAL_MODE_MASK; + switch (sig_mode) { + case PORT_PHY_QCFG_RESP_SIGNAL_MODE_NRZ: + signal = "(NRZ) "; + break; + case PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4: + signal = "(PAM4) "; + break; + default: + break; + } + } + netdev_info(bp->dev, "NIC Link is Up, %u Mbps %s%s duplex, Flow control: %s\n", + speed, signal, duplex, flow_ctrl); + if (bp->phy_flags & BNXT_PHY_FL_EEE_CAP) netdev_info(bp->dev, "EEE is %s\n", bp->eee.eee_active ? "active" : "not active"); @@ -9132,10 +9173,6 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp) struct hwrm_port_phy_qcaps_output *resp = bp->hwrm_cmd_resp_addr; struct bnxt_link_info *link_info = &bp->link_info; - bp->flags &= ~BNXT_FLAG_EEE_CAP; - if (bp->test_info) - bp->test_info->flags &= ~(BNXT_TEST_FL_EXT_LPBK | - BNXT_TEST_FL_AN_PHY_LPBK); if (bp->hwrm_spec_code < 0x10201) return 0; @@ -9146,31 +9183,17 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp) if (rc) goto hwrm_phy_qcaps_exit; + bp->phy_flags = resp->flags; if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED) { struct ethtool_eee *eee = &bp->eee; u16 fw_speeds = le16_to_cpu(resp->supported_speeds_eee_mode); - bp->flags |= BNXT_FLAG_EEE_CAP; eee->supported = _bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0); bp->lpi_tmr_lo = le32_to_cpu(resp->tx_lpi_timer_low) & PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_MASK; bp->lpi_tmr_hi = le32_to_cpu(resp->valid_tx_lpi_timer_high) & PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK; } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED) { - if (bp->test_info) - bp->test_info->flags |= BNXT_TEST_FL_EXT_LPBK; - } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_AUTONEG_LPBK_SUPPORTED) { - if (bp->test_info) - bp->test_info->flags |= BNXT_TEST_FL_AN_PHY_LPBK; - } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_SHARED_PHY_CFG_SUPPORTED) { - if (BNXT_PF(bp)) - bp->fw_cap |= BNXT_FW_CAP_SHARED_PORT_CFG; - } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_CUMULATIVE_COUNTERS_ON_RESET) - bp->fw_cap |= BNXT_FW_CAP_PORT_STATS_NO_RESET; if (bp->hwrm_spec_code >= 0x10a01) { if (bnxt_phy_qcaps_no_speed(resp)) { @@ -9261,7 +9284,7 @@ int bnxt_update_link(struct bnxt *bp, bool chng_link_state) PORT_PHY_QCFG_RESP_PHY_ADDR_MASK; link_info->module_status = resp->module_status; - if (bp->flags & BNXT_FLAG_EEE_CAP) { + if (bp->phy_flags & BNXT_PHY_FL_EEE_CAP) { struct ethtool_eee *eee = &bp->eee; u16 fw_speeds; @@ -9497,7 +9520,8 @@ static int bnxt_hwrm_shutdown_link(struct bnxt *bp) if (!BNXT_SINGLE_PF(bp)) return 0; - if (pci_num_vf(bp->pdev)) + if (pci_num_vf(bp->pdev) && + !(bp->phy_flags & BNXT_PHY_FL_FW_MANAGED_LKDN)) return 0; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_CFG, -1, -1); @@ -9782,7 +9806,9 @@ static ssize_t bnxt_show_temp(struct device *dev, if (!rc) len = sprintf(buf, "%u\n", resp->temp * 1000); /* display millidegree */ mutex_unlock(&bp->hwrm_cmd_lock); - return rc ?: len; + if (rc) + return rc; + return len; } static SENSOR_DEVICE_ATTR(temp1_input, 0444, bnxt_show_temp, NULL, 0); @@ -9839,7 +9865,7 @@ static bool bnxt_eee_config_ok(struct bnxt *bp) struct ethtool_eee *eee = &bp->eee; struct bnxt_link_info *link_info = &bp->link_info; - if (!(bp->flags & BNXT_FLAG_EEE_CAP)) + if (!(bp->phy_flags & BNXT_PHY_FL_EEE_CAP)) return true; if (eee->eee_enabled) { @@ -10486,7 +10512,7 @@ static void bnxt_set_rx_mode(struct net_device *dev) CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST | CFA_L2_SET_RX_MASK_REQ_MASK_BCAST); - if ((dev->flags & IFF_PROMISC) && bnxt_promisc_ok(bp)) + if (dev->flags & IFF_PROMISC) mask |= CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS; uc_update = bnxt_uc_list_updated(bp); @@ -10562,6 +10588,9 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp) } skip_uc: + if ((vnic->rx_mask & CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS) && + !bnxt_promisc_ok(bp)) + vnic->rx_mask &= ~CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS; rc = bnxt_hwrm_cfa_l2_set_rx_mask(bp, 0); if (rc && vnic->mc_list_count) { netdev_info(bp->dev, "Failed setting MC filters rc: %d, turning on ALL_MCAST mode\n", @@ -10756,6 +10785,40 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features) return rc; } +static netdev_features_t bnxt_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + struct bnxt *bp; + __be16 udp_port; + u8 l4_proto = 0; + + features = vlan_features_check(skb, features); + if (!skb->encapsulation) + return features; + + switch (vlan_get_protocol(skb)) { + case htons(ETH_P_IP): + l4_proto = ip_hdr(skb)->protocol; + break; + case htons(ETH_P_IPV6): + l4_proto = ipv6_hdr(skb)->nexthdr; + break; + default: + return features; + } + + if (l4_proto != IPPROTO_UDP) + return features; + + bp = netdev_priv(dev); + /* For UDP, we can only handle 1 Vxlan port and 1 Geneve port. */ + udp_port = udp_hdr(skb)->dest; + if (udp_port == bp->vxlan_port || udp_port == bp->nge_port) + return features; + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); +} + int bnxt_dbg_hwrm_rd_reg(struct bnxt *bp, u32 reg_off, u16 num_words, u32 *reg_buf) { @@ -12263,10 +12326,13 @@ static int bnxt_udp_tunnel_sync(struct net_device *netdev, unsigned int table) unsigned int cmd; udp_tunnel_nic_get_port(netdev, table, 0, &ti); - if (ti.type == UDP_TUNNEL_TYPE_VXLAN) + if (ti.type == UDP_TUNNEL_TYPE_VXLAN) { + bp->vxlan_port = ti.port; cmd = TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN; - else + } else { + bp->nge_port = ti.port; cmd = TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE; + } if (ti.port) return bnxt_hwrm_tunnel_dst_port_alloc(bp, ti.port, cmd); @@ -12366,6 +12432,7 @@ static const struct net_device_ops bnxt_netdev_ops = { .ndo_change_mtu = bnxt_change_mtu, .ndo_fix_features = bnxt_fix_features, .ndo_set_features = bnxt_set_features, + .ndo_features_check = bnxt_features_check, .ndo_tx_timeout = bnxt_tx_timeout, #ifdef CONFIG_BNXT_SRIOV .ndo_get_vf_config = bnxt_get_vf_config, @@ -12434,12 +12501,17 @@ static int bnxt_probe_phy(struct bnxt *bp, bool fw_dflt) int rc = 0; struct bnxt_link_info *link_info = &bp->link_info; + bp->phy_flags = 0; rc = bnxt_hwrm_phy_qcaps(bp); if (rc) { netdev_err(bp->dev, "Probe phy can't get phy capabilities (rc: %x)\n", rc); return rc; } + if (bp->phy_flags & BNXT_PHY_FL_NO_FCS) + bp->dev->priv_flags |= IFF_SUPP_NOFCS; + else + bp->dev->priv_flags &= ~IFF_SUPP_NOFCS; if (!fw_dflt) return 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 29061c577baa..24d2ad6a8740 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1341,9 +1341,6 @@ struct bnxt_led_info { struct bnxt_test_info { u8 offline_mask; - u8 flags; -#define BNXT_TEST_FL_EXT_LPBK 0x1 -#define BNXT_TEST_FL_AN_PHY_LPBK 0x2 u16 timeout; char string[BNXT_MAX_TEST][ETH_GSTRING_LEN]; }; @@ -1693,7 +1690,6 @@ struct bnxt { #define BNXT_FLAG_SHARED_RINGS 0x200 #define BNXT_FLAG_PORT_STATS 0x400 #define BNXT_FLAG_UDP_RSS_CAP 0x800 - #define BNXT_FLAG_EEE_CAP 0x1000 #define BNXT_FLAG_NEW_RSS_CAP 0x2000 #define BNXT_FLAG_WOL_CAP 0x4000 #define BNXT_FLAG_ROCEV1_CAP 0x8000 @@ -1720,8 +1716,10 @@ struct bnxt { #define BNXT_NPAR(bp) ((bp)->port_partition_type) #define BNXT_MH(bp) ((bp)->flags & BNXT_FLAG_MULTI_HOST) #define BNXT_SINGLE_PF(bp) (BNXT_PF(bp) && !BNXT_NPAR(bp) && !BNXT_MH(bp)) +#define BNXT_SH_PORT_CFG_OK(bp) (BNXT_PF(bp) && \ + ((bp)->phy_flags & BNXT_PHY_FL_SHARED_PORT_CFG)) #define BNXT_PHY_CFG_ABLE(bp) ((BNXT_SINGLE_PF(bp) || \ - ((bp)->fw_cap & BNXT_FW_CAP_SHARED_PORT_CFG)) && \ + BNXT_SH_PORT_CFG_OK(bp)) && \ (bp)->link_info.phy_state == BNXT_PHY_STATE_ENABLED) #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0) #define BNXT_RX_PAGE_MODE(bp) ((bp)->flags & BNXT_FLAG_RX_PAGE_MODE) @@ -1871,11 +1869,9 @@ struct bnxt { #define BNXT_FW_CAP_EXT_STATS_SUPPORTED 0x00040000 #define BNXT_FW_CAP_ERR_RECOVER_RELOAD 0x00100000 #define BNXT_FW_CAP_HOT_RESET 0x00200000 - #define BNXT_FW_CAP_SHARED_PORT_CFG 0x00400000 #define BNXT_FW_CAP_VLAN_RX_STRIP 0x01000000 #define BNXT_FW_CAP_VLAN_TX_INSERT 0x02000000 #define BNXT_FW_CAP_EXT_HW_STATS_SUPPORTED 0x04000000 - #define BNXT_FW_CAP_PORT_STATS_NO_RESET 0x10000000 #define BNXT_FW_CAP_RING_MONITOR 0x40000000 #define BNXT_NEW_RM(bp) ((bp)->fw_cap & BNXT_FW_CAP_NEW_RM) @@ -1918,6 +1914,8 @@ struct bnxt { u16 vxlan_fw_dst_port_id; u16 nge_fw_dst_port_id; + __be16 vxlan_port; + __be16 nge_port; u8 port_partition_type; u8 port_count; u16 br_mode; @@ -2010,6 +2008,17 @@ struct bnxt { u32 lpi_tmr_lo; u32 lpi_tmr_hi; + /* copied from flags in hwrm_port_phy_qcaps_output */ + u8 phy_flags; +#define BNXT_PHY_FL_EEE_CAP PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED +#define BNXT_PHY_FL_EXT_LPBK PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED +#define BNXT_PHY_FL_AN_PHY_LPBK PORT_PHY_QCAPS_RESP_FLAGS_AUTONEG_LPBK_SUPPORTED +#define BNXT_PHY_FL_SHARED_PORT_CFG PORT_PHY_QCAPS_RESP_FLAGS_SHARED_PHY_CFG_SUPPORTED +#define BNXT_PHY_FL_PORT_STATS_NO_RESET PORT_PHY_QCAPS_RESP_FLAGS_CUMULATIVE_COUNTERS_ON_RESET +#define BNXT_PHY_FL_NO_PHY_LPBK PORT_PHY_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED +#define BNXT_PHY_FL_FW_MANAGED_LKDN PORT_PHY_QCAPS_RESP_FLAGS_FW_MANAGED_LINK_DOWN +#define BNXT_PHY_FL_NO_FCS PORT_PHY_QCAPS_RESP_FLAGS_NO_FCS + u8 num_tests; struct bnxt_test_info *test_info; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 3b66e300c962..c664ec52ebcf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2912,7 +2912,7 @@ static int bnxt_set_eee(struct net_device *dev, struct ethtool_eee *edata) if (!BNXT_PHY_CFG_ABLE(bp)) return -EOPNOTSUPP; - if (!(bp->flags & BNXT_FLAG_EEE_CAP)) + if (!(bp->phy_flags & BNXT_PHY_FL_EEE_CAP)) return -EOPNOTSUPP; mutex_lock(&bp->link_lock); @@ -2963,7 +2963,7 @@ static int bnxt_get_eee(struct net_device *dev, struct ethtool_eee *edata) { struct bnxt *bp = netdev_priv(dev); - if (!(bp->flags & BNXT_FLAG_EEE_CAP)) + if (!(bp->phy_flags & BNXT_PHY_FL_EEE_CAP)) return -EOPNOTSUPP; *edata = bp->eee; @@ -3215,7 +3215,7 @@ static int bnxt_disable_an_for_lpbk(struct bnxt *bp, int rc; if (!link_info->autoneg || - (bp->test_info->flags & BNXT_TEST_FL_AN_PHY_LPBK)) + (bp->phy_flags & BNXT_PHY_FL_AN_PHY_LPBK)) return 0; rc = bnxt_query_force_speeds(bp, &fw_advertising); @@ -3416,7 +3416,7 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, } if ((etest->flags & ETH_TEST_FL_EXTERNAL_LB) && - (bp->test_info->flags & BNXT_TEST_FL_EXT_LPBK)) + (bp->phy_flags & BNXT_PHY_FL_EXT_LPBK)) do_ext_lpbk = true; if (etest->flags & ETH_TEST_FL_OFFLINE) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index a217316228f4..eb00a219aa51 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -49,10 +49,6 @@ static int bnxt_hwrm_fwd_async_event_cmpl(struct bnxt *bp, static int bnxt_vf_ndo_prep(struct bnxt *bp, int vf_id) { - if (!test_bit(BNXT_STATE_OPEN, &bp->state)) { - netdev_err(bp->dev, "vf ndo called though PF is down\n"); - return -EINVAL; - } if (!bp->pf.active_vfs) { netdev_err(bp->dev, "vf ndo called though sriov is disabled\n"); return -EINVAL; @@ -113,7 +109,7 @@ static int bnxt_hwrm_func_qcfg_flags(struct bnxt *bp, struct bnxt_vf_info *vf) int rc; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1); - req.fid = cpu_to_le16(vf->fw_fid); + req.fid = cpu_to_le16(BNXT_PF(bp) ? vf->fw_fid : 0xffff); mutex_lock(&bp->hwrm_cmd_lock); rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); if (rc) { @@ -125,9 +121,9 @@ static int bnxt_hwrm_func_qcfg_flags(struct bnxt *bp, struct bnxt_vf_info *vf) return 0; } -static bool bnxt_is_trusted_vf(struct bnxt *bp, struct bnxt_vf_info *vf) +bool bnxt_is_trusted_vf(struct bnxt *bp, struct bnxt_vf_info *vf) { - if (!(bp->fw_cap & BNXT_FW_CAP_TRUSTED_VF)) + if (BNXT_PF(bp) && !(bp->fw_cap & BNXT_FW_CAP_TRUSTED_VF)) return !!(vf->flags & BNXT_VF_TRUST); bnxt_hwrm_func_qcfg_flags(bp, vf); @@ -1120,10 +1116,38 @@ void bnxt_hwrm_exec_fwd_req(struct bnxt *bp) } } +int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict) +{ + struct hwrm_func_vf_cfg_input req = {0}; + int rc = 0; + + if (!BNXT_VF(bp)) + return 0; + + if (bp->hwrm_spec_code < 0x10202) { + if (is_valid_ether_addr(bp->vf.mac_addr)) + rc = -EADDRNOTAVAIL; + goto mac_done; + } + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1); + req.enables = cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR); + memcpy(req.dflt_mac_addr, mac, ETH_ALEN); + rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); +mac_done: + if (rc && strict) { + rc = -EADDRNOTAVAIL; + netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n", + mac); + return rc; + } + return 0; +} + void bnxt_update_vf_mac(struct bnxt *bp) { struct hwrm_func_qcaps_input req = {0}; struct hwrm_func_qcaps_output *resp = bp->hwrm_cmd_resp_addr; + bool inform_pf = false; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCAPS, -1, -1); req.fid = cpu_to_le16(0xffff); @@ -1139,42 +1163,24 @@ void bnxt_update_vf_mac(struct bnxt *bp) * default but the stored zero MAC will allow the VF user to change * the random MAC address using ndo_set_mac_address() if he wants. */ - if (!ether_addr_equal(resp->mac_address, bp->vf.mac_addr)) + if (!ether_addr_equal(resp->mac_address, bp->vf.mac_addr)) { memcpy(bp->vf.mac_addr, resp->mac_address, ETH_ALEN); + /* This means we are now using our own MAC address, let + * the PF know about this MAC address. + */ + if (!is_valid_ether_addr(bp->vf.mac_addr)) + inform_pf = true; + } /* overwrite netdev dev_addr with admin VF MAC */ if (is_valid_ether_addr(bp->vf.mac_addr)) memcpy(bp->dev->dev_addr, bp->vf.mac_addr, ETH_ALEN); update_vf_mac_exit: mutex_unlock(&bp->hwrm_cmd_lock); + if (inform_pf) + bnxt_approve_mac(bp, bp->dev->dev_addr, false); } -int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict) -{ - struct hwrm_func_vf_cfg_input req = {0}; - int rc = 0; - - if (!BNXT_VF(bp)) - return 0; - - if (bp->hwrm_spec_code < 0x10202) { - if (is_valid_ether_addr(bp->vf.mac_addr)) - rc = -EADDRNOTAVAIL; - goto mac_done; - } - bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1); - req.enables = cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR); - memcpy(req.dflt_mac_addr, mac, ETH_ALEN); - rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); -mac_done: - if (rc && strict) { - rc = -EADDRNOTAVAIL; - netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n", - mac); - return rc; - } - return 0; -} #else int bnxt_cfg_hw_sriov(struct bnxt *bp, int *num_vfs, bool reset) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h index 629641bf6fc5..995535e4c11b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h @@ -34,6 +34,7 @@ int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16); int bnxt_set_vf_bw(struct net_device *, int, int, int); int bnxt_set_vf_link_state(struct net_device *, int, int); int bnxt_set_vf_spoofchk(struct net_device *, int, bool); +bool bnxt_is_trusted_vf(struct bnxt *bp, struct bnxt_vf_info *vf); int bnxt_set_vf_trust(struct net_device *dev, int vf_id, bool trust); int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs); int bnxt_cfg_hw_sriov(struct bnxt *bp, int *num_vfs, bool reset); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index a3f5b80888e5..ef3f1e92632f 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -33,7 +33,6 @@ static int chcr_get_nfrags_to_send(struct sk_buff *skb, u32 start, u32 len) if (unlikely(start < skb_linear_data_len)) { frag_size = min(len, skb_linear_data_len - start); - start = 0; } else { start -= skb_linear_data_len; @@ -873,10 +872,10 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, } /* update receive window */ if (first_wr || tx_info->prev_win != tcp_win) { - pos = chcr_write_cpl_set_tcb_ulp(tx_info, q, tx_info->tid, pos, - TCB_RCV_WND_W, - TCB_RCV_WND_V(TCB_RCV_WND_M), - TCB_RCV_WND_V(tcp_win), 0); + chcr_write_cpl_set_tcb_ulp(tx_info, q, tx_info->tid, pos, + TCB_RCV_WND_W, + TCB_RCV_WND_V(TCB_RCV_WND_M), + TCB_RCV_WND_V(tcp_win), 0); tx_info->prev_win = tcp_win; cpl++; } @@ -1485,7 +1484,6 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, wr->op_to_compl = htonl(FW_WR_OP_V(FW_ULPTX_WR)); wr->flowid_len16 = htonl(wr_mid | FW_WR_LEN16_V(len16)); wr->cookie = 0; - pos += sizeof(*wr); /* ULP_TXPKT */ ulptx = (struct ulp_txpkt *)(wr + 1); ulptx->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index f08c420a5803..2768c78528a5 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -431,7 +431,8 @@ static void prestera_port_handle_event(struct prestera_switch *sw, netif_carrier_on(port->dev); if (!delayed_work_pending(caching_dw)) queue_delayed_work(prestera_wq, caching_dw, 0); - } else { + } else if (netif_running(port->dev) && + netif_carrier_ok(port->dev)) { netif_carrier_off(port->dev); if (delayed_work_pending(caching_dw)) cancel_delayed_work(caching_dw); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 3e19b1721303..0399a396d166 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -96,7 +96,7 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, } if (!vport->egress.acl) { - vport->egress.acl = esw_acl_table_create(esw, vport->vport, + vport->egress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_EGRESS, table_size); if (IS_ERR(vport->egress.acl)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c index 26b37a0f8762..505bf811984a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c @@ -148,7 +148,7 @@ static void esw_acl_egress_ofld_groups_destroy(struct mlx5_vport *vport) esw_acl_egress_vlan_grp_destroy(vport); } -static bool esw_acl_egress_needed(const struct mlx5_eswitch *esw, u16 vport_num) +static bool esw_acl_egress_needed(struct mlx5_eswitch *esw, u16 vport_num) { return mlx5_eswitch_is_vf_vport(esw, vport_num) || mlx5_esw_is_sf_vport(esw, vport_num); } @@ -171,7 +171,7 @@ int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport table_size++; if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) table_size++; - vport->egress.acl = esw_acl_table_create(esw, vport->vport, + vport->egress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_EGRESS, table_size); if (IS_ERR(vport->egress.acl)) { err = PTR_ERR(vport->egress.acl); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c index 4a369669e51e..45b839116212 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c @@ -6,14 +6,14 @@ #include "helper.h" struct mlx5_flow_table * -esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size) +esw_acl_table_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int ns, int size) { struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *dev = esw->dev; struct mlx5_flow_namespace *root_ns; struct mlx5_flow_table *acl; int acl_supported; - int vport_index; + u16 vport_num; int err; acl_supported = (ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS) ? @@ -23,11 +23,11 @@ esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size) if (!acl_supported) return ERR_PTR(-EOPNOTSUPP); + vport_num = vport->vport; esw_debug(dev, "Create vport[%d] %s ACL table\n", vport_num, ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress"); - vport_index = mlx5_eswitch_vport_num_to_index(esw, vport_num); - root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport_index); + root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport->index); if (!root_ns) { esw_warn(dev, "Failed to get E-Switch root namespace for vport (%d)\n", vport_num); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h index 8dc4cab66a71..a47063fab57e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h @@ -8,7 +8,7 @@ /* General acl helper functions */ struct mlx5_flow_table * -esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size); +esw_acl_table_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int ns, int size); /* Egress acl helper functions */ void esw_acl_egress_table_destroy(struct mlx5_vport *vport); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index d64fad2823e7..f75b86abaf1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -177,7 +177,7 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, } if (!vport->ingress.acl) { - vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + vport->ingress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_INGRESS, table_size); if (IS_ERR(vport->ingress.acl)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c index 548c005ea633..39e948bc1204 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c @@ -7,7 +7,7 @@ #include "ofld.h" static bool -esw_acl_ingress_prio_tag_enabled(const struct mlx5_eswitch *esw, +esw_acl_ingress_prio_tag_enabled(struct mlx5_eswitch *esw, const struct mlx5_vport *vport) { return (MLX5_CAP_GEN(esw->dev, prio_tag_required) && @@ -255,7 +255,7 @@ int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, if (esw_acl_ingress_prio_tag_enabled(esw, vport)) num_ftes++; - vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + vport->ingress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_INGRESS, num_ftes); if (IS_ERR(vport->ingress.acl)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index 7bfc84238b3d..1703384eca95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -14,8 +14,7 @@ mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_i memcpy(ppid->id, &parent_id, sizeof(parent_id)); } -static bool -mlx5_esw_devlink_port_supported(const struct mlx5_eswitch *esw, u16 vport_num) +static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num) { return vport_num == MLX5_VPORT_UPLINK || (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) || @@ -124,7 +123,7 @@ struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u1 } int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum) + u16 vport_num, u32 controller, u32 sfnum) { struct mlx5_core_dev *dev = esw->dev; struct netdev_phys_item_id ppid = {}; @@ -142,7 +141,7 @@ int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_p mlx5_esw_get_port_parent_id(dev, &ppid); memcpy(dl_port->attrs.switch_id.id, &ppid.id[0], ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_sf_set(dl_port, 0, pfnum, sfnum); + devlink_port_attrs_pci_sf_set(dl_port, controller, pfnum, sfnum, !!controller); devlink = priv_to_devlink(dev); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); err = devlink_port_register(devlink, dl_port, dl_port_index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c index 8ab1224653a4..d9041b16611d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c @@ -216,7 +216,8 @@ static void esw_destroy_legacy_table(struct mlx5_eswitch *esw) int esw_legacy_enable(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; - int ret, i; + unsigned long i; + int ret; ret = esw_create_legacy_table(esw); if (ret) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 1bb229ecd43b..570f2280823c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -88,20 +88,17 @@ struct mlx5_eswitch *mlx5_devlink_eswitch_get(struct devlink *devlink) struct mlx5_vport *__must_check mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num) { - u16 idx; + struct mlx5_vport *vport; if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager)) return ERR_PTR(-EPERM); - idx = mlx5_eswitch_vport_num_to_index(esw, vport_num); - - if (idx > esw->total_vports - 1) { - esw_debug(esw->dev, "vport out of range: num(0x%x), idx(0x%x)\n", - vport_num, idx); + vport = xa_load(&esw->vports, vport_num); + if (!vport) { + esw_debug(esw->dev, "vport out of range: num(0x%x)\n", vport_num); return ERR_PTR(-EINVAL); } - - return &esw->vports[idx]; + return vport; } static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport, @@ -345,9 +342,10 @@ static void update_allmulti_vports(struct mlx5_eswitch *esw, { u8 *mac = vaddr->node.addr; struct mlx5_vport *vport; - u16 i, vport_num; + unsigned long i; + u16 vport_num; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { struct hlist_head *vport_hash = vport->mc_list; struct vport_addr *iter_vaddr = l2addr_hash_find(vport_hash, @@ -1175,7 +1173,7 @@ static void mlx5_eswitch_event_handlers_unregister(struct mlx5_eswitch *esw) static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; - int i; + unsigned long i; mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) { memset(&vport->qos, 0, sizeof(vport->qos)); @@ -1213,20 +1211,25 @@ void mlx5_eswitch_unload_vport(struct mlx5_eswitch *esw, u16 vport_num) void mlx5_eswitch_unload_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs) { - int i; + struct mlx5_vport *vport; + unsigned long i; - mlx5_esw_for_each_vf_vport_num_reverse(esw, i, num_vfs) - mlx5_eswitch_unload_vport(esw, i); + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + if (!vport->enabled) + continue; + mlx5_eswitch_unload_vport(esw, vport->vport); + } } int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs, enum mlx5_eswitch_vport_event enabled_events) { + struct mlx5_vport *vport; + unsigned long i; int err; - int i; - mlx5_esw_for_each_vf_vport_num(esw, i, num_vfs) { - err = mlx5_eswitch_load_vport(esw, i, enabled_events); + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + err = mlx5_eswitch_load_vport(esw, vport->vport, enabled_events); if (err) goto vf_err; } @@ -1234,7 +1237,7 @@ int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs, return 0; vf_err: - mlx5_eswitch_unload_vf_vports(esw, i - 1); + mlx5_eswitch_unload_vf_vports(esw, num_vfs); return err; } @@ -1563,24 +1566,161 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf) up_write(&esw->mode_lock); } +static int mlx5_query_hca_cap_host_pf(struct mlx5_core_dev *dev, void *out) +{ + u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {}; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + MLX5_SET(query_hca_cap_in, in, function_id, MLX5_VPORT_PF); + MLX5_SET(query_hca_cap_in, in, other_function, true); + return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); +} + +int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *sf_base_id) + +{ + int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *query_ctx; + void *hca_caps; + int err; + + if (!mlx5_core_is_ecpf(dev)) { + *max_sfs = 0; + return 0; + } + + query_ctx = kzalloc(query_out_sz, GFP_KERNEL); + if (!query_ctx) + return -ENOMEM; + + err = mlx5_query_hca_cap_host_pf(dev, query_ctx); + if (err) + goto out_free; + + hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); + *max_sfs = MLX5_GET(cmd_hca_cap, hca_caps, max_num_sf); + *sf_base_id = MLX5_GET(cmd_hca_cap, hca_caps, sf_base_id); + +out_free: + kfree(query_ctx); + return err; +} + +static int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, struct mlx5_core_dev *dev, + int index, u16 vport_num) +{ + struct mlx5_vport *vport; + int err; + + vport = kzalloc(sizeof(*vport), GFP_KERNEL); + if (!vport) + return -ENOMEM; + + vport->dev = esw->dev; + vport->vport = vport_num; + vport->index = index; + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + INIT_WORK(&vport->vport_change_handler, esw_vport_change_handler); + err = xa_insert(&esw->vports, vport_num, vport, GFP_KERNEL); + if (err) + goto insert_err; + + esw->total_vports++; + return 0; + +insert_err: + kfree(vport); + return err; +} + +static void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + xa_erase(&esw->vports, vport->vport); + kfree(vport); +} + +static void mlx5_esw_vports_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vport(esw, i, vport) + mlx5_esw_vport_free(esw, vport); + xa_destroy(&esw->vports); +} + +static int mlx5_esw_vports_init(struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *dev = esw->dev; + u16 max_host_pf_sfs; + u16 base_sf_num; + int idx = 0; + int err; + int i; + + xa_init(&esw->vports); + + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_PF); + if (err) + goto err; + if (esw->first_host_vport == MLX5_VPORT_PF) + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + idx++; + + for (i = 0; i < mlx5_core_max_vfs(dev); i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, idx); + if (err) + goto err; + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_VF); + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + idx++; + } + base_sf_num = mlx5_sf_start_function_id(dev); + for (i = 0; i < mlx5_sf_max_functions(dev); i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, base_sf_num + i); + if (err) + goto err; + xa_set_mark(&esw->vports, base_sf_num + i, MLX5_ESW_VPT_SF); + idx++; + } + + err = mlx5_esw_sf_max_hpf_functions(dev, &max_host_pf_sfs, &base_sf_num); + if (err) + goto err; + for (i = 0; i < max_host_pf_sfs; i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, base_sf_num + i); + if (err) + goto err; + xa_set_mark(&esw->vports, base_sf_num + i, MLX5_ESW_VPT_SF); + idx++; + } + + if (mlx5_ecpf_vport_exists(dev)) { + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_ECPF); + if (err) + goto err; + idx++; + } + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_UPLINK); + if (err) + goto err; + return 0; + +err: + mlx5_esw_vports_cleanup(esw); + return err; +} + int mlx5_eswitch_init(struct mlx5_core_dev *dev) { struct mlx5_eswitch *esw; - struct mlx5_vport *vport; - int total_vports; - int err, i; + int err; if (!MLX5_VPORT_MANAGER(dev)) return 0; - total_vports = mlx5_eswitch_get_total_vports(dev); - - esw_info(dev, - "Total vports %d, per vport: max uc(%d) max mc(%d)\n", - total_vports, - MLX5_MAX_UC_PER_VPORT(dev), - MLX5_MAX_MC_PER_VPORT(dev)); - esw = kzalloc(sizeof(*esw), GFP_KERNEL); if (!esw) return -ENOMEM; @@ -1595,18 +1735,13 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) goto abort; } - esw->vports = kcalloc(total_vports, sizeof(struct mlx5_vport), - GFP_KERNEL); - if (!esw->vports) { - err = -ENOMEM; + err = mlx5_esw_vports_init(esw); + if (err) goto abort; - } - - esw->total_vports = total_vports; err = esw_offloads_init_reps(esw); if (err) - goto abort; + goto reps_err; mutex_init(&esw->offloads.encap_tbl_lock); hash_init(esw->offloads.encap_tbl); @@ -1619,25 +1754,25 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) mutex_init(&esw->state_lock); init_rwsem(&esw->mode_lock); - mlx5_esw_for_all_vports(esw, i, vport) { - vport->vport = mlx5_eswitch_index_to_vport_num(esw, i); - vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; - vport->dev = dev; - INIT_WORK(&vport->vport_change_handler, - esw_vport_change_handler); - } - esw->enabled_vports = 0; esw->mode = MLX5_ESWITCH_NONE; esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE; dev->priv.eswitch = esw; BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); + + esw_info(dev, + "Total vports %d, per vport: max uc(%d) max mc(%d)\n", + esw->total_vports, + MLX5_MAX_UC_PER_VPORT(dev), + MLX5_MAX_MC_PER_VPORT(dev)); return 0; + +reps_err: + mlx5_esw_vports_cleanup(esw); abort: if (esw->work_queue) destroy_workqueue(esw->work_queue); - kfree(esw->vports); kfree(esw); return err; } @@ -1659,7 +1794,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) mutex_destroy(&esw->offloads.encap_tbl_lock); mutex_destroy(&esw->offloads.decap_tbl_lock); esw_offloads_cleanup_reps(esw); - kfree(esw->vports); + mlx5_esw_vports_cleanup(esw); kfree(esw); } @@ -1718,8 +1853,29 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, return err; } +static bool mlx5_esw_check_port_type(struct mlx5_eswitch *esw, u16 vport_num, xa_mark_t mark) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return false; + + return xa_get_mark(&esw->vports, vport_num, mark); +} + +bool mlx5_eswitch_is_vf_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_esw_check_port_type(esw, vport_num, MLX5_ESW_VPT_VF); +} + +bool mlx5_esw_is_sf_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_esw_check_port_type(esw, vport_num, MLX5_ESW_VPT_SF); +} + static bool -is_port_function_supported(const struct mlx5_eswitch *esw, u16 vport_num) +is_port_function_supported(struct mlx5_eswitch *esw, u16 vport_num) { return vport_num == MLX5_VPORT_PF || mlx5_eswitch_is_vf_vport(esw, vport_num) || @@ -1891,9 +2047,9 @@ static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw) u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); struct mlx5_vport *evport; u32 max_guarantee = 0; - int i; + unsigned long i; - mlx5_esw_for_all_vports(esw, i, evport) { + mlx5_esw_for_each_vport(esw, i, evport) { if (!evport->enabled || evport->qos.min_rate < max_guarantee) continue; max_guarantee = evport->qos.min_rate; @@ -1911,11 +2067,11 @@ static int normalize_vports_min_rate(struct mlx5_eswitch *esw) struct mlx5_vport *evport; u32 vport_max_rate; u32 vport_min_rate; + unsigned long i; u32 bw_share; int err; - int i; - mlx5_esw_for_all_vports(esw, i, evport) { + mlx5_esw_for_each_vport(esw, i, evport) { if (!evport->enabled) continue; vport_min_rate = evport->qos.min_rate; @@ -2205,3 +2361,19 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw) { up_write(&esw->mode_lock); } + +/** + * mlx5_eswitch_get_total_vports - Get total vports of the eswitch + * + * @dev: Pointer to core device + * + * mlx5_eswitch_get_total_vports returns total number of eswitch vports. + */ +u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) +{ + struct mlx5_eswitch *esw; + + esw = dev->priv.eswitch; + return mlx5_esw_allowed(esw) ? esw->total_vports : 0; +} +EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index b289d756a7e4..64ccb2bc0b58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -176,6 +176,7 @@ struct mlx5_vport { u16 vport; bool enabled; enum mlx5_eswitch_vport_event enabled_events; + int index; struct devlink_port *dl_port; }; @@ -228,7 +229,7 @@ struct mlx5_esw_offload { struct mlx5_flow_table *ft_offloads; struct mlx5_flow_group *vport_rx_group; - struct mlx5_eswitch_rep *vport_reps; + struct xarray vport_reps; struct list_head peer_flows; struct mutex peer_mutex; struct mutex encap_tbl_lock; /* protects encap_tbl */ @@ -278,7 +279,7 @@ struct mlx5_eswitch { struct esw_mc_addr mc_promisc; /* end of legacy */ struct workqueue_struct *work_queue; - struct mlx5_vport *vports; + struct xarray vports; u32 flags; int total_vports; int enabled_vports; @@ -545,94 +546,11 @@ static inline u16 mlx5_eswitch_first_host_vport_num(struct mlx5_core_dev *dev) MLX5_VPORT_PF : MLX5_VPORT_FIRST_VF; } -static inline int mlx5_esw_sf_start_idx(const struct mlx5_eswitch *esw) -{ - /* PF and VF vports indices start from 0 to max_vfs */ - return MLX5_VPORT_PF_PLACEHOLDER + mlx5_core_max_vfs(esw->dev); -} - -static inline int mlx5_esw_sf_end_idx(const struct mlx5_eswitch *esw) -{ - return mlx5_esw_sf_start_idx(esw) + mlx5_sf_max_functions(esw->dev); -} - -static inline int -mlx5_esw_sf_vport_num_to_index(const struct mlx5_eswitch *esw, u16 vport_num) -{ - return vport_num - mlx5_sf_start_function_id(esw->dev) + - MLX5_VPORT_PF_PLACEHOLDER + mlx5_core_max_vfs(esw->dev); -} - -static inline u16 -mlx5_esw_sf_vport_index_to_num(const struct mlx5_eswitch *esw, int idx) -{ - return mlx5_sf_start_function_id(esw->dev) + idx - - (MLX5_VPORT_PF_PLACEHOLDER + mlx5_core_max_vfs(esw->dev)); -} - -static inline bool -mlx5_esw_is_sf_vport(const struct mlx5_eswitch *esw, u16 vport_num) -{ - return mlx5_sf_supported(esw->dev) && - vport_num >= mlx5_sf_start_function_id(esw->dev) && - (vport_num < (mlx5_sf_start_function_id(esw->dev) + - mlx5_sf_max_functions(esw->dev))); -} - static inline bool mlx5_eswitch_is_funcs_handler(const struct mlx5_core_dev *dev) { return mlx5_core_is_ecpf_esw_manager(dev); } -static inline int mlx5_eswitch_uplink_idx(struct mlx5_eswitch *esw) -{ - /* Uplink always locate at the last element of the array.*/ - return esw->total_vports - 1; -} - -static inline int mlx5_eswitch_ecpf_idx(struct mlx5_eswitch *esw) -{ - return esw->total_vports - 2; -} - -static inline int mlx5_eswitch_vport_num_to_index(struct mlx5_eswitch *esw, - u16 vport_num) -{ - if (vport_num == MLX5_VPORT_ECPF) { - if (!mlx5_ecpf_vport_exists(esw->dev)) - esw_warn(esw->dev, "ECPF vport doesn't exist!\n"); - return mlx5_eswitch_ecpf_idx(esw); - } - - if (vport_num == MLX5_VPORT_UPLINK) - return mlx5_eswitch_uplink_idx(esw); - - if (mlx5_esw_is_sf_vport(esw, vport_num)) - return mlx5_esw_sf_vport_num_to_index(esw, vport_num); - - /* PF and VF vports start from 0 to max_vfs */ - return vport_num; -} - -static inline u16 mlx5_eswitch_index_to_vport_num(struct mlx5_eswitch *esw, - int index) -{ - if (index == mlx5_eswitch_ecpf_idx(esw) && - mlx5_ecpf_vport_exists(esw->dev)) - return MLX5_VPORT_ECPF; - - if (index == mlx5_eswitch_uplink_idx(esw)) - return MLX5_VPORT_UPLINK; - - /* SF vports indices are after VFs and before ECPF */ - if (mlx5_sf_supported(esw->dev) && - index > mlx5_core_max_vfs(esw->dev)) - return mlx5_esw_sf_vport_index_to_num(esw, index); - - /* PF and VF vports start from 0 to max_vfs */ - return index; -} - static inline unsigned int mlx5_esw_vport_to_devlink_port_index(const struct mlx5_core_dev *dev, u16 vport_num) @@ -649,82 +567,42 @@ mlx5_esw_devlink_port_index_to_vport_num(unsigned int dl_port_index) /* TODO: This mlx5e_tc function shouldn't be called by eswitch */ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw); -/* The vport getter/iterator are only valid after esw->total_vports - * and vport->vport are initialized in mlx5_eswitch_init. +/* Each mark identifies eswitch vport type. + * MLX5_ESW_VPT_HOST_FN is used to identify both PF and VF ports using + * a single mark. + * MLX5_ESW_VPT_VF identifies a SRIOV VF vport. + * MLX5_ESW_VPT_SF identifies SF vport. */ -#define mlx5_esw_for_all_vports(esw, i, vport) \ - for ((i) = MLX5_VPORT_PF; \ - (vport) = &(esw)->vports[i], \ - (i) < (esw)->total_vports; (i)++) - -#define mlx5_esw_for_all_vports_reverse(esw, i, vport) \ - for ((i) = (esw)->total_vports - 1; \ - (vport) = &(esw)->vports[i], \ - (i) >= MLX5_VPORT_PF; (i)--) - -#define mlx5_esw_for_each_vf_vport(esw, i, vport, nvfs) \ - for ((i) = MLX5_VPORT_FIRST_VF; \ - (vport) = &(esw)->vports[(i)], \ - (i) <= (nvfs); (i)++) - -#define mlx5_esw_for_each_vf_vport_reverse(esw, i, vport, nvfs) \ - for ((i) = (nvfs); \ - (vport) = &(esw)->vports[(i)], \ - (i) >= MLX5_VPORT_FIRST_VF; (i)--) - -/* The rep getter/iterator are only valid after esw->total_vports - * and vport->vport are initialized in mlx5_eswitch_init. +#define MLX5_ESW_VPT_HOST_FN XA_MARK_0 +#define MLX5_ESW_VPT_VF XA_MARK_1 +#define MLX5_ESW_VPT_SF XA_MARK_2 + +/* The vport iterator is valid only after vport are initialized in mlx5_eswitch_init. + * Borrowed the idea from xa_for_each_marked() but with support for desired last element. */ -#define mlx5_esw_for_all_reps(esw, i, rep) \ - for ((i) = MLX5_VPORT_PF; \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) < (esw)->total_vports; (i)++) - -#define mlx5_esw_for_each_vf_rep(esw, i, rep, nvfs) \ - for ((i) = MLX5_VPORT_FIRST_VF; \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) <= (nvfs); (i)++) - -#define mlx5_esw_for_each_vf_rep_reverse(esw, i, rep, nvfs) \ - for ((i) = (nvfs); \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) >= MLX5_VPORT_FIRST_VF; (i)--) - -#define mlx5_esw_for_each_vf_vport_num(esw, vport, nvfs) \ - for ((vport) = MLX5_VPORT_FIRST_VF; (vport) <= (nvfs); (vport)++) - -#define mlx5_esw_for_each_vf_vport_num_reverse(esw, vport, nvfs) \ - for ((vport) = (nvfs); (vport) >= MLX5_VPORT_FIRST_VF; (vport)--) - -/* Includes host PF (vport 0) if it's not esw manager. */ -#define mlx5_esw_for_each_host_func_rep(esw, i, rep, nvfs) \ - for ((i) = (esw)->first_host_vport; \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) <= (nvfs); (i)++) - -#define mlx5_esw_for_each_host_func_rep_reverse(esw, i, rep, nvfs) \ - for ((i) = (nvfs); \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) >= (esw)->first_host_vport; (i)--) - -#define mlx5_esw_for_each_host_func_vport(esw, vport, nvfs) \ - for ((vport) = (esw)->first_host_vport; \ - (vport) <= (nvfs); (vport)++) - -#define mlx5_esw_for_each_host_func_vport_reverse(esw, vport, nvfs) \ - for ((vport) = (nvfs); \ - (vport) >= (esw)->first_host_vport; (vport)--) - -#define mlx5_esw_for_each_sf_rep(esw, i, rep) \ - for ((i) = mlx5_esw_sf_start_idx(esw); \ - (rep) = &(esw)->offloads.vport_reps[(i)], \ - (i) < mlx5_esw_sf_end_idx(esw); (i++)) + +#define mlx5_esw_for_each_vport(esw, index, vport) \ + xa_for_each(&((esw)->vports), index, vport) + +#define mlx5_esw_for_each_entry_marked(xa, index, entry, last, filter) \ + for (index = 0, entry = xa_find(xa, &index, last, filter); \ + entry; entry = xa_find_after(xa, &index, last, filter)) + +#define mlx5_esw_for_each_vport_marked(esw, index, vport, last, filter) \ + mlx5_esw_for_each_entry_marked(&((esw)->vports), index, vport, last, filter) + +#define mlx5_esw_for_each_vf_vport(esw, index, vport, last) \ + mlx5_esw_for_each_vport_marked(esw, index, vport, last, MLX5_ESW_VPT_VF) + +#define mlx5_esw_for_each_host_func_vport(esw, index, vport, last) \ + mlx5_esw_for_each_vport_marked(esw, index, vport, last, MLX5_ESW_VPT_HOST_FN) struct mlx5_eswitch *mlx5_devlink_eswitch_get(struct devlink *devlink); struct mlx5_vport *__must_check mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num); -bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num); +bool mlx5_eswitch_is_vf_vport(struct mlx5_eswitch *esw, u16 vport_num); +bool mlx5_esw_is_sf_vport(struct mlx5_eswitch *esw, u16 vport_num); int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type, void *data); @@ -784,12 +662,13 @@ void mlx5_esw_offloads_devlink_port_unregister(struct mlx5_eswitch *esw, u16 vpo struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u16 vport_num); int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum); + u16 vport_num, u32 controller, u32 sfnum); void mlx5_esw_devlink_sf_port_unregister(struct mlx5_eswitch *esw, u16 vport_num); int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum); + u16 vport_num, u32 controller, u32 sfnum); void mlx5_esw_offloads_sf_vport_disable(struct mlx5_eswitch *esw, u16 vport_num); +int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *sf_base_id); int mlx5_esw_vport_vhca_id_set(struct mlx5_eswitch *esw, u16 vport_num); void mlx5_esw_vport_vhca_id_clear(struct mlx5_eswitch *esw, u16 vport_num); @@ -816,6 +695,8 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw); void esw_vport_change_handle_locked(struct mlx5_vport *vport); +bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 controller); + #else /* CONFIG_MLX5_ESWITCH */ /* eswitch API stubs */ static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index bbb707117296..db1e74280e57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -49,6 +49,16 @@ #include "en_tc.h" #include "en/mapping.h" +#define mlx5_esw_for_each_rep(esw, i, rep) \ + xa_for_each(&((esw)->offloads.vport_reps), i, rep) + +#define mlx5_esw_for_each_sf_rep(esw, i, rep) \ + xa_for_each_marked(&((esw)->offloads.vport_reps), i, rep, MLX5_ESW_VPT_SF) + +#define mlx5_esw_for_each_vf_rep(esw, index, rep) \ + mlx5_esw_for_each_entry_marked(&((esw)->offloads.vport_reps), index, \ + rep, (esw)->esw_funcs.num_vfs, MLX5_ESW_VPT_VF) + /* There are two match-all miss flows, one for unicast dst mac and * one for multicast. */ @@ -67,10 +77,7 @@ static const struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_mirror_ns = { static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw, u16 vport_num) { - int idx = mlx5_eswitch_vport_num_to_index(esw, vport_num); - - WARN_ON(idx > esw->total_vports - 1); - return &esw->offloads.vport_reps[idx]; + return xa_load(&esw->offloads.vport_reps, vport_num); } static void @@ -720,10 +727,11 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw, static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val) { struct mlx5_eswitch_rep *rep; - int i, err = 0; + unsigned long i; + int err = 0; esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none"); - mlx5_esw_for_each_host_func_rep(esw, i, rep, esw->esw_funcs.num_vfs) { + mlx5_esw_for_each_host_func_vport(esw, i, rep, esw->esw_funcs.num_vfs) { if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) continue; @@ -972,13 +980,13 @@ void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule) static void mlx5_eswitch_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw) { struct mlx5_flow_handle **flows = esw->fdb_table.offloads.send_to_vport_meta_rules; - int i = 0, num_vfs = esw->esw_funcs.num_vfs, vport_num; + int i = 0, num_vfs = esw->esw_funcs.num_vfs; if (!num_vfs || !flows) return; - mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs) - mlx5_del_flow_rules(flows[i++]); + for (i = 0; i < num_vfs; i++) + mlx5_del_flow_rules(flows[i]); kvfree(flows); } @@ -992,6 +1000,8 @@ mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw) struct mlx5_flow_handle *flow_rule; struct mlx5_flow_handle **flows; struct mlx5_flow_spec *spec; + struct mlx5_vport *vport; + unsigned long i; u16 vport_num; num_vfs = esw->esw_funcs.num_vfs; @@ -1016,7 +1026,8 @@ mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw) dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs) { + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + vport_num = vport->vport; MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num)); dest.vport.num = vport_num; @@ -1158,12 +1169,14 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, struct mlx5_flow_destination dest = {}; struct mlx5_flow_act flow_act = {0}; struct mlx5_flow_handle **flows; - struct mlx5_flow_handle *flow; - struct mlx5_flow_spec *spec; /* total vports is the same for both e-switches */ int nvports = esw->total_vports; + struct mlx5_flow_handle *flow; + struct mlx5_flow_spec *spec; + struct mlx5_vport *vport; + unsigned long i; void *misc; - int err, i; + int err; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); if (!spec) @@ -1182,6 +1195,7 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, misc_parameters); if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, spec, MLX5_VPORT_PF); @@ -1191,10 +1205,11 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_pf_flow_err; } - flows[MLX5_VPORT_PF] = flow; + flows[vport->index] = flow; } if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_ECPF); flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); @@ -1202,13 +1217,13 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_ecpf_flow_err; } - flows[mlx5_eswitch_ecpf_idx(esw)] = flow; + flows[vport->index] = flow; } - mlx5_esw_for_each_vf_vport_num(esw, i, mlx5_core_max_vfs(esw->dev)) { + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, - spec, i); + spec, vport->vport); flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); @@ -1216,7 +1231,7 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_vf_flow_err; } - flows[i] = flow; + flows[vport->index] = flow; } esw->fdb_table.offloads.peer_miss_rules = flows; @@ -1225,15 +1240,20 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, return 0; add_vf_flow_err: - nvports = --i; - mlx5_esw_for_each_vf_vport_num_reverse(esw, i, nvports) - mlx5_del_flow_rules(flows[i]); - - if (mlx5_ecpf_vport_exists(esw->dev)) - mlx5_del_flow_rules(flows[mlx5_eswitch_ecpf_idx(esw)]); + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { + if (!flows[vport->index]) + continue; + mlx5_del_flow_rules(flows[vport->index]); + } + if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[vport->index]); + } add_ecpf_flow_err: - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) - mlx5_del_flow_rules(flows[MLX5_VPORT_PF]); + if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[vport->index]); + } add_pf_flow_err: esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err); kvfree(flows); @@ -1245,20 +1265,23 @@ alloc_flows_err: static void esw_del_fdb_peer_miss_rules(struct mlx5_eswitch *esw) { struct mlx5_flow_handle **flows; - int i; + struct mlx5_vport *vport; + unsigned long i; flows = esw->fdb_table.offloads.peer_miss_rules; - mlx5_esw_for_each_vf_vport_num_reverse(esw, i, - mlx5_core_max_vfs(esw->dev)) - mlx5_del_flow_rules(flows[i]); - - if (mlx5_ecpf_vport_exists(esw->dev)) - mlx5_del_flow_rules(flows[mlx5_eswitch_ecpf_idx(esw)]); + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) + mlx5_del_flow_rules(flows[vport->index]); - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) - mlx5_del_flow_rules(flows[MLX5_VPORT_PF]); + if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[vport->index]); + } + if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[vport->index]); + } kvfree(flows); } @@ -1402,11 +1425,11 @@ static void esw_vport_tbl_put(struct mlx5_eswitch *esw) { struct mlx5_vport_tbl_attr attr; struct mlx5_vport *vport; - int i; + unsigned long i; attr.chain = 0; attr.prio = 1; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { attr.vport = vport->vport; attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; mlx5_esw_vporttbl_put(esw, &attr); @@ -1418,11 +1441,11 @@ static int esw_vport_tbl_get(struct mlx5_eswitch *esw) struct mlx5_vport_tbl_attr attr; struct mlx5_flow_table *fdb; struct mlx5_vport *vport; - int i; + unsigned long i; attr.chain = 0; attr.prio = 1; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { attr.vport = vport->vport; attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; fdb = mlx5_esw_vporttbl_get(esw, &attr); @@ -1910,12 +1933,12 @@ out: return flow_rule; } - -static int mlx5_eswitch_inline_mode_get(const struct mlx5_eswitch *esw, u8 *mode) +static int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, u8 *mode) { u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2; struct mlx5_core_dev *dev = esw->dev; - int vport; + struct mlx5_vport *vport; + unsigned long i; if (!MLX5_CAP_GEN(dev, vport_group_manager)) return -EOPNOTSUPP; @@ -1936,8 +1959,8 @@ static int mlx5_eswitch_inline_mode_get(const struct mlx5_eswitch *esw, u8 *mode query_vports: mlx5_query_nic_vport_min_inline(dev, esw->first_host_vport, &prev_mlx5_mode); - mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) { - mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode); + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + mlx5_query_nic_vport_min_inline(dev, vport->vport, &mlx5_mode); if (prev_mlx5_mode != mlx5_mode) return -EINVAL; prev_mlx5_mode = mlx5_mode; @@ -2080,34 +2103,82 @@ static int esw_offloads_start(struct mlx5_eswitch *esw, return err; } -void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw) +static void mlx5_esw_offloads_rep_mark_set(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, + xa_mark_t mark) { - kfree(esw->offloads.vport_reps); + bool mark_set; + + /* Copy the mark from vport to its rep */ + mark_set = xa_get_mark(&esw->vports, rep->vport, mark); + if (mark_set) + xa_set_mark(&esw->offloads.vport_reps, rep->vport, mark); } -int esw_offloads_init_reps(struct mlx5_eswitch *esw) +static int mlx5_esw_offloads_rep_init(struct mlx5_eswitch *esw, const struct mlx5_vport *vport) { - int total_vports = esw->total_vports; struct mlx5_eswitch_rep *rep; - int vport_index; - u8 rep_type; + int rep_type; + int err; - esw->offloads.vport_reps = kcalloc(total_vports, - sizeof(struct mlx5_eswitch_rep), - GFP_KERNEL); - if (!esw->offloads.vport_reps) + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) return -ENOMEM; - mlx5_esw_for_all_reps(esw, vport_index, rep) { - rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport_index); - rep->vport_index = vport_index; + rep->vport = vport->vport; + rep->vport_index = vport->index; + for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) + atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED); - for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) - atomic_set(&rep->rep_data[rep_type].state, - REP_UNREGISTERED); - } + err = xa_insert(&esw->offloads.vport_reps, rep->vport, rep, GFP_KERNEL); + if (err) + goto insert_err; + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_HOST_FN); + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_VF); + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_SF); return 0; + +insert_err: + kfree(rep); + return err; +} + +static void mlx5_esw_offloads_rep_cleanup(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep) +{ + xa_erase(&esw->offloads.vport_reps, rep->vport); + kfree(rep); +} + +void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + + mlx5_esw_for_each_rep(esw, i, rep) + mlx5_esw_offloads_rep_cleanup(esw, rep); + xa_destroy(&esw->offloads.vport_reps); +} + +int esw_offloads_init_reps(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + xa_init(&esw->offloads.vport_reps); + + mlx5_esw_for_each_vport(esw, i, vport) { + err = mlx5_esw_offloads_rep_init(esw, vport); + if (err) + goto err; + } + return 0; + +err: + esw_offloads_cleanup_reps(esw); + return err; } static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw, @@ -2121,7 +2192,7 @@ static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw, static void __unload_reps_sf_vport(struct mlx5_eswitch *esw, u8 rep_type) { struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; mlx5_esw_for_each_sf_rep(esw, i, rep) __esw_offloads_unload_rep(esw, rep, rep_type); @@ -2130,11 +2201,11 @@ static void __unload_reps_sf_vport(struct mlx5_eswitch *esw, u8 rep_type) static void __unload_reps_all_vport(struct mlx5_eswitch *esw, u8 rep_type) { struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; __unload_reps_sf_vport(esw, rep_type); - mlx5_esw_for_each_vf_rep_reverse(esw, i, rep, esw->esw_funcs.num_vfs) + mlx5_esw_for_each_vf_rep(esw, i, rep) __esw_offloads_unload_rep(esw, rep, rep_type); if (mlx5_ecpf_vport_exists(esw->dev)) { @@ -2421,25 +2492,25 @@ static void esw_offloads_vport_metadata_cleanup(struct mlx5_eswitch *esw, static void esw_offloads_metadata_uninit(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; - int i; + unsigned long i; if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) return; - mlx5_esw_for_all_vports_reverse(esw, i, vport) + mlx5_esw_for_each_vport(esw, i, vport) esw_offloads_vport_metadata_cleanup(esw, vport); } static int esw_offloads_metadata_init(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; + unsigned long i; int err; - int i; if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) return 0; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { err = esw_offloads_vport_metadata_setup(esw, vport); if (err) goto metadata_err; @@ -2676,11 +2747,25 @@ static int mlx5_esw_host_number_init(struct mlx5_eswitch *esw) return 0; } +bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 controller) +{ + /* Local controller is always valid */ + if (controller == 0) + return true; + + if (!mlx5_core_is_ecpf_esw_manager(esw->dev)) + return false; + + /* External host number starts with zero in device */ + return (controller == esw->offloads.host_number + 1); +} + int esw_offloads_enable(struct mlx5_eswitch *esw) { struct mapping_ctx *reg_c0_obj_pool; struct mlx5_vport *vport; - int err, i; + unsigned long i; + int err; if (MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, reformat) && MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, decap)) @@ -2926,13 +3011,44 @@ unlock: return err; } +static int mlx5_esw_vports_inline_set(struct mlx5_eswitch *esw, u8 mlx5_mode, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_vport *vport; + u16 err_vport_num = 0; + unsigned long i; + int err = 0; + + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + err = mlx5_modify_nic_vport_min_inline(dev, vport->vport, mlx5_mode); + if (err) { + err_vport_num = vport->vport; + NL_SET_ERR_MSG_MOD(extack, + "Failed to set min inline on vport"); + goto revert_inline_mode; + } + } + return 0; + +revert_inline_mode: + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + if (vport->vport == err_vport_num) + break; + mlx5_modify_nic_vport_min_inline(dev, + vport->vport, + esw->offloads.inline_mode); + } + return err; +} + int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); - int err, vport, num_vport; struct mlx5_eswitch *esw; u8 mlx5_mode; + int err; esw = mlx5_devlink_eswitch_get(devlink); if (IS_ERR(esw)) @@ -2967,25 +3083,14 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, if (err) goto out; - mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) { - err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode); - if (err) { - NL_SET_ERR_MSG_MOD(extack, - "Failed to set min inline on vport"); - goto revert_inline_mode; - } - } + err = mlx5_esw_vports_inline_set(esw, mlx5_mode, extack); + if (err) + goto out; esw->offloads.inline_mode = mlx5_mode; up_write(&esw->mode_lock); return 0; -revert_inline_mode: - num_vport = --vport; - mlx5_esw_for_each_host_func_vport_reverse(esw, vport, num_vport) - mlx5_modify_nic_vport_min_inline(dev, - vport, - esw->offloads.inline_mode); out: up_write(&esw->mode_lock); return err; @@ -3116,11 +3221,11 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw, { struct mlx5_eswitch_rep_data *rep_data; struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; esw->offloads.rep_ops[rep_type] = ops; - mlx5_esw_for_all_reps(esw, i, rep) { - if (likely(mlx5_eswitch_vport_has_rep(esw, i))) { + mlx5_esw_for_each_rep(esw, i, rep) { + if (likely(mlx5_eswitch_vport_has_rep(esw, rep->vport))) { rep->esw = esw; rep_data = &rep->rep_data[rep_type]; atomic_set(&rep_data->state, REP_REGISTERED); @@ -3132,12 +3237,12 @@ EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps); void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type) { struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; if (esw->mode == MLX5_ESWITCH_OFFLOADS) __unload_reps_all_vport(esw, rep_type); - mlx5_esw_for_all_reps(esw, i, rep) + mlx5_esw_for_each_rep(esw, i, rep) atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED); } EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps); @@ -3178,12 +3283,6 @@ struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw, } EXPORT_SYMBOL(mlx5_eswitch_vport_rep); -bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num) -{ - return vport_num >= MLX5_VPORT_FIRST_VF && - vport_num <= esw->dev->priv.sriov.max_vfs; -} - bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw) { return !!(esw->flags & MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED); @@ -3209,7 +3308,7 @@ u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match); int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum) + u16 vport_num, u32 controller, u32 sfnum) { int err; @@ -3217,7 +3316,7 @@ int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_p if (err) return err; - err = mlx5_esw_devlink_sf_port_register(esw, dl_port, vport_num, sfnum); + err = mlx5_esw_devlink_sf_port_register(esw, dl_port, vport_num, controller, sfnum); if (err) goto devlink_err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index 90b524c59f3c..6a0c6f965ad1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -148,9 +148,19 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_ struct mlx5_sf_dev_table *table = container_of(nb, struct mlx5_sf_dev_table, nb); const struct mlx5_vhca_state_event *event = data; struct mlx5_sf_dev *sf_dev; + u16 max_functions; u16 sf_index; + u16 base_id; + + max_functions = mlx5_sf_max_functions(table->dev); + if (!max_functions) + return 0; + + base_id = MLX5_CAP_GEN(table->dev, sf_base_id); + if (event->function_id < base_id || event->function_id >= (base_id + max_functions)) + return 0; - sf_index = event->function_id - MLX5_CAP_GEN(table->dev, sf_base_id); + sf_index = event->function_id - base_id; sf_dev = xa_load(&table->devices, sf_index); switch (event->new_vhca_state) { case MLX5_VHCA_STATE_ALLOCATED: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c index 52226d9b9a6d..a8e73c9ed1ea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c @@ -12,6 +12,7 @@ struct mlx5_sf { struct devlink_port dl_port; unsigned int port_index; + u32 controller; u16 id; u16 hw_fn_id; u16 hw_state; @@ -58,7 +59,8 @@ static void mlx5_sf_id_erase(struct mlx5_sf_table *table, struct mlx5_sf *sf) } static struct mlx5_sf * -mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *extack) +mlx5_sf_alloc(struct mlx5_sf_table *table, struct mlx5_eswitch *esw, + u32 controller, u32 sfnum, struct netlink_ext_ack *extack) { unsigned int dl_port_index; struct mlx5_sf *sf; @@ -66,7 +68,12 @@ mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *ex int id_err; int err; - id_err = mlx5_sf_hw_table_sf_alloc(table->dev, sfnum); + if (!mlx5_esw_offloads_controller_valid(esw, controller)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid controller number"); + return ERR_PTR(-EINVAL); + } + + id_err = mlx5_sf_hw_table_sf_alloc(table->dev, controller, sfnum); if (id_err < 0) { err = id_err; goto id_err; @@ -78,11 +85,12 @@ mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *ex goto alloc_err; } sf->id = id_err; - hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sf->id); + hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, controller, sf->id); dl_port_index = mlx5_esw_vport_to_devlink_port_index(table->dev, hw_fn_id); sf->port_index = dl_port_index; sf->hw_fn_id = hw_fn_id; sf->hw_state = MLX5_VHCA_STATE_ALLOCATED; + sf->controller = controller; err = mlx5_sf_id_insert(table, sf); if (err) @@ -93,7 +101,7 @@ mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *ex insert_err: kfree(sf); alloc_err: - mlx5_sf_hw_table_sf_free(table->dev, id_err); + mlx5_sf_hw_table_sf_free(table->dev, controller, id_err); id_err: if (err == -EEXIST) NL_SET_ERR_MSG_MOD(extack, "SF already exist. Choose different sfnum"); @@ -103,7 +111,7 @@ id_err: static void mlx5_sf_free(struct mlx5_sf_table *table, struct mlx5_sf *sf) { mlx5_sf_id_erase(table, sf); - mlx5_sf_hw_table_sf_free(table->dev, sf->id); + mlx5_sf_hw_table_sf_free(table->dev, sf->controller, sf->id); kfree(sf); } @@ -272,12 +280,12 @@ static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table, struct mlx5_sf *sf; int err; - sf = mlx5_sf_alloc(table, new_attr->sfnum, extack); + sf = mlx5_sf_alloc(table, esw, new_attr->controller, new_attr->sfnum, extack); if (IS_ERR(sf)) return PTR_ERR(sf); err = mlx5_esw_offloads_sf_vport_enable(esw, &sf->dl_port, sf->hw_fn_id, - new_attr->sfnum); + new_attr->controller, new_attr->sfnum); if (err) goto esw_err; *new_port_index = sf->port_index; @@ -306,7 +314,8 @@ mlx5_sf_new_check_attr(struct mlx5_core_dev *dev, const struct devlink_port_new_ "User must provide unique sfnum. Driver does not support auto assignment"); return -EOPNOTSUPP; } - if (new_attr->controller_valid && new_attr->controller) { + if (new_attr->controller_valid && new_attr->controller && + !mlx5_core_is_ecpf_esw_manager(dev)) { NL_SET_ERR_MSG_MOD(extack, "External controller is unsupported"); return -EOPNOTSUPP; } @@ -352,10 +361,10 @@ static void mlx5_sf_dealloc(struct mlx5_sf_table *table, struct mlx5_sf *sf) * firmware gives confirmation that it is detached by the driver. */ mlx5_cmd_sf_disable_hca(table->dev, sf->hw_fn_id); - mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->id); + mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->controller, sf->id); kfree(sf); } else { - mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->id); + mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->controller, sf->id); kfree(sf); } } @@ -437,9 +446,6 @@ sf_err: static void mlx5_sf_table_enable(struct mlx5_sf_table *table) { - if (!mlx5_sf_max_functions(table->dev)) - return; - init_completion(&table->disable_complete); refcount_set(&table->refcount, 1); } @@ -462,9 +468,6 @@ static void mlx5_sf_deactivate_all(struct mlx5_sf_table *table) static void mlx5_sf_table_disable(struct mlx5_sf_table *table) { - if (!mlx5_sf_max_functions(table->dev)) - return; - if (!refcount_read(&table->refcount)) return; @@ -498,7 +501,8 @@ static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, voi static bool mlx5_sf_table_supported(const struct mlx5_core_dev *dev) { - return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) && mlx5_sf_supported(dev); + return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) && + mlx5_sf_hw_table_supported(dev); } int mlx5_sf_table_init(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c index ec53c11c8344..ef5f892aafad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c @@ -8,6 +8,7 @@ #include "ecpf.h" #include "vhca_event.h" #include "mlx5_core.h" +#include "eswitch.h" struct mlx5_sf_hw { u32 usr_sfnum; @@ -15,59 +16,113 @@ struct mlx5_sf_hw { u8 pending_delete: 1; }; +struct mlx5_sf_hwc_table { + struct mlx5_sf_hw *sfs; + int max_fn; + u16 start_fn_id; +}; + +enum mlx5_sf_hwc_index { + MLX5_SF_HWC_LOCAL, + MLX5_SF_HWC_EXTERNAL, + MLX5_SF_HWC_MAX, +}; + struct mlx5_sf_hw_table { struct mlx5_core_dev *dev; - struct mlx5_sf_hw *sfs; - int max_local_functions; struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */ struct notifier_block vhca_nb; + struct mlx5_sf_hwc_table hwc[MLX5_SF_HWC_MAX]; }; -u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id) +static struct mlx5_sf_hwc_table * +mlx5_sf_controller_to_hwc(struct mlx5_core_dev *dev, u32 controller) { - return sw_id + mlx5_sf_start_function_id(dev); + int idx = !!controller; + + return &dev->priv.sf_hw_table->hwc[idx]; } -static u16 mlx5_sf_hw_to_sw_id(const struct mlx5_core_dev *dev, u16 hw_id) +u16 mlx5_sf_sw_to_hw_id(struct mlx5_core_dev *dev, u32 controller, u16 sw_id) { - return hw_id - mlx5_sf_start_function_id(dev); + struct mlx5_sf_hwc_table *hwc; + + hwc = mlx5_sf_controller_to_hwc(dev, controller); + return hwc->start_fn_id + sw_id; } -int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum) +static u16 mlx5_sf_hw_to_sw_id(struct mlx5_sf_hwc_table *hwc, u16 hw_id) +{ + return hw_id - hwc->start_fn_id; +} + +static struct mlx5_sf_hwc_table * +mlx5_sf_table_fn_to_hwc(struct mlx5_sf_hw_table *table, u16 fn_id) { - struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; - int sw_id = -ENOSPC; - u16 hw_fn_id; - int err; int i; - if (!table->max_local_functions) - return -EOPNOTSUPP; + for (i = 0; i < ARRAY_SIZE(table->hwc); i++) { + if (table->hwc[i].max_fn && + fn_id >= table->hwc[i].start_fn_id && + fn_id < (table->hwc[i].start_fn_id + table->hwc[i].max_fn)) + return &table->hwc[i]; + } + return NULL; +} + +static int mlx5_sf_hw_table_id_alloc(struct mlx5_sf_hw_table *table, u32 controller, + u32 usr_sfnum) +{ + struct mlx5_sf_hwc_table *hwc; + int i; + + hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + if (!hwc->sfs) + return -ENOSPC; - mutex_lock(&table->table_lock); /* Check if sf with same sfnum already exists or not. */ - for (i = 0; i < table->max_local_functions; i++) { - if (table->sfs[i].allocated && table->sfs[i].usr_sfnum == usr_sfnum) { - err = -EEXIST; - goto exist_err; - } + for (i = 0; i < hwc->max_fn; i++) { + if (hwc->sfs[i].allocated && hwc->sfs[i].usr_sfnum == usr_sfnum) + return -EEXIST; } - /* Find the free entry and allocate the entry from the array */ - for (i = 0; i < table->max_local_functions; i++) { - if (!table->sfs[i].allocated) { - table->sfs[i].usr_sfnum = usr_sfnum; - table->sfs[i].allocated = true; - sw_id = i; - break; + for (i = 0; i < hwc->max_fn; i++) { + if (!hwc->sfs[i].allocated) { + hwc->sfs[i].usr_sfnum = usr_sfnum; + hwc->sfs[i].allocated = true; + return i; } } - if (sw_id == -ENOSPC) { - err = -ENOSPC; + return -ENOSPC; +} + +static void mlx5_sf_hw_table_id_free(struct mlx5_sf_hw_table *table, u32 controller, int id) +{ + struct mlx5_sf_hwc_table *hwc; + + hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + hwc->sfs[id].allocated = false; + hwc->sfs[id].pending_delete = false; +} + +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr_sfnum) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + u16 hw_fn_id; + int sw_id; + int err; + + if (!table) + return -EOPNOTSUPP; + + mutex_lock(&table->table_lock); + sw_id = mlx5_sf_hw_table_id_alloc(table, controller, usr_sfnum); + if (sw_id < 0) { + err = sw_id; goto exist_err; } - hw_fn_id = mlx5_sf_sw_to_hw_id(dev, sw_id); + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, sw_id); err = mlx5_cmd_alloc_sf(dev, hw_fn_id); if (err) goto err; @@ -76,47 +131,58 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum) if (err) goto vhca_err; + if (controller) { + /* If this SF is for external controller, SF manager + * needs to arm firmware to receive the events. + */ + err = mlx5_vhca_event_arm(dev, hw_fn_id); + if (err) + goto vhca_err; + } + mutex_unlock(&table->table_lock); return sw_id; vhca_err: mlx5_cmd_dealloc_sf(dev, hw_fn_id); err: - table->sfs[i].allocated = false; + mlx5_sf_hw_table_id_free(table, controller, sw_id); exist_err: mutex_unlock(&table->table_lock); return err; } -static void _mlx5_sf_hw_id_free(struct mlx5_core_dev *dev, u16 id) +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id) { struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; u16 hw_fn_id; - hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id); + mutex_lock(&table->table_lock); + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id); mlx5_cmd_dealloc_sf(dev, hw_fn_id); - table->sfs[id].allocated = false; - table->sfs[id].pending_delete = false; + mlx5_sf_hw_table_id_free(table, controller, id); + mutex_unlock(&table->table_lock); } -void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id) +static void mlx5_sf_hw_table_hwc_sf_free(struct mlx5_core_dev *dev, + struct mlx5_sf_hwc_table *hwc, int idx) { - struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; - - mutex_lock(&table->table_lock); - _mlx5_sf_hw_id_free(dev, id); - mutex_unlock(&table->table_lock); + mlx5_cmd_dealloc_sf(dev, hwc->start_fn_id + idx); + hwc->sfs[idx].allocated = false; + hwc->sfs[idx].pending_delete = false; } -void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id) +void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u32 controller, u16 id) { struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; + struct mlx5_sf_hwc_table *hwc; u16 hw_fn_id; u8 state; int err; - hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id); + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id); + hwc = mlx5_sf_controller_to_hwc(dev, controller); mutex_lock(&table->table_lock); err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out)); if (err) @@ -124,53 +190,102 @@ void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id) state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state); if (state == MLX5_VHCA_STATE_ALLOCATED) { mlx5_cmd_dealloc_sf(dev, hw_fn_id); - table->sfs[id].allocated = false; + hwc->sfs[id].allocated = false; } else { - table->sfs[id].pending_delete = true; + hwc->sfs[id].pending_delete = true; } err: mutex_unlock(&table->table_lock); } -static void mlx5_sf_hw_dealloc_all(struct mlx5_sf_hw_table *table) +static void mlx5_sf_hw_table_hwc_dealloc_all(struct mlx5_core_dev *dev, + struct mlx5_sf_hwc_table *hwc) { int i; - for (i = 0; i < table->max_local_functions; i++) { - if (table->sfs[i].allocated) - _mlx5_sf_hw_id_free(table->dev, i); + for (i = 0; i < hwc->max_fn; i++) { + if (hwc->sfs[i].allocated) + mlx5_sf_hw_table_hwc_sf_free(dev, hwc, i); } } +static void mlx5_sf_hw_table_dealloc_all(struct mlx5_sf_hw_table *table) +{ + mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_EXTERNAL]); + mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_LOCAL]); +} + +static int mlx5_sf_hw_table_hwc_init(struct mlx5_sf_hwc_table *hwc, u16 max_fn, u16 base_id) +{ + struct mlx5_sf_hw *sfs; + + if (!max_fn) + return 0; + + sfs = kcalloc(max_fn, sizeof(*sfs), GFP_KERNEL); + if (!sfs) + return -ENOMEM; + + hwc->sfs = sfs; + hwc->max_fn = max_fn; + hwc->start_fn_id = base_id; + return 0; +} + +static void mlx5_sf_hw_table_hwc_cleanup(struct mlx5_sf_hwc_table *hwc) +{ + kfree(hwc->sfs); +} + int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) { struct mlx5_sf_hw_table *table; - struct mlx5_sf_hw *sfs; - int max_functions; + u16 max_ext_fn = 0; + u16 ext_base_id; + u16 max_fn = 0; + u16 base_id; + int err; - if (!mlx5_sf_supported(dev) || !mlx5_vhca_event_supported(dev)) + if (!mlx5_vhca_event_supported(dev)) + return 0; + + if (mlx5_sf_supported(dev)) + max_fn = mlx5_sf_max_functions(dev); + + err = mlx5_esw_sf_max_hpf_functions(dev, &max_ext_fn, &ext_base_id); + if (err) + return err; + + if (!max_fn && !max_ext_fn) return 0; - max_functions = mlx5_sf_max_functions(dev); table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return -ENOMEM; - sfs = kcalloc(max_functions, sizeof(*sfs), GFP_KERNEL); - if (!sfs) - goto table_err; - mutex_init(&table->table_lock); table->dev = dev; - table->sfs = sfs; - table->max_local_functions = max_functions; dev->priv.sf_hw_table = table; - mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions); + + base_id = mlx5_sf_start_function_id(dev); + err = mlx5_sf_hw_table_hwc_init(&table->hwc[MLX5_SF_HWC_LOCAL], max_fn, base_id); + if (err) + goto table_err; + + err = mlx5_sf_hw_table_hwc_init(&table->hwc[MLX5_SF_HWC_EXTERNAL], + max_ext_fn, ext_base_id); + if (err) + goto ext_err; + + mlx5_core_dbg(dev, "SF HW table: max sfs = %d, ext sfs = %d\n", max_fn, max_ext_fn); return 0; +ext_err: + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]); table_err: + mutex_destroy(&table->table_lock); kfree(table); - return -ENOMEM; + return err; } void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) @@ -181,7 +296,8 @@ void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) return; mutex_destroy(&table->table_lock); - kfree(table->sfs); + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_EXTERNAL]); + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]); kfree(table); } @@ -189,21 +305,26 @@ static int mlx5_sf_hw_vhca_event(struct notifier_block *nb, unsigned long opcode { struct mlx5_sf_hw_table *table = container_of(nb, struct mlx5_sf_hw_table, vhca_nb); const struct mlx5_vhca_state_event *event = data; + struct mlx5_sf_hwc_table *hwc; struct mlx5_sf_hw *sf_hw; u16 sw_id; if (event->new_vhca_state != MLX5_VHCA_STATE_ALLOCATED) return 0; - sw_id = mlx5_sf_hw_to_sw_id(table->dev, event->function_id); - sf_hw = &table->sfs[sw_id]; + hwc = mlx5_sf_table_fn_to_hwc(table, event->function_id); + if (!hwc) + return 0; + + sw_id = mlx5_sf_hw_to_sw_id(hwc, event->function_id); + sf_hw = &hwc->sfs[sw_id]; mutex_lock(&table->table_lock); /* SF driver notified through firmware that SF is finally detached. * Hence recycle the sf hardware id for reuse. */ if (sf_hw->allocated && sf_hw->pending_delete) - _mlx5_sf_hw_id_free(table->dev, sw_id); + mlx5_sf_hw_table_hwc_sf_free(table->dev, hwc, sw_id); mutex_unlock(&table->table_lock); return 0; } @@ -228,5 +349,10 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev) mlx5_vhca_event_notifier_unregister(dev, &table->vhca_nb); /* Dealloc SFs whose firmware event has been missed. */ - mlx5_sf_hw_dealloc_all(table); + mlx5_sf_hw_table_dealloc_all(table); +} + +bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev) +{ + return !!dev->priv.sf_hw_table; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h index cb02a51d0986..7114f3fc335f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h @@ -12,10 +12,11 @@ int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id); int mlx5_cmd_sf_enable_hca(struct mlx5_core_dev *dev, u16 func_id); int mlx5_cmd_sf_disable_hca(struct mlx5_core_dev *dev, u16 func_id); -u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id); +u16 mlx5_sf_sw_to_hw_id(struct mlx5_core_dev *dev, u32 controller, u16 sw_id); -int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum); -void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id); -void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id); +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr_sfnum); +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id); +void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u32 controller, u16 id); +bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev); #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index e05c5c0f3ae1..457ad42eaa2a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1151,20 +1151,6 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev) } EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid); -/** - * mlx5_eswitch_get_total_vports - Get total vports of the eswitch - * - * @dev: Pointer to core device - * - * mlx5_eswitch_get_total_vports returns total number of vports for - * the eswitch. - */ -u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) -{ - return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev); -} -EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); - int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out) { u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 713ee3041d49..bea978df7713 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -364,6 +364,7 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) attrs.split = eth_port.is_split; attrs.splittable = !attrs.split; + attrs.lanes = eth_port.port_lanes; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = eth_port.label_port; attrs.phys.split_subport_number = eth_port.label_subport; diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 9e5dad41cdc9..4afff320dfd0 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -913,31 +913,20 @@ static int ravb_poll(struct napi_struct *napi, int budget) int q = napi - priv->napi; int mask = BIT(q); int quota = budget; - u32 ris0, tis; - for (;;) { - tis = ravb_read(ndev, TIS); - ris0 = ravb_read(ndev, RIS0); - if (!((ris0 & mask) || (tis & mask))) - break; + /* Processing RX Descriptor Ring */ + /* Clear RX interrupt */ + ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); + if (ravb_rx(ndev, "a, q)) + goto out; - /* Processing RX Descriptor Ring */ - if (ris0 & mask) { - /* Clear RX interrupt */ - ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); - if (ravb_rx(ndev, "a, q)) - goto out; - } - /* Processing TX Descriptor Ring */ - if (tis & mask) { - spin_lock_irqsave(&priv->lock, flags); - /* Clear TX interrupt */ - ravb_write(ndev, ~(mask | TIS_RESERVED), TIS); - ravb_tx_free(ndev, q, true); - netif_wake_subqueue(ndev, q); - spin_unlock_irqrestore(&priv->lock, flags); - } - } + /* Processing RX Descriptor Ring */ + spin_lock_irqsave(&priv->lock, flags); + /* Clear TX interrupt */ + ravb_write(ndev, ~(mask | TIS_RESERVED), TIS); + ravb_tx_free(ndev, q, true); + netif_wake_subqueue(ndev, q); + spin_unlock_irqrestore(&priv->lock, flags); napi_complete(napi); diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index c873f961d5a5..c3f35da1b82a 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -2944,8 +2944,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) /* Get the transmit queue */ tx_ev_q_label = EFX_QWORD_FIELD(*event, ESF_DZ_TX_QLABEL); - tx_queue = efx_channel_get_tx_queue(channel, - tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); if (!tx_queue->timestamping) { /* Transmit completion */ diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c index d75cf5ff5686..49df02ecee91 100644 --- a/drivers/net/ethernet/sfc/farch.c +++ b/drivers/net/ethernet/sfc/farch.c @@ -835,14 +835,14 @@ efx_farch_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) /* Transmit completion */ tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_DESC_PTR); tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL); - tx_queue = efx_channel_get_tx_queue( - channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + + (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); efx_xmit_done(tx_queue, tx_ev_desc_ptr); } else if (EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_WQ_FF_FULL)) { /* Rewrite the FIFO write pointer */ tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL); - tx_queue = efx_channel_get_tx_queue( - channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + + (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); netif_tx_lock(efx->net_dev); efx_farch_notify_tx_desc(tx_queue); @@ -1081,16 +1081,16 @@ static void efx_farch_handle_tx_flush_done(struct efx_nic *efx, efx_qword_t *event) { struct efx_tx_queue *tx_queue; + struct efx_channel *channel; int qid; qid = EFX_QWORD_FIELD(*event, FSF_AZ_DRIVER_EV_SUBDATA); if (qid < EFX_MAX_TXQ_PER_CHANNEL * (efx->n_tx_channels + efx->n_extra_tx_channels)) { - tx_queue = efx_get_tx_queue(efx, qid / EFX_MAX_TXQ_PER_CHANNEL, - qid % EFX_MAX_TXQ_PER_CHANNEL); - if (atomic_cmpxchg(&tx_queue->flush_outstanding, 1, 0)) { + channel = efx_get_tx_channel(efx, qid / EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + (qid % EFX_MAX_TXQ_PER_CHANNEL); + if (atomic_cmpxchg(&tx_queue->flush_outstanding, 1, 0)) efx_farch_magic_event(tx_queue->channel, EFX_CHANNEL_MAGIC_TX_DRAIN(tx_queue)); - } } } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 372090e8ee6f..a9a984c57d78 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3303,8 +3303,15 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) /* Enable TSO */ if (priv->tso) { - for (chan = 0; chan < tx_cnt; chan++) + for (chan = 0; chan < tx_cnt; chan++) { + struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; + + /* TSO and TBS cannot co-exist */ + if (tx_q->tbs & STMMAC_TBS_AVAIL) + continue; + stmmac_enable_tso(priv, priv->ioaddr, 1, chan); + } } /* Enable Split Header */ @@ -3674,9 +3681,8 @@ int stmmac_open(struct net_device *dev) struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; int tbs_en = priv->plat->tx_queues_cfg[chan].tbs_en; + /* Setup per-TXQ tbs flag before TX descriptor alloc */ tx_q->tbs |= tbs_en ? STMMAC_TBS_AVAIL : 0; - if (stmmac_enable_tbs(priv, priv->ioaddr, tbs_en, chan)) - tx_q->tbs &= ~STMMAC_TBS_AVAIL; } ret = alloc_dma_desc_resources(priv); diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 14e7da7d302f..f9417b44cae8 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -169,11 +169,11 @@ static const char emac_version_string[] = "TI DaVinci EMAC Linux v6.1"; /* EMAC mac_status register */ #define EMAC_MACSTATUS_TXERRCODE_MASK (0xF00000) #define EMAC_MACSTATUS_TXERRCODE_SHIFT (20) -#define EMAC_MACSTATUS_TXERRCH_MASK (0x7) +#define EMAC_MACSTATUS_TXERRCH_MASK (0x70000) #define EMAC_MACSTATUS_TXERRCH_SHIFT (16) #define EMAC_MACSTATUS_RXERRCODE_MASK (0xF000) #define EMAC_MACSTATUS_RXERRCODE_SHIFT (12) -#define EMAC_MACSTATUS_RXERRCH_MASK (0x7) +#define EMAC_MACSTATUS_RXERRCH_MASK (0x700) #define EMAC_MACSTATUS_RXERRCH_SHIFT (8) /* EMAC RX register masks */ diff --git a/drivers/net/ethernet/xscale/Kconfig b/drivers/net/ethernet/xscale/Kconfig index 7b83a6e5d894..468ffe3d1707 100644 --- a/drivers/net/ethernet/xscale/Kconfig +++ b/drivers/net/ethernet/xscale/Kconfig @@ -22,6 +22,7 @@ config IXP4XX_ETH tristate "Intel IXP4xx Ethernet support" depends on ARM && ARCH_IXP4XX && IXP4XX_NPE && IXP4XX_QMGR select PHYLIB + select OF_MDIO if OF select NET_PTP_CLASSIFY help Say Y here if you want to use built-in Ethernet ports diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c index 0152f1e70783..cb89323855d8 100644 --- a/drivers/net/ethernet/xscale/ixp4xx_eth.c +++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c @@ -28,6 +28,7 @@ #include <linux/kernel.h> #include <linux/net_tstamp.h> #include <linux/of.h> +#include <linux/of_mdio.h> #include <linux/phy.h> #include <linux/platform_data/eth_ixp4xx.h> #include <linux/platform_device.h> @@ -165,7 +166,6 @@ struct eth_regs { }; struct port { - struct resource *mem_res; struct eth_regs __iomem *regs; struct npe *npe; struct net_device *netdev; @@ -250,6 +250,7 @@ static inline void memcpy_swab32(u32 *dest, u32 *src, int cnt) static DEFINE_SPINLOCK(mdio_lock); static struct eth_regs __iomem *mdio_regs; /* mdio command and status only */ static struct mii_bus *mdio_bus; +static struct device_node *mdio_bus_np; static int ports_open; static struct port *npe_port_tab[MAX_NPES]; static struct dma_pool *dma_pool; @@ -533,7 +534,8 @@ static int ixp4xx_mdio_register(struct eth_regs __iomem *regs) mdio_bus->write = &ixp4xx_mdio_write; snprintf(mdio_bus->id, MII_BUS_ID_SIZE, "ixp4xx-eth-0"); - if ((err = mdiobus_register(mdio_bus))) + err = of_mdiobus_register(mdio_bus, mdio_bus_np); + if (err) mdiobus_free(mdio_bus); return err; } @@ -1085,7 +1087,7 @@ static int init_queues(struct port *port) int i; if (!ports_open) { - dma_pool = dma_pool_create(DRV_NAME, port->netdev->dev.parent, + dma_pool = dma_pool_create(DRV_NAME, &port->netdev->dev, POOL_ALLOC_SIZE, 32, 0); if (!dma_pool) return -ENOMEM; @@ -1358,19 +1360,118 @@ static const struct net_device_ops ixp4xx_netdev_ops = { .ndo_validate_addr = eth_validate_addr, }; +#ifdef CONFIG_OF +static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev) +{ + struct device_node *np = dev->of_node; + struct of_phandle_args queue_spec; + struct of_phandle_args npe_spec; + struct device_node *mdio_np; + struct eth_plat_info *plat; + int ret; + + plat = devm_kzalloc(dev, sizeof(*plat), GFP_KERNEL); + if (!plat) + return NULL; + + ret = of_parse_phandle_with_fixed_args(np, "intel,npe-handle", 1, 0, + &npe_spec); + if (ret) { + dev_err(dev, "no NPE engine specified\n"); + return NULL; + } + /* NPE ID 0x00, 0x10, 0x20... */ + plat->npe = (npe_spec.args[0] << 4); + + /* Check if this device has an MDIO bus */ + mdio_np = of_get_child_by_name(np, "mdio"); + if (mdio_np) { + plat->has_mdio = true; + mdio_bus_np = mdio_np; + /* DO NOT put the mdio_np, it will be used */ + } + + /* Get the rx queue as a resource from queue manager */ + ret = of_parse_phandle_with_fixed_args(np, "queue-rx", 1, 0, + &queue_spec); + if (ret) { + dev_err(dev, "no rx queue phandle\n"); + return NULL; + } + plat->rxq = queue_spec.args[0]; + + /* Get the txready queue as resource from queue manager */ + ret = of_parse_phandle_with_fixed_args(np, "queue-txready", 1, 0, + &queue_spec); + if (ret) { + dev_err(dev, "no txready queue phandle\n"); + return NULL; + } + plat->txreadyq = queue_spec.args[0]; + + return plat; +} +#else +static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev) +{ + return NULL; +} +#endif + static int ixp4xx_eth_probe(struct platform_device *pdev) { - char phy_id[MII_BUS_ID_SIZE + 3]; struct phy_device *phydev = NULL; struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; struct eth_plat_info *plat; - resource_size_t regs_phys; struct net_device *ndev; struct resource *res; struct port *port; int err; - plat = dev_get_platdata(dev); + if (np) { + plat = ixp4xx_of_get_platdata(dev); + if (!plat) + return -ENODEV; + } else { + plat = dev_get_platdata(dev); + if (!plat) + return -ENODEV; + plat->npe = pdev->id; + switch (plat->npe) { + case IXP4XX_ETH_NPEA: + /* If the MDIO bus is not up yet, defer probe */ + break; + case IXP4XX_ETH_NPEB: + /* On all except IXP43x, NPE-B is used for the MDIO bus. + * If there is no NPE-B in the feature set, bail out, + * else we have the MDIO bus here. + */ + if (!cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEB_ETH0)) + return -ENODEV; + /* Else register the MDIO bus on NPE-B */ + plat->has_mdio = true; + } + break; + case IXP4XX_ETH_NPEC: + /* IXP43x lacks NPE-B and uses NPE-C for the MDIO bus + * access, if there is no NPE-C, no bus, nothing works, + * so bail out. + */ + if (cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEC_ETH)) + return -ENODEV; + /* Else register the MDIO bus on NPE-B */ + plat->has_mdio = true; + } + break; + default: + return -ENODEV; + } + } if (!(ndev = devm_alloc_etherdev(dev, sizeof(struct port)))) return -ENOMEM; @@ -1378,75 +1479,42 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) SET_NETDEV_DEV(ndev, dev); port = netdev_priv(ndev); port->netdev = ndev; - port->id = pdev->id; + port->id = plat->npe; /* Get the port resource and remap */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENODEV; - regs_phys = res->start; port->regs = devm_ioremap_resource(dev, res); if (IS_ERR(port->regs)) return PTR_ERR(port->regs); - switch (port->id) { - case IXP4XX_ETH_NPEA: - /* If the MDIO bus is not up yet, defer probe */ - if (!mdio_bus) - return -EPROBE_DEFER; - break; - case IXP4XX_ETH_NPEB: - /* - * On all except IXP43x, NPE-B is used for the MDIO bus. - * If there is no NPE-B in the feature set, bail out, else - * register the MDIO bus. - */ - if (!cpu_is_ixp43x()) { - if (!(ixp4xx_read_feature_bits() & - IXP4XX_FEATURE_NPEB_ETH0)) - return -ENODEV; - /* Else register the MDIO bus on NPE-B */ - if ((err = ixp4xx_mdio_register(port->regs))) - return err; - } - if (!mdio_bus) - return -EPROBE_DEFER; - break; - case IXP4XX_ETH_NPEC: - /* - * IXP43x lacks NPE-B and uses NPE-C for the MDIO bus access, - * of there is no NPE-C, no bus, nothing works, so bail out. - */ - if (cpu_is_ixp43x()) { - if (!(ixp4xx_read_feature_bits() & - IXP4XX_FEATURE_NPEC_ETH)) - return -ENODEV; - /* Else register the MDIO bus on NPE-C */ - if ((err = ixp4xx_mdio_register(port->regs))) - return err; + /* Register the MDIO bus if we have it */ + if (plat->has_mdio) { + err = ixp4xx_mdio_register(port->regs); + if (err) { + dev_err(dev, "failed to register MDIO bus\n"); + return err; } - if (!mdio_bus) - return -EPROBE_DEFER; - break; - default: - return -ENODEV; } + /* If the instance with the MDIO bus has not yet appeared, + * defer probing until it gets probed. + */ + if (!mdio_bus) + return -EPROBE_DEFER; ndev->netdev_ops = &ixp4xx_netdev_ops; ndev->ethtool_ops = &ixp4xx_ethtool_ops; ndev->tx_queue_len = 100; + /* Inherit the DMA masks from the platform device */ + ndev->dev.dma_mask = dev->dma_mask; + ndev->dev.coherent_dma_mask = dev->coherent_dma_mask; netif_napi_add(ndev, &port->napi, eth_poll, NAPI_WEIGHT); if (!(port->npe = npe_request(NPE_ID(port->id)))) return -EIO; - port->mem_res = request_mem_region(regs_phys, REGS_SIZE, ndev->name); - if (!port->mem_res) { - err = -EBUSY; - goto err_npe_rel; - } - port->plat = plat; npe_port_tab[NPE_ID(port->id)] = port; memcpy(ndev->dev_addr, plat->hwaddr, ETH_ALEN); @@ -1459,12 +1527,24 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) __raw_writel(DEFAULT_CORE_CNTRL, &port->regs->core_control); udelay(50); - snprintf(phy_id, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, - mdio_bus->id, plat->phy); - phydev = phy_connect(ndev, phy_id, &ixp4xx_adjust_link, - PHY_INTERFACE_MODE_MII); - if (IS_ERR(phydev)) { - err = PTR_ERR(phydev); + if (np) { + phydev = of_phy_get_and_connect(ndev, np, ixp4xx_adjust_link); + } else { + phydev = mdiobus_get_phy(mdio_bus, plat->phy); + if (IS_ERR(phydev)) { + err = PTR_ERR(phydev); + dev_err(dev, "could not connect phydev (%d)\n", err); + goto err_free_mem; + } + err = phy_connect_direct(ndev, phydev, ixp4xx_adjust_link, + PHY_INTERFACE_MODE_MII); + if (err) + goto err_free_mem; + + } + if (!phydev) { + err = -ENODEV; + dev_err(dev, "no phydev\n"); goto err_free_mem; } @@ -1482,8 +1562,6 @@ err_phy_dis: phy_disconnect(phydev); err_free_mem: npe_port_tab[NPE_ID(port->id)] = NULL; - release_resource(port->mem_res); -err_npe_rel: npe_release(port->npe); return err; } @@ -1499,12 +1577,21 @@ static int ixp4xx_eth_remove(struct platform_device *pdev) ixp4xx_mdio_remove(); npe_port_tab[NPE_ID(port->id)] = NULL; npe_release(port->npe); - release_resource(port->mem_res); return 0; } +static const struct of_device_id ixp4xx_eth_of_match[] = { + { + .compatible = "intel,ixp4xx-ethernet", + }, + { }, +}; + static struct platform_driver ixp4xx_eth_driver = { - .driver.name = DRV_NAME, + .driver = { + .name = DRV_NAME, + .of_match_table = of_match_ptr(ixp4xx_eth_of_match), + }, .probe = ixp4xx_eth_probe, .remove = ixp4xx_eth_remove, }; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 072de880b99f..1ab94b5f9bbf 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -892,7 +892,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + if (!pskb_inet_may_pull(skb)) return -EINVAL; sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); @@ -989,7 +989,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + if (!pskb_inet_may_pull(skb)) return -EINVAL; sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 7349a70af083..f682a5572d84 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2297,6 +2297,7 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) { struct device *parent = vf_netdev->dev.parent; struct net_device_context *ndev_ctx; + struct net_device *ndev; struct pci_dev *pdev; u32 serial; @@ -2319,8 +2320,17 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) if (!ndev_ctx->vf_alloc) continue; - if (ndev_ctx->vf_serial == serial) - return hv_get_drvdata(ndev_ctx->device_ctx); + if (ndev_ctx->vf_serial != serial) + continue; + + ndev = hv_get_drvdata(ndev_ctx->device_ctx); + if (ndev->addr_len != vf_netdev->addr_len || + memcmp(ndev->perm_addr, vf_netdev->perm_addr, + ndev->addr_len) != 0) + continue; + + return ndev; + } netdev_notice(vf_netdev, diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 9a9a5cf36a4b..7427b989607e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -423,18 +423,24 @@ static void macvlan_forward_source_one(struct sk_buff *skb, macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } -static void macvlan_forward_source(struct sk_buff *skb, +static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; + bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { - if (ether_addr_equal_64bits(entry->addr, addr)) + if (ether_addr_equal_64bits(entry->addr, addr)) { + if (entry->vlan->flags & MACVLAN_FLAG_NODST) + consume = true; macvlan_forward_source_one(skb, entry->vlan); + } } + + return consume; } /* called under rcu_read_lock() from netif_receive_skb */ @@ -463,7 +469,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { @@ -482,7 +489,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; } - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); @@ -1286,7 +1294,8 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], return 0; if (data[IFLA_MACVLAN_FLAGS] && - nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC) + nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | + MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { diff --git a/drivers/net/phy/intel-xway.c b/drivers/net/phy/intel-xway.c index 6eac50d4b42f..d453ec016168 100644 --- a/drivers/net/phy/intel-xway.c +++ b/drivers/net/phy/intel-xway.c @@ -11,6 +11,18 @@ #define XWAY_MDIO_IMASK 0x19 /* interrupt mask */ #define XWAY_MDIO_ISTAT 0x1A /* interrupt status */ +#define XWAY_MDIO_LED 0x1B /* led control */ + +/* bit 15:12 are reserved */ +#define XWAY_MDIO_LED_LED3_EN BIT(11) /* Enable the integrated function of LED3 */ +#define XWAY_MDIO_LED_LED2_EN BIT(10) /* Enable the integrated function of LED2 */ +#define XWAY_MDIO_LED_LED1_EN BIT(9) /* Enable the integrated function of LED1 */ +#define XWAY_MDIO_LED_LED0_EN BIT(8) /* Enable the integrated function of LED0 */ +/* bit 7:4 are reserved */ +#define XWAY_MDIO_LED_LED3_DA BIT(3) /* Direct Access to LED3 */ +#define XWAY_MDIO_LED_LED2_DA BIT(2) /* Direct Access to LED2 */ +#define XWAY_MDIO_LED_LED1_DA BIT(1) /* Direct Access to LED1 */ +#define XWAY_MDIO_LED_LED0_DA BIT(0) /* Direct Access to LED0 */ #define XWAY_MDIO_INIT_WOL BIT(15) /* Wake-On-LAN */ #define XWAY_MDIO_INIT_MSRE BIT(14) @@ -159,6 +171,15 @@ static int xway_gphy_config_init(struct phy_device *phydev) /* Clear all pending interrupts */ phy_read(phydev, XWAY_MDIO_ISTAT); + /* Ensure that integrated led function is enabled for all leds */ + err = phy_write(phydev, XWAY_MDIO_LED, + XWAY_MDIO_LED_LED0_EN | + XWAY_MDIO_LED_LED1_EN | + XWAY_MDIO_LED_LED2_EN | + XWAY_MDIO_LED_LED3_EN); + if (err) + return err; + phy_write_mmd(phydev, MDIO_MMD_VEND2, XWAY_MMD_LEDCH, XWAY_MMD_LEDCH_NACS_NONE | XWAY_MMD_LEDCH_SBF_F02HZ | diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e2b2b20c0dc5..a61fde7013bd 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -978,22 +978,28 @@ static int m88e1111_get_downshift(struct phy_device *phydev, u8 *data) static int m88e1111_set_downshift(struct phy_device *phydev, u8 cnt) { - int val; + int val, err; if (cnt > MII_M1111_PHY_EXT_CR_DOWNSHIFT_MAX) return -E2BIG; - if (!cnt) - return phy_clear_bits(phydev, MII_M1111_PHY_EXT_CR, - MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN); + if (!cnt) { + err = phy_clear_bits(phydev, MII_M1111_PHY_EXT_CR, + MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN); + } else { + val = MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN; + val |= FIELD_PREP(MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, cnt - 1); - val = MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN; - val |= FIELD_PREP(MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, cnt - 1); + err = phy_modify(phydev, MII_M1111_PHY_EXT_CR, + MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN | + MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, + val); + } - return phy_modify(phydev, MII_M1111_PHY_EXT_CR, - MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN | - MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, - val); + if (err < 0) + return err; + + return genphy_soft_reset(phydev); } static int m88e1111_get_tunable(struct phy_device *phydev, @@ -1036,22 +1042,28 @@ static int m88e1011_get_downshift(struct phy_device *phydev, u8 *data) static int m88e1011_set_downshift(struct phy_device *phydev, u8 cnt) { - int val; + int val, err; if (cnt > MII_M1011_PHY_SCR_DOWNSHIFT_MAX) return -E2BIG; - if (!cnt) - return phy_clear_bits(phydev, MII_M1011_PHY_SCR, - MII_M1011_PHY_SCR_DOWNSHIFT_EN); + if (!cnt) { + err = phy_clear_bits(phydev, MII_M1011_PHY_SCR, + MII_M1011_PHY_SCR_DOWNSHIFT_EN); + } else { + val = MII_M1011_PHY_SCR_DOWNSHIFT_EN; + val |= FIELD_PREP(MII_M1011_PHY_SCR_DOWNSHIFT_MASK, cnt - 1); - val = MII_M1011_PHY_SCR_DOWNSHIFT_EN; - val |= FIELD_PREP(MII_M1011_PHY_SCR_DOWNSHIFT_MASK, cnt - 1); + err = phy_modify(phydev, MII_M1011_PHY_SCR, + MII_M1011_PHY_SCR_DOWNSHIFT_EN | + MII_M1011_PHY_SCR_DOWNSHIFT_MASK, + val); + } - return phy_modify(phydev, MII_M1011_PHY_SCR, - MII_M1011_PHY_SCR_DOWNSHIFT_EN | - MII_M1011_PHY_SCR_DOWNSHIFT_MASK, - val); + if (err < 0) + return err; + + return genphy_soft_reset(phydev); } static int m88e1011_get_tunable(struct phy_device *phydev, diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 9986f8969d02..136ea06540ff 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -767,8 +767,6 @@ enum rtl8152_flags { PHY_RESET, SCHEDULE_TASKLET, GREEN_ETHERNET, - DELL_TB_RX_AGG_BUG, - LENOVO_MACPASSTHRU, }; #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2 0x3082 @@ -934,6 +932,8 @@ struct r8152 { u32 fc_pause_on, fc_pause_off; u32 support_2500full:1; + u32 lenovo_macpassthru:1; + u32 dell_tb_rx_agg_bug:1; u16 ocp_base; u16 speed; u16 eee_adv; @@ -1594,7 +1594,7 @@ static int vendor_mac_passthru_addr_read(struct r8152 *tp, struct sockaddr *sa) acpi_object_type mac_obj_type; int mac_strlen; - if (test_bit(LENOVO_MACPASSTHRU, &tp->flags)) { + if (tp->lenovo_macpassthru) { mac_obj_name = "\\MACA"; mac_obj_type = ACPI_TYPE_STRING; mac_strlen = 0x16; @@ -2283,7 +2283,7 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct tx_agg *agg) remain = agg_buf_sz - (int)(tx_agg_align(tx_data) - agg->head); - if (test_bit(DELL_TB_RX_AGG_BUG, &tp->flags)) + if (tp->dell_tb_rx_agg_bug) break; } @@ -6941,7 +6941,7 @@ static void r8153_init(struct r8152 *tp) /* rx aggregation */ ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_USB_CTRL); ocp_data &= ~(RX_AGG_DISABLE | RX_ZERO_EN); - if (test_bit(DELL_TB_RX_AGG_BUG, &tp->flags)) + if (tp->dell_tb_rx_agg_bug) ocp_data |= RX_AGG_DISABLE; ocp_write_word(tp, MCU_TYPE_USB, USB_USB_CTRL, ocp_data); @@ -9447,7 +9447,7 @@ static int rtl8152_probe(struct usb_interface *intf, switch (le16_to_cpu(udev->descriptor.idProduct)) { case DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2: case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2: - set_bit(LENOVO_MACPASSTHRU, &tp->flags); + tp->lenovo_macpassthru = 1; } } @@ -9455,7 +9455,7 @@ static int rtl8152_probe(struct usb_interface *intf, (!strcmp(udev->serial, "000001000000") || !strcmp(udev->serial, "000002000000"))) { dev_info(&udev->dev, "Dell TB16 Dock, disable RX aggregation"); - set_bit(DELL_TB_RX_AGG_BUG, &tp->flags); + tp->dell_tb_rx_agg_bug = 1; } netdev->ethtool_ops = &ops; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 4456abb9a074..34bde8c87324 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -40,6 +40,7 @@ int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, const u8 *cmddata[IWL_MAX_CMD_TBS_PER_TFD]; u16 cmdlen[IWL_MAX_CMD_TBS_PER_TFD]; struct iwl_tfh_tfd *tfd; + unsigned long flags; copy_size = sizeof(struct iwl_cmd_header_wide); cmd_size = sizeof(struct iwl_cmd_header_wide); @@ -108,14 +109,14 @@ int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, goto free_dup_buf; } - spin_lock_bh(&txq->lock); + spin_lock_irqsave(&txq->lock, flags); idx = iwl_txq_get_cmd_index(txq, txq->write_ptr); tfd = iwl_txq_get_tfd(trans, txq, txq->write_ptr); memset(tfd, 0, sizeof(*tfd)); if (iwl_txq_space(trans, txq) < ((cmd->flags & CMD_ASYNC) ? 2 : 1)) { - spin_unlock_bh(&txq->lock); + spin_unlock_irqrestore(&txq->lock, flags); IWL_ERR(trans, "No space in command queue\n"); iwl_op_mode_cmd_queue_full(trans->op_mode); @@ -250,7 +251,7 @@ int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, spin_unlock(&trans_pcie->reg_lock); out: - spin_unlock_bh(&txq->lock); + spin_unlock_irqrestore(&txq->lock, flags); free_dup_buf: if (idx < 0) kfree(dup_buf); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 6a29fe11485d..8b77d08d4b47 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -458,7 +458,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -struct bpf_prog; struct cgroup_bpf {}; static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c9b7a876b0c8..ad4bcf1cadbb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -310,6 +310,7 @@ enum bpf_arg_type { ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ + ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ __BPF_ARG_TYPE_MAX, }; @@ -930,7 +931,6 @@ struct bpf_link_primer { }; struct bpf_struct_ops_value; -struct btf_type; struct btf_member; #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 @@ -1955,6 +1955,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; @@ -2080,4 +2081,24 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); +enum bpf_printf_mod_type { + BPF_PRINTF_INT, + BPF_PRINTF_LONG, + BPF_PRINTF_LONG_LONG, +}; + +/* Workaround for getting va_list handling working with different argument type + * combinations generically for 32 and 64 bit archs. + */ +#define BPF_CAST_FMT_ARG(arg_nb, args, mod) \ + (mod[arg_nb] == BPF_PRINTF_LONG_LONG || \ + (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64) \ + ? (u64)args[arg_nb] \ + : (u32)args[arg_nb]) + +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args); +void bpf_printf_cleanup(void); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 51c2ffa3d901..6023a1367853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -487,6 +487,15 @@ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } +/* unpack the IDs from the key as constructed above */ +static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) +{ + if (obj_id) + *obj_id = key >> 32; + if (btf_id) + *btf_id = key & 0x7FFFFFFF; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 9cf1da2883c6..17109b65c1ac 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -65,8 +65,6 @@ struct mlx5_flow_handle * mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, struct mlx5_eswitch_rep *rep, u32 sqn); -u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); - #ifdef CONFIG_MLX5_ESWITCH enum devlink_eswitch_encap_mode mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev); @@ -126,6 +124,8 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, #define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); +u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); + #else /* CONFIG_MLX5_ESWITCH */ static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev) @@ -162,10 +162,17 @@ mlx5_eswitch_get_vport_metadata_mask(void) { return 0; } + +static inline u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) +{ + return 0; +} + #endif /* CONFIG_MLX5_ESWITCH */ static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev) { return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS; } + #endif diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 4db87bcfce7b..aad53cb72f17 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,14 +36,6 @@ #include <linux/mlx5/driver.h> #include <linux/mlx5/device.h> -#define MLX5_VPORT_PF_PLACEHOLDER (1u) -#define MLX5_VPORT_UPLINK_PLACEHOLDER (1u) -#define MLX5_VPORT_ECPF_PLACEHOLDER(mdev) (mlx5_ecpf_vport_exists(mdev)) - -#define MLX5_SPECIAL_VPORTS(mdev) (MLX5_VPORT_PF_PLACEHOLDER + \ - MLX5_VPORT_UPLINK_PLACEHOLDER + \ - MLX5_VPORT_ECPF_PLACEHOLDER(mdev)) - #define MLX5_VPORT_MANAGER(mdev) \ (MLX5_CAP_GEN(mdev, vport_group_manager) && \ (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ diff --git a/include/linux/platform_data/eth_ixp4xx.h b/include/linux/platform_data/eth_ixp4xx.h index 6f652ea0c6ae..114b0940729f 100644 --- a/include/linux/platform_data/eth_ixp4xx.h +++ b/include/linux/platform_data/eth_ixp4xx.h @@ -14,6 +14,8 @@ struct eth_plat_info { u8 rxq; /* configurable, currently 0 - 31 only */ u8 txreadyq; u8 hwaddr[6]; + u8 npe; /* NPE instance used by this interface */ + bool has_mdio; /* If this instance has an MDIO bus */ }; #endif diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e242bf3d2b4a..aba0f0f429be 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,7 +99,8 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, + bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -404,7 +405,7 @@ static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) - psock->psock_update_sk_prot(sk, true); + psock->psock_update_sk_prot(sk, psock, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/devlink.h b/include/net/devlink.h index 853420db5d32..7c984cadfec4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -98,11 +98,13 @@ struct devlink_port_pci_vf_attrs { * @controller: Associated controller number * @sf: Associated PCI SF for of the PCI PF for this port. * @pf: Associated PCI PF number for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_sf_attrs { u32 controller; u32 sf; u16 pf; + u8 external:1; }; /** @@ -1508,7 +1510,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, - u32 controller, u16 pf, u32 sf); + u32 controller, u16 pf, u32 sf, + bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/net/sock.h b/include/net/sock.h index cadcc12cc316..42bc5e1a627f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1118,6 +1118,7 @@ struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; +struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes @@ -1189,7 +1190,9 @@ struct proto { void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); #ifdef CONFIG_BPF_SYSCALL - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, + struct sk_psock *psock, + bool restore); #endif /* Keeping track of sockets in use */ diff --git a/include/net/tcp.h b/include/net/tcp.h index eaea43afcc97..d05193cb0d99 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2215,7 +2215,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int tcp_bpf_update_proto(struct sock *sk, bool restore); +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index f55aaeef7e91..360df454356c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -543,7 +543,7 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb) #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int udp_bpf_update_proto(struct sock *sk, bool restore); +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif #endif /* _UDP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 49371eba98ba..ec6d85a81744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,6 +312,27 @@ union bpf_iter_link_info { * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -4061,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4078,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4088,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4671,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4838,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5379,6 +5444,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 91c8dda6d95d..cd5b382a4138 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -614,6 +614,7 @@ enum macvlan_macaddr_mode { }; #define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ /* VRF section */ enum { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f5423251c118..5e31ee9f7512 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1363,11 +1363,10 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions - * @stack: is the eBPF storage stack * * Decode and execute eBPF instructions. */ -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z @@ -1701,7 +1700,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ - return ___bpf_prog_run(regs, insn, stack); \ + return ___bpf_prog_run(regs, insn); \ } #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size @@ -1718,7 +1717,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ BPF_R3 = r3; \ BPF_R4 = r4; \ BPF_R5 = r5; \ - return ___bpf_prog_run(regs, insn, stack); \ + return ___bpf_prog_run(regs, insn); \ } #define EVAL1(FN, X) FN(X) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f306611c4ddf..85b26ca5aacd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -669,6 +669,310 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; +static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)unsafe_ptr < TASK_SIZE) + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + fallthrough; +#endif + case 'k': + return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + case 'u': + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + } + + return -EINVAL; +} + +/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p + */ +#define MAX_PRINTF_BUF_LEN 512 + +struct bpf_printf_buf { + char tmp_buf[MAX_PRINTF_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); +static DEFINE_PER_CPU(int, bpf_printf_buf_used); + +static int try_get_fmt_tmp_buf(char **tmp_buf) +{ + struct bpf_printf_buf *bufs; + int used; + + if (*tmp_buf) + return 0; + + preempt_disable(); + used = this_cpu_inc_return(bpf_printf_buf_used); + if (WARN_ON_ONCE(used > 1)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + return -EBUSY; + } + bufs = this_cpu_ptr(&bpf_printf_buf); + *tmp_buf = bufs->tmp_buf; + + return 0; +} + +void bpf_printf_cleanup(void) +{ + if (this_cpu_read(bpf_printf_buf_used)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + } +} + +/* + * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers + * + * Returns a negative value if fmt is an invalid format string or 0 otherwise. + * + * This can be used in two ways: + * - Format string verification only: when final_args and mod are NULL + * - Arguments preparation: in addition to the above verification, it writes in + * final_args a copy of raw_args where pointers from BPF have been sanitized + * into pointers safe to use by snprintf. This also writes in the mod array + * the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex. + * + * In argument preparation mode, if 0 is returned, safe temporary buffers are + * allocated and bpf_printf_cleanup should be called to free them after use. + */ +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args) +{ + char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end; + size_t tmp_buf_len = MAX_PRINTF_BUF_LEN; + int err, i, num_spec = 0, copy_size; + enum bpf_printf_mod_type cur_mod; + u64 cur_arg; + char fmt_ptype; + + if (!!final_args != !!mod) + return -EINVAL; + + fmt_end = strnchr(fmt, fmt_size, 0); + if (!fmt_end) + return -EINVAL; + fmt_size = fmt_end - fmt; + + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto cleanup; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (num_spec >= num_args) { + err = -EINVAL; + goto cleanup; + } + + /* The string is zero-terminated so if fmt[i] != 0, we can + * always access fmt[i + 1], in the worst case it will be a 0 + */ + i++; + + /* skip optional "[0 +-][num]" width formatting field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 'p') { + cur_mod = BPF_PRINTF_LONG; + + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + fmt[i + 1] == 'x' || fmt[i + 1] == 'B' || + fmt[i + 1] == 's' || fmt[i + 1] == 'S') { + /* just kernel pointers */ + if (final_args) + cur_arg = raw_args[num_spec]; + goto fmt_next; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || + (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { + err = -EINVAL; + goto cleanup; + } + + i += 2; + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + if (tmp_buf_len < copy_size) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr, + copy_size); + if (err < 0) + memset(tmp_buf, 0, copy_size); + cur_arg = (u64)(long)tmp_buf; + tmp_buf += copy_size; + tmp_buf_len -= copy_size; + + goto fmt_next; + } else if (fmt[i] == 's') { + cur_mod = BPF_PRINTF_LONG; + fmt_ptype = fmt[i]; +fmt_str: + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) { + err = -EINVAL; + goto cleanup; + } + + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + if (!tmp_buf_len) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, + fmt_ptype, tmp_buf_len); + if (err < 0) { + tmp_buf[0] = '\0'; + err = 1; + } + + cur_arg = (u64)(long)tmp_buf; + tmp_buf += err; + tmp_buf_len -= err; + + goto fmt_next; + } + + cur_mod = BPF_PRINTF_INT; + + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG; + i++; + } + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG_LONG; + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && + fmt[i] != 'x' && fmt[i] != 'X') { + err = -EINVAL; + goto cleanup; + } + + if (final_args) + cur_arg = raw_args[num_spec]; +fmt_next: + if (final_args) { + mod[num_spec] = cur_mod; + final_args[num_spec] = cur_arg; + } + num_spec++; + } + + err = 0; +cleanup: + if (err) + bpf_printf_cleanup(); +out: + return err; +} + +#define MAX_SNPRINTF_VARARGS 12 + +BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) +{ + enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; + u64 args[MAX_SNPRINTF_VARARGS]; + int err, num_args; + + if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + if (err < 0) + return err; + + /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give + * all of them to snprintf(). + */ + err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); + + bpf_printf_cleanup(); + + return err + 1; +} + +const struct bpf_func_proto bpf_snprintf_proto = { + .func = bpf_snprintf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -757,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d2de2abec35b..b4ebd60a6c16 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -816,8 +816,6 @@ static int __init bpf_init(void) { int ret; - mutex_init(&bpf_preload_lock); - ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6428634da57e..fd495190115e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2551,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link); info->tracing.attach_type = tr_link->attach_type; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &info->tracing.target_obj_id, + &info->tracing.target_btf_id); return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5682a02901d3..637462e9b6ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4787,6 +4787,7 @@ static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALU static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; +static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4817,6 +4818,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, [ARG_PTR_TO_FUNC] = &func_ptr_types, [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, + [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -5067,6 +5069,44 @@ skip_type_check: if (err) return err; err = check_ptr_alignment(env, reg, 0, size, true); + } else if (arg_type == ARG_PTR_TO_CONST_STR) { + struct bpf_map *map = reg->map_ptr; + int map_off; + u64 map_addr; + char *str_ptr; + + if (!bpf_map_is_rdonly(map)) { + verbose(env, "R%d does not point to a readonly map'\n", regno); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a constant address'\n", regno); + return -EACCES; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + return -EACCES; + } + + err = check_map_access(env, regno, reg->off, + map->value_size - reg->off, false); + if (err) + return err; + + map_off = reg->off + reg->var_off.value; + err = map->ops->map_direct_value_addr(map, &map_addr, map_off); + if (err) { + verbose(env, "direct value access on string failed\n"); + return err; + } + + str_ptr = (char *)(long)(map_addr); + if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { + verbose(env, "string is not zero-terminated\n"); + return -EINVAL; + } } return err; @@ -5767,6 +5807,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, if (ret_type != RET_INTEGER || (func_id != BPF_FUNC_get_stack && + func_id != BPF_FUNC_get_task_stack && func_id != BPF_FUNC_probe_read_str && func_id != BPF_FUNC_probe_read_kernel_str && func_id != BPF_FUNC_probe_read_user_str)) @@ -5877,6 +5918,43 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } +static int check_bpf_snprintf_call(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; + + /* data must be an array of u64 */ + if (data_len_reg->var_off.value % 8) + return -EINVAL; + num_args = data_len_reg->var_off.value / 8; + + /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const + * and map_direct_value_addr is set. + */ + fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, + fmt_map_off); + if (err) { + verbose(env, "verifier bug\n"); + return -EFAULT; + } + fmt = (char *)(long)fmt_addr + fmt_map_off; + + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + if (err < 0) + verbose(env, "Invalid format string\n"); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -5991,6 +6069,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_snprintf) { + err = check_bpf_snprintf_call(env, regs); + if (err < 0) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0d23755c2747..2a8bcdc927c7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -372,188 +372,38 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } -static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, - size_t bufsz) -{ - void __user *user_ptr = (__force void __user *)unsafe_ptr; - - buf[0] = 0; - - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)unsafe_ptr < TASK_SIZE) { - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } - fallthrough; -#endif - case 'k': - strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); - break; - case 'u': - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } -} - static DEFINE_RAW_SPINLOCK(trace_printk_lock); -#define BPF_TRACE_PRINTK_SIZE 1024 +#define MAX_TRACE_PRINTK_VARARGS 3 +#define BPF_TRACE_PRINTK_SIZE 1024 -static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; + enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS]; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; - va_list ap; int ret; - raw_spin_lock_irqsave(&trace_printk_lock, flags); - va_start(ap, fmt); - ret = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - /* vsnprintf() will not append null for zero-length strings */ + ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod, + MAX_TRACE_PRINTK_VARARGS); + if (ret < 0) + return ret; + + ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod)); + /* snprintf() will not append null for zero-length strings */ if (ret == 0) buf[0] = '\0'; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - return ret; -} - -/* - * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s - */ -BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, - u64, arg2, u64, arg3) -{ - int i, mod[3] = {}, fmt_cnt = 0; - char buf[64], fmt_ptype; - void *unsafe_ptr = NULL; - bool str_seen = false; + bpf_printf_cleanup(); - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - return -EINVAL; - - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) - return -EINVAL; - - if (fmt[i] != '%') - continue; - - if (fmt_cnt >= 3) - return -EINVAL; - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } else if (fmt[i] == 'p') { - mod[fmt_cnt]++; - if ((fmt[i + 1] == 'k' || - fmt[i + 1] == 'u') && - fmt[i + 2] == 's') { - fmt_ptype = fmt[i + 1]; - i += 2; - goto fmt_str; - } - - if (fmt[i + 1] == 'B') { - i++; - goto fmt_next; - } - - /* disallow any further format extensions */ - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - goto fmt_next; - } else if (fmt[i] == 's') { - mod[fmt_cnt]++; - fmt_ptype = fmt[i]; -fmt_str: - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - switch (fmt_cnt) { - case 0: - unsafe_ptr = (void *)(long)arg1; - arg1 = (long)buf; - break; - case 1: - unsafe_ptr = (void *)(long)arg2; - arg2 = (long)buf; - break; - case 2: - unsafe_ptr = (void *)(long)arg3; - arg3 = (long)buf; - break; - } - - bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, - sizeof(buf)); - goto fmt_next; - } - - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') - return -EINVAL; -fmt_next: - fmt_cnt++; - } - -/* Horrid workaround for getting va_list handling working with different - * argument type combinations generically for 32 and 64 bit archs. - */ -#define __BPF_TP_EMIT() __BPF_ARG3_TP() -#define __BPF_TP(...) \ - bpf_do_trace_printk(fmt, ##__VA_ARGS__) - -#define __BPF_ARG1_TP(...) \ - ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_TP(arg1, ##__VA_ARGS__) \ - : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ - : __BPF_TP((u32)arg1, ##__VA_ARGS__))) - -#define __BPF_ARG2_TP(...) \ - ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ - : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ - : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) - -#define __BPF_ARG3_TP(...) \ - ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ - : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ - : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) - - return __BPF_TP_EMIT(); + return ret; } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -581,184 +431,37 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) } #define MAX_SEQ_PRINTF_VARARGS 12 -#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 -#define MAX_SEQ_PRINTF_STR_LEN 128 - -struct bpf_seq_printf_buf { - char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; -}; -static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); -static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { - int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; - int i, buf_used, copy_size, num_args; - u64 params[MAX_SEQ_PRINTF_VARARGS]; - struct bpf_seq_printf_buf *bufs; - const u64 *args = data; - - buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); - if (WARN_ON_ONCE(buf_used > 1)) { - err = -EBUSY; - goto out; - } - - bufs = this_cpu_ptr(&bpf_seq_printf_buf); - - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - goto out; - - if (data_len & 7) - goto out; - - for (i = 0; i < fmt_size; i++) { - if (fmt[i] == '%') { - if (fmt[i + 1] == '%') - i++; - else if (!data || !data_len) - goto out; - } - } + enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS]; + u64 args[MAX_SEQ_PRINTF_VARARGS]; + int err, num_args; + if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; num_args = data_len / 8; - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - /* only printable ascii for now. */ - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { - err = -EINVAL; - goto out; - } - - if (fmt[i] != '%') - continue; - - if (fmt[i + 1] == '%') { - i++; - continue; - } - - if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { - err = -E2BIG; - goto out; - } - - if (fmt_cnt >= num_args) { - err = -EINVAL; - goto out; - } - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - - /* skip optional "[0 +-][num]" width formating field */ - while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || - fmt[i] == ' ') - i++; - if (fmt[i] >= '1' && fmt[i] <= '9') { - i++; - while (fmt[i] >= '0' && fmt[i] <= '9') - i++; - } - - if (fmt[i] == 's') { - void *unsafe_ptr; - - /* try our best to copy */ - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - unsafe_ptr = (void *)(long)args[fmt_cnt]; - err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], - unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); - if (err < 0) - bufs->buf[memcpy_cnt][0] = '\0'; - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'p') { - if (fmt[i + 1] == 0 || - fmt[i + 1] == 'K' || - fmt[i + 1] == 'x' || - fmt[i + 1] == 'B') { - /* just kernel pointers */ - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - continue; - } - - /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ - if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { - err = -EINVAL; - goto out; - } - if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { - err = -EINVAL; - goto out; - } - - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - - copy_size = (fmt[i + 2] == '4') ? 4 : 16; - - err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - copy_size); - if (err < 0) - memset(bufs->buf[memcpy_cnt], 0, copy_size); - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - i += 2; - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'l') { - i++; - if (fmt[i] == 'l') - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x' && - fmt[i] != 'X') { - err = -EINVAL; - goto out; - } - - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - } + err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args); + if (err < 0) + return err; /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give * all of them to seq_printf(). */ - seq_printf(m, fmt, params[0], params[1], params[2], params[3], - params[4], params[5], params[6], params[7], params[8], - params[9], params[10], params[11]); + seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); - err = seq_has_overflowed(m) ? -EOVERFLOW : 0; -out: - this_cpu_dec(bpf_seq_printf_buf_used); - return err; + bpf_printf_cleanup(); + + return seq_has_overflowed(m) ? -EOVERFLOW : 0; } BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) @@ -1373,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8b644113715e..fb3d3262dc1a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -71,6 +71,9 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, if (array == NULL) return -ENOBUFS; + /* paired with smp_rmb() in __vlan_group_get_device() */ + smp_wmb(); + vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 953405362795..fa3ad3d4d58c 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -57,6 +57,10 @@ static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg, array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; + + /* paired with smp_wmb() in vlan_group_prealloc_vid() */ + smp_rmb(); + return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL; } diff --git a/net/core/dev.c b/net/core/dev.c index d9bf63dbe4fd..222b1d322c96 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4723,10 +4723,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; + bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; - bool orig_bcast; int off; /* Reinjected packets coming from act_mirred or similar should @@ -4773,6 +4773,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; @@ -4800,8 +4801,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } @@ -5962,7 +5966,7 @@ static void gro_list_prepare(const struct list_head *head, } } -static void skb_gro_reset_offset(struct sk_buff *skb) +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; @@ -5973,7 +5977,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) { + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), @@ -6191,7 +6195,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); @@ -6280,7 +6284,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) napi->skb = NULL; skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); diff --git a/net/core/devlink.c b/net/core/devlink.c index 737b61c2976e..4eb969518ee0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8599,9 +8599,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf) + u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -8615,6 +8616,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; + attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); @@ -8667,6 +8669,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (attrs->pci_sf.external) { + n = snprintf(name, len, "c%u", attrs->pci_sf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8379719d1dce..98f20efbfadf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,6 +131,9 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); + if (n->dead) + goto out; + /* remove from the gc list if new state is permanent or if neighbor * is externally learned; otherwise entry should be on the gc list */ @@ -147,6 +150,7 @@ static void neigh_update_gc_list(struct neighbour *n) atomic_inc(&n->tbl->gc_entries); } +out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3d190d22b0d8..6f1b82b8ad49 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -188,7 +188,7 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; - return sk->sk_prot->psock_update_sk_prot(sk, false); + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) @@ -1521,7 +1521,7 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); @@ -1532,6 +1532,7 @@ void sock_map_close(struct sock *sk, long timeout) sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock, true); + sk_psock_put(sk, psock); release_sock(sk); saved_close(sk, timeout); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 4f49c12dae53..ad9d17923fc5 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -499,9 +499,8 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -int tcp_bpf_update_proto(struct sock *sk, bool restore) +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { - struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7d5c4ebf42fe..954c4591a6fd 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -103,14 +103,12 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -int udp_bpf_update_proto(struct sock *sk, bool restore) +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; - struct sk_psock *psock = sk_psock(sk); if (restore) { sk->sk_write_space = psock->saved_write_space; - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, psock->sk_proto); return 0; } @@ -118,7 +116,6 @@ int udp_bpf_update_proto(struct sock *sk, bool restore) if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); return 0; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8bf21996734d..29a2d690d8d5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -392,6 +392,14 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) return false; } +static void mptcp_set_datafin_timeout(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, + TCP_RTO_MIN << icsk->icsk_retransmits); +} + static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) { long tout = ssk && inet_csk(ssk)->icsk_pending ? @@ -1062,7 +1070,7 @@ out: } if (snd_una == READ_ONCE(msk->snd_nxt)) { - if (msk->timer_ival) + if (msk->timer_ival && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { mptcp_reset_timer(sk); @@ -2287,8 +2295,19 @@ static void __mptcp_retrans(struct sock *sk) __mptcp_clean_una_wakeup(sk); dfrag = mptcp_rtx_head(sk); - if (!dfrag) + if (!dfrag) { + if (mptcp_data_fin_enabled(msk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_retransmits++; + mptcp_set_datafin_timeout(sk); + mptcp_send_ack(msk); + + goto reset_timer; + } + return; + } ssk = mptcp_subflow_get_retrans(msk); if (!ssk) @@ -2474,6 +2493,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) pr_debug("Sending DATA_FIN on subflow %p", ssk); mptcp_set_timeout(sk, ssk); tcp_send_ack(ssk); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); } break; } diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 15424d26e85d..96b524ceabca 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -392,7 +392,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * * Start with a full bucket. */ - band->bucket = (band->burst_size + band->rate) * 1000ULL; + band->bucket = band->burst_size * 1000ULL; band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; @@ -641,7 +641,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, long long int max_bucket_size; band = &meter->bands[i]; - max_bucket_size = (band->burst_size + band->rate) * 1000LL; + max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index 2bf2b1943e61..fa611678af05 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -50,6 +50,9 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep); int rc; + if (skb->sk) + sock_hold(skb->sk); + rc = skb_linearize(skb); if (rc) goto free_skb; @@ -59,12 +62,11 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) if (rc) goto free_skb; - if (skb->sk) - sock_hold(skb->sk); - return rc; free_skb: + if (skb->sk) + sock_put(skb->sk); kfree_skb(skb); return rc; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 922ed6b91abb..5c91df52b8c2 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -945,6 +945,12 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); + + if (!cycle) { + NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); + return -EINVAL; + } + new->cycle_time = cycle; } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e4370b1b7494..902cb6dd710b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -733,6 +733,23 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, return t->send_pkt(reply); } +/* This function should be called with sk_lock held and SOCK_DONE set */ +static void virtio_transport_remove_sock(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt, *tmp; + + /* We don't need to take rx_lock, as the socket is closing and we are + * removing it. + */ + list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + vsock_remove_sock(vsk); +} + static void virtio_transport_wait_close(struct sock *sk, long timeout) { if (timeout) { @@ -765,7 +782,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { vsk->close_work_scheduled = false; - vsock_remove_sock(vsk); + virtio_transport_remove_sock(vsk); /* Release refcnt obtained when we scheduled the timeout */ sock_put(sk); @@ -828,22 +845,15 @@ static bool virtio_transport_close(struct vsock_sock *vsk) void virtio_transport_release(struct vsock_sock *vsk) { - struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt, *tmp; struct sock *sk = &vsk->sk; bool remove_sock = true; if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); - list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - if (remove_sock) { sock_set_flag(sk, SOCK_DONE); - vsock_remove_sock(vsk); + virtio_transport_remove_sock(vsk); } } EXPORT_SYMBOL_GPL(virtio_transport_release); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 8b65323207db..1c9ecb18b8e6 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -568,8 +568,7 @@ vmci_transport_queue_pair_alloc(struct vmci_qp **qpair, peer, flags, VMCI_NO_PRIVILEGE_FLAGS); out: if (err < 0) { - pr_err("Could not attach to queue pair with %d\n", - err); + pr_err_once("Could not attach to queue pair with %d\n", err); err = vmci_transport_error_to_vsock_error(err); } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a71ed664da0a..cd62d4ba87a9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -30,7 +30,7 @@ #include "xdp_umem.h" #include "xsk.h" -#define TX_BATCH_SIZE 16 +#define TX_BATCH_SIZE 32 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c index 3f4599c9a202..ef30d2b353b0 100644 --- a/samples/bpf/tracex1_kern.c +++ b/samples/bpf/tracex1_kern.c @@ -26,7 +26,7 @@ SEC("kprobe/__netif_receive_skb_core") int bpf_prog1(struct pt_regs *ctx) { - /* attaches to kprobe netif_receive_skb, + /* attaches to kprobe __netif_receive_skb_core, * looks for packets on loobpack device and prints them */ char devname[IFNAMSIZ]; @@ -35,7 +35,7 @@ int bpf_prog1(struct pt_regs *ctx) int len; /* non-portable! works for the given kernel only */ - skb = (struct sk_buff *) PT_REGS_PARM1(ctx); + bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); dev = _(skb->dev); len = _(skb->len); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 3b261b0f74f0..667aacb9261c 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -213,6 +213,7 @@ vmlinux_link() gen_btf() { local pahole_ver + local extra_paholeopt= if ! [ -x "$(command -v ${PAHOLE})" ]; then echo >&2 "BTF: ${1}: pahole (${PAHOLE}) is not available" @@ -227,8 +228,12 @@ gen_btf() vmlinux_link ${1} + if [ "${pahole_ver}" -ge "121" ]; then + extra_paholeopt="${extra_paholeopt} --btf_gen_floats" + fi + info "BTF" ${2} - LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1} + LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${extra_paholeopt} ${1} # Create ${2} which contains just .BTF section but no symbols. Add # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 62953bbf68b4..385d5c955cf3 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -71,7 +71,9 @@ static const char *btf_var_linkage_str(__u32 linkage) case BTF_VAR_STATIC: return "static"; case BTF_VAR_GLOBAL_ALLOCATED: - return "global-alloc"; + return "global"; + case BTF_VAR_GLOBAL_EXTERN: + return "extern"; default: return "(unknown)"; } @@ -98,26 +100,28 @@ static const char *btf_str(const struct btf *btf, __u32 off) return btf__name_by_offset(btf, off) ? : "(invalid)"; } +static int btf_kind_safe(int kind) +{ + return kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; +} + static int dump_btf_type(const struct btf *btf, __u32 id, const struct btf_type *t) { json_writer_t *w = json_wtr; - int kind, safe_kind; - - kind = BTF_INFO_KIND(t->info); - safe_kind = kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; + int kind = btf_kind(t); if (json_output) { jsonw_start_object(w); jsonw_uint_field(w, "id", id); - jsonw_string_field(w, "kind", btf_kind_str[safe_kind]); + jsonw_string_field(w, "kind", btf_kind_str[btf_kind_safe(kind)]); jsonw_string_field(w, "name", btf_str(btf, t->name_off)); } else { - printf("[%u] %s '%s'", id, btf_kind_str[safe_kind], + printf("[%u] %s '%s'", id, btf_kind_str[btf_kind_safe(kind)], btf_str(btf, t->name_off)); } - switch (BTF_INFO_KIND(t->info)) { + switch (kind) { case BTF_KIND_INT: { __u32 v = *(__u32 *)(t + 1); const char *enc; @@ -300,7 +304,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id, break; } case BTF_KIND_DATASEC: { - const struct btf_var_secinfo *v = (const void *)(t+1); + const struct btf_var_secinfo *v = (const void *)(t + 1); + const struct btf_type *vt; __u16 vlen = BTF_INFO_VLEN(t->info); int i; @@ -322,6 +327,13 @@ static int dump_btf_type(const struct btf *btf, __u32 id, } else { printf("\n\ttype_id=%u offset=%u size=%u", v->type, v->offset, v->size); + + if (v->type <= btf__get_nr_types(btf)) { + vt = btf__type_by_id(btf, v->type); + printf(" (%s '%s')", + btf_kind_str[btf_kind_safe(btf_kind(vt))], + btf_str(btf, vt->name_off)); + } } } if (json_output) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index ff3aa0cf3997..f836d115d7d6 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -157,7 +157,7 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq, if (len == 0) break; - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len); nh = NLMSG_NEXT(nh, len)) { if (nh->nlmsg_pid != nl_pid) { ret = -LIBBPF_ERRNO__WRNGPID; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69902603012c..ec6d85a81744 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -312,6 +312,27 @@ union bpf_iter_link_info { * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -4061,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4078,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4088,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4578,7 +4615,7 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based + * Check packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. * @@ -4595,6 +4632,14 @@ union bpf_attr { * against the current net device. This is practical if this isn't * used prior to redirect. * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * * The Linux kernel route table can configure MTUs on a more * specific per route level, which is not provided by this helper. * For route level MTU checks use the **bpf_fib_lookup**\ () @@ -4619,11 +4664,9 @@ union bpf_attr { * * On return *mtu_len* pointer contains the MTU value of the net * device. Remember the net device configured MTU is the L3 size, - * which is returned here and XDP and TX length operate at L2. + * which is returned here and XDP and TC length operate at L2. * Helper take this into account for you, but remember when using - * MTU value in your BPF-code. On input *mtu_len* must be a valid - * pointer and be initialized (to zero), else verifier will reject - * BPF program. + * MTU value in your BPF-code. * * Return * * 0 on success, and populate MTU value in *mtu_len* pointer. @@ -4665,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4832,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5373,6 +5444,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cc2e51c64a54..9720dc0b4605 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -25,9 +25,16 @@ /* * Helper macro to place programs, maps, license in * different sections in elf_bpf file. Section names - * are interpreted by elf_bpf loader + * are interpreted by libbpf depending on the context (BPF programs, BPF maps, + * extern variables, etc). + * To allow use of SEC() with externs (e.g., for extern .maps declarations), + * make sure __attribute__((unused)) doesn't trigger compilation warning. */ -#define SEC(NAME) __attribute__((section(NAME), used)) +#define SEC(name) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ + __attribute__((section(name), used)) \ + _Pragma("GCC diagnostic pop") \ /* Avoid 'linux/stddef.h' definition of '__always_inline'. */ #undef __always_inline @@ -40,6 +47,14 @@ #define __weak __attribute__((weak)) #endif +/* + * Use __hidden attribute to mark a non-static BPF subprogram effectively + * static for BPF verifier's verification algorithm purposes, allowing more + * extensive and permissive BPF verification process, taking into account + * subprogram's caller context. + */ +#define __hidden __attribute__((visibility("hidden"))) + /* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include * any system-level headers (such as stddef.h, linux/version.h, etc), and * commonly-used macros like NULL and KERNEL_VERSION aren't available through @@ -51,7 +66,7 @@ #endif #ifndef KERNEL_VERSION -#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c)) +#define KERNEL_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) #endif /* diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index f9ef37707888..8c954ebc0c7c 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -413,20 +413,56 @@ typeof(name(0)) name(struct pt_regs *ctx) \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) +#define ___bpf_fill0(arr, p, x) do {} while (0) +#define ___bpf_fill1(arr, p, x) arr[p] = x +#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args) +#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args) +#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args) +#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args) +#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args) +#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args) +#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args) +#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args) +#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args) +#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args) +#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args) +#define ___bpf_fill(arr, args...) \ + ___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args) + /* * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values * in a structure. */ -#define BPF_SEQ_PRINTF(seq, fmt, args...) \ - ({ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - static const char ___fmt[] = fmt; \ - unsigned long long ___param[] = { args }; \ - _Pragma("GCC diagnostic pop") \ - int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ - ___param, sizeof(___param)); \ - ___ret; \ - }) +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* + * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of + * an array of u64. + */ +#define BPF_SNPRINTF(out, out_size, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_snprintf(out, out_size, ___fmt, \ + ___param, sizeof(___param)); \ +}) #endif diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d30e67e7e1e5..d57e13a13798 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1605,11 +1605,6 @@ static void *btf_add_type_mem(struct btf *btf, size_t add_sz) btf->hdr->type_len, UINT_MAX, add_sz); } -static __u32 btf_type_info(int kind, int vlen, int kflag) -{ - return (kflag << 31) | (kind << 24) | vlen; -} - static void btf_type_inc_vlen(struct btf_type *t) { t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7aad78dbb4b4..a1cddd17af7d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -69,8 +69,7 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); -static const struct btf_type * -skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); static int __base_pr(enum libbpf_print_level level, const char *format, va_list args) @@ -195,7 +194,6 @@ struct reloc_desc { int insn_idx; int map_idx; int sym_off; - bool processed; }; struct bpf_sec_def; @@ -275,6 +273,7 @@ struct bpf_program { bpf_program_clear_priv_t clear_priv; bool load; + bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; int prog_ifindex; @@ -501,8 +500,6 @@ static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); static int elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn, GElf_Shdr *hdr); static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym); void bpf_program__unload(struct bpf_program *prog) { @@ -643,25 +640,29 @@ static int bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, const char *sec_name, int sec_idx) { + Elf_Data *symbols = obj->efile.symbols; struct bpf_program *prog, *progs; void *data = sec_data->d_buf; - size_t sec_sz = sec_data->d_size, sec_off, prog_sz; - int nr_progs, err; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz, nr_syms; + int nr_progs, err, i; const char *name; GElf_Sym sym; progs = obj->programs; nr_progs = obj->nr_programs; + nr_syms = symbols->d_size / sizeof(GElf_Sym); sec_off = 0; - while (sec_off < sec_sz) { - if (elf_sym_by_sec_off(obj, sec_idx, sec_off, STT_FUNC, &sym)) { - pr_warn("sec '%s': failed to find program symbol at offset %zu\n", - sec_name, sec_off); - return -LIBBPF_ERRNO__FORMAT; - } + for (i = 0; i < nr_syms; i++) { + if (!gelf_getsym(symbols, i, &sym)) + continue; + if (sym.st_shndx != sec_idx) + continue; + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; prog_sz = sym.st_size; + sec_off = sym.st_value; name = elf_sym_str(obj, sym.st_name); if (!name) { @@ -699,10 +700,17 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, if (err) return err; + /* if function is a global/weak symbol, but has hidden + * visibility (STV_HIDDEN), mark its BTF FUNC as static to + * enable more permissive BPF verification mode with more + * outside context available to BPF verifier + */ + if (GELF_ST_BIND(sym.st_info) != STB_LOCAL + && GELF_ST_VISIBILITY(sym.st_other) == STV_HIDDEN) + prog->mark_btf_static = true; + nr_progs++; obj->nr_programs = nr_progs; - - sec_off += prog_sz; } return 0; @@ -1896,7 +1904,7 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) return 0; } -static const struct btf_type * +const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { const struct btf_type *t = btf__type_by_id(btf, id); @@ -1951,16 +1959,11 @@ static const char *__btf_kind_str(__u16 kind) } } -static const char *btf_kind_str(const struct btf_type *t) +const char *btf_kind_str(const struct btf_type *t) { return __btf_kind_str(btf_kind(t)); } -static enum btf_func_linkage btf_func_linkage(const struct btf_type *t) -{ - return (enum btf_func_linkage)BTF_INFO_VLEN(t->info); -} - /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -2015,254 +2018,262 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return bpf_map__set_pin_path(map, buf); } - -static int parse_btf_map_def(struct bpf_object *obj, - struct bpf_map *map, - const struct btf_type *def, - bool strict, bool is_inner, - const char *pin_root_path) +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def) { const struct btf_type *t; const struct btf_member *m; + bool is_inner = inner_def == NULL; int vlen, i; - vlen = btf_vlen(def); - m = btf_members(def); + vlen = btf_vlen(def_t); + m = btf_members(def_t); for (i = 0; i < vlen; i++, m++) { - const char *name = btf__name_by_offset(obj->btf, m->name_off); + const char *name = btf__name_by_offset(btf, m->name_off); if (!name) { - pr_warn("map '%s': invalid field #%d.\n", map->name, i); + pr_warn("map '%s': invalid field #%d.\n", map_name, i); return -EINVAL; } if (strcmp(name, "type") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.type)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_type)) return -EINVAL; - pr_debug("map '%s': found type = %u.\n", - map->name, map->def.type); + map_def->parts |= MAP_DEF_MAP_TYPE; } else if (strcmp(name, "max_entries") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.max_entries)) + if (!get_map_field_int(map_name, btf, m, &map_def->max_entries)) return -EINVAL; - pr_debug("map '%s': found max_entries = %u.\n", - map->name, map->def.max_entries); + map_def->parts |= MAP_DEF_MAX_ENTRIES; } else if (strcmp(name, "map_flags") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.map_flags)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_flags)) return -EINVAL; - pr_debug("map '%s': found map_flags = %u.\n", - map->name, map->def.map_flags); + map_def->parts |= MAP_DEF_MAP_FLAGS; } else if (strcmp(name, "numa_node") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, &map->numa_node)) + if (!get_map_field_int(map_name, btf, m, &map_def->numa_node)) return -EINVAL; - pr_debug("map '%s': found numa_node = %u.\n", map->name, map->numa_node); + map_def->parts |= MAP_DEF_NUMA_NODE; } else if (strcmp(name, "key_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found key_size = %u.\n", - map->name, sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %u.\n", - map->name, map->def.key_size, sz); + map_name, map_def->key_size, sz); return -EINVAL; } - map->def.key_size = sz; + map_def->key_size = sz; + map_def->parts |= MAP_DEF_KEY_SIZE; } else if (strcmp(name, "key") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': key type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': key spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found key [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %zd.\n", - map->name, map->def.key_size, (ssize_t)sz); + map_name, map_def->key_size, (ssize_t)sz); return -EINVAL; } - map->def.key_size = sz; - map->btf_key_type_id = t->type; + map_def->key_size = sz; + map_def->key_type_id = t->type; + map_def->parts |= MAP_DEF_KEY_SIZE | MAP_DEF_KEY_TYPE; } else if (strcmp(name, "value_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found value_size = %u.\n", - map->name, sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %u.\n", - map->name, map->def.value_size, sz); + map_name, map_def->value_size, sz); return -EINVAL; } - map->def.value_size = sz; + map_def->value_size = sz; + map_def->parts |= MAP_DEF_VALUE_SIZE; } else if (strcmp(name, "value") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': value type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': value spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found value [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %zd.\n", - map->name, map->def.value_size, (ssize_t)sz); + map_name, map_def->value_size, (ssize_t)sz); return -EINVAL; } - map->def.value_size = sz; - map->btf_value_type_id = t->type; + map_def->value_size = sz; + map_def->value_type_id = t->type; + map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; } else if (strcmp(name, "values") == 0) { + char inner_map_name[128]; int err; if (is_inner) { pr_warn("map '%s': multi-level inner maps not supported.\n", - map->name); + map_name); return -ENOTSUP; } if (i != vlen - 1) { pr_warn("map '%s': '%s' member should be last.\n", - map->name, name); + map_name, name); return -EINVAL; } - if (!bpf_map_type__is_map_in_map(map->def.type)) { + if (!bpf_map_type__is_map_in_map(map_def->map_type)) { pr_warn("map '%s': should be map-in-map.\n", - map->name); + map_name); return -ENOTSUP; } - if (map->def.value_size && map->def.value_size != 4) { + if (map_def->value_size && map_def->value_size != 4) { pr_warn("map '%s': conflicting value size %u != 4.\n", - map->name, map->def.value_size); + map_name, map_def->value_size); return -EINVAL; } - map->def.value_size = 4; - t = btf__type_by_id(obj->btf, m->type); + map_def->value_size = 4; + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': map-in-map inner type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_array(t) || btf_array(t)->nelems) { pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n", - map->name); + map_name); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, btf_array(t)->type, - NULL); + t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); if (!btf_is_ptr(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + t = skip_mods_and_typedefs(btf, t->type, NULL); if (!btf_is_struct(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - map->inner_map = calloc(1, sizeof(*map->inner_map)); - if (!map->inner_map) - return -ENOMEM; - map->inner_map->sec_idx = obj->efile.btf_maps_shndx; - map->inner_map->name = malloc(strlen(map->name) + - sizeof(".inner") + 1); - if (!map->inner_map->name) - return -ENOMEM; - sprintf(map->inner_map->name, "%s.inner", map->name); - - err = parse_btf_map_def(obj, map->inner_map, t, strict, - true /* is_inner */, NULL); + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); if (err) return err; + + map_def->parts |= MAP_DEF_INNER_MAP; } else if (strcmp(name, "pinning") == 0) { __u32 val; - int err; if (is_inner) { - pr_debug("map '%s': inner def can't be pinned.\n", - map->name); + pr_warn("map '%s': inner def can't be pinned.\n", map_name); return -EINVAL; } - if (!get_map_field_int(map->name, obj->btf, m, &val)) + if (!get_map_field_int(map_name, btf, m, &val)) return -EINVAL; - pr_debug("map '%s': found pinning = %u.\n", - map->name, val); - - if (val != LIBBPF_PIN_NONE && - val != LIBBPF_PIN_BY_NAME) { + if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { pr_warn("map '%s': invalid pinning value %u.\n", - map->name, val); + map_name, val); return -EINVAL; } - if (val == LIBBPF_PIN_BY_NAME) { - err = build_map_pin_path(map, pin_root_path); - if (err) { - pr_warn("map '%s': couldn't build pin path.\n", - map->name); - return err; - } - } + map_def->pinning = val; + map_def->parts |= MAP_DEF_PINNING; } else { if (strict) { - pr_warn("map '%s': unknown field '%s'.\n", - map->name, name); + pr_warn("map '%s': unknown field '%s'.\n", map_name, name); return -ENOTSUP; } - pr_debug("map '%s': ignoring unknown field '%s'.\n", - map->name, name); + pr_debug("map '%s': ignoring unknown field '%s'.\n", map_name, name); } } - if (map->def.type == BPF_MAP_TYPE_UNSPEC) { - pr_warn("map '%s': map type isn't specified.\n", map->name); + if (map_def->map_type == BPF_MAP_TYPE_UNSPEC) { + pr_warn("map '%s': map type isn't specified.\n", map_name); return -EINVAL; } return 0; } +static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) +{ + map->def.type = def->map_type; + map->def.key_size = def->key_size; + map->def.value_size = def->value_size; + map->def.max_entries = def->max_entries; + map->def.map_flags = def->map_flags; + + map->numa_node = def->numa_node; + map->btf_key_type_id = def->key_type_id; + map->btf_value_type_id = def->value_type_id; + + if (def->parts & MAP_DEF_MAP_TYPE) + pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); + + if (def->parts & MAP_DEF_KEY_TYPE) + pr_debug("map '%s': found key [%u], sz = %u.\n", + map->name, def->key_type_id, def->key_size); + else if (def->parts & MAP_DEF_KEY_SIZE) + pr_debug("map '%s': found key_size = %u.\n", map->name, def->key_size); + + if (def->parts & MAP_DEF_VALUE_TYPE) + pr_debug("map '%s': found value [%u], sz = %u.\n", + map->name, def->value_type_id, def->value_size); + else if (def->parts & MAP_DEF_VALUE_SIZE) + pr_debug("map '%s': found value_size = %u.\n", map->name, def->value_size); + + if (def->parts & MAP_DEF_MAX_ENTRIES) + pr_debug("map '%s': found max_entries = %u.\n", map->name, def->max_entries); + if (def->parts & MAP_DEF_MAP_FLAGS) + pr_debug("map '%s': found map_flags = %u.\n", map->name, def->map_flags); + if (def->parts & MAP_DEF_PINNING) + pr_debug("map '%s': found pinning = %u.\n", map->name, def->pinning); + if (def->parts & MAP_DEF_NUMA_NODE) + pr_debug("map '%s': found numa_node = %u.\n", map->name, def->numa_node); + + if (def->parts & MAP_DEF_INNER_MAP) + pr_debug("map '%s': found inner map definition.\n", map->name); +} + static int bpf_object__init_user_btf_map(struct bpf_object *obj, const struct btf_type *sec, int var_idx, int sec_idx, const Elf_Data *data, bool strict, const char *pin_root_path) { + struct btf_map_def map_def = {}, inner_def = {}; const struct btf_type *var, *def; const struct btf_var_secinfo *vi; const struct btf_var *var_extra; const char *map_name; struct bpf_map *map; + int err; vi = btf_var_secinfos(sec) + var_idx; var = btf__type_by_id(obj->btf, vi->type); @@ -2316,7 +2327,35 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, pr_debug("map '%s': at sec_idx %d, offset %zu.\n", map_name, map->sec_idx, map->sec_offset); - return parse_btf_map_def(obj, map, def, strict, false, pin_root_path); + err = parse_btf_map_def(map->name, obj->btf, def, strict, &map_def, &inner_def); + if (err) + return err; + + fill_map_from_def(map, &map_def); + + if (map_def.pinning == LIBBPF_PIN_BY_NAME) { + err = build_map_pin_path(map, pin_root_path); + if (err) { + pr_warn("map '%s': couldn't build pin path.\n", map->name); + return err; + } + } + + if (map_def.parts & MAP_DEF_INNER_MAP) { + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->fd = -1; + map->inner_map->sec_idx = sec_idx; + map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map_name); + + fill_map_from_def(map->inner_map, &inner_def); + } + + return 0; } static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, @@ -2618,7 +2657,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) { struct btf *kern_btf = obj->btf; bool btf_mandatory, sanitize; - int err = 0; + int i, err = 0; if (!obj->btf) return 0; @@ -2632,6 +2671,38 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) return 0; } + /* Even though some subprogs are global/weak, user might prefer more + * permissive BPF verification process that BPF verifier performs for + * static functions, taking into account more context from the caller + * functions. In such case, they need to mark such subprogs with + * __attribute__((visibility("hidden"))) and libbpf will adjust + * corresponding FUNC BTF type to be marked as static and trigger more + * involved BPF verification process. + */ + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + struct btf_type *t; + const char *name; + int j, n; + + if (!prog->mark_btf_static || !prog_is_subprog(obj, prog)) + continue; + + n = btf__get_nr_types(obj->btf); + for (j = 1; j <= n; j++) { + t = btf_type_by_id(obj->btf, j); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, prog->name) != 0) + continue; + + t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_STATIC, 0); + break; + } + } + sanitize = btf_needs_sanitization(obj); if (sanitize) { const void *raw_data; @@ -2782,26 +2853,6 @@ static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) return data; } -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym) -{ - Elf_Data *symbols = obj->efile.symbols; - size_t n = symbols->d_size / sizeof(GElf_Sym); - int i; - - for (i = 0; i < n; i++) { - if (!gelf_getsym(symbols, i, sym)) - continue; - if (sym->st_shndx != sec_idx || sym->st_value != off) - continue; - if (GELF_ST_TYPE(sym->st_info) != sym_type) - continue; - return 0; - } - - return -ENOENT; -} - static bool is_sec_name_dwarf(const char *name) { /* approximation, but the actual list is too long */ @@ -3498,8 +3549,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, const char *sym_sec_name; struct bpf_map *map; - reloc_desc->processed = false; - if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); @@ -3682,11 +3731,16 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data int err, i, nrels; const char *sym_name; __u32 insn_idx; + Elf_Scn *scn; + Elf_Data *scn_data; GElf_Sym sym; GElf_Rel rel; + scn = elf_sec_by_idx(obj, sec_idx); + scn_data = elf_sec_data(obj, scn); + relo_sec_name = elf_sec_str(obj, shdr->sh_name); - sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + sec_name = elf_sec_name(obj, scn); if (!relo_sec_name || !sec_name) return -EINVAL; @@ -3704,7 +3758,8 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; } - if (rel.r_offset % BPF_INSN_SZ) { + + if (rel.r_offset % BPF_INSN_SZ || rel.r_offset >= scn_data->d_size) { pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n", relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; @@ -3728,9 +3783,9 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); if (!prog) { - pr_warn("sec '%s': relo #%d: program not found in section '%s' for insn #%u\n", + pr_debug("sec '%s': relo #%d: couldn't find program in section '%s' for insn #%u, probably overridden weak function, skipping...\n", relo_sec_name, i, sec_name, insn_idx); - return -LIBBPF_ERRNO__RELOC; + continue; } relos = libbpf_reallocarray(prog->reloc_desc, @@ -3845,6 +3900,14 @@ __u32 bpf_map__max_entries(const struct bpf_map *map) return map->def.max_entries; } +struct bpf_map *bpf_map__inner_map(struct bpf_map *map) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) + return NULL; + + return map->inner_map; +} + int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { if (map->fd >= 0) @@ -6305,13 +6368,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_LD64: insn[0].src_reg = BPF_PSEUDO_MAP_FD; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; case RELO_DATA: insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; insn[1].imm = insn[0].imm + relo->sym_off; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; case RELO_EXTERN_VAR: ext = &obj->externs[relo->sym_off]; @@ -6329,13 +6390,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[1].imm = ext->ksym.addr >> 32; } } - relo->processed = true; break; case RELO_EXTERN_FUNC: ext = &obj->externs[relo->sym_off]; insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; insn[0].imm = ext->ksym.kernel_btf_id; - relo->processed = true; break; case RELO_SUBPROG_ADDR: insn[0].src_reg = BPF_PSEUDO_FUNC; @@ -6621,9 +6680,6 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * different main programs */ insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; - if (relo) - relo->processed = true; - pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); } @@ -6716,7 +6772,7 @@ static int bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_program *subprog; - int i, j, err; + int i, err; /* mark all subprogs as not relocated (yet) within the context of * current main program @@ -6727,9 +6783,6 @@ bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) continue; subprog->sub_insn_off = 0; - for (j = 0; j < subprog->nr_reloc; j++) - if (subprog->reloc_desc[j].type == RELO_CALL) - subprog->reloc_desc[j].processed = false; } err = bpf_object__reloc_code(obj, prog, prog); @@ -6976,7 +7029,7 @@ static bool insn_is_helper_call(struct bpf_insn *insn, enum bpf_func_id *func_id return false; } -static int bpf_object__sanitize_prog(struct bpf_object* obj, struct bpf_program *prog) +static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_insn *insn = prog->insns; enum bpf_func_id func_id; @@ -9476,6 +9529,7 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) pr_warn("error: inner_map_fd already specified\n"); return -EINVAL; } + zfree(&map->inner_map); map->inner_map_fd = fd; return 0; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f500621d28e5..bec4e6a6e31d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -480,6 +480,7 @@ LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); +LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); LIBBPF_API long libbpf_get_error(const void *ptr); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index f5990f7208ce..b9b29baf1df8 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -359,5 +359,6 @@ LIBBPF_0.4.0 { bpf_linker__finalize; bpf_linker__free; bpf_linker__new; + bpf_map__inner_map; bpf_object__set_kversion; } LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 6017902c687e..ee426226928f 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -19,6 +19,7 @@ #pragma GCC poison reallocarray #include "libbpf.h" +#include "btf.h" #ifndef EM_BPF #define EM_BPF 247 @@ -131,6 +132,50 @@ struct btf; struct btf_type; struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); +const char *btf_kind_str(const struct btf_type *t); +const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + +static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)(int)btf_vlen(t); +} + +static inline __u32 btf_type_info(int kind, int vlen, int kflag) +{ + return (kflag << 31) | (kind << 24) | vlen; +} + +enum map_def_parts { + MAP_DEF_MAP_TYPE = 0x001, + MAP_DEF_KEY_TYPE = 0x002, + MAP_DEF_KEY_SIZE = 0x004, + MAP_DEF_VALUE_TYPE = 0x008, + MAP_DEF_VALUE_SIZE = 0x010, + MAP_DEF_MAX_ENTRIES = 0x020, + MAP_DEF_MAP_FLAGS = 0x040, + MAP_DEF_NUMA_NODE = 0x080, + MAP_DEF_PINNING = 0x100, + MAP_DEF_INNER_MAP = 0x200, + + MAP_DEF_ALL = 0x3ff, /* combination of all above */ +}; + +struct btf_map_def { + enum map_def_parts parts; + __u32 map_type; + __u32 key_type_id; + __u32 key_size; + __u32 value_type_id; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + __u32 numa_node; + __u32 pinning; +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def); void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t cur_cnt, size_t max_cnt, size_t add_cnt); diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 46b16cbdcda3..9de084b1c699 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -22,6 +22,8 @@ #include "libbpf_internal.h" #include "strset.h" +#define BTF_EXTERN_SEC ".extern" + struct src_sec { const char *sec_name; /* positional (not necessarily ELF) index in an array of sections */ @@ -74,11 +76,36 @@ struct btf_ext_sec_data { void *recs; }; +struct glob_sym { + /* ELF symbol index */ + int sym_idx; + /* associated section id for .ksyms, .kconfig, etc, but not .extern */ + int sec_id; + /* extern name offset in STRTAB */ + int name_off; + /* optional associated BTF type ID */ + int btf_id; + /* BTF type ID to which VAR/FUNC type is pointing to; used for + * rewriting types when extern VAR/FUNC is resolved to a concrete + * definition + */ + int underlying_btf_id; + /* sec_var index in the corresponding dst_sec, if exists */ + int var_idx; + + /* extern or resolved/global symbol */ + bool is_extern; + /* weak or strong symbol, never goes back from strong to weak */ + bool is_weak; +}; + struct dst_sec { char *sec_name; /* positional (not necessarily ELF) index in an array of sections */ int id; + bool ephemeral; + /* ELF info */ size_t sec_idx; Elf_Scn *scn; @@ -120,22 +147,28 @@ struct bpf_linker { struct btf *btf; struct btf_ext *btf_ext; + + /* global (including extern) ELF symbols */ + int glob_sym_cnt; + struct glob_sym *glob_syms; }; #define pr_warn_elf(fmt, ...) \ -do { \ - libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)); \ -} while (0) + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)) static int init_output_elf(struct bpf_linker *linker, const char *file); static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj); static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec); static int linker_sanity_check_btf(struct src_obj *obj); static int linker_sanity_check_btf_ext(struct src_obj *obj); static int linker_fixup_btf(struct src_obj *obj); static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx); static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); @@ -282,7 +315,7 @@ static int init_output_elf(struct bpf_linker *linker, const char *file) /* ELF header */ linker->elf_hdr = elf64_newehdr(linker->elf); - if (!linker->elf_hdr){ + if (!linker->elf_hdr) { pr_warn_elf("failed to create ELF header"); return -EINVAL; } @@ -663,8 +696,8 @@ static bool is_pow_of_2(size_t x) static int linker_sanity_check_elf(struct src_obj *obj) { - struct src_sec *sec, *link_sec; - int i, j, n; + struct src_sec *sec; + int i, err; if (!obj->symtab_sec_idx) { pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); @@ -692,43 +725,11 @@ static int linker_sanity_check_elf(struct src_obj *obj) return -EINVAL; switch (sec->shdr->sh_type) { - case SHT_SYMTAB: { - Elf64_Sym *sym; - - if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) - return -EINVAL; - if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) - return -EINVAL; - - if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { - pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } - link_sec = &obj->secs[sec->shdr->sh_link]; - if (link_sec->shdr->sh_type != SHT_STRTAB) { - pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } - - n = sec->shdr->sh_size / sec->shdr->sh_entsize; - sym = sec->data->d_buf; - for (j = 0; j < n; j++, sym++) { - if (sym->st_shndx - && sym->st_shndx < SHN_LORESERVE - && sym->st_shndx >= obj->sec_cnt) { - pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", - j, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); - return -EINVAL; - } - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) { - if (sym->st_value != 0) - return -EINVAL; - } - } + case SHT_SYMTAB: + err = linker_sanity_check_elf_symtab(obj, sec); + if (err) + return err; break; - } case SHT_STRTAB: break; case SHT_PROGBITS: @@ -739,87 +740,169 @@ static int linker_sanity_check_elf(struct src_obj *obj) break; case SHT_NOBITS: break; - case SHT_REL: { - Elf64_Rel *relo; - struct src_sec *sym_sec; + case SHT_REL: + err = linker_sanity_check_elf_relos(obj, sec); + if (err) + return err; + break; + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } - if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) - return -EINVAL; - if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) - return -EINVAL; + return 0; +} - /* SHT_REL's sh_link should point to SYMTAB */ - if (sec->shdr->sh_link != obj->symtab_sec_idx) { - pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec; + Elf64_Sym *sym; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } - /* SHT_REL's sh_info points to relocated section */ - if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { - pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (i = 0; i < n; i++, sym++) { + int sym_type = ELF64_ST_TYPE(sym->st_info); + int sym_bind = ELF64_ST_BIND(sym->st_info); + int sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + + if (i == 0) { + if (sym->st_name != 0 || sym->st_info != 0 + || sym->st_other != 0 || sym->st_shndx != 0 + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #0 is invalid in %s\n", obj->filename); return -EINVAL; } - link_sec = &obj->secs[sec->shdr->sh_info]; + continue; + } + if (sym_bind != STB_LOCAL && sym_bind != STB_GLOBAL && sym_bind != STB_WEAK) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol binding %d\n", + i, sec->sec_idx, sym_bind); + return -EINVAL; + } + if (sym_vis != STV_DEFAULT && sym_vis != STV_HIDDEN) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol visibility %d\n", + i, sec->sec_idx, sym_vis); + return -EINVAL; + } + if (sym->st_shndx == 0) { + if (sym_type != STT_NOTYPE || sym_bind == STB_LOCAL + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #%d is invalid extern symbol in %s\n", + i, obj->filename); - /* .rel<secname> -> <secname> pattern is followed */ - if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 - || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { - pr_warn("ELF relo section #%zu name has invalid name in %s\n", - sec->sec_idx, obj->filename); return -EINVAL; } + continue; + } + if (sym->st_shndx < SHN_LORESERVE && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (sym_type == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + continue; + } + } - /* don't further validate relocations for ignored sections */ - if (link_sec->skipped) - break; + return 0; +} - /* relocatable section is data or instructions */ - if (link_sec->shdr->sh_type != SHT_PROGBITS - && link_sec->shdr->sh_type != SHT_NOBITS) { - pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); - return -EINVAL; - } +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec, *sym_sec; + Elf64_Rel *relo; + int i, n; - /* check sanity of each relocation */ - n = sec->shdr->sh_size / sec->shdr->sh_entsize; - relo = sec->data->d_buf; - sym_sec = &obj->secs[obj->symtab_sec_idx]; - for (j = 0; j < n; j++, relo++) { - size_t sym_idx = ELF64_R_SYM(relo->r_info); - size_t sym_type = ELF64_R_TYPE(relo->r_info); - - if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { - pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", - j, sec->sec_idx, sym_type, obj->filename); - return -EINVAL; - } + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; - if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { - pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", - j, sec->sec_idx, sym_idx, obj->filename); - return -EINVAL; - } + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } - if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { - if (relo->r_offset % sizeof(struct bpf_insn) != 0) { - pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", - j, sec->sec_idx, sym_idx, obj->filename); - return -EINVAL; - } - } - } - break; + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel<secname> -> <secname> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + return 0; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (i = 0; i < n; i++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + i, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; } - case SHT_LLVM_ADDRSIG: - break; - default: - pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", - sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); return -EINVAL; } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } } return 0; @@ -897,6 +980,7 @@ static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct s dst_sec->sec_sz = 0; dst_sec->sec_idx = 0; + dst_sec->ephemeral = src_sec->ephemeral; /* ephemeral sections are just thin section shells lacking most parts */ if (src_sec->ephemeral) @@ -904,13 +988,13 @@ static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct s scn = elf_newscn(linker->elf); if (!scn) - return -1; + return -ENOMEM; data = elf_newdata(scn); if (!data) - return -1; + return -ENOMEM; shdr = elf64_getshdr(scn); if (!shdr) - return -1; + return -ENOMEM; dst_sec->scn = scn; dst_sec->shdr = shdr; @@ -960,6 +1044,9 @@ static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const cha static bool secs_match(struct dst_sec *dst, struct src_sec *src) { + if (dst->ephemeral || src->ephemeral) + return true; + if (dst->shdr->sh_type != src->shdr->sh_type) { pr_warn("sec %s types mismatch\n", dst->sec_name); return false; @@ -985,13 +1072,33 @@ static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec return true; } -static int extend_sec(struct dst_sec *dst, struct src_sec *src) +static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src) { void *tmp; - size_t dst_align = dst->shdr->sh_addralign; - size_t src_align = src->shdr->sh_addralign; + size_t dst_align, src_align; size_t dst_align_sz, dst_final_sz; + int err; + + /* Ephemeral source section doesn't contribute anything to ELF + * section data. + */ + if (src->ephemeral) + return 0; + + /* Some sections (like .maps) can contain both externs (and thus be + * ephemeral) and non-externs (map definitions). So it's possible that + * it has to be "upgraded" from ephemeral to non-ephemeral when the + * first non-ephemeral entity appears. In such case, we add ELF + * section, data, etc. + */ + if (dst->ephemeral) { + err = init_sec(linker, dst, src); + if (err) + return err; + } + dst_align = dst->shdr->sh_addralign; + src_align = src->shdr->sh_addralign; if (dst_align == 0) dst_align = 1; if (dst_align < src_align) @@ -1087,10 +1194,7 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj /* record mapped section index */ src_sec->dst_id = dst_sec->id; - if (src_sec->ephemeral) - continue; - - err = extend_sec(dst_sec, src_sec); + err = extend_sec(linker, dst_sec, src_sec); if (err) return err; } @@ -1101,68 +1205,778 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) { struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; - Elf64_Sym *sym = symtab->data->d_buf, *dst_sym; - int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize, err; int str_sec_idx = symtab->shdr->sh_link; + const char *sym_name; obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); if (!obj->sym_map) return -ENOMEM; for (i = 0; i < n; i++, sym++) { - struct src_sec *src_sec = NULL; - struct dst_sec *dst_sec = NULL; - const char *sym_name; - size_t dst_sym_idx; - int name_off; - - /* we already have all-zero initial symbol */ - if (sym->st_name == 0 && sym->st_info == 0 && - sym->st_other == 0 && sym->st_shndx == SHN_UNDEF && - sym->st_value == 0 && sym->st_size ==0) + /* We already validated all-zero symbol #0 and we already + * appended it preventively to the final SYMTAB, so skip it. + */ + if (i == 0) continue; sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); if (!sym_name) { pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); - return -1; + return -EINVAL; + } + + err = linker_append_elf_sym(linker, obj, sym, sym_name, i); + if (err) + return err; + } + + return 0; +} + +static Elf64_Sym *get_sym_by_idx(struct bpf_linker *linker, size_t sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms = symtab->raw_data; + + return &syms[sym_idx]; +} + +static struct glob_sym *find_glob_sym(struct bpf_linker *linker, const char *sym_name) +{ + struct glob_sym *glob_sym; + const char *name; + int i; + + for (i = 0; i < linker->glob_sym_cnt; i++) { + glob_sym = &linker->glob_syms[i]; + name = strset__data(linker->strtab_strs) + glob_sym->name_off; + + if (strcmp(name, sym_name) == 0) + return glob_sym; + } + + return NULL; +} + +static struct glob_sym *add_glob_sym(struct bpf_linker *linker) +{ + struct glob_sym *syms, *sym; + + syms = libbpf_reallocarray(linker->glob_syms, linker->glob_sym_cnt + 1, + sizeof(*linker->glob_syms)); + if (!syms) + return NULL; + + sym = &syms[linker->glob_sym_cnt]; + memset(sym, 0, sizeof(*sym)); + sym->var_idx = -1; + + linker->glob_syms = syms; + linker->glob_sym_cnt++; + + return sym; +} + +static bool glob_sym_btf_matches(const char *sym_name, bool exact, + const struct btf *btf1, __u32 id1, + const struct btf *btf2, __u32 id2) +{ + const struct btf_type *t1, *t2; + bool is_static1, is_static2; + const char *n1, *n2; + int i, n; + +recur: + n1 = n2 = NULL; + t1 = skip_mods_and_typedefs(btf1, id1, &id1); + t2 = skip_mods_and_typedefs(btf2, id2, &id2); + + /* check if only one side is FWD, otherwise handle with common logic */ + if (!exact && btf_is_fwd(t1) != btf_is_fwd(t2)) { + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible forward declaration names '%s' and '%s'\n", + sym_name, n1, n2); + return false; } + /* validate if FWD kind matches concrete kind */ + if (btf_is_fwd(t1)) { + if (btf_kflag(t1) && btf_is_union(t2)) + return true; + if (!btf_kflag(t1) && btf_is_struct(t2)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t1) ? "union" : "struct", btf_kind_str(t2)); + } else { + if (btf_kflag(t2) && btf_is_union(t1)) + return true; + if (!btf_kflag(t2) && btf_is_struct(t1)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t2) ? "union" : "struct", btf_kind_str(t1)); + } + return false; + } + + if (btf_kind(t1) != btf_kind(t2)) { + pr_warn("global '%s': incompatible BTF kinds %s and %s\n", + sym_name, btf_kind_str(t1), btf_kind_str(t2)); + return false; + } + + switch (btf_kind(t1)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible %s names '%s' and '%s'\n", + sym_name, btf_kind_str(t1), n1, n2); + return false; + } + break; + default: + break; + } + + switch (btf_kind(t1)) { + case BTF_KIND_UNKN: /* void */ + case BTF_KIND_FWD: + return true; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + /* ignore encoding for int and enum values for enum */ + if (t1->size != t2->size) { + pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", + sym_name, btf_kind_str(t1), n1, t1->size, t2->size); + return false; + } + return true; + case BTF_KIND_PTR: + /* just validate overall shape of the referenced type, so no + * contents comparison for struct/union, and allowd fwd vs + * struct/union + */ + exact = false; + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_ARRAY: + /* ignore index type and array size */ + id1 = btf_array(t1)->type; + id2 = btf_array(t2)->type; + goto recur; + case BTF_KIND_FUNC: + /* extern and global linkages are compatible */ + is_static1 = btf_func_linkage(t1) == BTF_FUNC_STATIC; + is_static2 = btf_func_linkage(t2) == BTF_FUNC_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible func '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_VAR: + /* extern and global linkages are compatible */ + is_static1 = btf_var(t1)->linkage == BTF_VAR_STATIC; + is_static2 = btf_var(t2)->linkage == BTF_VAR_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible var '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m1, *m2; + + if (!exact) + return true; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s fields %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < n; i++, m1++, m2++) { + n1 = btf__str_by_offset(btf1, m1->name_off); + n2 = btf__str_by_offset(btf2, m2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible field #%d names '%s' and '%s'\n", + sym_name, i, n1, n2); + return false; + } + if (m1->offset != m2->offset) { + pr_warn("global '%s': incompatible field #%d ('%s') offsets\n", + sym_name, i, n1); + return false; + } + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + return true; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m1, *m2; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s params %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < n; i++, m1++, m2++) { + /* ignore func arg names */ + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + /* now check return type as well */ + id1 = t1->type; + id2 = t2->type; + goto recur; + } + + /* skip_mods_and_typedefs() make this impossible */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + /* DATASECs are never compared with each other */ + case BTF_KIND_DATASEC: + default: + pr_warn("global '%s': unsupported BTF kind %s\n", + sym_name, btf_kind_str(t1)); + return false; + } +} + +static bool map_defs_match(const char *sym_name, + const struct btf *main_btf, + const struct btf_map_def *main_def, + const struct btf_map_def *main_inner_def, + const struct btf *extra_btf, + const struct btf_map_def *extra_def, + const struct btf_map_def *extra_inner_def) +{ + const char *reason; + + if (main_def->map_type != extra_def->map_type) { + reason = "type"; + goto mismatch; + } + + /* check key type/size match */ + if (main_def->key_size != extra_def->key_size) { + reason = "key_size"; + goto mismatch; + } + if (!!main_def->key_type_id != !!extra_def->key_type_id) { + reason = "key type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_KEY_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->key_type_id, + extra_btf, extra_def->key_type_id)) { + reason = "key type"; + goto mismatch; + } + + /* validate value type/size match */ + if (main_def->value_size != extra_def->value_size) { + reason = "value_size"; + goto mismatch; + } + if (!!main_def->value_type_id != !!extra_def->value_type_id) { + reason = "value type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_VALUE_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->value_type_id, + extra_btf, extra_def->value_type_id)) { + reason = "key type"; + goto mismatch; + } + + if (main_def->max_entries != extra_def->max_entries) { + reason = "max_entries"; + goto mismatch; + } + if (main_def->map_flags != extra_def->map_flags) { + reason = "map_flags"; + goto mismatch; + } + if (main_def->numa_node != extra_def->numa_node) { + reason = "numa_node"; + goto mismatch; + } + if (main_def->pinning != extra_def->pinning) { + reason = "pinning"; + goto mismatch; + } + + if ((main_def->parts & MAP_DEF_INNER_MAP) != (extra_def->parts & MAP_DEF_INNER_MAP)) { + reason = "inner map"; + goto mismatch; + } - if (sym->st_shndx && sym->st_shndx < SHN_LORESERVE) { - src_sec = &obj->secs[sym->st_shndx]; - if (src_sec->skipped) + if (main_def->parts & MAP_DEF_INNER_MAP) { + char inner_map_name[128]; + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", sym_name); + + return map_defs_match(inner_map_name, + main_btf, main_inner_def, NULL, + extra_btf, extra_inner_def, NULL); + } + + return true; + +mismatch: + pr_warn("global '%s': map %s mismatch\n", sym_name, reason); + return false; +} + +static bool glob_map_defs_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, int btf_id) +{ + struct btf_map_def dst_def = {}, dst_inner_def = {}; + struct btf_map_def src_def = {}, src_inner_def = {}; + const struct btf_type *t; + int err; + + t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(t)) { + pr_warn("global '%s': invalid map definition type [%d]\n", sym_name, btf_id); + return false; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + + err = parse_btf_map_def(sym_name, obj->btf, t, true /*strict*/, &src_def, &src_inner_def); + if (err) { + pr_warn("global '%s': invalid map definition\n", sym_name); + return false; + } + + /* re-parse existing map definition */ + t = btf__type_by_id(linker->btf, glob_sym->btf_id); + t = skip_mods_and_typedefs(linker->btf, t->type, NULL); + err = parse_btf_map_def(sym_name, linker->btf, t, true /*strict*/, &dst_def, &dst_inner_def); + if (err) { + /* this should not happen, because we already validated it */ + pr_warn("global '%s': invalid dst map definition\n", sym_name); + return false; + } + + /* Currently extern map definition has to be complete and match + * concrete map definition exactly. This restriction might be lifted + * in the future. + */ + return map_defs_match(sym_name, linker->btf, &dst_def, &dst_inner_def, + obj->btf, &src_def, &src_inner_def); +} + +static bool glob_syms_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) +{ + const struct btf_type *src_t; + + /* if we are dealing with externs, BTF types describing both global + * and extern VARs/FUNCs should be completely present in all files + */ + if (!glob_sym->btf_id || !btf_id) { + pr_warn("BTF info is missing for global symbol '%s'\n", sym_name); + return false; + } + + src_t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(src_t) && !btf_is_func(src_t)) { + pr_warn("only extern variables and functions are supported, but got '%s' for '%s'\n", + btf_kind_str(src_t), sym_name); + return false; + } + + /* deal with .maps definitions specially */ + if (glob_sym->sec_id && strcmp(linker->secs[glob_sym->sec_id].sec_name, MAPS_ELF_SEC) == 0) + return glob_map_defs_match(sym_name, linker, glob_sym, obj, sym, btf_id); + + if (!glob_sym_btf_matches(sym_name, true /*exact*/, + linker->btf, glob_sym->btf_id, obj->btf, btf_id)) + return false; + + return true; +} + +static bool btf_is_non_static(const struct btf_type *t) +{ + return (btf_is_var(t) && btf_var(t)->linkage != BTF_VAR_STATIC) + || (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_STATIC); +} + +static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, + int *out_btf_sec_id, int *out_btf_id) +{ + int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; + const struct btf_type *t; + const struct btf_var_secinfo *vi; + const char *name; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* some global and extern FUNCs and VARs might not be associated with any + * DATASEC, so try to detect them in the same pass + */ + if (btf_is_non_static(t)) { + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, sym_name) != 0) continue; - dst_sec = &linker->secs[src_sec->dst_id]; - /* allow only one STT_SECTION symbol per section */ - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sec->sec_sym_idx) { - obj->sym_map[i] = dst_sec->sec_sym_idx; + /* remember and still try to find DATASEC */ + btf_id = i; + continue; + } + + if (!btf_is_datasec(t)) + continue; + + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + t = btf__type_by_id(obj->btf, vi->type); + name = btf__str_by_offset(obj->btf, t->name_off); + + if (strcmp(name, sym_name) != 0) + continue; + if (btf_is_var(t) && btf_var(t)->linkage == BTF_VAR_STATIC) continue; + if (btf_is_func(t) && btf_func_linkage(t) == BTF_FUNC_STATIC) + continue; + + if (btf_id && btf_id != vi->type) { + pr_warn("global/extern '%s' BTF is ambiguous: both types #%d and #%u match\n", + sym_name, btf_id, vi->type); + return -EINVAL; } + + *out_btf_sec_id = i; + *out_btf_id = vi->type; + + return 0; } + } + + /* free-floating extern or global FUNC */ + if (btf_id) { + *out_btf_sec_id = 0; + *out_btf_id = btf_id; + return 0; + } - name_off = strset__add_str(linker->strtab_strs, sym_name); - if (name_off < 0) - return name_off; + pr_warn("failed to find BTF info for global/extern symbol '%s'\n", sym_name); + return -ENOENT; +} - dst_sym = add_new_sym(linker, &dst_sym_idx); - if (!dst_sym) - return -ENOMEM; +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; - dst_sym->st_name = name_off; - dst_sym->st_info = sym->st_info; - dst_sym->st_other = sym->st_other; - dst_sym->st_shndx = src_sec ? dst_sec->sec_idx : sym->st_shndx; - dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; - dst_sym->st_size = sym->st_size; + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static int complete_extern_btf_info(struct btf *dst_btf, int dst_id, + struct btf *src_btf, int src_id) +{ + struct btf_type *dst_t = btf_type_by_id(dst_btf, dst_id); + struct btf_type *src_t = btf_type_by_id(src_btf, src_id); + struct btf_param *src_p, *dst_p; + const char *s; + int i, n, off; + + /* We already made sure that source and destination types (FUNC or + * VAR) match in terms of types and argument names. + */ + if (btf_is_var(dst_t)) { + btf_var(dst_t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + return 0; + } + + dst_t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_GLOBAL, 0); + + /* now onto FUNC_PROTO types */ + src_t = btf_type_by_id(src_btf, src_t->type); + dst_t = btf_type_by_id(dst_btf, dst_t->type); + + /* Fill in all the argument names, which for extern FUNCs are missing. + * We'll end up with two copies of FUNCs/VARs for externs, but that + * will be taken care of by BTF dedup at the very end. + * It might be that BTF types for extern in one file has less/more BTF + * information (e.g., FWD instead of full STRUCT/UNION information), + * but that should be (in most cases, subject to BTF dedup rules) + * handled and resolved by BTF dedup algorithm as well, so we won't + * worry about it. Our only job is to make sure that argument names + * are populated on both sides, otherwise BTF dedup will pedantically + * consider them different. + */ + src_p = btf_params(src_t); + dst_p = btf_params(dst_t); + for (i = 0, n = btf_vlen(dst_t); i < n; i++, src_p++, dst_p++) { + if (!src_p->name_off) + continue; + + /* src_btf has more complete info, so add name to dst_btf */ + s = btf__str_by_offset(src_btf, src_p->name_off); + off = btf__add_str(dst_btf, s); + if (off < 0) + return off; + dst_p->name_off = off; + } + return 0; +} + +static void sym_update_bind(Elf64_Sym *sym, int sym_bind) +{ + sym->st_info = ELF64_ST_INFO(sym_bind, ELF64_ST_TYPE(sym->st_info)); +} + +static void sym_update_type(Elf64_Sym *sym, int sym_type) +{ + sym->st_info = ELF64_ST_INFO(ELF64_ST_BIND(sym->st_info), sym_type); +} + +static void sym_update_visibility(Elf64_Sym *sym, int sym_vis) +{ + /* libelf doesn't provide setters for ST_VISIBILITY, + * but it is stored in the lower 2 bits of st_other + */ + sym->st_other &= 0x03; + sym->st_other |= sym_vis; +} + +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx) +{ + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + struct glob_sym *glob_sym = NULL; + int name_off, sym_type, sym_bind, sym_vis, err; + int btf_sec_id = 0, btf_id = 0; + size_t dst_sym_idx; + Elf64_Sym *dst_sym; + bool sym_is_extern; + + sym_type = ELF64_ST_TYPE(sym->st_info); + sym_bind = ELF64_ST_BIND(sym->st_info); + sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + sym_is_extern = sym->st_shndx == SHN_UNDEF; + + if (sym_is_extern) { + if (!obj->btf) { + pr_warn("externs without BTF info are not supported\n"); + return -ENOTSUP; + } + } else if (sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + return 0; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (sym_type == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; + return 0; + } + } + + if (sym_bind == STB_LOCAL) + goto add_sym; + + /* find matching BTF info */ + err = find_glob_sym_btf(obj, sym, sym_name, &btf_sec_id, &btf_id); + if (err) + return err; + + if (sym_is_extern && btf_sec_id) { + const char *sec_name = NULL; + const struct btf_type *t; + + t = btf__type_by_id(obj->btf, btf_sec_id); + sec_name = btf__str_by_offset(obj->btf, t->name_off); + + /* Clang puts unannotated extern vars into + * '.extern' BTF DATASEC. Treat them the same + * as unannotated extern funcs (which are + * currently not put into any DATASECs). + * Those don't have associated src_sec/dst_sec. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) != 0) { + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("failed to find matching ELF sec '%s'\n", sec_name); + return -ENOENT; + } + dst_sec = &linker->secs[src_sec->dst_id]; + } + } + + glob_sym = find_glob_sym(linker, sym_name); + if (glob_sym) { + /* Preventively resolve to existing symbol. This is + * needed for further relocation symbol remapping in + * the next step of linking. + */ + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + + /* If both symbols are non-externs, at least one of + * them has to be STB_WEAK, otherwise they are in + * a conflict with each other. + */ + if (!sym_is_extern && !glob_sym->is_extern + && !glob_sym->is_weak && sym_bind != STB_WEAK) { + pr_warn("conflicting non-weak symbol #%d (%s) definition in '%s'\n", + src_sym_idx, sym_name, obj->filename); + return -EINVAL; + } - obj->sym_map[i] = dst_sym_idx; + if (!glob_syms_match(sym_name, linker, glob_sym, obj, sym, src_sym_idx, btf_id)) + return -EINVAL; + + dst_sym = get_sym_by_idx(linker, glob_sym->sym_idx); - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sym) { - dst_sec->sec_sym_idx = dst_sym_idx; - dst_sym->st_value = 0; + /* If new symbol is strong, then force dst_sym to be strong as + * well; this way a mix of weak and non-weak extern + * definitions will end up being strong. + */ + if (sym_bind == STB_GLOBAL) { + /* We still need to preserve type (NOTYPE or + * OBJECT/FUNC, depending on whether the symbol is + * extern or not) + */ + sym_update_bind(dst_sym, STB_GLOBAL); + glob_sym->is_weak = false; } + /* Non-default visibility is "contaminating", with stricter + * visibility overwriting more permissive ones, even if more + * permissive visibility comes from just an extern definition. + * Currently only STV_DEFAULT and STV_HIDDEN are allowed and + * ensured by ELF symbol sanity checks above. + */ + if (sym_vis > ELF64_ST_VISIBILITY(dst_sym->st_other)) + sym_update_visibility(dst_sym, sym_vis); + + /* If the new symbol is extern, then regardless if + * existing symbol is extern or resolved global, just + * keep the existing one untouched. + */ + if (sym_is_extern) + return 0; + + /* If existing symbol is a strong resolved symbol, bail out, + * because we lost resolution battle have nothing to + * contribute. We already checked abover that there is no + * strong-strong conflict. We also already tightened binding + * and visibility, so nothing else to contribute at that point. + */ + if (!glob_sym->is_extern && sym_bind == STB_WEAK) + return 0; + + /* At this point, new symbol is strong non-extern, + * so overwrite glob_sym with new symbol information. + * Preserve binding and visibility. + */ + sym_update_type(dst_sym, sym_type); + dst_sym->st_shndx = dst_sec->sec_idx; + dst_sym->st_value = src_sec->dst_off + sym->st_value; + dst_sym->st_size = sym->st_size; + + /* see comment below about dst_sec->id vs dst_sec->sec_idx */ + glob_sym->sec_id = dst_sec->id; + glob_sym->is_extern = false; + + if (complete_extern_btf_info(linker->btf, glob_sym->btf_id, + obj->btf, btf_id)) + return -EINVAL; + + /* request updating VAR's/FUNC's underlying BTF type when appending BTF type */ + glob_sym->underlying_btf_id = 0; + + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + return 0; + } + +add_sym: + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = dst_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[src_sym_idx] = dst_sym_idx; + + if (sym_type == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + if (sym_bind != STB_LOCAL) { + glob_sym = add_glob_sym(linker); + if (!glob_sym) + return -ENOMEM; + + glob_sym->sym_idx = dst_sym_idx; + /* we use dst_sec->id (and not dst_sec->sec_idx), because + * ephemeral sections (.kconfig, .ksyms, etc) don't have + * sec_idx (as they don't have corresponding ELF section), but + * still have id. .extern doesn't have even ephemeral section + * associated with it, so dst_sec->id == dst_sec->sec_idx == 0. + */ + glob_sym->sec_id = dst_sec ? dst_sec->id : 0; + glob_sym->name_off = name_off; + /* we will fill btf_id in during BTF merging step */ + glob_sym->btf_id = 0; + glob_sym->is_extern = sym_is_extern; + glob_sym->is_weak = sym_bind == STB_WEAK; } return 0; @@ -1200,7 +2014,7 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return err; } } else if (!secs_match(dst_sec, src_sec)) { - pr_warn("Secs %s are not compatible\n", src_sec->sec_name); + pr_warn("sections %s are not compatible\n", src_sec->sec_name); return -1; } @@ -1212,7 +2026,7 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; src_sec->dst_id = dst_sec->id; - err = extend_sec(dst_sec, src_sec); + err = extend_sec(linker, dst_sec, src_sec); if (err) return err; @@ -1265,21 +2079,6 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return 0; } -static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) -{ - struct src_sec *sec; - int i; - - for (i = 1; i < obj->sec_cnt; i++) { - sec = &obj->secs[i]; - - if (strcmp(sec->sec_name, sec_name) == 0) - return sec; - } - - return NULL; -} - static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, int sym_type, const char *sym_name) { @@ -1334,12 +2133,32 @@ static int linker_fixup_btf(struct src_obj *obj) t->size = sec->shdr->sh_size; } else { /* BTF can have some sections that are not represented - * in ELF, e.g., .kconfig and .ksyms, which are used - * for special extern variables. Here we'll - * pre-create "section shells" for them to be able to - * keep track of extra per-section metadata later - * (e.g., BTF variables). + * in ELF, e.g., .kconfig, .ksyms, .extern, which are used + * for special extern variables. + * + * For all but one such special (ephemeral) + * sections, we pre-create "section shells" to be able + * to keep track of extra per-section metadata later + * (e.g., those BTF extern variables). + * + * .extern is even more special, though, because it + * contains extern variables that need to be resolved + * by static linker, not libbpf and kernel. When such + * externs are resolved, we are going to remove them + * from .extern BTF section and might end up not + * needing it at all. Each resolved extern should have + * matching non-extern VAR/FUNC in other sections. + * + * We do support leaving some of the externs + * unresolved, though, to support cases of building + * libraries, which will later be linked against final + * BPF applications. So if at finalization we still + * see unresolved externs, we'll create .extern + * section on our own. */ + if (strcmp(sec_name, BTF_EXTERN_SEC) == 0) + continue; + sec = add_src_sec(obj, sec_name); if (!sec) return -ENOMEM; @@ -1379,6 +2198,13 @@ static int linker_fixup_btf(struct src_obj *obj) static int remap_type_id(__u32 *type_id, void *ctx) { int *id_map = ctx; + int new_id = id_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); + return -EINVAL; + } *type_id = id_map[*type_id]; @@ -1389,6 +2215,7 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) { const struct btf_type *t; int i, j, n, start_id, id; + const char *name; if (!obj->btf) return 0; @@ -1401,12 +2228,44 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) return -ENOMEM; for (i = 1; i <= n; i++) { + struct glob_sym *glob_sym = NULL; + t = btf__type_by_id(obj->btf, i); /* DATASECs are handled specially below */ if (btf_kind(t) == BTF_KIND_DATASEC) continue; + if (btf_is_non_static(t)) { + /* there should be glob_sym already */ + name = btf__str_by_offset(obj->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + + /* VARs without corresponding glob_sym are those that + * belong to skipped/deduplicated sections (i.e., + * license and version), so just skip them + */ + if (!glob_sym) + continue; + + /* linker_append_elf_sym() might have requested + * updating underlying type ID, if extern was resolved + * to strong symbol or weak got upgraded to non-weak + */ + if (glob_sym->underlying_btf_id == 0) + glob_sym->underlying_btf_id = -t->type; + + /* globals from previous object files that match our + * VAR/FUNC already have a corresponding associated + * BTF type, so just make sure to use it + */ + if (glob_sym->btf_id) { + /* reuse existing BTF type for global var/func */ + obj->btf_type_map[i] = glob_sym->btf_id; + continue; + } + } + id = btf__add_type(linker->btf, obj->btf, t); if (id < 0) { pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); @@ -1414,6 +2273,12 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) } obj->btf_type_map[i] = id; + + /* record just appended BTF type for var/func */ + if (glob_sym) { + glob_sym->btf_id = id; + glob_sym->underlying_btf_id = -t->type; + } } /* remap all the types except DATASECs */ @@ -1425,6 +2290,22 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) return -EINVAL; } + /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's + * actual type), if necessary + */ + for (i = 0; i < linker->glob_sym_cnt; i++) { + struct glob_sym *glob_sym = &linker->glob_syms[i]; + struct btf_type *glob_t; + + if (glob_sym->underlying_btf_id >= 0) + continue; + + glob_sym->underlying_btf_id = obj->btf_type_map[-glob_sym->underlying_btf_id]; + + glob_t = btf_type_by_id(linker->btf, glob_sym->btf_id); + glob_t->type = glob_sym->underlying_btf_id; + } + /* append DATASEC info */ for (i = 1; i < obj->sec_cnt; i++) { struct src_sec *src_sec; @@ -1452,6 +2333,42 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) n = btf_vlen(t); for (j = 0; j < n; j++, src_var++) { void *sec_vars = dst_sec->sec_vars; + int new_id = obj->btf_type_map[src_var->type]; + struct glob_sym *glob_sym = NULL; + + t = btf_type_by_id(linker->btf, new_id); + if (btf_is_non_static(t)) { + name = btf__str_by_offset(linker->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + if (glob_sym->sec_id != dst_sec->id) { + pr_warn("global '%s': section mismatch %d vs %d\n", + name, glob_sym->sec_id, dst_sec->id); + return -EINVAL; + } + } + + /* If there is already a member (VAR or FUNC) mapped + * to the same type, don't add a duplicate entry. + * This will happen when multiple object files define + * the same extern VARs/FUNCs. + */ + if (glob_sym && glob_sym->var_idx >= 0) { + __s64 sz; + + dst_var = &dst_sec->sec_vars[glob_sym->var_idx]; + /* Because underlying BTF type might have + * changed, so might its size have changed, so + * re-calculate and update it in sec_var. + */ + sz = btf__resolve_size(linker->btf, glob_sym->underlying_btf_id); + if (sz < 0) { + pr_warn("global '%s': failed to resolve size of underlying type: %d\n", + name, (int)sz); + return -EINVAL; + } + dst_var->size = sz; + continue; + } sec_vars = libbpf_reallocarray(sec_vars, dst_sec->sec_var_cnt + 1, @@ -1466,6 +2383,9 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) dst_var->type = obj->btf_type_map[src_var->type]; dst_var->size = src_var->size; dst_var->offset = src_sec->dst_off + src_var->offset; + + if (glob_sym) + glob_sym->var_idx = dst_sec->sec_var_cnt - 1; } } @@ -1895,7 +2815,7 @@ static int finalize_btf_ext(struct bpf_linker *linker) hdr->func_info_len = funcs_sz; hdr->line_info_off = funcs_sz; hdr->line_info_len = lines_sz; - hdr->core_relo_off = funcs_sz + lines_sz;; + hdr->core_relo_off = funcs_sz + lines_sz; hdr->core_relo_len = core_relos_sz; if (funcs_sz) { diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a402f32a145c..91130648d8e6 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -39,8 +39,6 @@ EXTRA_WARNINGS += -Wundef EXTRA_WARNINGS += -Wwrite-strings EXTRA_WARNINGS += -Wformat -CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) - # Makefiles suck: This macro sets a default value of $(2) for the # variable named by $(1), unless the variable has been set by # environment or command line. This is necessary for CC and AR @@ -52,12 +50,22 @@ define allow-override $(eval $(1) = $(2))) endef +ifneq ($(LLVM),) +$(call allow-override,CC,clang) +$(call allow-override,AR,llvm-ar) +$(call allow-override,LD,ld.lld) +$(call allow-override,CXX,clang++) +$(call allow-override,STRIP,llvm-strip) +else # Allow setting various cross-compile vars or setting CROSS_COMPILE as a prefix. $(call allow-override,CC,$(CROSS_COMPILE)gcc) $(call allow-override,AR,$(CROSS_COMPILE)ar) $(call allow-override,LD,$(CROSS_COMPILE)ld) $(call allow-override,CXX,$(CROSS_COMPILE)g++) $(call allow-override,STRIP,$(CROSS_COMPILE)strip) +endif + +CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) ifneq ($(LLVM),) HOSTAR ?= llvm-ar diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6448c626498f..283e5ad8385e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -21,13 +21,18 @@ endif BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= -CFLAGS += -g -Og -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ +CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_verifier_log test_dev_cgroup \ @@ -182,7 +187,6 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) -$(TEST_GEN_FILES): docs $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c @@ -201,10 +205,12 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ CC=$(HOSTCC) LD=$(HOSTLD) \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install +all: docs + docs: $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ -f Makefile.docs \ @@ -219,7 +225,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers ifneq ($(BPFOBJ),$(HOST_BPFOBJ)) @@ -227,7 +233,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(HOST_BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif @@ -303,9 +309,15 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c -LINKED_SKELS := test_static_linked.skel.h +LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ + linked_vars.skel.h linked_maps.skel.h test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o +linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o +linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o +linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o + +LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. @@ -325,7 +337,7 @@ TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ - $$(filter-out $(SKEL_BLACKLIST), \ + $$(filter-out $(SKEL_BLACKLIST) $(LINKED_BPF_SRCS),\ $$(TRUNNER_BPF_SRCS))) TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) @@ -481,7 +493,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) - $(Q)$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 65fe318d1e71..3353778c30f8 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -193,3 +193,12 @@ Without it, the error from compiling bpf selftests looks like: libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2 __ https://reviews.llvm.org/D93563 + +Clang dependencies for static linking tests +=========================================== + +linked_vars, linked_maps, and linked_funcs tests depend on `Clang fix`__ to +generate valid BTF information for weak variables. Please make sure you use +Clang that contains the fix. + +__ https://reviews.llvm.org/D100362 diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 37e1f303fc11..5192305159ec 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -44,3 +44,5 @@ CONFIG_SECURITYFS=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_READ_POLICY=y CONFIG_BLK_DEV_LOOP=y +CONFIG_FUNCTION_TRACER=y +CONFIG_DYNAMIC_FTRACE=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 74c45d557a2b..2d3590cfb5e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -147,6 +147,7 @@ static void test_task_stack(void) return; do_dummy_read(skel->progs.dump_task_stack); + do_dummy_read(skel->progs.get_task_user_stacks); bpf_iter_task_stack__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index 5c0448910426..63990842d20f 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -58,42 +58,73 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, test_cb cb) { struct bpf_object *obj = NULL, *tgt_obj; + __u32 retval, tgt_prog_id, info_len; + struct bpf_prog_info prog_info = {}; struct bpf_program **prog = NULL; struct bpf_link **link = NULL; - __u32 duration = 0, retval; int err, tgt_fd, i; + struct btf *btf; err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC, &tgt_obj, &tgt_fd); - if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n", - target_obj_file, err, errno)) + if (!ASSERT_OK(err, "tgt_prog_load")) return; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .attach_prog_fd = tgt_fd, ); + info_len = sizeof(prog_info); + err = bpf_obj_get_info_by_fd(tgt_fd, &prog_info, &info_len); + if (!ASSERT_OK(err, "tgt_fd_get_info")) + goto close_prog; + + tgt_prog_id = prog_info.id; + btf = bpf_object__btf(tgt_obj); + link = calloc(sizeof(struct bpf_link *), prog_cnt); + if (!ASSERT_OK_PTR(link, "link_ptr")) + goto close_prog; + prog = calloc(sizeof(struct bpf_program *), prog_cnt); - if (CHECK(!link || !prog, "alloc_memory", "failed to alloc memory")) + if (!ASSERT_OK_PTR(prog, "prog_ptr")) goto close_prog; obj = bpf_object__open_file(obj_file, &opts); - if (CHECK(IS_ERR_OR_NULL(obj), "obj_open", - "failed to open %s: %ld\n", obj_file, - PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open")) goto close_prog; err = bpf_object__load(obj); - if (CHECK(err, "obj_load", "err %d\n", err)) + if (!ASSERT_OK(err, "obj_load")) goto close_prog; for (i = 0; i < prog_cnt; i++) { + struct bpf_link_info link_info; + char *tgt_name; + __s32 btf_id; + + tgt_name = strstr(prog_name[i], "/"); + if (!ASSERT_OK_PTR(tgt_name, "tgt_name")) + goto close_prog; + btf_id = btf__find_by_name_kind(btf, tgt_name + 1, BTF_KIND_FUNC); + prog[i] = bpf_object__find_program_by_title(obj, prog_name[i]); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name[i])) + if (!ASSERT_OK_PTR(prog[i], prog_name[i])) goto close_prog; + link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) + if (!ASSERT_OK_PTR(link[i], "attach_trace")) goto close_prog; + + info_len = sizeof(link_info); + memset(&link_info, 0, sizeof(link_info)); + err = bpf_obj_get_info_by_fd(bpf_link__fd(link[i]), + &link_info, &info_len); + ASSERT_OK(err, "link_fd_get_info"); + ASSERT_EQ(link_info.tracing.attach_type, + bpf_program__get_expected_attach_type(prog[i]), + "link_attach_type"); + ASSERT_EQ(link_info.tracing.target_obj_id, tgt_prog_id, "link_tgt_obj_id"); + ASSERT_EQ(link_info.tracing.target_btf_id, btf_id, "link_tgt_btf_id"); } if (cb) { @@ -106,10 +137,9 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, goto close_prog; err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6), - NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + NULL, NULL, &retval, NULL); + ASSERT_OK(err, "prog_run"); + ASSERT_EQ(retval, 0, "prog_run_ret"); if (check_data_map(obj, prog_cnt, false)) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c index 6c4d42a2386f..ccc7e8a34ab6 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -39,7 +39,7 @@ void test_fexit_sleep(void) goto cleanup; cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); - if (CHECK(cpid == -1, "clone", strerror(errno))) + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) goto cleanup; /* wait until first sys_nanosleep ends and second sys_nanosleep starts */ @@ -65,7 +65,7 @@ void test_fexit_sleep(void) /* kill the thread to unwind sys_nanosleep stack through the trampoline */ kill(cpid, 9); - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) goto cleanup; if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/linked_funcs.c b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c new file mode 100644 index 000000000000..e9916f2817ec --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_funcs.skel.h" + +void test_linked_funcs(void) +{ + int err; + struct linked_funcs *skel; + + skel = linked_funcs__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->rodata->my_tid = syscall(SYS_gettid); + skel->bss->syscall_id = SYS_getpgid; + + err = linked_funcs__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_funcs__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_val1, 2000 + 2000, "output_val1"); + ASSERT_EQ(skel->bss->output_ctx1, SYS_getpgid, "output_ctx1"); + ASSERT_EQ(skel->bss->output_weak1, 42, "output_weak1"); + + ASSERT_EQ(skel->bss->output_val2, 2 * 1000 + 2 * (2 * 1000), "output_val2"); + ASSERT_EQ(skel->bss->output_ctx2, SYS_getpgid, "output_ctx2"); + /* output_weak2 should never be updated */ + ASSERT_EQ(skel->bss->output_weak2, 0, "output_weak2"); + +cleanup: + linked_funcs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_maps.c b/tools/testing/selftests/bpf/prog_tests/linked_maps.c new file mode 100644 index 000000000000..85dcaaaf2775 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_maps.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_maps.skel.h" + +void test_linked_maps(void) +{ + int err; + struct linked_maps *skel; + + skel = linked_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + err = linked_maps__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_first1, 2000, "output_first1"); + ASSERT_EQ(skel->bss->output_second1, 2, "output_second1"); + ASSERT_EQ(skel->bss->output_weak1, 2, "output_weak1"); + +cleanup: + linked_maps__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_vars.c b/tools/testing/selftests/bpf/prog_tests/linked_vars.c new file mode 100644 index 000000000000..267166abe4c1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_vars.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_vars.skel.h" + +void test_linked_vars(void) +{ + int err; + struct linked_vars *skel; + + skel = linked_vars__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->input_bss1 = 1000; + skel->bss->input_bss2 = 2000; + skel->bss->input_bss_weak = 3000; + + err = linked_vars__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_vars__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_bss1, 1000 + 2000 + 3000, "output_bss1"); + ASSERT_EQ(skel->bss->output_bss2, 1000 + 2000 + 3000, "output_bss2"); + /* 10 comes from "winner" input_data_weak in first obj file */ + ASSERT_EQ(skel->bss->output_data1, 1 + 2 + 10, "output_bss1"); + ASSERT_EQ(skel->bss->output_data2, 1 + 2 + 10, "output_bss2"); + /* 100 comes from "winner" input_rodata_weak in first obj file */ + ASSERT_EQ(skel->bss->output_rodata1, 11 + 22 + 100, "output_weak1"); + ASSERT_EQ(skel->bss->output_rodata2, 11 + 22 + 100, "output_weak2"); + +cleanup: + linked_vars__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/map_ptr.c b/tools/testing/selftests/bpf/prog_tests/map_ptr.c index c230a573c373..4972f92205c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_ptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_ptr.c @@ -12,11 +12,22 @@ void test_map_ptr(void) __u32 duration = 0, retval; char buf[128]; int err; + int page_size = getpagesize(); - skel = map_ptr_kern__open_and_load(); - if (CHECK(!skel, "skel_open_load", "open_load failed\n")) + skel = map_ptr_kern__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) return; + err = bpf_map__set_max_entries(skel->maps.m_ringbuf, page_size); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto cleanup; + + err = map_ptr_kern__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + skel->bss->page_size = page_size; + err = bpf_prog_test_run(bpf_program__fd(skel->progs.cg_skb), 1, &pkt_v4, sizeof(pkt_v4), buf, NULL, &retval, NULL); diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 9c3c5c0f068f..37b002ca1167 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -29,22 +29,36 @@ void test_mmap(void) struct test_mmap *skel; __u64 val = 0; - skel = test_mmap__open_and_load(); - if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + skel = test_mmap__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.rdonly_map, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + /* at least 4 pages of data */ + err = bpf_map__set_max_entries(skel->maps.data_map, + 4 * (page_size / sizeof(u64))); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_mmap__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + bss_map = skel->maps.bss; data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); rdmap_fd = bpf_map__fd(skel->maps.rdonly_map); - tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) { - munmap(tmp1, 4096); + munmap(tmp1, page_size); goto cleanup; } /* now double-check if it's mmap()'able at all */ - tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno)) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 31a3114906e2..2535788e135f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -68,10 +68,10 @@ static void test_ns_current_pid_tgid_new_ns(void) cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, CLONE_NEWPID | SIGCHLD, NULL); - if (CHECK(cpid == -1, "clone", strerror(errno))) + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) return; if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index fddbc5db5d6a..de78617f6550 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -87,11 +87,20 @@ void test_ringbuf(void) pthread_t thread; long bg_ret = -1; int err, cnt; + int page_size = getpagesize(); - skel = test_ringbuf__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_ringbuf__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -110,9 +119,9 @@ void test_ringbuf(void) CHECK(skel->bss->avail_data != 3 * rec_sz, "err_avail_size", "exp %ld, got %ld\n", 3L * rec_sz, skel->bss->avail_data); - CHECK(skel->bss->ring_size != 4096, + CHECK(skel->bss->ring_size != page_size, "err_ring_size", "exp %ld, got %ld\n", - 4096L, skel->bss->ring_size); + (long)page_size, skel->bss->ring_size); CHECK(skel->bss->cons_pos != 0, "err_cons_pos", "exp %ld, got %ld\n", 0L, skel->bss->cons_pos); diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index d37161e59bb2..cef63e703924 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -41,13 +41,42 @@ static int process_sample(void *ctx, void *data, size_t len) void test_ringbuf_multi(void) { struct test_ringbuf_multi *skel; - struct ring_buffer *ringbuf; + struct ring_buffer *ringbuf = NULL; int err; + int page_size = getpagesize(); + int proto_fd = -1; - skel = test_ringbuf_multi__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf_multi__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf1, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.ringbuf2, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(bpf_map__inner_map(skel->maps.ringbuf_arr), page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0); + if (CHECK(proto_fd == -1, "bpf_create_map", "bpf_create_map failed\n")) + goto cleanup; + + err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd); + if (CHECK(err != 0, "bpf_map__set_inner_map_fd", "bpf_map__set_inner_map_fd failed\n")) + goto cleanup; + + err = test_ringbuf_multi__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + + close(proto_fd); + proto_fd = -1; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -97,6 +126,8 @@ void test_ringbuf_multi(void) 2L, skel->bss->total); cleanup: + if (proto_fd >= 0) + close(proto_fd); ring_buffer__free(ringbuf); test_ringbuf_multi__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c new file mode 100644 index 000000000000..a958c22aec75 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <test_progs.h> +#include "test_snprintf.skel.h" +#include "test_snprintf_single.skel.h" + +#define EXP_NUM_OUT "-8 9 96 -424242 1337 DABBAD00" +#define EXP_NUM_RET sizeof(EXP_NUM_OUT) + +#define EXP_IP_OUT "127.000.000.001 0000:0000:0000:0000:0000:0000:0000:0001" +#define EXP_IP_RET sizeof(EXP_IP_OUT) + +/* The third specifier, %pB, depends on compiler inlining so don't check it */ +#define EXP_SYM_OUT "schedule schedule+0x0/" +#define MIN_SYM_RET sizeof(EXP_SYM_OUT) + +/* The third specifier, %p, is a hashed pointer which changes on every reboot */ +#define EXP_ADDR_OUT "0000000000000000 ffff00000add4e55 " +#define EXP_ADDR_RET sizeof(EXP_ADDR_OUT "unknownhashedptr") + +#define EXP_STR_OUT "str1 longstr" +#define EXP_STR_RET sizeof(EXP_STR_OUT) + +#define EXP_OVER_OUT "%over" +#define EXP_OVER_RET 10 + +#define EXP_PAD_OUT " 4 000" +#define EXP_PAD_RET 900007 + +#define EXP_NO_ARG_OUT "simple case" +#define EXP_NO_ARG_RET 12 + +#define EXP_NO_BUF_RET 29 + +void test_snprintf_positive(void) +{ + char exp_addr_out[] = EXP_ADDR_OUT; + char exp_sym_out[] = EXP_SYM_OUT; + struct test_snprintf *skel; + + skel = test_snprintf__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach")) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + ASSERT_STREQ(skel->bss->num_out, EXP_NUM_OUT, "num_out"); + ASSERT_EQ(skel->bss->num_ret, EXP_NUM_RET, "num_ret"); + + ASSERT_STREQ(skel->bss->ip_out, EXP_IP_OUT, "ip_out"); + ASSERT_EQ(skel->bss->ip_ret, EXP_IP_RET, "ip_ret"); + + ASSERT_OK(memcmp(skel->bss->sym_out, exp_sym_out, + sizeof(exp_sym_out) - 1), "sym_out"); + ASSERT_LT(MIN_SYM_RET, skel->bss->sym_ret, "sym_ret"); + + ASSERT_OK(memcmp(skel->bss->addr_out, exp_addr_out, + sizeof(exp_addr_out) - 1), "addr_out"); + ASSERT_EQ(skel->bss->addr_ret, EXP_ADDR_RET, "addr_ret"); + + ASSERT_STREQ(skel->bss->str_out, EXP_STR_OUT, "str_out"); + ASSERT_EQ(skel->bss->str_ret, EXP_STR_RET, "str_ret"); + + ASSERT_STREQ(skel->bss->over_out, EXP_OVER_OUT, "over_out"); + ASSERT_EQ(skel->bss->over_ret, EXP_OVER_RET, "over_ret"); + + ASSERT_STREQ(skel->bss->pad_out, EXP_PAD_OUT, "pad_out"); + ASSERT_EQ(skel->bss->pad_ret, EXP_PAD_RET, "pad_ret"); + + ASSERT_STREQ(skel->bss->noarg_out, EXP_NO_ARG_OUT, "no_arg_out"); + ASSERT_EQ(skel->bss->noarg_ret, EXP_NO_ARG_RET, "no_arg_ret"); + + ASSERT_EQ(skel->bss->nobuf_ret, EXP_NO_BUF_RET, "no_buf_ret"); + +cleanup: + test_snprintf__destroy(skel); +} + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +/* Loads an eBPF object calling bpf_snprintf with up to 10 characters of fmt */ +static int load_single_snprintf(char *fmt) +{ + struct test_snprintf_single *skel; + int ret; + + skel = test_snprintf_single__open(); + if (!skel) + return -EINVAL; + + memcpy(skel->rodata->fmt, fmt, min(strlen(fmt) + 1, 10)); + + ret = test_snprintf_single__load(skel); + test_snprintf_single__destroy(skel); + + return ret; +} + +void test_snprintf_negative(void) +{ + ASSERT_OK(load_single_snprintf("valid %d"), "valid usage"); + + ASSERT_ERR(load_single_snprintf("0123456789"), "no terminating zero"); + ASSERT_ERR(load_single_snprintf("%d %d"), "too many specifiers"); + ASSERT_ERR(load_single_snprintf("%pi5"), "invalid specifier 1"); + ASSERT_ERR(load_single_snprintf("%a"), "invalid specifier 2"); + ASSERT_ERR(load_single_snprintf("%"), "invalid specifier 3"); + ASSERT_ERR(load_single_snprintf("%12345678"), "invalid specifier 4"); + ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5"); + ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character"); + ASSERT_ERR(load_single_snprintf("\x1"), "non printable character"); +} + +void test_snprintf(void) +{ + if (test__start_subtest("snprintf_positive")) + test_snprintf_positive(); + if (test__start_subtest("snprintf_negative")) + test_snprintf_negative(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index d5b44b135c00..4b937e5dbaca 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -3,6 +3,7 @@ #include "cgroup_helpers.h" #include <linux/tcp.h> +#include "sockopt_sk.skel.h" #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -191,60 +192,30 @@ err: return -1; } -static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title) +static void run_test(int cgroup_fd) { - enum bpf_attach_type attach_type; - enum bpf_prog_type prog_type; - struct bpf_program *prog; - int err; + struct sockopt_sk *skel; - err = libbpf_prog_type_by_name(title, &prog_type, &attach_type); - if (err) { - log_err("Failed to deduct types for %s BPF program", title); - return -1; - } + skel = sockopt_sk__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; - prog = bpf_object__find_program_by_title(obj, title); - if (!prog) { - log_err("Failed to find %s BPF program", title); - return -1; - } - - err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, - attach_type, 0); - if (err) { - log_err("Failed to attach %s BPF program", title); - return -1; - } - - return 0; -} - -static void run_test(int cgroup_fd) -{ - struct bpf_prog_load_attr attr = { - .file = "./sockopt_sk.o", - }; - struct bpf_object *obj; - int ignored; - int err; - - err = bpf_prog_load_xattr(&attr, &obj, &ignored); - if (CHECK_FAIL(err)) - return; + skel->bss->page_size = getpagesize(); - err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._setsockopt = + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) + goto cleanup; - err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._getsockopt = + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) + goto cleanup; - CHECK_FAIL(getsetsockopt()); + ASSERT_OK(getsetsockopt(), "getsetsockopt"); -close_bpf_object: - bpf_object__close(obj); +cleanup: + sockopt_sk__destroy(skel); } void test_sockopt_sk(void) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c index 50e59a2e142e..43c36f5f7649 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c @@ -35,3 +35,30 @@ int dump_task_stack(struct bpf_iter__task *ctx) return 0; } + +SEC("iter/task") +int get_task_user_stacks(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + uint64_t buf_sz = 0; + int64_t res; + + if (task == (void *)0) + return 0; + + res = bpf_get_task_stack(task, entries, + MAX_STACK_TRACE_DEPTH * SIZE_OF_ULONG, BPF_F_USER_STACK); + if (res <= 0) + return 0; + + buf_sz += res; + + /* If the verifier doesn't refine bpf_get_task_stack res, and instead + * assumes res is entirely unknown, this program will fail to load as + * the verifier will believe that max buf_sz value allows reading + * past the end of entries in bpf_seq_write call + */ + bpf_seq_write(seq, &entries, buf_sz); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c new file mode 100644 index 000000000000..b964ec1390c2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* weak and shared between two files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val1; +int output_ctx1; +int output_weak1; + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 1; +} + +/* Global functions can't be void */ +int set_output_val1(int x) +{ + output_val1 = x + subprog(x); + return x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx1(__u64 *ctx) +{ + output_ctx1 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should win because it's the first one */ +__weak int set_output_weak(int x) +{ + output_weak1 = x; + return x; +} + +extern int set_output_val2(int x); + +/* here we'll force set_output_ctx2() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx2(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val2(1000); + set_output_ctx2(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c new file mode 100644 index 000000000000..575e958e60b7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* weak and shared between both files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val2; +int output_ctx2; +int output_weak2; /* should stay zero */ + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 2; +} + +/* Global functions can't be void */ +int set_output_val2(int x) +{ + output_val2 = 2 * x + 2 * subprog(x); + return 2 * x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx2(__u64 *ctx) +{ + output_ctx2 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should lose, because it will be processed second */ +__weak int set_output_weak(int x) +{ + output_weak2 = x; + return 2 * x; +} + +extern int set_output_val1(int x); + +/* here we'll force set_output_ctx1() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx1(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val1(2000); + set_output_ctx1(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_maps1.c b/tools/testing/selftests/bpf/progs/linked_maps1.c new file mode 100644 index 000000000000..52291515cc72 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps1.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct my_key { long x; }; +struct my_value { long x; }; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct my_key); + __type(value, struct my_value); + __uint(max_entries, 16); +} map1 SEC(".maps"); + + /* Matches map2 definition in linked_maps2.c. Order of the attributes doesn't + * matter. + */ +typedef struct { + __uint(max_entries, 8); + __type(key, int); + __type(value, int); + __uint(type, BPF_MAP_TYPE_ARRAY); +} map2_t; + +extern map2_t map2 SEC(".maps"); + +/* This should be the winning map definition, but we have no way of verifying, + * so we just make sure that it links and works without errors + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first1; +int output_second1; +int output_weak1; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter1) +{ + /* update values with key = 1 */ + int key = 1, val = 1; + struct my_key key_struct = { .x = 1 }; + struct my_value val_struct = { .x = 1000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit1) +{ + /* lookup values with key = 2, set in another file */ + int key = 2, *val; + struct my_key key_struct = { .x = 2 }; + struct my_value *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first1 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second1 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak1 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_maps2.c b/tools/testing/selftests/bpf/progs/linked_maps2.c new file mode 100644 index 000000000000..0693687474ed --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps2.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* modifiers and typedefs are ignored when comparing key/value types */ +typedef struct my_key { long x; } key_type; +typedef struct my_value { long x; } value_type; + +extern struct { + __uint(max_entries, 16); + __type(key, key_type); + __type(value, value_type); + __uint(type, BPF_MAP_TYPE_HASH); +} map1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 8); +} map2 SEC(".maps"); + +/* this definition will lose, but it has to exactly match the winner */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first2; +int output_second2; +int output_weak2; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter2) +{ + /* update values with key = 2 */ + int key = 2, val = 2; + key_type key_struct = { .x = 2 }; + value_type val_struct = { .x = 2000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit2) +{ + /* lookup values with key = 1, set in another file */ + int key = 1, *val; + key_type key_struct = { .x = 1 }; + value_type *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first2 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second2 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak2 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_vars1.c b/tools/testing/selftests/bpf/progs/linked_vars1.c new file mode 100644 index 000000000000..ef9e9d0bb0ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars1.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern int LINUX_KERNEL_VERSION __kconfig; +/* this weak extern will be strict due to the other file's strong extern */ +extern bool CONFIG_BPF_SYSCALL __kconfig __weak; +extern const void bpf_link_fops __ksym __weak; + +int input_bss1; +int input_data1 = 1; +const volatile int input_rodata1 = 11; + +int input_bss_weak __weak; +/* these two definitions should win */ +int input_data_weak __weak = 10; +const volatile int input_rodata_weak __weak = 100; + +extern int input_bss2; +extern int input_data2; +extern const int input_rodata2; + +int output_bss1; +int output_data1; +int output_rodata1; + +long output_sink1; + +static __noinline int get_bss_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_bss1 + input_bss2 + input_bss_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1) +{ + output_bss1 = get_bss_res(); + output_data1 = input_data1 + input_data2 + input_data_weak; + output_rodata1 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink1 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&bpf_link_fops; + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_vars2.c b/tools/testing/selftests/bpf/progs/linked_vars2.c new file mode 100644 index 000000000000..e4f5bd388a3c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars2.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern int LINUX_KERNEL_VERSION __kconfig; +/* when an extern is defined as both strong and weak, resulting symbol will be strong */ +extern bool CONFIG_BPF_SYSCALL __kconfig; +extern const void __start_BTF __ksym; + +int input_bss2; +int input_data2 = 2; +const volatile int input_rodata2 = 22; + +int input_bss_weak __weak; +/* these two weak variables should lose */ +int input_data_weak __weak = 20; +const volatile int input_rodata_weak __weak = 200; + +extern int input_bss1; +extern int input_data1; +extern const int input_rodata1; + +int output_bss2; +int output_data2; +int output_rodata2; + +int output_sink2; + +static __noinline int get_data_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_data1 + input_data2 + input_data_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2) +{ + output_bss2 = input_bss1 + input_bss2 + input_bss_weak; + output_data2 = get_data_res(); + output_rodata2 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink2 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&__start_BTF; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index d8850bc6a9f1..d1d304c980f0 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -12,6 +12,7 @@ _Static_assert(MAX_ENTRIES < LOOP_BOUND, "MAX_ENTRIES must be < LOOP_BOUND"); enum bpf_map_type g_map_type = BPF_MAP_TYPE_UNSPEC; __u32 g_line = 0; +int page_size = 0; /* userspace should set it */ #define VERIFY_TYPE(type, func) ({ \ g_map_type = type; \ @@ -635,7 +636,6 @@ struct bpf_ringbuf_map { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } m_ringbuf SEC(".maps"); static inline int check_ringbuf(void) @@ -643,7 +643,7 @@ static inline int check_ringbuf(void) struct bpf_ringbuf_map *ringbuf = (struct bpf_ringbuf_map *)&m_ringbuf; struct bpf_map *map = (struct bpf_map *)&m_ringbuf; - VERIFY(check(&ringbuf->map, map, 0, 0, 1 << 12)); + VERIFY(check(&ringbuf->map, map, 0, 0, page_size)); return 1; } diff --git a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c index fdb4bf4408fa..eeaf6e75c9a2 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c @@ -8,18 +8,6 @@ int _version SEC("version") = 1; SEC("sk_msg1") int bpf_prog1(struct sk_msg_md *msg) { - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - - char *d; - - if (data + 8 > data_end) - return SK_DROP; - - bpf_printk("data length %i\n", (__u64)msg->data_end - (__u64)msg->data); - d = (char *)data; - bpf_printk("hello sendmsg hook %i %i\n", d[0], d[1]); - return SK_PASS; } diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index d3597f81e6e9..8acdb99b5959 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -6,11 +6,8 @@ #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; -__u32 _version SEC("version") = 1; -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 -#endif +int page_size = 0; /* userspace should set it */ #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -90,7 +87,7 @@ int _getsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; @@ -161,7 +158,7 @@ int _setsockopt(struct bpf_sockopt *ctx) if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { /* Original optlen is larger than PAGE_SIZE. */ - if (ctx->optlen != PAGE_SIZE * 2) + if (ctx->optlen != page_size * 2) return 0; /* EPERM, unexpected data size */ if (optval + 1 > optval_end) @@ -175,7 +172,7 @@ int _setsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 4eb42cff5fe9..5a5cc19a15bf 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -9,7 +9,6 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 4096); __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG); __type(key, __u32); __type(value, char); @@ -17,7 +16,6 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 512 * 4); /* at least 4 pages of data */ __uint(map_flags, BPF_F_MMAPABLE); __type(key, __u32); __type(value, __u64); diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c index 8ba9959b036b..6b3f288b7c63 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -15,7 +15,6 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf SEC(".maps"); /* inputs */ diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c index edf3b6953533..197b86546dca 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -15,7 +15,6 @@ struct sample { struct ringbuf_map { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf1 SEC(".maps"), ringbuf2 SEC(".maps"); @@ -31,6 +30,17 @@ struct { }, }; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_hash SEC(".maps") = { + .values = { + [0] = &ringbuf1, + }, +}; + /* inputs */ int pid = 0; int target_ring = 0; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c new file mode 100644 index 000000000000..951a0301c553 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char num_out[64] = {}; +long num_ret = 0; + +char ip_out[64] = {}; +long ip_ret = 0; + +char sym_out[64] = {}; +long sym_ret = 0; + +char addr_out[64] = {}; +long addr_ret = 0; + +char str_out[64] = {}; +long str_ret = 0; + +char over_out[6] = {}; +long over_ret = 0; + +char pad_out[10] = {}; +long pad_ret = 0; + +char noarg_out[64] = {}; +long noarg_ret = 0; + +long nobuf_ret = 0; + +extern const void schedule __ksym; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + /* Convenient values to pretty-print */ + const __u8 ex_ipv4[] = {127, 0, 0, 1}; + const __u8 ex_ipv6[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + static const char str1[] = "str1"; + static const char longstr[] = "longstr"; + + /* Integer types */ + num_ret = BPF_SNPRINTF(num_out, sizeof(num_out), + "%d %u %x %li %llu %lX", + -8, 9, 150, -424242, 1337, 0xDABBAD00); + /* IP addresses */ + ip_ret = BPF_SNPRINTF(ip_out, sizeof(ip_out), "%pi4 %pI6", + &ex_ipv4, &ex_ipv6); + /* Symbol lookup formatting */ + sym_ret = BPF_SNPRINTF(sym_out, sizeof(sym_out), "%ps %pS %pB", + &schedule, &schedule, &schedule); + /* Kernel pointers */ + addr_ret = BPF_SNPRINTF(addr_out, sizeof(addr_out), "%pK %px %p", + 0, 0xFFFF00000ADD4E55, 0xFFFF00000ADD4E55); + /* Strings embedding */ + str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s %+05s", + str1, longstr); + /* Overflow */ + over_ret = BPF_SNPRINTF(over_out, sizeof(over_out), "%%overflow"); + /* Padding of fixed width numbers */ + pad_ret = BPF_SNPRINTF(pad_out, sizeof(pad_out), "%5d %0900000X", 4, 4); + /* No args */ + noarg_ret = BPF_SNPRINTF(noarg_out, sizeof(noarg_out), "simple case"); + /* No buffer */ + nobuf_ret = BPF_SNPRINTF(NULL, 0, "only interested in length %d", 60); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf_single.c b/tools/testing/selftests/bpf/progs/test_snprintf_single.c new file mode 100644 index 000000000000..402adaf344f9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf_single.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* The format string is filled from the userspace such that loading fails */ +static const char fmt[10]; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + unsigned long long arg = 42; + + bpf_snprintf(NULL, 0, fmt, &arg, sizeof(arg)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index ba6eadfec565..e7b673117436 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -396,7 +396,7 @@ int _ip6vxlan_get_tunnel(struct __sk_buff *skb) SEC("geneve_set_tunnel") int _geneve_set_tunnel(struct __sk_buff *skb) { - int ret, ret2; + int ret; struct bpf_tunnel_key key; struct geneve_opt gopt; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index e87c8546230e..ee7e3b45182a 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -210,7 +210,7 @@ extern int test__join_cgroup(const char *path); #define ASSERT_ERR_PTR(ptr, name) ({ \ static int duration = 0; \ const void *___res = (ptr); \ - bool ___ok = IS_ERR(___res) \ + bool ___ok = IS_ERR(___res); \ CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res); \ ___ok; \ }) diff --git a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c index 69b048cf46d9..3e024c891178 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c +++ b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c @@ -42,3 +42,46 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, +{ + "bpf_get_task_stack return R0 range is refined", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_6, 0), // ctx->meta->seq + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 8), // ctx->task + BPF_LD_MAP_FD(BPF_REG_1, 0), // fixup_map_array_48b + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), // keep buf for seq_write + BPF_MOV64_IMM(BPF_REG_3, 48), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_EMIT_CALL(BPF_FUNC_get_task_stack), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_seq_write), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_ITER, + .kfunc = "task", + .runs = -1, // Don't run, just load + .fixup_map_array_48b = { 3 }, +}, diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index a5ce26d548e4..9a41d8bb9ff1 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -1,6 +1,10 @@ # This mimics the top-level Makefile. We do it explicitly here so that this # Makefile can operate with or without the kbuild infrastructure. +ifneq ($(LLVM),) +CC := clang +else CC := $(CROSS_COMPILE)gcc +endif ifeq (0,$(MAKELEVEL)) ifeq ($(OUTPUT),) |