diff options
193 files changed, 6005 insertions, 2830 deletions
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 2ed89abbf9a4..253496af8fef 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -29,7 +29,7 @@ list: This may also include issues related to XDP, BPF tracing, etc. Given netdev has a high volume of traffic, please also add the BPF -maintainers to Cc (from kernel MAINTAINERS_ file): +maintainers to Cc (from kernel ``MAINTAINERS`` file): * Alexei Starovoitov <[email protected]> * Daniel Borkmann <[email protected]> @@ -234,11 +234,11 @@ be subject to change. Q: samples/bpf preference vs selftests? --------------------------------------- -Q: When should I add code to `samples/bpf/`_ and when to BPF kernel -selftests_ ? +Q: When should I add code to ``samples/bpf/`` and when to BPF kernel +selftests_? A: In general, we prefer additions to BPF kernel selftests_ rather than -`samples/bpf/`_. The rationale is very simple: kernel selftests are +``samples/bpf/``. The rationale is very simple: kernel selftests are regularly run by various bots to test for kernel regressions. The more test cases we add to BPF selftests, the better the coverage @@ -246,9 +246,9 @@ and the less likely it is that those could accidentally break. It is not that BPF kernel selftests cannot demo how a specific feature can be used. -That said, `samples/bpf/`_ may be a good place for people to get started, +That said, ``samples/bpf/`` may be a good place for people to get started, so it might be advisable that simple demos of features could go into -`samples/bpf/`_, but advanced functional and corner-case testing rather +``samples/bpf/``, but advanced functional and corner-case testing rather into kernel selftests. If your sample looks like a test case, then go for BPF kernel selftests @@ -449,6 +449,19 @@ from source at https://github.com/acmel/dwarves +pahole starts to use libbpf definitions and APIs since v1.13 after the +commit 21507cd3e97b ("pahole: add libbpf as submodule under lib/bpf"). +It works well with the git repository because the libbpf submodule will +use "git submodule update --init --recursive" to update. + +Unfortunately, the default github release source code does not contain +libbpf submodule source code and this will cause build issues, the tarball +from https://git.kernel.org/pub/scm/devel/pahole/pahole.git/ is same with +github, you can get the source tarball with corresponding libbpf submodule +codes from + +https://fedorapeople.org/~acme/dwarves + Some distros have pahole version 1.16 packaged already, e.g. Fedora, Gentoo. @@ -645,10 +658,9 @@ when: .. Links .. _Documentation/process/: https://www.kernel.org/doc/html/latest/process/ -.. _MAINTAINERS: ../../MAINTAINERS .. _netdev-FAQ: ../networking/netdev-FAQ.rst -.. _samples/bpf/: ../../samples/bpf/ -.. _selftests: ../../tools/testing/selftests/bpf/ +.. _selftests: + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/ .. _Documentation/dev-tools/kselftest.rst: https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html .. _Documentation/bpf/btf.rst: btf.rst diff --git a/Documentation/devicetree/bindings/net/intel,ixp4xx-ethernet.yaml b/Documentation/devicetree/bindings/net/intel,ixp4xx-ethernet.yaml new file mode 100644 index 000000000000..f2e91d1bf7d7 --- /dev/null +++ b/Documentation/devicetree/bindings/net/intel,ixp4xx-ethernet.yaml @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +# Copyright 2018 Linaro Ltd. +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/net/intel,ixp4xx-ethernet.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: Intel IXP4xx ethernet + +allOf: + - $ref: "ethernet-controller.yaml#" + +maintainers: + - Linus Walleij <[email protected]> + +description: | + The Intel IXP4xx ethernet makes use of the IXP4xx NPE (Network + Processing Engine) and the IXP4xx Queue Manager to process + the ethernet frames. It can optionally contain an MDIO bus to + talk to PHYs. + +properties: + compatible: + const: intel,ixp4xx-ethernet + + reg: + maxItems: 1 + description: Ethernet MMIO address range + + queue-rx: + $ref: '/schemas/types.yaml#/definitions/phandle-array' + maxItems: 1 + description: phandle to the RX queue on the NPE + + queue-txready: + $ref: '/schemas/types.yaml#/definitions/phandle-array' + maxItems: 1 + description: phandle to the TX READY queue on the NPE + + phy-mode: true + + phy-handle: true + + intel,npe-handle: + $ref: '/schemas/types.yaml#/definitions/phandle-array' + maxItems: 1 + description: phandle to the NPE this ethernet instance is using + and the instance to use in the second cell + + mdio: + type: object + $ref: "mdio.yaml#" + description: optional node for embedded MDIO controller + +required: + - compatible + - reg + - queue-rx + - queue-txready + - intel,npe-handle + +additionalProperties: false + +examples: + - | + npe: npe@c8006000 { + compatible = "intel,ixp4xx-network-processing-engine"; + reg = <0xc8006000 0x1000>, <0xc8007000 0x1000>, <0xc8008000 0x1000>; + }; + + ethernet@c8009000 { + compatible = "intel,ixp4xx-ethernet"; + reg = <0xc8009000 0x1000>; + status = "disabled"; + queue-rx = <&qmgr 4>; + queue-txready = <&qmgr 21>; + intel,npe-handle = <&npe 1>; + phy-mode = "rgmii"; + phy-handle = <&phy1>; + }; + + ethernet@c800c000 { + compatible = "intel,ixp4xx-ethernet"; + reg = <0xc800c000 0x1000>; + status = "disabled"; + queue-rx = <&qmgr 3>; + queue-txready = <&qmgr 20>; + intel,npe-handle = <&npe 2>; + phy-mode = "rgmii"; + phy-handle = <&phy2>; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + phy1: ethernet-phy@1 { + reg = <1>; + }; + phy2: ethernet-phy@2 { + reg = <2>; + }; + }; + }; diff --git a/MAINTAINERS b/MAINTAINERS index c3c8fa572580..0d85ae9e61e2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8532,7 +8532,6 @@ IBM Power SRIOV Virtual NIC Device Driver M: Dany Madden <[email protected]> M: Sukadev Bhattiprolu <[email protected]> R: Thomas Falcon <[email protected]> -R: Lijun Pan <[email protected]> S: Supported F: drivers/net/ethernet/ibm/ibmvnic.* diff --git a/arch/arm/boot/dts/uniphier-pxs2.dtsi b/arch/arm/boot/dts/uniphier-pxs2.dtsi index b0b15c97306b..e81e5937a60a 100644 --- a/arch/arm/boot/dts/uniphier-pxs2.dtsi +++ b/arch/arm/boot/dts/uniphier-pxs2.dtsi @@ -583,7 +583,7 @@ clocks = <&sys_clk 6>; reset-names = "ether"; resets = <&sys_rst 6>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 0>; diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi index a87b8a678719..8f2c1c1e2c64 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi +++ b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi @@ -734,7 +734,7 @@ clocks = <&sys_clk 6>; reset-names = "ether"; resets = <&sys_rst 6>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 0>; diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi index 0e52dadf54b3..be97da132258 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi +++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi @@ -564,7 +564,7 @@ clocks = <&sys_clk 6>; reset-names = "ether"; resets = <&sys_rst 6>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 0>; @@ -585,7 +585,7 @@ clocks = <&sys_clk 7>; reset-names = "ether"; resets = <&sys_rst 7>; - phy-mode = "rgmii"; + phy-mode = "rgmii-id"; local-mac-address = [00 00 00 00 00 00]; socionext,syscon-phy-mode = <&soc_glue 1>; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index d5ca38aa8aa9..20bbda1b36e1 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4391,9 +4391,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) int agg_id = 0; int ret = 0; -#ifdef CONFIG_LOCKDEP - WARN_ON(lockdep_is_held(&bond->mode_lock)); -#endif + might_sleep(); usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); @@ -4406,7 +4404,9 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; + spin_lock_bh(&bond->mode_lock); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { + spin_unlock_bh(&bond->mode_lock); pr_debug("bond_3ad_get_active_agg_info failed\n"); /* No active aggragator means it's not safe to use * the previous array. @@ -4414,6 +4414,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) bond_reset_slave_arr(bond); goto out; } + spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) { diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index f78daba60b35..aa412506832d 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -2853,8 +2853,7 @@ static void pcnet32_check_media(struct net_device *dev, int verbose) netif_info(lp, link, dev, "link down\n"); } if (lp->phycount > 1) { - curr_link = pcnet32_check_otherphy(dev); - prev_link = 0; + pcnet32_check_otherphy(dev); } } else if (verbose || !prev_link) { netif_carrier_on(dev); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e15d454e33f0..39ac9e2f5118 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -122,7 +122,10 @@ enum board_idx { NETXTREME_E_VF, NETXTREME_C_VF, NETXTREME_S_VF, + NETXTREME_C_VF_HV, + NETXTREME_E_VF_HV, NETXTREME_E_P5_VF, + NETXTREME_E_P5_VF_HV, }; /* indexed by enum above */ @@ -170,7 +173,10 @@ static const struct { [NETXTREME_E_VF] = { "Broadcom NetXtreme-E Ethernet Virtual Function" }, [NETXTREME_C_VF] = { "Broadcom NetXtreme-C Ethernet Virtual Function" }, [NETXTREME_S_VF] = { "Broadcom NetXtreme-S Ethernet Virtual Function" }, + [NETXTREME_C_VF_HV] = { "Broadcom NetXtreme-C Virtual Function for Hyper-V" }, + [NETXTREME_E_VF_HV] = { "Broadcom NetXtreme-E Virtual Function for Hyper-V" }, [NETXTREME_E_P5_VF] = { "Broadcom BCM5750X NetXtreme-E Ethernet Virtual Function" }, + [NETXTREME_E_P5_VF_HV] = { "Broadcom BCM5750X NetXtreme-E Virtual Function for Hyper-V" }, }; static const struct pci_device_id bnxt_pci_tbl[] = { @@ -222,15 +228,25 @@ static const struct pci_device_id bnxt_pci_tbl[] = { { PCI_VDEVICE(BROADCOM, 0xd804), .driver_data = BCM58804 }, #ifdef CONFIG_BNXT_SRIOV { PCI_VDEVICE(BROADCOM, 0x1606), .driver_data = NETXTREME_E_VF }, + { PCI_VDEVICE(BROADCOM, 0x1607), .driver_data = NETXTREME_E_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x1608), .driver_data = NETXTREME_E_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x1609), .driver_data = NETXTREME_E_VF }, + { PCI_VDEVICE(BROADCOM, 0x16bd), .driver_data = NETXTREME_E_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x16c1), .driver_data = NETXTREME_E_VF }, + { PCI_VDEVICE(BROADCOM, 0x16c2), .driver_data = NETXTREME_C_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x16c3), .driver_data = NETXTREME_C_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x16c4), .driver_data = NETXTREME_E_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x16c5), .driver_data = NETXTREME_E_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x16cb), .driver_data = NETXTREME_C_VF }, { PCI_VDEVICE(BROADCOM, 0x16d3), .driver_data = NETXTREME_E_VF }, { PCI_VDEVICE(BROADCOM, 0x16dc), .driver_data = NETXTREME_E_VF }, { PCI_VDEVICE(BROADCOM, 0x16e1), .driver_data = NETXTREME_C_VF }, { PCI_VDEVICE(BROADCOM, 0x16e5), .driver_data = NETXTREME_C_VF }, + { PCI_VDEVICE(BROADCOM, 0x16e6), .driver_data = NETXTREME_C_VF_HV }, { PCI_VDEVICE(BROADCOM, 0x1806), .driver_data = NETXTREME_E_P5_VF }, { PCI_VDEVICE(BROADCOM, 0x1807), .driver_data = NETXTREME_E_P5_VF }, + { PCI_VDEVICE(BROADCOM, 0x1808), .driver_data = NETXTREME_E_P5_VF_HV }, + { PCI_VDEVICE(BROADCOM, 0x1809), .driver_data = NETXTREME_E_P5_VF_HV }, { PCI_VDEVICE(BROADCOM, 0xd800), .driver_data = NETXTREME_S_VF }, #endif { 0 } @@ -265,7 +281,8 @@ static struct workqueue_struct *bnxt_pf_wq; static bool bnxt_vf_pciid(enum board_idx idx) { return (idx == NETXTREME_C_VF || idx == NETXTREME_E_VF || - idx == NETXTREME_S_VF || idx == NETXTREME_E_P5_VF); + idx == NETXTREME_S_VF || idx == NETXTREME_C_VF_HV || + idx == NETXTREME_E_VF_HV || idx == NETXTREME_E_P5_VF); } #define DB_CP_REARM_FLAGS (DB_KEY_CP | DB_IDX_VALID) @@ -358,6 +375,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) struct pci_dev *pdev = bp->pdev; struct bnxt_tx_ring_info *txr; struct bnxt_sw_tx_bd *tx_buf; + __le32 lflags = 0; i = skb_get_queue_mapping(skb); if (unlikely(i >= bp->tx_nr_rings)) { @@ -399,6 +417,11 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT; } + if (unlikely(skb->no_fcs)) { + lflags |= cpu_to_le32(TX_BD_FLAGS_NO_CRC); + goto normal_tx; + } + if (free_size == bp->tx_ring_size && length <= bp->tx_push_thresh) { struct tx_push_buffer *tx_push_buf = txr->tx_push; struct tx_push_bd *tx_push = &tx_push_buf->push_bd; @@ -500,7 +523,7 @@ normal_tx: txbd1 = (struct tx_bd_ext *) &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)]; - txbd1->tx_bd_hsize_lflags = 0; + txbd1->tx_bd_hsize_lflags = lflags; if (skb_is_gso(skb)) { u32 hdr_len; @@ -512,14 +535,14 @@ normal_tx: hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); - txbd1->tx_bd_hsize_lflags = cpu_to_le32(TX_BD_FLAGS_LSO | + txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_LSO | TX_BD_FLAGS_T_IPID | (hdr_len << (TX_BD_HSIZE_SHIFT - 1))); length = skb_shinfo(skb)->gso_size; txbd1->tx_bd_mss = cpu_to_le32(length); length += hdr_len; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - txbd1->tx_bd_hsize_lflags = + txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); txbd1->tx_bd_mss = 0; } @@ -1732,14 +1755,16 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, cons = rxcmp->rx_cmp_opaque; if (unlikely(cons != rxr->rx_next_cons)) { - int rc1 = bnxt_discard_rx(bp, cpr, raw_cons, rxcmp); + int rc1 = bnxt_discard_rx(bp, cpr, &tmp_raw_cons, rxcmp); /* 0xffff is forced error, don't print it */ if (rxr->rx_next_cons != 0xffff) netdev_warn(bp->dev, "RX cons %x != expected cons %x\n", cons, rxr->rx_next_cons); bnxt_sched_reset(bp, rxr); - return rc1; + if (rc1) + return rc1; + goto next_rx_no_prod_no_len; } rx_buf = &rxr->rx_buf_ring[cons]; data = rx_buf->data; @@ -4145,7 +4170,7 @@ static void bnxt_free_mem(struct bnxt *bp, bool irq_re_init) bnxt_free_ntp_fltrs(bp, irq_re_init); if (irq_re_init) { bnxt_free_ring_stats(bp); - if (!(bp->fw_cap & BNXT_FW_CAP_PORT_STATS_NO_RESET) || + if (!(bp->phy_flags & BNXT_PHY_FL_PORT_STATS_NO_RESET) || test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) bnxt_free_port_stats(bp); bnxt_free_ring_grps(bp); @@ -8340,11 +8365,11 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp) #endif } -/* Allow PF and VF with default VLAN to be in promiscuous mode */ +/* Allow PF, trusted VFs and VFs with default VLAN to be in promiscuous mode */ static bool bnxt_promisc_ok(struct bnxt *bp) { #ifdef CONFIG_BNXT_SRIOV - if (BNXT_VF(bp) && !bp->vf.vlan) + if (BNXT_VF(bp) && !bp->vf.vlan && !bnxt_is_trusted_vf(bp, &bp->vf)) return false; #endif return true; @@ -8441,7 +8466,7 @@ static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init) if (bp->dev->flags & IFF_BROADCAST) vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_BCAST; - if ((bp->dev->flags & IFF_PROMISC) && bnxt_promisc_ok(bp)) + if (bp->dev->flags & IFF_PROMISC) vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS; if (bp->dev->flags & IFF_ALLMULTI) { @@ -9075,8 +9100,9 @@ static char *bnxt_report_fec(struct bnxt_link_info *link_info) static void bnxt_report_link(struct bnxt *bp) { if (bp->link_info.link_up) { - const char *duplex; + const char *signal = ""; const char *flow_ctrl; + const char *duplex; u32 speed; u16 fec; @@ -9098,9 +9124,24 @@ static void bnxt_report_link(struct bnxt *bp) flow_ctrl = "ON - receive"; else flow_ctrl = "none"; - netdev_info(bp->dev, "NIC Link is Up, %u Mbps %s duplex, Flow control: %s\n", - speed, duplex, flow_ctrl); - if (bp->flags & BNXT_FLAG_EEE_CAP) + if (bp->link_info.phy_qcfg_resp.option_flags & + PORT_PHY_QCFG_RESP_OPTION_FLAGS_SIGNAL_MODE_KNOWN) { + u8 sig_mode = bp->link_info.active_fec_sig_mode & + PORT_PHY_QCFG_RESP_SIGNAL_MODE_MASK; + switch (sig_mode) { + case PORT_PHY_QCFG_RESP_SIGNAL_MODE_NRZ: + signal = "(NRZ) "; + break; + case PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4: + signal = "(PAM4) "; + break; + default: + break; + } + } + netdev_info(bp->dev, "NIC Link is Up, %u Mbps %s%s duplex, Flow control: %s\n", + speed, signal, duplex, flow_ctrl); + if (bp->phy_flags & BNXT_PHY_FL_EEE_CAP) netdev_info(bp->dev, "EEE is %s\n", bp->eee.eee_active ? "active" : "not active"); @@ -9132,10 +9173,6 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp) struct hwrm_port_phy_qcaps_output *resp = bp->hwrm_cmd_resp_addr; struct bnxt_link_info *link_info = &bp->link_info; - bp->flags &= ~BNXT_FLAG_EEE_CAP; - if (bp->test_info) - bp->test_info->flags &= ~(BNXT_TEST_FL_EXT_LPBK | - BNXT_TEST_FL_AN_PHY_LPBK); if (bp->hwrm_spec_code < 0x10201) return 0; @@ -9146,31 +9183,17 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp) if (rc) goto hwrm_phy_qcaps_exit; + bp->phy_flags = resp->flags; if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED) { struct ethtool_eee *eee = &bp->eee; u16 fw_speeds = le16_to_cpu(resp->supported_speeds_eee_mode); - bp->flags |= BNXT_FLAG_EEE_CAP; eee->supported = _bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0); bp->lpi_tmr_lo = le32_to_cpu(resp->tx_lpi_timer_low) & PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_MASK; bp->lpi_tmr_hi = le32_to_cpu(resp->valid_tx_lpi_timer_high) & PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK; } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED) { - if (bp->test_info) - bp->test_info->flags |= BNXT_TEST_FL_EXT_LPBK; - } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_AUTONEG_LPBK_SUPPORTED) { - if (bp->test_info) - bp->test_info->flags |= BNXT_TEST_FL_AN_PHY_LPBK; - } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_SHARED_PHY_CFG_SUPPORTED) { - if (BNXT_PF(bp)) - bp->fw_cap |= BNXT_FW_CAP_SHARED_PORT_CFG; - } - if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_CUMULATIVE_COUNTERS_ON_RESET) - bp->fw_cap |= BNXT_FW_CAP_PORT_STATS_NO_RESET; if (bp->hwrm_spec_code >= 0x10a01) { if (bnxt_phy_qcaps_no_speed(resp)) { @@ -9261,7 +9284,7 @@ int bnxt_update_link(struct bnxt *bp, bool chng_link_state) PORT_PHY_QCFG_RESP_PHY_ADDR_MASK; link_info->module_status = resp->module_status; - if (bp->flags & BNXT_FLAG_EEE_CAP) { + if (bp->phy_flags & BNXT_PHY_FL_EEE_CAP) { struct ethtool_eee *eee = &bp->eee; u16 fw_speeds; @@ -9497,7 +9520,8 @@ static int bnxt_hwrm_shutdown_link(struct bnxt *bp) if (!BNXT_SINGLE_PF(bp)) return 0; - if (pci_num_vf(bp->pdev)) + if (pci_num_vf(bp->pdev) && + !(bp->phy_flags & BNXT_PHY_FL_FW_MANAGED_LKDN)) return 0; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_CFG, -1, -1); @@ -9782,7 +9806,9 @@ static ssize_t bnxt_show_temp(struct device *dev, if (!rc) len = sprintf(buf, "%u\n", resp->temp * 1000); /* display millidegree */ mutex_unlock(&bp->hwrm_cmd_lock); - return rc ?: len; + if (rc) + return rc; + return len; } static SENSOR_DEVICE_ATTR(temp1_input, 0444, bnxt_show_temp, NULL, 0); @@ -9839,7 +9865,7 @@ static bool bnxt_eee_config_ok(struct bnxt *bp) struct ethtool_eee *eee = &bp->eee; struct bnxt_link_info *link_info = &bp->link_info; - if (!(bp->flags & BNXT_FLAG_EEE_CAP)) + if (!(bp->phy_flags & BNXT_PHY_FL_EEE_CAP)) return true; if (eee->eee_enabled) { @@ -10486,7 +10512,7 @@ static void bnxt_set_rx_mode(struct net_device *dev) CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST | CFA_L2_SET_RX_MASK_REQ_MASK_BCAST); - if ((dev->flags & IFF_PROMISC) && bnxt_promisc_ok(bp)) + if (dev->flags & IFF_PROMISC) mask |= CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS; uc_update = bnxt_uc_list_updated(bp); @@ -10562,6 +10588,9 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp) } skip_uc: + if ((vnic->rx_mask & CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS) && + !bnxt_promisc_ok(bp)) + vnic->rx_mask &= ~CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS; rc = bnxt_hwrm_cfa_l2_set_rx_mask(bp, 0); if (rc && vnic->mc_list_count) { netdev_info(bp->dev, "Failed setting MC filters rc: %d, turning on ALL_MCAST mode\n", @@ -10756,6 +10785,40 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features) return rc; } +static netdev_features_t bnxt_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + struct bnxt *bp; + __be16 udp_port; + u8 l4_proto = 0; + + features = vlan_features_check(skb, features); + if (!skb->encapsulation) + return features; + + switch (vlan_get_protocol(skb)) { + case htons(ETH_P_IP): + l4_proto = ip_hdr(skb)->protocol; + break; + case htons(ETH_P_IPV6): + l4_proto = ipv6_hdr(skb)->nexthdr; + break; + default: + return features; + } + + if (l4_proto != IPPROTO_UDP) + return features; + + bp = netdev_priv(dev); + /* For UDP, we can only handle 1 Vxlan port and 1 Geneve port. */ + udp_port = udp_hdr(skb)->dest; + if (udp_port == bp->vxlan_port || udp_port == bp->nge_port) + return features; + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); +} + int bnxt_dbg_hwrm_rd_reg(struct bnxt *bp, u32 reg_off, u16 num_words, u32 *reg_buf) { @@ -12263,10 +12326,13 @@ static int bnxt_udp_tunnel_sync(struct net_device *netdev, unsigned int table) unsigned int cmd; udp_tunnel_nic_get_port(netdev, table, 0, &ti); - if (ti.type == UDP_TUNNEL_TYPE_VXLAN) + if (ti.type == UDP_TUNNEL_TYPE_VXLAN) { + bp->vxlan_port = ti.port; cmd = TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN; - else + } else { + bp->nge_port = ti.port; cmd = TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE; + } if (ti.port) return bnxt_hwrm_tunnel_dst_port_alloc(bp, ti.port, cmd); @@ -12366,6 +12432,7 @@ static const struct net_device_ops bnxt_netdev_ops = { .ndo_change_mtu = bnxt_change_mtu, .ndo_fix_features = bnxt_fix_features, .ndo_set_features = bnxt_set_features, + .ndo_features_check = bnxt_features_check, .ndo_tx_timeout = bnxt_tx_timeout, #ifdef CONFIG_BNXT_SRIOV .ndo_get_vf_config = bnxt_get_vf_config, @@ -12434,12 +12501,17 @@ static int bnxt_probe_phy(struct bnxt *bp, bool fw_dflt) int rc = 0; struct bnxt_link_info *link_info = &bp->link_info; + bp->phy_flags = 0; rc = bnxt_hwrm_phy_qcaps(bp); if (rc) { netdev_err(bp->dev, "Probe phy can't get phy capabilities (rc: %x)\n", rc); return rc; } + if (bp->phy_flags & BNXT_PHY_FL_NO_FCS) + bp->dev->priv_flags |= IFF_SUPP_NOFCS; + else + bp->dev->priv_flags &= ~IFF_SUPP_NOFCS; if (!fw_dflt) return 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 29061c577baa..24d2ad6a8740 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1341,9 +1341,6 @@ struct bnxt_led_info { struct bnxt_test_info { u8 offline_mask; - u8 flags; -#define BNXT_TEST_FL_EXT_LPBK 0x1 -#define BNXT_TEST_FL_AN_PHY_LPBK 0x2 u16 timeout; char string[BNXT_MAX_TEST][ETH_GSTRING_LEN]; }; @@ -1693,7 +1690,6 @@ struct bnxt { #define BNXT_FLAG_SHARED_RINGS 0x200 #define BNXT_FLAG_PORT_STATS 0x400 #define BNXT_FLAG_UDP_RSS_CAP 0x800 - #define BNXT_FLAG_EEE_CAP 0x1000 #define BNXT_FLAG_NEW_RSS_CAP 0x2000 #define BNXT_FLAG_WOL_CAP 0x4000 #define BNXT_FLAG_ROCEV1_CAP 0x8000 @@ -1720,8 +1716,10 @@ struct bnxt { #define BNXT_NPAR(bp) ((bp)->port_partition_type) #define BNXT_MH(bp) ((bp)->flags & BNXT_FLAG_MULTI_HOST) #define BNXT_SINGLE_PF(bp) (BNXT_PF(bp) && !BNXT_NPAR(bp) && !BNXT_MH(bp)) +#define BNXT_SH_PORT_CFG_OK(bp) (BNXT_PF(bp) && \ + ((bp)->phy_flags & BNXT_PHY_FL_SHARED_PORT_CFG)) #define BNXT_PHY_CFG_ABLE(bp) ((BNXT_SINGLE_PF(bp) || \ - ((bp)->fw_cap & BNXT_FW_CAP_SHARED_PORT_CFG)) && \ + BNXT_SH_PORT_CFG_OK(bp)) && \ (bp)->link_info.phy_state == BNXT_PHY_STATE_ENABLED) #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0) #define BNXT_RX_PAGE_MODE(bp) ((bp)->flags & BNXT_FLAG_RX_PAGE_MODE) @@ -1871,11 +1869,9 @@ struct bnxt { #define BNXT_FW_CAP_EXT_STATS_SUPPORTED 0x00040000 #define BNXT_FW_CAP_ERR_RECOVER_RELOAD 0x00100000 #define BNXT_FW_CAP_HOT_RESET 0x00200000 - #define BNXT_FW_CAP_SHARED_PORT_CFG 0x00400000 #define BNXT_FW_CAP_VLAN_RX_STRIP 0x01000000 #define BNXT_FW_CAP_VLAN_TX_INSERT 0x02000000 #define BNXT_FW_CAP_EXT_HW_STATS_SUPPORTED 0x04000000 - #define BNXT_FW_CAP_PORT_STATS_NO_RESET 0x10000000 #define BNXT_FW_CAP_RING_MONITOR 0x40000000 #define BNXT_NEW_RM(bp) ((bp)->fw_cap & BNXT_FW_CAP_NEW_RM) @@ -1918,6 +1914,8 @@ struct bnxt { u16 vxlan_fw_dst_port_id; u16 nge_fw_dst_port_id; + __be16 vxlan_port; + __be16 nge_port; u8 port_partition_type; u8 port_count; u16 br_mode; @@ -2010,6 +2008,17 @@ struct bnxt { u32 lpi_tmr_lo; u32 lpi_tmr_hi; + /* copied from flags in hwrm_port_phy_qcaps_output */ + u8 phy_flags; +#define BNXT_PHY_FL_EEE_CAP PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED +#define BNXT_PHY_FL_EXT_LPBK PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED +#define BNXT_PHY_FL_AN_PHY_LPBK PORT_PHY_QCAPS_RESP_FLAGS_AUTONEG_LPBK_SUPPORTED +#define BNXT_PHY_FL_SHARED_PORT_CFG PORT_PHY_QCAPS_RESP_FLAGS_SHARED_PHY_CFG_SUPPORTED +#define BNXT_PHY_FL_PORT_STATS_NO_RESET PORT_PHY_QCAPS_RESP_FLAGS_CUMULATIVE_COUNTERS_ON_RESET +#define BNXT_PHY_FL_NO_PHY_LPBK PORT_PHY_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED +#define BNXT_PHY_FL_FW_MANAGED_LKDN PORT_PHY_QCAPS_RESP_FLAGS_FW_MANAGED_LINK_DOWN +#define BNXT_PHY_FL_NO_FCS PORT_PHY_QCAPS_RESP_FLAGS_NO_FCS + u8 num_tests; struct bnxt_test_info *test_info; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 3b66e300c962..c664ec52ebcf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2912,7 +2912,7 @@ static int bnxt_set_eee(struct net_device *dev, struct ethtool_eee *edata) if (!BNXT_PHY_CFG_ABLE(bp)) return -EOPNOTSUPP; - if (!(bp->flags & BNXT_FLAG_EEE_CAP)) + if (!(bp->phy_flags & BNXT_PHY_FL_EEE_CAP)) return -EOPNOTSUPP; mutex_lock(&bp->link_lock); @@ -2963,7 +2963,7 @@ static int bnxt_get_eee(struct net_device *dev, struct ethtool_eee *edata) { struct bnxt *bp = netdev_priv(dev); - if (!(bp->flags & BNXT_FLAG_EEE_CAP)) + if (!(bp->phy_flags & BNXT_PHY_FL_EEE_CAP)) return -EOPNOTSUPP; *edata = bp->eee; @@ -3215,7 +3215,7 @@ static int bnxt_disable_an_for_lpbk(struct bnxt *bp, int rc; if (!link_info->autoneg || - (bp->test_info->flags & BNXT_TEST_FL_AN_PHY_LPBK)) + (bp->phy_flags & BNXT_PHY_FL_AN_PHY_LPBK)) return 0; rc = bnxt_query_force_speeds(bp, &fw_advertising); @@ -3416,7 +3416,7 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, } if ((etest->flags & ETH_TEST_FL_EXTERNAL_LB) && - (bp->test_info->flags & BNXT_TEST_FL_EXT_LPBK)) + (bp->phy_flags & BNXT_PHY_FL_EXT_LPBK)) do_ext_lpbk = true; if (etest->flags & ETH_TEST_FL_OFFLINE) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index a217316228f4..eb00a219aa51 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -49,10 +49,6 @@ static int bnxt_hwrm_fwd_async_event_cmpl(struct bnxt *bp, static int bnxt_vf_ndo_prep(struct bnxt *bp, int vf_id) { - if (!test_bit(BNXT_STATE_OPEN, &bp->state)) { - netdev_err(bp->dev, "vf ndo called though PF is down\n"); - return -EINVAL; - } if (!bp->pf.active_vfs) { netdev_err(bp->dev, "vf ndo called though sriov is disabled\n"); return -EINVAL; @@ -113,7 +109,7 @@ static int bnxt_hwrm_func_qcfg_flags(struct bnxt *bp, struct bnxt_vf_info *vf) int rc; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1); - req.fid = cpu_to_le16(vf->fw_fid); + req.fid = cpu_to_le16(BNXT_PF(bp) ? vf->fw_fid : 0xffff); mutex_lock(&bp->hwrm_cmd_lock); rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); if (rc) { @@ -125,9 +121,9 @@ static int bnxt_hwrm_func_qcfg_flags(struct bnxt *bp, struct bnxt_vf_info *vf) return 0; } -static bool bnxt_is_trusted_vf(struct bnxt *bp, struct bnxt_vf_info *vf) +bool bnxt_is_trusted_vf(struct bnxt *bp, struct bnxt_vf_info *vf) { - if (!(bp->fw_cap & BNXT_FW_CAP_TRUSTED_VF)) + if (BNXT_PF(bp) && !(bp->fw_cap & BNXT_FW_CAP_TRUSTED_VF)) return !!(vf->flags & BNXT_VF_TRUST); bnxt_hwrm_func_qcfg_flags(bp, vf); @@ -1120,10 +1116,38 @@ void bnxt_hwrm_exec_fwd_req(struct bnxt *bp) } } +int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict) +{ + struct hwrm_func_vf_cfg_input req = {0}; + int rc = 0; + + if (!BNXT_VF(bp)) + return 0; + + if (bp->hwrm_spec_code < 0x10202) { + if (is_valid_ether_addr(bp->vf.mac_addr)) + rc = -EADDRNOTAVAIL; + goto mac_done; + } + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1); + req.enables = cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR); + memcpy(req.dflt_mac_addr, mac, ETH_ALEN); + rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); +mac_done: + if (rc && strict) { + rc = -EADDRNOTAVAIL; + netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n", + mac); + return rc; + } + return 0; +} + void bnxt_update_vf_mac(struct bnxt *bp) { struct hwrm_func_qcaps_input req = {0}; struct hwrm_func_qcaps_output *resp = bp->hwrm_cmd_resp_addr; + bool inform_pf = false; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCAPS, -1, -1); req.fid = cpu_to_le16(0xffff); @@ -1139,42 +1163,24 @@ void bnxt_update_vf_mac(struct bnxt *bp) * default but the stored zero MAC will allow the VF user to change * the random MAC address using ndo_set_mac_address() if he wants. */ - if (!ether_addr_equal(resp->mac_address, bp->vf.mac_addr)) + if (!ether_addr_equal(resp->mac_address, bp->vf.mac_addr)) { memcpy(bp->vf.mac_addr, resp->mac_address, ETH_ALEN); + /* This means we are now using our own MAC address, let + * the PF know about this MAC address. + */ + if (!is_valid_ether_addr(bp->vf.mac_addr)) + inform_pf = true; + } /* overwrite netdev dev_addr with admin VF MAC */ if (is_valid_ether_addr(bp->vf.mac_addr)) memcpy(bp->dev->dev_addr, bp->vf.mac_addr, ETH_ALEN); update_vf_mac_exit: mutex_unlock(&bp->hwrm_cmd_lock); + if (inform_pf) + bnxt_approve_mac(bp, bp->dev->dev_addr, false); } -int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict) -{ - struct hwrm_func_vf_cfg_input req = {0}; - int rc = 0; - - if (!BNXT_VF(bp)) - return 0; - - if (bp->hwrm_spec_code < 0x10202) { - if (is_valid_ether_addr(bp->vf.mac_addr)) - rc = -EADDRNOTAVAIL; - goto mac_done; - } - bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1); - req.enables = cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR); - memcpy(req.dflt_mac_addr, mac, ETH_ALEN); - rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); -mac_done: - if (rc && strict) { - rc = -EADDRNOTAVAIL; - netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n", - mac); - return rc; - } - return 0; -} #else int bnxt_cfg_hw_sriov(struct bnxt *bp, int *num_vfs, bool reset) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h index 629641bf6fc5..995535e4c11b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h @@ -34,6 +34,7 @@ int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16); int bnxt_set_vf_bw(struct net_device *, int, int, int); int bnxt_set_vf_link_state(struct net_device *, int, int); int bnxt_set_vf_spoofchk(struct net_device *, int, bool); +bool bnxt_is_trusted_vf(struct bnxt *bp, struct bnxt_vf_info *vf); int bnxt_set_vf_trust(struct net_device *dev, int vf_id, bool trust); int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs); int bnxt_cfg_hw_sriov(struct bnxt *bp, int *num_vfs, bool reset); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index a3f5b80888e5..ef3f1e92632f 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -33,7 +33,6 @@ static int chcr_get_nfrags_to_send(struct sk_buff *skb, u32 start, u32 len) if (unlikely(start < skb_linear_data_len)) { frag_size = min(len, skb_linear_data_len - start); - start = 0; } else { start -= skb_linear_data_len; @@ -873,10 +872,10 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, } /* update receive window */ if (first_wr || tx_info->prev_win != tcp_win) { - pos = chcr_write_cpl_set_tcb_ulp(tx_info, q, tx_info->tid, pos, - TCB_RCV_WND_W, - TCB_RCV_WND_V(TCB_RCV_WND_M), - TCB_RCV_WND_V(tcp_win), 0); + chcr_write_cpl_set_tcb_ulp(tx_info, q, tx_info->tid, pos, + TCB_RCV_WND_W, + TCB_RCV_WND_V(TCB_RCV_WND_M), + TCB_RCV_WND_V(tcp_win), 0); tx_info->prev_win = tcp_win; cpl++; } @@ -1485,7 +1484,6 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, wr->op_to_compl = htonl(FW_WR_OP_V(FW_ULPTX_WR)); wr->flowid_len16 = htonl(wr_mid | FW_WR_LEN16_V(len16)); wr->cookie = 0; - pos += sizeof(*wr); /* ULP_TXPKT */ ulptx = (struct ulp_txpkt *)(wr + 1); ulptx->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index e7f7121821be..2a8bf53c2f75 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1522,7 +1522,6 @@ dm9000_probe(struct platform_device *pdev) if (ret) { dev_err(db->dev, "irq %d cannot set wakeup (%d)\n", db->irq_wake, ret); - ret = 0; } else { irq_set_irq_wake(db->irq_wake, 0); db->wake_supported = 1; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index f08c420a5803..2768c78528a5 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -431,7 +431,8 @@ static void prestera_port_handle_event(struct prestera_switch *sw, netif_carrier_on(port->dev); if (!delayed_work_pending(caching_dw)) queue_delayed_work(prestera_wq, caching_dw, 0); - } else { + } else if (netif_running(port->dev) && + netif_carrier_ok(port->dev)) { netif_carrier_off(port->dev); if (delayed_work_pending(caching_dw)) cancel_delayed_work(caching_dw); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 3e19b1721303..0399a396d166 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -96,7 +96,7 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, } if (!vport->egress.acl) { - vport->egress.acl = esw_acl_table_create(esw, vport->vport, + vport->egress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_EGRESS, table_size); if (IS_ERR(vport->egress.acl)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c index 26b37a0f8762..505bf811984a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c @@ -148,7 +148,7 @@ static void esw_acl_egress_ofld_groups_destroy(struct mlx5_vport *vport) esw_acl_egress_vlan_grp_destroy(vport); } -static bool esw_acl_egress_needed(const struct mlx5_eswitch *esw, u16 vport_num) +static bool esw_acl_egress_needed(struct mlx5_eswitch *esw, u16 vport_num) { return mlx5_eswitch_is_vf_vport(esw, vport_num) || mlx5_esw_is_sf_vport(esw, vport_num); } @@ -171,7 +171,7 @@ int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport table_size++; if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) table_size++; - vport->egress.acl = esw_acl_table_create(esw, vport->vport, + vport->egress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_EGRESS, table_size); if (IS_ERR(vport->egress.acl)) { err = PTR_ERR(vport->egress.acl); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c index 4a369669e51e..45b839116212 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c @@ -6,14 +6,14 @@ #include "helper.h" struct mlx5_flow_table * -esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size) +esw_acl_table_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int ns, int size) { struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *dev = esw->dev; struct mlx5_flow_namespace *root_ns; struct mlx5_flow_table *acl; int acl_supported; - int vport_index; + u16 vport_num; int err; acl_supported = (ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS) ? @@ -23,11 +23,11 @@ esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size) if (!acl_supported) return ERR_PTR(-EOPNOTSUPP); + vport_num = vport->vport; esw_debug(dev, "Create vport[%d] %s ACL table\n", vport_num, ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress"); - vport_index = mlx5_eswitch_vport_num_to_index(esw, vport_num); - root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport_index); + root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport->index); if (!root_ns) { esw_warn(dev, "Failed to get E-Switch root namespace for vport (%d)\n", vport_num); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h index 8dc4cab66a71..a47063fab57e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h @@ -8,7 +8,7 @@ /* General acl helper functions */ struct mlx5_flow_table * -esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size); +esw_acl_table_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int ns, int size); /* Egress acl helper functions */ void esw_acl_egress_table_destroy(struct mlx5_vport *vport); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index d64fad2823e7..f75b86abaf1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -177,7 +177,7 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, } if (!vport->ingress.acl) { - vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + vport->ingress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_INGRESS, table_size); if (IS_ERR(vport->ingress.acl)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c index 548c005ea633..39e948bc1204 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c @@ -7,7 +7,7 @@ #include "ofld.h" static bool -esw_acl_ingress_prio_tag_enabled(const struct mlx5_eswitch *esw, +esw_acl_ingress_prio_tag_enabled(struct mlx5_eswitch *esw, const struct mlx5_vport *vport) { return (MLX5_CAP_GEN(esw->dev, prio_tag_required) && @@ -255,7 +255,7 @@ int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, if (esw_acl_ingress_prio_tag_enabled(esw, vport)) num_ftes++; - vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + vport->ingress.acl = esw_acl_table_create(esw, vport, MLX5_FLOW_NAMESPACE_ESW_INGRESS, num_ftes); if (IS_ERR(vport->ingress.acl)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index 7bfc84238b3d..1703384eca95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -14,8 +14,7 @@ mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_i memcpy(ppid->id, &parent_id, sizeof(parent_id)); } -static bool -mlx5_esw_devlink_port_supported(const struct mlx5_eswitch *esw, u16 vport_num) +static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num) { return vport_num == MLX5_VPORT_UPLINK || (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) || @@ -124,7 +123,7 @@ struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u1 } int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum) + u16 vport_num, u32 controller, u32 sfnum) { struct mlx5_core_dev *dev = esw->dev; struct netdev_phys_item_id ppid = {}; @@ -142,7 +141,7 @@ int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_p mlx5_esw_get_port_parent_id(dev, &ppid); memcpy(dl_port->attrs.switch_id.id, &ppid.id[0], ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_sf_set(dl_port, 0, pfnum, sfnum); + devlink_port_attrs_pci_sf_set(dl_port, controller, pfnum, sfnum, !!controller); devlink = priv_to_devlink(dev); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); err = devlink_port_register(devlink, dl_port, dl_port_index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c index 8ab1224653a4..d9041b16611d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c @@ -216,7 +216,8 @@ static void esw_destroy_legacy_table(struct mlx5_eswitch *esw) int esw_legacy_enable(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; - int ret, i; + unsigned long i; + int ret; ret = esw_create_legacy_table(esw); if (ret) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 1bb229ecd43b..570f2280823c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -88,20 +88,17 @@ struct mlx5_eswitch *mlx5_devlink_eswitch_get(struct devlink *devlink) struct mlx5_vport *__must_check mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num) { - u16 idx; + struct mlx5_vport *vport; if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager)) return ERR_PTR(-EPERM); - idx = mlx5_eswitch_vport_num_to_index(esw, vport_num); - - if (idx > esw->total_vports - 1) { - esw_debug(esw->dev, "vport out of range: num(0x%x), idx(0x%x)\n", - vport_num, idx); + vport = xa_load(&esw->vports, vport_num); + if (!vport) { + esw_debug(esw->dev, "vport out of range: num(0x%x)\n", vport_num); return ERR_PTR(-EINVAL); } - - return &esw->vports[idx]; + return vport; } static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport, @@ -345,9 +342,10 @@ static void update_allmulti_vports(struct mlx5_eswitch *esw, { u8 *mac = vaddr->node.addr; struct mlx5_vport *vport; - u16 i, vport_num; + unsigned long i; + u16 vport_num; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { struct hlist_head *vport_hash = vport->mc_list; struct vport_addr *iter_vaddr = l2addr_hash_find(vport_hash, @@ -1175,7 +1173,7 @@ static void mlx5_eswitch_event_handlers_unregister(struct mlx5_eswitch *esw) static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; - int i; + unsigned long i; mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) { memset(&vport->qos, 0, sizeof(vport->qos)); @@ -1213,20 +1211,25 @@ void mlx5_eswitch_unload_vport(struct mlx5_eswitch *esw, u16 vport_num) void mlx5_eswitch_unload_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs) { - int i; + struct mlx5_vport *vport; + unsigned long i; - mlx5_esw_for_each_vf_vport_num_reverse(esw, i, num_vfs) - mlx5_eswitch_unload_vport(esw, i); + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + if (!vport->enabled) + continue; + mlx5_eswitch_unload_vport(esw, vport->vport); + } } int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs, enum mlx5_eswitch_vport_event enabled_events) { + struct mlx5_vport *vport; + unsigned long i; int err; - int i; - mlx5_esw_for_each_vf_vport_num(esw, i, num_vfs) { - err = mlx5_eswitch_load_vport(esw, i, enabled_events); + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + err = mlx5_eswitch_load_vport(esw, vport->vport, enabled_events); if (err) goto vf_err; } @@ -1234,7 +1237,7 @@ int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs, return 0; vf_err: - mlx5_eswitch_unload_vf_vports(esw, i - 1); + mlx5_eswitch_unload_vf_vports(esw, num_vfs); return err; } @@ -1563,24 +1566,161 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf) up_write(&esw->mode_lock); } +static int mlx5_query_hca_cap_host_pf(struct mlx5_core_dev *dev, void *out) +{ + u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {}; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + MLX5_SET(query_hca_cap_in, in, function_id, MLX5_VPORT_PF); + MLX5_SET(query_hca_cap_in, in, other_function, true); + return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); +} + +int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *sf_base_id) + +{ + int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *query_ctx; + void *hca_caps; + int err; + + if (!mlx5_core_is_ecpf(dev)) { + *max_sfs = 0; + return 0; + } + + query_ctx = kzalloc(query_out_sz, GFP_KERNEL); + if (!query_ctx) + return -ENOMEM; + + err = mlx5_query_hca_cap_host_pf(dev, query_ctx); + if (err) + goto out_free; + + hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); + *max_sfs = MLX5_GET(cmd_hca_cap, hca_caps, max_num_sf); + *sf_base_id = MLX5_GET(cmd_hca_cap, hca_caps, sf_base_id); + +out_free: + kfree(query_ctx); + return err; +} + +static int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, struct mlx5_core_dev *dev, + int index, u16 vport_num) +{ + struct mlx5_vport *vport; + int err; + + vport = kzalloc(sizeof(*vport), GFP_KERNEL); + if (!vport) + return -ENOMEM; + + vport->dev = esw->dev; + vport->vport = vport_num; + vport->index = index; + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + INIT_WORK(&vport->vport_change_handler, esw_vport_change_handler); + err = xa_insert(&esw->vports, vport_num, vport, GFP_KERNEL); + if (err) + goto insert_err; + + esw->total_vports++; + return 0; + +insert_err: + kfree(vport); + return err; +} + +static void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + xa_erase(&esw->vports, vport->vport); + kfree(vport); +} + +static void mlx5_esw_vports_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vport(esw, i, vport) + mlx5_esw_vport_free(esw, vport); + xa_destroy(&esw->vports); +} + +static int mlx5_esw_vports_init(struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *dev = esw->dev; + u16 max_host_pf_sfs; + u16 base_sf_num; + int idx = 0; + int err; + int i; + + xa_init(&esw->vports); + + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_PF); + if (err) + goto err; + if (esw->first_host_vport == MLX5_VPORT_PF) + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + idx++; + + for (i = 0; i < mlx5_core_max_vfs(dev); i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, idx); + if (err) + goto err; + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_VF); + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + idx++; + } + base_sf_num = mlx5_sf_start_function_id(dev); + for (i = 0; i < mlx5_sf_max_functions(dev); i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, base_sf_num + i); + if (err) + goto err; + xa_set_mark(&esw->vports, base_sf_num + i, MLX5_ESW_VPT_SF); + idx++; + } + + err = mlx5_esw_sf_max_hpf_functions(dev, &max_host_pf_sfs, &base_sf_num); + if (err) + goto err; + for (i = 0; i < max_host_pf_sfs; i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, base_sf_num + i); + if (err) + goto err; + xa_set_mark(&esw->vports, base_sf_num + i, MLX5_ESW_VPT_SF); + idx++; + } + + if (mlx5_ecpf_vport_exists(dev)) { + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_ECPF); + if (err) + goto err; + idx++; + } + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_UPLINK); + if (err) + goto err; + return 0; + +err: + mlx5_esw_vports_cleanup(esw); + return err; +} + int mlx5_eswitch_init(struct mlx5_core_dev *dev) { struct mlx5_eswitch *esw; - struct mlx5_vport *vport; - int total_vports; - int err, i; + int err; if (!MLX5_VPORT_MANAGER(dev)) return 0; - total_vports = mlx5_eswitch_get_total_vports(dev); - - esw_info(dev, - "Total vports %d, per vport: max uc(%d) max mc(%d)\n", - total_vports, - MLX5_MAX_UC_PER_VPORT(dev), - MLX5_MAX_MC_PER_VPORT(dev)); - esw = kzalloc(sizeof(*esw), GFP_KERNEL); if (!esw) return -ENOMEM; @@ -1595,18 +1735,13 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) goto abort; } - esw->vports = kcalloc(total_vports, sizeof(struct mlx5_vport), - GFP_KERNEL); - if (!esw->vports) { - err = -ENOMEM; + err = mlx5_esw_vports_init(esw); + if (err) goto abort; - } - - esw->total_vports = total_vports; err = esw_offloads_init_reps(esw); if (err) - goto abort; + goto reps_err; mutex_init(&esw->offloads.encap_tbl_lock); hash_init(esw->offloads.encap_tbl); @@ -1619,25 +1754,25 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) mutex_init(&esw->state_lock); init_rwsem(&esw->mode_lock); - mlx5_esw_for_all_vports(esw, i, vport) { - vport->vport = mlx5_eswitch_index_to_vport_num(esw, i); - vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; - vport->dev = dev; - INIT_WORK(&vport->vport_change_handler, - esw_vport_change_handler); - } - esw->enabled_vports = 0; esw->mode = MLX5_ESWITCH_NONE; esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE; dev->priv.eswitch = esw; BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); + + esw_info(dev, + "Total vports %d, per vport: max uc(%d) max mc(%d)\n", + esw->total_vports, + MLX5_MAX_UC_PER_VPORT(dev), + MLX5_MAX_MC_PER_VPORT(dev)); return 0; + +reps_err: + mlx5_esw_vports_cleanup(esw); abort: if (esw->work_queue) destroy_workqueue(esw->work_queue); - kfree(esw->vports); kfree(esw); return err; } @@ -1659,7 +1794,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) mutex_destroy(&esw->offloads.encap_tbl_lock); mutex_destroy(&esw->offloads.decap_tbl_lock); esw_offloads_cleanup_reps(esw); - kfree(esw->vports); + mlx5_esw_vports_cleanup(esw); kfree(esw); } @@ -1718,8 +1853,29 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, return err; } +static bool mlx5_esw_check_port_type(struct mlx5_eswitch *esw, u16 vport_num, xa_mark_t mark) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return false; + + return xa_get_mark(&esw->vports, vport_num, mark); +} + +bool mlx5_eswitch_is_vf_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_esw_check_port_type(esw, vport_num, MLX5_ESW_VPT_VF); +} + +bool mlx5_esw_is_sf_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_esw_check_port_type(esw, vport_num, MLX5_ESW_VPT_SF); +} + static bool -is_port_function_supported(const struct mlx5_eswitch *esw, u16 vport_num) +is_port_function_supported(struct mlx5_eswitch *esw, u16 vport_num) { return vport_num == MLX5_VPORT_PF || mlx5_eswitch_is_vf_vport(esw, vport_num) || @@ -1891,9 +2047,9 @@ static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw) u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); struct mlx5_vport *evport; u32 max_guarantee = 0; - int i; + unsigned long i; - mlx5_esw_for_all_vports(esw, i, evport) { + mlx5_esw_for_each_vport(esw, i, evport) { if (!evport->enabled || evport->qos.min_rate < max_guarantee) continue; max_guarantee = evport->qos.min_rate; @@ -1911,11 +2067,11 @@ static int normalize_vports_min_rate(struct mlx5_eswitch *esw) struct mlx5_vport *evport; u32 vport_max_rate; u32 vport_min_rate; + unsigned long i; u32 bw_share; int err; - int i; - mlx5_esw_for_all_vports(esw, i, evport) { + mlx5_esw_for_each_vport(esw, i, evport) { if (!evport->enabled) continue; vport_min_rate = evport->qos.min_rate; @@ -2205,3 +2361,19 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw) { up_write(&esw->mode_lock); } + +/** + * mlx5_eswitch_get_total_vports - Get total vports of the eswitch + * + * @dev: Pointer to core device + * + * mlx5_eswitch_get_total_vports returns total number of eswitch vports. + */ +u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) +{ + struct mlx5_eswitch *esw; + + esw = dev->priv.eswitch; + return mlx5_esw_allowed(esw) ? esw->total_vports : 0; +} +EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index b289d756a7e4..64ccb2bc0b58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -176,6 +176,7 @@ struct mlx5_vport { u16 vport; bool enabled; enum mlx5_eswitch_vport_event enabled_events; + int index; struct devlink_port *dl_port; }; @@ -228,7 +229,7 @@ struct mlx5_esw_offload { struct mlx5_flow_table *ft_offloads; struct mlx5_flow_group *vport_rx_group; - struct mlx5_eswitch_rep *vport_reps; + struct xarray vport_reps; struct list_head peer_flows; struct mutex peer_mutex; struct mutex encap_tbl_lock; /* protects encap_tbl */ @@ -278,7 +279,7 @@ struct mlx5_eswitch { struct esw_mc_addr mc_promisc; /* end of legacy */ struct workqueue_struct *work_queue; - struct mlx5_vport *vports; + struct xarray vports; u32 flags; int total_vports; int enabled_vports; @@ -545,94 +546,11 @@ static inline u16 mlx5_eswitch_first_host_vport_num(struct mlx5_core_dev *dev) MLX5_VPORT_PF : MLX5_VPORT_FIRST_VF; } -static inline int mlx5_esw_sf_start_idx(const struct mlx5_eswitch *esw) -{ - /* PF and VF vports indices start from 0 to max_vfs */ - return MLX5_VPORT_PF_PLACEHOLDER + mlx5_core_max_vfs(esw->dev); -} - -static inline int mlx5_esw_sf_end_idx(const struct mlx5_eswitch *esw) -{ - return mlx5_esw_sf_start_idx(esw) + mlx5_sf_max_functions(esw->dev); -} - -static inline int -mlx5_esw_sf_vport_num_to_index(const struct mlx5_eswitch *esw, u16 vport_num) -{ - return vport_num - mlx5_sf_start_function_id(esw->dev) + - MLX5_VPORT_PF_PLACEHOLDER + mlx5_core_max_vfs(esw->dev); -} - -static inline u16 -mlx5_esw_sf_vport_index_to_num(const struct mlx5_eswitch *esw, int idx) -{ - return mlx5_sf_start_function_id(esw->dev) + idx - - (MLX5_VPORT_PF_PLACEHOLDER + mlx5_core_max_vfs(esw->dev)); -} - -static inline bool -mlx5_esw_is_sf_vport(const struct mlx5_eswitch *esw, u16 vport_num) -{ - return mlx5_sf_supported(esw->dev) && - vport_num >= mlx5_sf_start_function_id(esw->dev) && - (vport_num < (mlx5_sf_start_function_id(esw->dev) + - mlx5_sf_max_functions(esw->dev))); -} - static inline bool mlx5_eswitch_is_funcs_handler(const struct mlx5_core_dev *dev) { return mlx5_core_is_ecpf_esw_manager(dev); } -static inline int mlx5_eswitch_uplink_idx(struct mlx5_eswitch *esw) -{ - /* Uplink always locate at the last element of the array.*/ - return esw->total_vports - 1; -} - -static inline int mlx5_eswitch_ecpf_idx(struct mlx5_eswitch *esw) -{ - return esw->total_vports - 2; -} - -static inline int mlx5_eswitch_vport_num_to_index(struct mlx5_eswitch *esw, - u16 vport_num) -{ - if (vport_num == MLX5_VPORT_ECPF) { - if (!mlx5_ecpf_vport_exists(esw->dev)) - esw_warn(esw->dev, "ECPF vport doesn't exist!\n"); - return mlx5_eswitch_ecpf_idx(esw); - } - - if (vport_num == MLX5_VPORT_UPLINK) - return mlx5_eswitch_uplink_idx(esw); - - if (mlx5_esw_is_sf_vport(esw, vport_num)) - return mlx5_esw_sf_vport_num_to_index(esw, vport_num); - - /* PF and VF vports start from 0 to max_vfs */ - return vport_num; -} - -static inline u16 mlx5_eswitch_index_to_vport_num(struct mlx5_eswitch *esw, - int index) -{ - if (index == mlx5_eswitch_ecpf_idx(esw) && - mlx5_ecpf_vport_exists(esw->dev)) - return MLX5_VPORT_ECPF; - - if (index == mlx5_eswitch_uplink_idx(esw)) - return MLX5_VPORT_UPLINK; - - /* SF vports indices are after VFs and before ECPF */ - if (mlx5_sf_supported(esw->dev) && - index > mlx5_core_max_vfs(esw->dev)) - return mlx5_esw_sf_vport_index_to_num(esw, index); - - /* PF and VF vports start from 0 to max_vfs */ - return index; -} - static inline unsigned int mlx5_esw_vport_to_devlink_port_index(const struct mlx5_core_dev *dev, u16 vport_num) @@ -649,82 +567,42 @@ mlx5_esw_devlink_port_index_to_vport_num(unsigned int dl_port_index) /* TODO: This mlx5e_tc function shouldn't be called by eswitch */ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw); -/* The vport getter/iterator are only valid after esw->total_vports - * and vport->vport are initialized in mlx5_eswitch_init. +/* Each mark identifies eswitch vport type. + * MLX5_ESW_VPT_HOST_FN is used to identify both PF and VF ports using + * a single mark. + * MLX5_ESW_VPT_VF identifies a SRIOV VF vport. + * MLX5_ESW_VPT_SF identifies SF vport. */ -#define mlx5_esw_for_all_vports(esw, i, vport) \ - for ((i) = MLX5_VPORT_PF; \ - (vport) = &(esw)->vports[i], \ - (i) < (esw)->total_vports; (i)++) - -#define mlx5_esw_for_all_vports_reverse(esw, i, vport) \ - for ((i) = (esw)->total_vports - 1; \ - (vport) = &(esw)->vports[i], \ - (i) >= MLX5_VPORT_PF; (i)--) - -#define mlx5_esw_for_each_vf_vport(esw, i, vport, nvfs) \ - for ((i) = MLX5_VPORT_FIRST_VF; \ - (vport) = &(esw)->vports[(i)], \ - (i) <= (nvfs); (i)++) - -#define mlx5_esw_for_each_vf_vport_reverse(esw, i, vport, nvfs) \ - for ((i) = (nvfs); \ - (vport) = &(esw)->vports[(i)], \ - (i) >= MLX5_VPORT_FIRST_VF; (i)--) - -/* The rep getter/iterator are only valid after esw->total_vports - * and vport->vport are initialized in mlx5_eswitch_init. +#define MLX5_ESW_VPT_HOST_FN XA_MARK_0 +#define MLX5_ESW_VPT_VF XA_MARK_1 +#define MLX5_ESW_VPT_SF XA_MARK_2 + +/* The vport iterator is valid only after vport are initialized in mlx5_eswitch_init. + * Borrowed the idea from xa_for_each_marked() but with support for desired last element. */ -#define mlx5_esw_for_all_reps(esw, i, rep) \ - for ((i) = MLX5_VPORT_PF; \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) < (esw)->total_vports; (i)++) - -#define mlx5_esw_for_each_vf_rep(esw, i, rep, nvfs) \ - for ((i) = MLX5_VPORT_FIRST_VF; \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) <= (nvfs); (i)++) - -#define mlx5_esw_for_each_vf_rep_reverse(esw, i, rep, nvfs) \ - for ((i) = (nvfs); \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) >= MLX5_VPORT_FIRST_VF; (i)--) - -#define mlx5_esw_for_each_vf_vport_num(esw, vport, nvfs) \ - for ((vport) = MLX5_VPORT_FIRST_VF; (vport) <= (nvfs); (vport)++) - -#define mlx5_esw_for_each_vf_vport_num_reverse(esw, vport, nvfs) \ - for ((vport) = (nvfs); (vport) >= MLX5_VPORT_FIRST_VF; (vport)--) - -/* Includes host PF (vport 0) if it's not esw manager. */ -#define mlx5_esw_for_each_host_func_rep(esw, i, rep, nvfs) \ - for ((i) = (esw)->first_host_vport; \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) <= (nvfs); (i)++) - -#define mlx5_esw_for_each_host_func_rep_reverse(esw, i, rep, nvfs) \ - for ((i) = (nvfs); \ - (rep) = &(esw)->offloads.vport_reps[i], \ - (i) >= (esw)->first_host_vport; (i)--) - -#define mlx5_esw_for_each_host_func_vport(esw, vport, nvfs) \ - for ((vport) = (esw)->first_host_vport; \ - (vport) <= (nvfs); (vport)++) - -#define mlx5_esw_for_each_host_func_vport_reverse(esw, vport, nvfs) \ - for ((vport) = (nvfs); \ - (vport) >= (esw)->first_host_vport; (vport)--) - -#define mlx5_esw_for_each_sf_rep(esw, i, rep) \ - for ((i) = mlx5_esw_sf_start_idx(esw); \ - (rep) = &(esw)->offloads.vport_reps[(i)], \ - (i) < mlx5_esw_sf_end_idx(esw); (i++)) + +#define mlx5_esw_for_each_vport(esw, index, vport) \ + xa_for_each(&((esw)->vports), index, vport) + +#define mlx5_esw_for_each_entry_marked(xa, index, entry, last, filter) \ + for (index = 0, entry = xa_find(xa, &index, last, filter); \ + entry; entry = xa_find_after(xa, &index, last, filter)) + +#define mlx5_esw_for_each_vport_marked(esw, index, vport, last, filter) \ + mlx5_esw_for_each_entry_marked(&((esw)->vports), index, vport, last, filter) + +#define mlx5_esw_for_each_vf_vport(esw, index, vport, last) \ + mlx5_esw_for_each_vport_marked(esw, index, vport, last, MLX5_ESW_VPT_VF) + +#define mlx5_esw_for_each_host_func_vport(esw, index, vport, last) \ + mlx5_esw_for_each_vport_marked(esw, index, vport, last, MLX5_ESW_VPT_HOST_FN) struct mlx5_eswitch *mlx5_devlink_eswitch_get(struct devlink *devlink); struct mlx5_vport *__must_check mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num); -bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num); +bool mlx5_eswitch_is_vf_vport(struct mlx5_eswitch *esw, u16 vport_num); +bool mlx5_esw_is_sf_vport(struct mlx5_eswitch *esw, u16 vport_num); int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type, void *data); @@ -784,12 +662,13 @@ void mlx5_esw_offloads_devlink_port_unregister(struct mlx5_eswitch *esw, u16 vpo struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u16 vport_num); int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum); + u16 vport_num, u32 controller, u32 sfnum); void mlx5_esw_devlink_sf_port_unregister(struct mlx5_eswitch *esw, u16 vport_num); int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum); + u16 vport_num, u32 controller, u32 sfnum); void mlx5_esw_offloads_sf_vport_disable(struct mlx5_eswitch *esw, u16 vport_num); +int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *sf_base_id); int mlx5_esw_vport_vhca_id_set(struct mlx5_eswitch *esw, u16 vport_num); void mlx5_esw_vport_vhca_id_clear(struct mlx5_eswitch *esw, u16 vport_num); @@ -816,6 +695,8 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw); void esw_vport_change_handle_locked(struct mlx5_vport *vport); +bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 controller); + #else /* CONFIG_MLX5_ESWITCH */ /* eswitch API stubs */ static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index bbb707117296..db1e74280e57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -49,6 +49,16 @@ #include "en_tc.h" #include "en/mapping.h" +#define mlx5_esw_for_each_rep(esw, i, rep) \ + xa_for_each(&((esw)->offloads.vport_reps), i, rep) + +#define mlx5_esw_for_each_sf_rep(esw, i, rep) \ + xa_for_each_marked(&((esw)->offloads.vport_reps), i, rep, MLX5_ESW_VPT_SF) + +#define mlx5_esw_for_each_vf_rep(esw, index, rep) \ + mlx5_esw_for_each_entry_marked(&((esw)->offloads.vport_reps), index, \ + rep, (esw)->esw_funcs.num_vfs, MLX5_ESW_VPT_VF) + /* There are two match-all miss flows, one for unicast dst mac and * one for multicast. */ @@ -67,10 +77,7 @@ static const struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_mirror_ns = { static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw, u16 vport_num) { - int idx = mlx5_eswitch_vport_num_to_index(esw, vport_num); - - WARN_ON(idx > esw->total_vports - 1); - return &esw->offloads.vport_reps[idx]; + return xa_load(&esw->offloads.vport_reps, vport_num); } static void @@ -720,10 +727,11 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw, static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val) { struct mlx5_eswitch_rep *rep; - int i, err = 0; + unsigned long i; + int err = 0; esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none"); - mlx5_esw_for_each_host_func_rep(esw, i, rep, esw->esw_funcs.num_vfs) { + mlx5_esw_for_each_host_func_vport(esw, i, rep, esw->esw_funcs.num_vfs) { if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) continue; @@ -972,13 +980,13 @@ void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule) static void mlx5_eswitch_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw) { struct mlx5_flow_handle **flows = esw->fdb_table.offloads.send_to_vport_meta_rules; - int i = 0, num_vfs = esw->esw_funcs.num_vfs, vport_num; + int i = 0, num_vfs = esw->esw_funcs.num_vfs; if (!num_vfs || !flows) return; - mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs) - mlx5_del_flow_rules(flows[i++]); + for (i = 0; i < num_vfs; i++) + mlx5_del_flow_rules(flows[i]); kvfree(flows); } @@ -992,6 +1000,8 @@ mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw) struct mlx5_flow_handle *flow_rule; struct mlx5_flow_handle **flows; struct mlx5_flow_spec *spec; + struct mlx5_vport *vport; + unsigned long i; u16 vport_num; num_vfs = esw->esw_funcs.num_vfs; @@ -1016,7 +1026,8 @@ mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw) dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs) { + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + vport_num = vport->vport; MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num)); dest.vport.num = vport_num; @@ -1158,12 +1169,14 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, struct mlx5_flow_destination dest = {}; struct mlx5_flow_act flow_act = {0}; struct mlx5_flow_handle **flows; - struct mlx5_flow_handle *flow; - struct mlx5_flow_spec *spec; /* total vports is the same for both e-switches */ int nvports = esw->total_vports; + struct mlx5_flow_handle *flow; + struct mlx5_flow_spec *spec; + struct mlx5_vport *vport; + unsigned long i; void *misc; - int err, i; + int err; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); if (!spec) @@ -1182,6 +1195,7 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, misc_parameters); if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, spec, MLX5_VPORT_PF); @@ -1191,10 +1205,11 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_pf_flow_err; } - flows[MLX5_VPORT_PF] = flow; + flows[vport->index] = flow; } if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_ECPF); flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); @@ -1202,13 +1217,13 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_ecpf_flow_err; } - flows[mlx5_eswitch_ecpf_idx(esw)] = flow; + flows[vport->index] = flow; } - mlx5_esw_for_each_vf_vport_num(esw, i, mlx5_core_max_vfs(esw->dev)) { + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, - spec, i); + spec, vport->vport); flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); @@ -1216,7 +1231,7 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_vf_flow_err; } - flows[i] = flow; + flows[vport->index] = flow; } esw->fdb_table.offloads.peer_miss_rules = flows; @@ -1225,15 +1240,20 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, return 0; add_vf_flow_err: - nvports = --i; - mlx5_esw_for_each_vf_vport_num_reverse(esw, i, nvports) - mlx5_del_flow_rules(flows[i]); - - if (mlx5_ecpf_vport_exists(esw->dev)) - mlx5_del_flow_rules(flows[mlx5_eswitch_ecpf_idx(esw)]); + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { + if (!flows[vport->index]) + continue; + mlx5_del_flow_rules(flows[vport->index]); + } + if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[vport->index]); + } add_ecpf_flow_err: - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) - mlx5_del_flow_rules(flows[MLX5_VPORT_PF]); + if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[vport->index]); + } add_pf_flow_err: esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err); kvfree(flows); @@ -1245,20 +1265,23 @@ alloc_flows_err: static void esw_del_fdb_peer_miss_rules(struct mlx5_eswitch *esw) { struct mlx5_flow_handle **flows; - int i; + struct mlx5_vport *vport; + unsigned long i; flows = esw->fdb_table.offloads.peer_miss_rules; - mlx5_esw_for_each_vf_vport_num_reverse(esw, i, - mlx5_core_max_vfs(esw->dev)) - mlx5_del_flow_rules(flows[i]); - - if (mlx5_ecpf_vport_exists(esw->dev)) - mlx5_del_flow_rules(flows[mlx5_eswitch_ecpf_idx(esw)]); + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) + mlx5_del_flow_rules(flows[vport->index]); - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) - mlx5_del_flow_rules(flows[MLX5_VPORT_PF]); + if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[vport->index]); + } + if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[vport->index]); + } kvfree(flows); } @@ -1402,11 +1425,11 @@ static void esw_vport_tbl_put(struct mlx5_eswitch *esw) { struct mlx5_vport_tbl_attr attr; struct mlx5_vport *vport; - int i; + unsigned long i; attr.chain = 0; attr.prio = 1; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { attr.vport = vport->vport; attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; mlx5_esw_vporttbl_put(esw, &attr); @@ -1418,11 +1441,11 @@ static int esw_vport_tbl_get(struct mlx5_eswitch *esw) struct mlx5_vport_tbl_attr attr; struct mlx5_flow_table *fdb; struct mlx5_vport *vport; - int i; + unsigned long i; attr.chain = 0; attr.prio = 1; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { attr.vport = vport->vport; attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; fdb = mlx5_esw_vporttbl_get(esw, &attr); @@ -1910,12 +1933,12 @@ out: return flow_rule; } - -static int mlx5_eswitch_inline_mode_get(const struct mlx5_eswitch *esw, u8 *mode) +static int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, u8 *mode) { u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2; struct mlx5_core_dev *dev = esw->dev; - int vport; + struct mlx5_vport *vport; + unsigned long i; if (!MLX5_CAP_GEN(dev, vport_group_manager)) return -EOPNOTSUPP; @@ -1936,8 +1959,8 @@ static int mlx5_eswitch_inline_mode_get(const struct mlx5_eswitch *esw, u8 *mode query_vports: mlx5_query_nic_vport_min_inline(dev, esw->first_host_vport, &prev_mlx5_mode); - mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) { - mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode); + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + mlx5_query_nic_vport_min_inline(dev, vport->vport, &mlx5_mode); if (prev_mlx5_mode != mlx5_mode) return -EINVAL; prev_mlx5_mode = mlx5_mode; @@ -2080,34 +2103,82 @@ static int esw_offloads_start(struct mlx5_eswitch *esw, return err; } -void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw) +static void mlx5_esw_offloads_rep_mark_set(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, + xa_mark_t mark) { - kfree(esw->offloads.vport_reps); + bool mark_set; + + /* Copy the mark from vport to its rep */ + mark_set = xa_get_mark(&esw->vports, rep->vport, mark); + if (mark_set) + xa_set_mark(&esw->offloads.vport_reps, rep->vport, mark); } -int esw_offloads_init_reps(struct mlx5_eswitch *esw) +static int mlx5_esw_offloads_rep_init(struct mlx5_eswitch *esw, const struct mlx5_vport *vport) { - int total_vports = esw->total_vports; struct mlx5_eswitch_rep *rep; - int vport_index; - u8 rep_type; + int rep_type; + int err; - esw->offloads.vport_reps = kcalloc(total_vports, - sizeof(struct mlx5_eswitch_rep), - GFP_KERNEL); - if (!esw->offloads.vport_reps) + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) return -ENOMEM; - mlx5_esw_for_all_reps(esw, vport_index, rep) { - rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport_index); - rep->vport_index = vport_index; + rep->vport = vport->vport; + rep->vport_index = vport->index; + for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) + atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED); - for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) - atomic_set(&rep->rep_data[rep_type].state, - REP_UNREGISTERED); - } + err = xa_insert(&esw->offloads.vport_reps, rep->vport, rep, GFP_KERNEL); + if (err) + goto insert_err; + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_HOST_FN); + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_VF); + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_SF); return 0; + +insert_err: + kfree(rep); + return err; +} + +static void mlx5_esw_offloads_rep_cleanup(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep) +{ + xa_erase(&esw->offloads.vport_reps, rep->vport); + kfree(rep); +} + +void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + + mlx5_esw_for_each_rep(esw, i, rep) + mlx5_esw_offloads_rep_cleanup(esw, rep); + xa_destroy(&esw->offloads.vport_reps); +} + +int esw_offloads_init_reps(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + xa_init(&esw->offloads.vport_reps); + + mlx5_esw_for_each_vport(esw, i, vport) { + err = mlx5_esw_offloads_rep_init(esw, vport); + if (err) + goto err; + } + return 0; + +err: + esw_offloads_cleanup_reps(esw); + return err; } static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw, @@ -2121,7 +2192,7 @@ static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw, static void __unload_reps_sf_vport(struct mlx5_eswitch *esw, u8 rep_type) { struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; mlx5_esw_for_each_sf_rep(esw, i, rep) __esw_offloads_unload_rep(esw, rep, rep_type); @@ -2130,11 +2201,11 @@ static void __unload_reps_sf_vport(struct mlx5_eswitch *esw, u8 rep_type) static void __unload_reps_all_vport(struct mlx5_eswitch *esw, u8 rep_type) { struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; __unload_reps_sf_vport(esw, rep_type); - mlx5_esw_for_each_vf_rep_reverse(esw, i, rep, esw->esw_funcs.num_vfs) + mlx5_esw_for_each_vf_rep(esw, i, rep) __esw_offloads_unload_rep(esw, rep, rep_type); if (mlx5_ecpf_vport_exists(esw->dev)) { @@ -2421,25 +2492,25 @@ static void esw_offloads_vport_metadata_cleanup(struct mlx5_eswitch *esw, static void esw_offloads_metadata_uninit(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; - int i; + unsigned long i; if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) return; - mlx5_esw_for_all_vports_reverse(esw, i, vport) + mlx5_esw_for_each_vport(esw, i, vport) esw_offloads_vport_metadata_cleanup(esw, vport); } static int esw_offloads_metadata_init(struct mlx5_eswitch *esw) { struct mlx5_vport *vport; + unsigned long i; int err; - int i; if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) return 0; - mlx5_esw_for_all_vports(esw, i, vport) { + mlx5_esw_for_each_vport(esw, i, vport) { err = esw_offloads_vport_metadata_setup(esw, vport); if (err) goto metadata_err; @@ -2676,11 +2747,25 @@ static int mlx5_esw_host_number_init(struct mlx5_eswitch *esw) return 0; } +bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 controller) +{ + /* Local controller is always valid */ + if (controller == 0) + return true; + + if (!mlx5_core_is_ecpf_esw_manager(esw->dev)) + return false; + + /* External host number starts with zero in device */ + return (controller == esw->offloads.host_number + 1); +} + int esw_offloads_enable(struct mlx5_eswitch *esw) { struct mapping_ctx *reg_c0_obj_pool; struct mlx5_vport *vport; - int err, i; + unsigned long i; + int err; if (MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, reformat) && MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, decap)) @@ -2926,13 +3011,44 @@ unlock: return err; } +static int mlx5_esw_vports_inline_set(struct mlx5_eswitch *esw, u8 mlx5_mode, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_vport *vport; + u16 err_vport_num = 0; + unsigned long i; + int err = 0; + + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + err = mlx5_modify_nic_vport_min_inline(dev, vport->vport, mlx5_mode); + if (err) { + err_vport_num = vport->vport; + NL_SET_ERR_MSG_MOD(extack, + "Failed to set min inline on vport"); + goto revert_inline_mode; + } + } + return 0; + +revert_inline_mode: + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + if (vport->vport == err_vport_num) + break; + mlx5_modify_nic_vport_min_inline(dev, + vport->vport, + esw->offloads.inline_mode); + } + return err; +} + int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); - int err, vport, num_vport; struct mlx5_eswitch *esw; u8 mlx5_mode; + int err; esw = mlx5_devlink_eswitch_get(devlink); if (IS_ERR(esw)) @@ -2967,25 +3083,14 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, if (err) goto out; - mlx5_esw_for_each_host_func_vport(esw, vport, esw->esw_funcs.num_vfs) { - err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode); - if (err) { - NL_SET_ERR_MSG_MOD(extack, - "Failed to set min inline on vport"); - goto revert_inline_mode; - } - } + err = mlx5_esw_vports_inline_set(esw, mlx5_mode, extack); + if (err) + goto out; esw->offloads.inline_mode = mlx5_mode; up_write(&esw->mode_lock); return 0; -revert_inline_mode: - num_vport = --vport; - mlx5_esw_for_each_host_func_vport_reverse(esw, vport, num_vport) - mlx5_modify_nic_vport_min_inline(dev, - vport, - esw->offloads.inline_mode); out: up_write(&esw->mode_lock); return err; @@ -3116,11 +3221,11 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw, { struct mlx5_eswitch_rep_data *rep_data; struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; esw->offloads.rep_ops[rep_type] = ops; - mlx5_esw_for_all_reps(esw, i, rep) { - if (likely(mlx5_eswitch_vport_has_rep(esw, i))) { + mlx5_esw_for_each_rep(esw, i, rep) { + if (likely(mlx5_eswitch_vport_has_rep(esw, rep->vport))) { rep->esw = esw; rep_data = &rep->rep_data[rep_type]; atomic_set(&rep_data->state, REP_REGISTERED); @@ -3132,12 +3237,12 @@ EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps); void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type) { struct mlx5_eswitch_rep *rep; - int i; + unsigned long i; if (esw->mode == MLX5_ESWITCH_OFFLOADS) __unload_reps_all_vport(esw, rep_type); - mlx5_esw_for_all_reps(esw, i, rep) + mlx5_esw_for_each_rep(esw, i, rep) atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED); } EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps); @@ -3178,12 +3283,6 @@ struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw, } EXPORT_SYMBOL(mlx5_eswitch_vport_rep); -bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num) -{ - return vport_num >= MLX5_VPORT_FIRST_VF && - vport_num <= esw->dev->priv.sriov.max_vfs; -} - bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw) { return !!(esw->flags & MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED); @@ -3209,7 +3308,7 @@ u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match); int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_port *dl_port, - u16 vport_num, u32 sfnum) + u16 vport_num, u32 controller, u32 sfnum) { int err; @@ -3217,7 +3316,7 @@ int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_p if (err) return err; - err = mlx5_esw_devlink_sf_port_register(esw, dl_port, vport_num, sfnum); + err = mlx5_esw_devlink_sf_port_register(esw, dl_port, vport_num, controller, sfnum); if (err) goto devlink_err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index 90b524c59f3c..6a0c6f965ad1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -148,9 +148,19 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_ struct mlx5_sf_dev_table *table = container_of(nb, struct mlx5_sf_dev_table, nb); const struct mlx5_vhca_state_event *event = data; struct mlx5_sf_dev *sf_dev; + u16 max_functions; u16 sf_index; + u16 base_id; + + max_functions = mlx5_sf_max_functions(table->dev); + if (!max_functions) + return 0; + + base_id = MLX5_CAP_GEN(table->dev, sf_base_id); + if (event->function_id < base_id || event->function_id >= (base_id + max_functions)) + return 0; - sf_index = event->function_id - MLX5_CAP_GEN(table->dev, sf_base_id); + sf_index = event->function_id - base_id; sf_dev = xa_load(&table->devices, sf_index); switch (event->new_vhca_state) { case MLX5_VHCA_STATE_ALLOCATED: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c index 52226d9b9a6d..a8e73c9ed1ea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c @@ -12,6 +12,7 @@ struct mlx5_sf { struct devlink_port dl_port; unsigned int port_index; + u32 controller; u16 id; u16 hw_fn_id; u16 hw_state; @@ -58,7 +59,8 @@ static void mlx5_sf_id_erase(struct mlx5_sf_table *table, struct mlx5_sf *sf) } static struct mlx5_sf * -mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *extack) +mlx5_sf_alloc(struct mlx5_sf_table *table, struct mlx5_eswitch *esw, + u32 controller, u32 sfnum, struct netlink_ext_ack *extack) { unsigned int dl_port_index; struct mlx5_sf *sf; @@ -66,7 +68,12 @@ mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *ex int id_err; int err; - id_err = mlx5_sf_hw_table_sf_alloc(table->dev, sfnum); + if (!mlx5_esw_offloads_controller_valid(esw, controller)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid controller number"); + return ERR_PTR(-EINVAL); + } + + id_err = mlx5_sf_hw_table_sf_alloc(table->dev, controller, sfnum); if (id_err < 0) { err = id_err; goto id_err; @@ -78,11 +85,12 @@ mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *ex goto alloc_err; } sf->id = id_err; - hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sf->id); + hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, controller, sf->id); dl_port_index = mlx5_esw_vport_to_devlink_port_index(table->dev, hw_fn_id); sf->port_index = dl_port_index; sf->hw_fn_id = hw_fn_id; sf->hw_state = MLX5_VHCA_STATE_ALLOCATED; + sf->controller = controller; err = mlx5_sf_id_insert(table, sf); if (err) @@ -93,7 +101,7 @@ mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *ex insert_err: kfree(sf); alloc_err: - mlx5_sf_hw_table_sf_free(table->dev, id_err); + mlx5_sf_hw_table_sf_free(table->dev, controller, id_err); id_err: if (err == -EEXIST) NL_SET_ERR_MSG_MOD(extack, "SF already exist. Choose different sfnum"); @@ -103,7 +111,7 @@ id_err: static void mlx5_sf_free(struct mlx5_sf_table *table, struct mlx5_sf *sf) { mlx5_sf_id_erase(table, sf); - mlx5_sf_hw_table_sf_free(table->dev, sf->id); + mlx5_sf_hw_table_sf_free(table->dev, sf->controller, sf->id); kfree(sf); } @@ -272,12 +280,12 @@ static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table, struct mlx5_sf *sf; int err; - sf = mlx5_sf_alloc(table, new_attr->sfnum, extack); + sf = mlx5_sf_alloc(table, esw, new_attr->controller, new_attr->sfnum, extack); if (IS_ERR(sf)) return PTR_ERR(sf); err = mlx5_esw_offloads_sf_vport_enable(esw, &sf->dl_port, sf->hw_fn_id, - new_attr->sfnum); + new_attr->controller, new_attr->sfnum); if (err) goto esw_err; *new_port_index = sf->port_index; @@ -306,7 +314,8 @@ mlx5_sf_new_check_attr(struct mlx5_core_dev *dev, const struct devlink_port_new_ "User must provide unique sfnum. Driver does not support auto assignment"); return -EOPNOTSUPP; } - if (new_attr->controller_valid && new_attr->controller) { + if (new_attr->controller_valid && new_attr->controller && + !mlx5_core_is_ecpf_esw_manager(dev)) { NL_SET_ERR_MSG_MOD(extack, "External controller is unsupported"); return -EOPNOTSUPP; } @@ -352,10 +361,10 @@ static void mlx5_sf_dealloc(struct mlx5_sf_table *table, struct mlx5_sf *sf) * firmware gives confirmation that it is detached by the driver. */ mlx5_cmd_sf_disable_hca(table->dev, sf->hw_fn_id); - mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->id); + mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->controller, sf->id); kfree(sf); } else { - mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->id); + mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->controller, sf->id); kfree(sf); } } @@ -437,9 +446,6 @@ sf_err: static void mlx5_sf_table_enable(struct mlx5_sf_table *table) { - if (!mlx5_sf_max_functions(table->dev)) - return; - init_completion(&table->disable_complete); refcount_set(&table->refcount, 1); } @@ -462,9 +468,6 @@ static void mlx5_sf_deactivate_all(struct mlx5_sf_table *table) static void mlx5_sf_table_disable(struct mlx5_sf_table *table) { - if (!mlx5_sf_max_functions(table->dev)) - return; - if (!refcount_read(&table->refcount)) return; @@ -498,7 +501,8 @@ static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, voi static bool mlx5_sf_table_supported(const struct mlx5_core_dev *dev) { - return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) && mlx5_sf_supported(dev); + return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) && + mlx5_sf_hw_table_supported(dev); } int mlx5_sf_table_init(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c index ec53c11c8344..ef5f892aafad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c @@ -8,6 +8,7 @@ #include "ecpf.h" #include "vhca_event.h" #include "mlx5_core.h" +#include "eswitch.h" struct mlx5_sf_hw { u32 usr_sfnum; @@ -15,59 +16,113 @@ struct mlx5_sf_hw { u8 pending_delete: 1; }; +struct mlx5_sf_hwc_table { + struct mlx5_sf_hw *sfs; + int max_fn; + u16 start_fn_id; +}; + +enum mlx5_sf_hwc_index { + MLX5_SF_HWC_LOCAL, + MLX5_SF_HWC_EXTERNAL, + MLX5_SF_HWC_MAX, +}; + struct mlx5_sf_hw_table { struct mlx5_core_dev *dev; - struct mlx5_sf_hw *sfs; - int max_local_functions; struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */ struct notifier_block vhca_nb; + struct mlx5_sf_hwc_table hwc[MLX5_SF_HWC_MAX]; }; -u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id) +static struct mlx5_sf_hwc_table * +mlx5_sf_controller_to_hwc(struct mlx5_core_dev *dev, u32 controller) { - return sw_id + mlx5_sf_start_function_id(dev); + int idx = !!controller; + + return &dev->priv.sf_hw_table->hwc[idx]; } -static u16 mlx5_sf_hw_to_sw_id(const struct mlx5_core_dev *dev, u16 hw_id) +u16 mlx5_sf_sw_to_hw_id(struct mlx5_core_dev *dev, u32 controller, u16 sw_id) { - return hw_id - mlx5_sf_start_function_id(dev); + struct mlx5_sf_hwc_table *hwc; + + hwc = mlx5_sf_controller_to_hwc(dev, controller); + return hwc->start_fn_id + sw_id; } -int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum) +static u16 mlx5_sf_hw_to_sw_id(struct mlx5_sf_hwc_table *hwc, u16 hw_id) +{ + return hw_id - hwc->start_fn_id; +} + +static struct mlx5_sf_hwc_table * +mlx5_sf_table_fn_to_hwc(struct mlx5_sf_hw_table *table, u16 fn_id) { - struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; - int sw_id = -ENOSPC; - u16 hw_fn_id; - int err; int i; - if (!table->max_local_functions) - return -EOPNOTSUPP; + for (i = 0; i < ARRAY_SIZE(table->hwc); i++) { + if (table->hwc[i].max_fn && + fn_id >= table->hwc[i].start_fn_id && + fn_id < (table->hwc[i].start_fn_id + table->hwc[i].max_fn)) + return &table->hwc[i]; + } + return NULL; +} + +static int mlx5_sf_hw_table_id_alloc(struct mlx5_sf_hw_table *table, u32 controller, + u32 usr_sfnum) +{ + struct mlx5_sf_hwc_table *hwc; + int i; + + hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + if (!hwc->sfs) + return -ENOSPC; - mutex_lock(&table->table_lock); /* Check if sf with same sfnum already exists or not. */ - for (i = 0; i < table->max_local_functions; i++) { - if (table->sfs[i].allocated && table->sfs[i].usr_sfnum == usr_sfnum) { - err = -EEXIST; - goto exist_err; - } + for (i = 0; i < hwc->max_fn; i++) { + if (hwc->sfs[i].allocated && hwc->sfs[i].usr_sfnum == usr_sfnum) + return -EEXIST; } - /* Find the free entry and allocate the entry from the array */ - for (i = 0; i < table->max_local_functions; i++) { - if (!table->sfs[i].allocated) { - table->sfs[i].usr_sfnum = usr_sfnum; - table->sfs[i].allocated = true; - sw_id = i; - break; + for (i = 0; i < hwc->max_fn; i++) { + if (!hwc->sfs[i].allocated) { + hwc->sfs[i].usr_sfnum = usr_sfnum; + hwc->sfs[i].allocated = true; + return i; } } - if (sw_id == -ENOSPC) { - err = -ENOSPC; + return -ENOSPC; +} + +static void mlx5_sf_hw_table_id_free(struct mlx5_sf_hw_table *table, u32 controller, int id) +{ + struct mlx5_sf_hwc_table *hwc; + + hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + hwc->sfs[id].allocated = false; + hwc->sfs[id].pending_delete = false; +} + +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr_sfnum) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + u16 hw_fn_id; + int sw_id; + int err; + + if (!table) + return -EOPNOTSUPP; + + mutex_lock(&table->table_lock); + sw_id = mlx5_sf_hw_table_id_alloc(table, controller, usr_sfnum); + if (sw_id < 0) { + err = sw_id; goto exist_err; } - hw_fn_id = mlx5_sf_sw_to_hw_id(dev, sw_id); + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, sw_id); err = mlx5_cmd_alloc_sf(dev, hw_fn_id); if (err) goto err; @@ -76,47 +131,58 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum) if (err) goto vhca_err; + if (controller) { + /* If this SF is for external controller, SF manager + * needs to arm firmware to receive the events. + */ + err = mlx5_vhca_event_arm(dev, hw_fn_id); + if (err) + goto vhca_err; + } + mutex_unlock(&table->table_lock); return sw_id; vhca_err: mlx5_cmd_dealloc_sf(dev, hw_fn_id); err: - table->sfs[i].allocated = false; + mlx5_sf_hw_table_id_free(table, controller, sw_id); exist_err: mutex_unlock(&table->table_lock); return err; } -static void _mlx5_sf_hw_id_free(struct mlx5_core_dev *dev, u16 id) +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id) { struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; u16 hw_fn_id; - hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id); + mutex_lock(&table->table_lock); + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id); mlx5_cmd_dealloc_sf(dev, hw_fn_id); - table->sfs[id].allocated = false; - table->sfs[id].pending_delete = false; + mlx5_sf_hw_table_id_free(table, controller, id); + mutex_unlock(&table->table_lock); } -void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id) +static void mlx5_sf_hw_table_hwc_sf_free(struct mlx5_core_dev *dev, + struct mlx5_sf_hwc_table *hwc, int idx) { - struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; - - mutex_lock(&table->table_lock); - _mlx5_sf_hw_id_free(dev, id); - mutex_unlock(&table->table_lock); + mlx5_cmd_dealloc_sf(dev, hwc->start_fn_id + idx); + hwc->sfs[idx].allocated = false; + hwc->sfs[idx].pending_delete = false; } -void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id) +void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u32 controller, u16 id) { struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; + struct mlx5_sf_hwc_table *hwc; u16 hw_fn_id; u8 state; int err; - hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id); + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id); + hwc = mlx5_sf_controller_to_hwc(dev, controller); mutex_lock(&table->table_lock); err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out)); if (err) @@ -124,53 +190,102 @@ void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id) state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state); if (state == MLX5_VHCA_STATE_ALLOCATED) { mlx5_cmd_dealloc_sf(dev, hw_fn_id); - table->sfs[id].allocated = false; + hwc->sfs[id].allocated = false; } else { - table->sfs[id].pending_delete = true; + hwc->sfs[id].pending_delete = true; } err: mutex_unlock(&table->table_lock); } -static void mlx5_sf_hw_dealloc_all(struct mlx5_sf_hw_table *table) +static void mlx5_sf_hw_table_hwc_dealloc_all(struct mlx5_core_dev *dev, + struct mlx5_sf_hwc_table *hwc) { int i; - for (i = 0; i < table->max_local_functions; i++) { - if (table->sfs[i].allocated) - _mlx5_sf_hw_id_free(table->dev, i); + for (i = 0; i < hwc->max_fn; i++) { + if (hwc->sfs[i].allocated) + mlx5_sf_hw_table_hwc_sf_free(dev, hwc, i); } } +static void mlx5_sf_hw_table_dealloc_all(struct mlx5_sf_hw_table *table) +{ + mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_EXTERNAL]); + mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_LOCAL]); +} + +static int mlx5_sf_hw_table_hwc_init(struct mlx5_sf_hwc_table *hwc, u16 max_fn, u16 base_id) +{ + struct mlx5_sf_hw *sfs; + + if (!max_fn) + return 0; + + sfs = kcalloc(max_fn, sizeof(*sfs), GFP_KERNEL); + if (!sfs) + return -ENOMEM; + + hwc->sfs = sfs; + hwc->max_fn = max_fn; + hwc->start_fn_id = base_id; + return 0; +} + +static void mlx5_sf_hw_table_hwc_cleanup(struct mlx5_sf_hwc_table *hwc) +{ + kfree(hwc->sfs); +} + int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) { struct mlx5_sf_hw_table *table; - struct mlx5_sf_hw *sfs; - int max_functions; + u16 max_ext_fn = 0; + u16 ext_base_id; + u16 max_fn = 0; + u16 base_id; + int err; - if (!mlx5_sf_supported(dev) || !mlx5_vhca_event_supported(dev)) + if (!mlx5_vhca_event_supported(dev)) + return 0; + + if (mlx5_sf_supported(dev)) + max_fn = mlx5_sf_max_functions(dev); + + err = mlx5_esw_sf_max_hpf_functions(dev, &max_ext_fn, &ext_base_id); + if (err) + return err; + + if (!max_fn && !max_ext_fn) return 0; - max_functions = mlx5_sf_max_functions(dev); table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return -ENOMEM; - sfs = kcalloc(max_functions, sizeof(*sfs), GFP_KERNEL); - if (!sfs) - goto table_err; - mutex_init(&table->table_lock); table->dev = dev; - table->sfs = sfs; - table->max_local_functions = max_functions; dev->priv.sf_hw_table = table; - mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions); + + base_id = mlx5_sf_start_function_id(dev); + err = mlx5_sf_hw_table_hwc_init(&table->hwc[MLX5_SF_HWC_LOCAL], max_fn, base_id); + if (err) + goto table_err; + + err = mlx5_sf_hw_table_hwc_init(&table->hwc[MLX5_SF_HWC_EXTERNAL], + max_ext_fn, ext_base_id); + if (err) + goto ext_err; + + mlx5_core_dbg(dev, "SF HW table: max sfs = %d, ext sfs = %d\n", max_fn, max_ext_fn); return 0; +ext_err: + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]); table_err: + mutex_destroy(&table->table_lock); kfree(table); - return -ENOMEM; + return err; } void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) @@ -181,7 +296,8 @@ void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) return; mutex_destroy(&table->table_lock); - kfree(table->sfs); + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_EXTERNAL]); + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]); kfree(table); } @@ -189,21 +305,26 @@ static int mlx5_sf_hw_vhca_event(struct notifier_block *nb, unsigned long opcode { struct mlx5_sf_hw_table *table = container_of(nb, struct mlx5_sf_hw_table, vhca_nb); const struct mlx5_vhca_state_event *event = data; + struct mlx5_sf_hwc_table *hwc; struct mlx5_sf_hw *sf_hw; u16 sw_id; if (event->new_vhca_state != MLX5_VHCA_STATE_ALLOCATED) return 0; - sw_id = mlx5_sf_hw_to_sw_id(table->dev, event->function_id); - sf_hw = &table->sfs[sw_id]; + hwc = mlx5_sf_table_fn_to_hwc(table, event->function_id); + if (!hwc) + return 0; + + sw_id = mlx5_sf_hw_to_sw_id(hwc, event->function_id); + sf_hw = &hwc->sfs[sw_id]; mutex_lock(&table->table_lock); /* SF driver notified through firmware that SF is finally detached. * Hence recycle the sf hardware id for reuse. */ if (sf_hw->allocated && sf_hw->pending_delete) - _mlx5_sf_hw_id_free(table->dev, sw_id); + mlx5_sf_hw_table_hwc_sf_free(table->dev, hwc, sw_id); mutex_unlock(&table->table_lock); return 0; } @@ -228,5 +349,10 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev) mlx5_vhca_event_notifier_unregister(dev, &table->vhca_nb); /* Dealloc SFs whose firmware event has been missed. */ - mlx5_sf_hw_dealloc_all(table); + mlx5_sf_hw_table_dealloc_all(table); +} + +bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev) +{ + return !!dev->priv.sf_hw_table; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h index cb02a51d0986..7114f3fc335f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h @@ -12,10 +12,11 @@ int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id); int mlx5_cmd_sf_enable_hca(struct mlx5_core_dev *dev, u16 func_id); int mlx5_cmd_sf_disable_hca(struct mlx5_core_dev *dev, u16 func_id); -u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id); +u16 mlx5_sf_sw_to_hw_id(struct mlx5_core_dev *dev, u32 controller, u16 sw_id); -int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum); -void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id); -void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id); +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr_sfnum); +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id); +void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u32 controller, u16 id); +bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev); #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index e05c5c0f3ae1..457ad42eaa2a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1151,20 +1151,6 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev) } EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid); -/** - * mlx5_eswitch_get_total_vports - Get total vports of the eswitch - * - * @dev: Pointer to core device - * - * mlx5_eswitch_get_total_vports returns total number of vports for - * the eswitch. - */ -u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) -{ - return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev); -} -EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); - int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out) { u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 713ee3041d49..bea978df7713 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -364,6 +364,7 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) attrs.split = eth_port.is_split; attrs.splittable = !attrs.split; + attrs.lanes = eth_port.port_lanes; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = eth_port.label_port; attrs.phys.split_subport_number = eth_port.label_subport; diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 9e5dad41cdc9..4afff320dfd0 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -913,31 +913,20 @@ static int ravb_poll(struct napi_struct *napi, int budget) int q = napi - priv->napi; int mask = BIT(q); int quota = budget; - u32 ris0, tis; - for (;;) { - tis = ravb_read(ndev, TIS); - ris0 = ravb_read(ndev, RIS0); - if (!((ris0 & mask) || (tis & mask))) - break; + /* Processing RX Descriptor Ring */ + /* Clear RX interrupt */ + ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); + if (ravb_rx(ndev, "a, q)) + goto out; - /* Processing RX Descriptor Ring */ - if (ris0 & mask) { - /* Clear RX interrupt */ - ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); - if (ravb_rx(ndev, "a, q)) - goto out; - } - /* Processing TX Descriptor Ring */ - if (tis & mask) { - spin_lock_irqsave(&priv->lock, flags); - /* Clear TX interrupt */ - ravb_write(ndev, ~(mask | TIS_RESERVED), TIS); - ravb_tx_free(ndev, q, true); - netif_wake_subqueue(ndev, q); - spin_unlock_irqrestore(&priv->lock, flags); - } - } + /* Processing RX Descriptor Ring */ + spin_lock_irqsave(&priv->lock, flags); + /* Clear TX interrupt */ + ravb_write(ndev, ~(mask | TIS_RESERVED), TIS); + ravb_tx_free(ndev, q, true); + netif_wake_subqueue(ndev, q); + spin_unlock_irqrestore(&priv->lock, flags); napi_complete(napi); diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index c873f961d5a5..c3f35da1b82a 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -2944,8 +2944,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) /* Get the transmit queue */ tx_ev_q_label = EFX_QWORD_FIELD(*event, ESF_DZ_TX_QLABEL); - tx_queue = efx_channel_get_tx_queue(channel, - tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); if (!tx_queue->timestamping) { /* Transmit completion */ diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c index d75cf5ff5686..49df02ecee91 100644 --- a/drivers/net/ethernet/sfc/farch.c +++ b/drivers/net/ethernet/sfc/farch.c @@ -835,14 +835,14 @@ efx_farch_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) /* Transmit completion */ tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_DESC_PTR); tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL); - tx_queue = efx_channel_get_tx_queue( - channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + + (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); efx_xmit_done(tx_queue, tx_ev_desc_ptr); } else if (EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_WQ_FF_FULL)) { /* Rewrite the FIFO write pointer */ tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL); - tx_queue = efx_channel_get_tx_queue( - channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + + (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); netif_tx_lock(efx->net_dev); efx_farch_notify_tx_desc(tx_queue); @@ -1081,16 +1081,16 @@ static void efx_farch_handle_tx_flush_done(struct efx_nic *efx, efx_qword_t *event) { struct efx_tx_queue *tx_queue; + struct efx_channel *channel; int qid; qid = EFX_QWORD_FIELD(*event, FSF_AZ_DRIVER_EV_SUBDATA); if (qid < EFX_MAX_TXQ_PER_CHANNEL * (efx->n_tx_channels + efx->n_extra_tx_channels)) { - tx_queue = efx_get_tx_queue(efx, qid / EFX_MAX_TXQ_PER_CHANNEL, - qid % EFX_MAX_TXQ_PER_CHANNEL); - if (atomic_cmpxchg(&tx_queue->flush_outstanding, 1, 0)) { + channel = efx_get_tx_channel(efx, qid / EFX_MAX_TXQ_PER_CHANNEL); + tx_queue = channel->tx_queue + (qid % EFX_MAX_TXQ_PER_CHANNEL); + if (atomic_cmpxchg(&tx_queue->flush_outstanding, 1, 0)) efx_farch_magic_event(tx_queue->channel, EFX_CHANNEL_MAGIC_TX_DRAIN(tx_queue)); - } } } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 372090e8ee6f..a9a984c57d78 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3303,8 +3303,15 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) /* Enable TSO */ if (priv->tso) { - for (chan = 0; chan < tx_cnt; chan++) + for (chan = 0; chan < tx_cnt; chan++) { + struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; + + /* TSO and TBS cannot co-exist */ + if (tx_q->tbs & STMMAC_TBS_AVAIL) + continue; + stmmac_enable_tso(priv, priv->ioaddr, 1, chan); + } } /* Enable Split Header */ @@ -3674,9 +3681,8 @@ int stmmac_open(struct net_device *dev) struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; int tbs_en = priv->plat->tx_queues_cfg[chan].tbs_en; + /* Setup per-TXQ tbs flag before TX descriptor alloc */ tx_q->tbs |= tbs_en ? STMMAC_TBS_AVAIL : 0; - if (stmmac_enable_tbs(priv, priv->ioaddr, tbs_en, chan)) - tx_q->tbs &= ~STMMAC_TBS_AVAIL; } ret = alloc_dma_desc_resources(priv); diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 14e7da7d302f..f9417b44cae8 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -169,11 +169,11 @@ static const char emac_version_string[] = "TI DaVinci EMAC Linux v6.1"; /* EMAC mac_status register */ #define EMAC_MACSTATUS_TXERRCODE_MASK (0xF00000) #define EMAC_MACSTATUS_TXERRCODE_SHIFT (20) -#define EMAC_MACSTATUS_TXERRCH_MASK (0x7) +#define EMAC_MACSTATUS_TXERRCH_MASK (0x70000) #define EMAC_MACSTATUS_TXERRCH_SHIFT (16) #define EMAC_MACSTATUS_RXERRCODE_MASK (0xF000) #define EMAC_MACSTATUS_RXERRCODE_SHIFT (12) -#define EMAC_MACSTATUS_RXERRCH_MASK (0x7) +#define EMAC_MACSTATUS_RXERRCH_MASK (0x700) #define EMAC_MACSTATUS_RXERRCH_SHIFT (8) /* EMAC RX register masks */ diff --git a/drivers/net/ethernet/xscale/Kconfig b/drivers/net/ethernet/xscale/Kconfig index 7b83a6e5d894..468ffe3d1707 100644 --- a/drivers/net/ethernet/xscale/Kconfig +++ b/drivers/net/ethernet/xscale/Kconfig @@ -22,6 +22,7 @@ config IXP4XX_ETH tristate "Intel IXP4xx Ethernet support" depends on ARM && ARCH_IXP4XX && IXP4XX_NPE && IXP4XX_QMGR select PHYLIB + select OF_MDIO if OF select NET_PTP_CLASSIFY help Say Y here if you want to use built-in Ethernet ports diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c index 0152f1e70783..cb89323855d8 100644 --- a/drivers/net/ethernet/xscale/ixp4xx_eth.c +++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c @@ -28,6 +28,7 @@ #include <linux/kernel.h> #include <linux/net_tstamp.h> #include <linux/of.h> +#include <linux/of_mdio.h> #include <linux/phy.h> #include <linux/platform_data/eth_ixp4xx.h> #include <linux/platform_device.h> @@ -165,7 +166,6 @@ struct eth_regs { }; struct port { - struct resource *mem_res; struct eth_regs __iomem *regs; struct npe *npe; struct net_device *netdev; @@ -250,6 +250,7 @@ static inline void memcpy_swab32(u32 *dest, u32 *src, int cnt) static DEFINE_SPINLOCK(mdio_lock); static struct eth_regs __iomem *mdio_regs; /* mdio command and status only */ static struct mii_bus *mdio_bus; +static struct device_node *mdio_bus_np; static int ports_open; static struct port *npe_port_tab[MAX_NPES]; static struct dma_pool *dma_pool; @@ -533,7 +534,8 @@ static int ixp4xx_mdio_register(struct eth_regs __iomem *regs) mdio_bus->write = &ixp4xx_mdio_write; snprintf(mdio_bus->id, MII_BUS_ID_SIZE, "ixp4xx-eth-0"); - if ((err = mdiobus_register(mdio_bus))) + err = of_mdiobus_register(mdio_bus, mdio_bus_np); + if (err) mdiobus_free(mdio_bus); return err; } @@ -1085,7 +1087,7 @@ static int init_queues(struct port *port) int i; if (!ports_open) { - dma_pool = dma_pool_create(DRV_NAME, port->netdev->dev.parent, + dma_pool = dma_pool_create(DRV_NAME, &port->netdev->dev, POOL_ALLOC_SIZE, 32, 0); if (!dma_pool) return -ENOMEM; @@ -1358,19 +1360,118 @@ static const struct net_device_ops ixp4xx_netdev_ops = { .ndo_validate_addr = eth_validate_addr, }; +#ifdef CONFIG_OF +static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev) +{ + struct device_node *np = dev->of_node; + struct of_phandle_args queue_spec; + struct of_phandle_args npe_spec; + struct device_node *mdio_np; + struct eth_plat_info *plat; + int ret; + + plat = devm_kzalloc(dev, sizeof(*plat), GFP_KERNEL); + if (!plat) + return NULL; + + ret = of_parse_phandle_with_fixed_args(np, "intel,npe-handle", 1, 0, + &npe_spec); + if (ret) { + dev_err(dev, "no NPE engine specified\n"); + return NULL; + } + /* NPE ID 0x00, 0x10, 0x20... */ + plat->npe = (npe_spec.args[0] << 4); + + /* Check if this device has an MDIO bus */ + mdio_np = of_get_child_by_name(np, "mdio"); + if (mdio_np) { + plat->has_mdio = true; + mdio_bus_np = mdio_np; + /* DO NOT put the mdio_np, it will be used */ + } + + /* Get the rx queue as a resource from queue manager */ + ret = of_parse_phandle_with_fixed_args(np, "queue-rx", 1, 0, + &queue_spec); + if (ret) { + dev_err(dev, "no rx queue phandle\n"); + return NULL; + } + plat->rxq = queue_spec.args[0]; + + /* Get the txready queue as resource from queue manager */ + ret = of_parse_phandle_with_fixed_args(np, "queue-txready", 1, 0, + &queue_spec); + if (ret) { + dev_err(dev, "no txready queue phandle\n"); + return NULL; + } + plat->txreadyq = queue_spec.args[0]; + + return plat; +} +#else +static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev) +{ + return NULL; +} +#endif + static int ixp4xx_eth_probe(struct platform_device *pdev) { - char phy_id[MII_BUS_ID_SIZE + 3]; struct phy_device *phydev = NULL; struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; struct eth_plat_info *plat; - resource_size_t regs_phys; struct net_device *ndev; struct resource *res; struct port *port; int err; - plat = dev_get_platdata(dev); + if (np) { + plat = ixp4xx_of_get_platdata(dev); + if (!plat) + return -ENODEV; + } else { + plat = dev_get_platdata(dev); + if (!plat) + return -ENODEV; + plat->npe = pdev->id; + switch (plat->npe) { + case IXP4XX_ETH_NPEA: + /* If the MDIO bus is not up yet, defer probe */ + break; + case IXP4XX_ETH_NPEB: + /* On all except IXP43x, NPE-B is used for the MDIO bus. + * If there is no NPE-B in the feature set, bail out, + * else we have the MDIO bus here. + */ + if (!cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEB_ETH0)) + return -ENODEV; + /* Else register the MDIO bus on NPE-B */ + plat->has_mdio = true; + } + break; + case IXP4XX_ETH_NPEC: + /* IXP43x lacks NPE-B and uses NPE-C for the MDIO bus + * access, if there is no NPE-C, no bus, nothing works, + * so bail out. + */ + if (cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEC_ETH)) + return -ENODEV; + /* Else register the MDIO bus on NPE-B */ + plat->has_mdio = true; + } + break; + default: + return -ENODEV; + } + } if (!(ndev = devm_alloc_etherdev(dev, sizeof(struct port)))) return -ENOMEM; @@ -1378,75 +1479,42 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) SET_NETDEV_DEV(ndev, dev); port = netdev_priv(ndev); port->netdev = ndev; - port->id = pdev->id; + port->id = plat->npe; /* Get the port resource and remap */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENODEV; - regs_phys = res->start; port->regs = devm_ioremap_resource(dev, res); if (IS_ERR(port->regs)) return PTR_ERR(port->regs); - switch (port->id) { - case IXP4XX_ETH_NPEA: - /* If the MDIO bus is not up yet, defer probe */ - if (!mdio_bus) - return -EPROBE_DEFER; - break; - case IXP4XX_ETH_NPEB: - /* - * On all except IXP43x, NPE-B is used for the MDIO bus. - * If there is no NPE-B in the feature set, bail out, else - * register the MDIO bus. - */ - if (!cpu_is_ixp43x()) { - if (!(ixp4xx_read_feature_bits() & - IXP4XX_FEATURE_NPEB_ETH0)) - return -ENODEV; - /* Else register the MDIO bus on NPE-B */ - if ((err = ixp4xx_mdio_register(port->regs))) - return err; - } - if (!mdio_bus) - return -EPROBE_DEFER; - break; - case IXP4XX_ETH_NPEC: - /* - * IXP43x lacks NPE-B and uses NPE-C for the MDIO bus access, - * of there is no NPE-C, no bus, nothing works, so bail out. - */ - if (cpu_is_ixp43x()) { - if (!(ixp4xx_read_feature_bits() & - IXP4XX_FEATURE_NPEC_ETH)) - return -ENODEV; - /* Else register the MDIO bus on NPE-C */ - if ((err = ixp4xx_mdio_register(port->regs))) - return err; + /* Register the MDIO bus if we have it */ + if (plat->has_mdio) { + err = ixp4xx_mdio_register(port->regs); + if (err) { + dev_err(dev, "failed to register MDIO bus\n"); + return err; } - if (!mdio_bus) - return -EPROBE_DEFER; - break; - default: - return -ENODEV; } + /* If the instance with the MDIO bus has not yet appeared, + * defer probing until it gets probed. + */ + if (!mdio_bus) + return -EPROBE_DEFER; ndev->netdev_ops = &ixp4xx_netdev_ops; ndev->ethtool_ops = &ixp4xx_ethtool_ops; ndev->tx_queue_len = 100; + /* Inherit the DMA masks from the platform device */ + ndev->dev.dma_mask = dev->dma_mask; + ndev->dev.coherent_dma_mask = dev->coherent_dma_mask; netif_napi_add(ndev, &port->napi, eth_poll, NAPI_WEIGHT); if (!(port->npe = npe_request(NPE_ID(port->id)))) return -EIO; - port->mem_res = request_mem_region(regs_phys, REGS_SIZE, ndev->name); - if (!port->mem_res) { - err = -EBUSY; - goto err_npe_rel; - } - port->plat = plat; npe_port_tab[NPE_ID(port->id)] = port; memcpy(ndev->dev_addr, plat->hwaddr, ETH_ALEN); @@ -1459,12 +1527,24 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) __raw_writel(DEFAULT_CORE_CNTRL, &port->regs->core_control); udelay(50); - snprintf(phy_id, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, - mdio_bus->id, plat->phy); - phydev = phy_connect(ndev, phy_id, &ixp4xx_adjust_link, - PHY_INTERFACE_MODE_MII); - if (IS_ERR(phydev)) { - err = PTR_ERR(phydev); + if (np) { + phydev = of_phy_get_and_connect(ndev, np, ixp4xx_adjust_link); + } else { + phydev = mdiobus_get_phy(mdio_bus, plat->phy); + if (IS_ERR(phydev)) { + err = PTR_ERR(phydev); + dev_err(dev, "could not connect phydev (%d)\n", err); + goto err_free_mem; + } + err = phy_connect_direct(ndev, phydev, ixp4xx_adjust_link, + PHY_INTERFACE_MODE_MII); + if (err) + goto err_free_mem; + + } + if (!phydev) { + err = -ENODEV; + dev_err(dev, "no phydev\n"); goto err_free_mem; } @@ -1482,8 +1562,6 @@ err_phy_dis: phy_disconnect(phydev); err_free_mem: npe_port_tab[NPE_ID(port->id)] = NULL; - release_resource(port->mem_res); -err_npe_rel: npe_release(port->npe); return err; } @@ -1499,12 +1577,21 @@ static int ixp4xx_eth_remove(struct platform_device *pdev) ixp4xx_mdio_remove(); npe_port_tab[NPE_ID(port->id)] = NULL; npe_release(port->npe); - release_resource(port->mem_res); return 0; } +static const struct of_device_id ixp4xx_eth_of_match[] = { + { + .compatible = "intel,ixp4xx-ethernet", + }, + { }, +}; + static struct platform_driver ixp4xx_eth_driver = { - .driver.name = DRV_NAME, + .driver = { + .name = DRV_NAME, + .of_match_table = of_match_ptr(ixp4xx_eth_of_match), + }, .probe = ixp4xx_eth_probe, .remove = ixp4xx_eth_remove, }; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 072de880b99f..1ab94b5f9bbf 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -892,7 +892,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + if (!pskb_inet_may_pull(skb)) return -EINVAL; sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); @@ -989,7 +989,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + if (!pskb_inet_may_pull(skb)) return -EINVAL; sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 7349a70af083..f682a5572d84 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2297,6 +2297,7 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) { struct device *parent = vf_netdev->dev.parent; struct net_device_context *ndev_ctx; + struct net_device *ndev; struct pci_dev *pdev; u32 serial; @@ -2319,8 +2320,17 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) if (!ndev_ctx->vf_alloc) continue; - if (ndev_ctx->vf_serial == serial) - return hv_get_drvdata(ndev_ctx->device_ctx); + if (ndev_ctx->vf_serial != serial) + continue; + + ndev = hv_get_drvdata(ndev_ctx->device_ctx); + if (ndev->addr_len != vf_netdev->addr_len || + memcmp(ndev->perm_addr, vf_netdev->perm_addr, + ndev->addr_len) != 0) + continue; + + return ndev; + } netdev_notice(vf_netdev, diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 9a9a5cf36a4b..7427b989607e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -423,18 +423,24 @@ static void macvlan_forward_source_one(struct sk_buff *skb, macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } -static void macvlan_forward_source(struct sk_buff *skb, +static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; + bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { - if (ether_addr_equal_64bits(entry->addr, addr)) + if (ether_addr_equal_64bits(entry->addr, addr)) { + if (entry->vlan->flags & MACVLAN_FLAG_NODST) + consume = true; macvlan_forward_source_one(skb, entry->vlan); + } } + + return consume; } /* called under rcu_read_lock() from netif_receive_skb */ @@ -463,7 +469,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { @@ -482,7 +489,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; } - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); @@ -1286,7 +1294,8 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], return 0; if (data[IFLA_MACVLAN_FLAGS] && - nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC) + nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | + MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { diff --git a/drivers/net/phy/intel-xway.c b/drivers/net/phy/intel-xway.c index 6eac50d4b42f..d453ec016168 100644 --- a/drivers/net/phy/intel-xway.c +++ b/drivers/net/phy/intel-xway.c @@ -11,6 +11,18 @@ #define XWAY_MDIO_IMASK 0x19 /* interrupt mask */ #define XWAY_MDIO_ISTAT 0x1A /* interrupt status */ +#define XWAY_MDIO_LED 0x1B /* led control */ + +/* bit 15:12 are reserved */ +#define XWAY_MDIO_LED_LED3_EN BIT(11) /* Enable the integrated function of LED3 */ +#define XWAY_MDIO_LED_LED2_EN BIT(10) /* Enable the integrated function of LED2 */ +#define XWAY_MDIO_LED_LED1_EN BIT(9) /* Enable the integrated function of LED1 */ +#define XWAY_MDIO_LED_LED0_EN BIT(8) /* Enable the integrated function of LED0 */ +/* bit 7:4 are reserved */ +#define XWAY_MDIO_LED_LED3_DA BIT(3) /* Direct Access to LED3 */ +#define XWAY_MDIO_LED_LED2_DA BIT(2) /* Direct Access to LED2 */ +#define XWAY_MDIO_LED_LED1_DA BIT(1) /* Direct Access to LED1 */ +#define XWAY_MDIO_LED_LED0_DA BIT(0) /* Direct Access to LED0 */ #define XWAY_MDIO_INIT_WOL BIT(15) /* Wake-On-LAN */ #define XWAY_MDIO_INIT_MSRE BIT(14) @@ -159,6 +171,15 @@ static int xway_gphy_config_init(struct phy_device *phydev) /* Clear all pending interrupts */ phy_read(phydev, XWAY_MDIO_ISTAT); + /* Ensure that integrated led function is enabled for all leds */ + err = phy_write(phydev, XWAY_MDIO_LED, + XWAY_MDIO_LED_LED0_EN | + XWAY_MDIO_LED_LED1_EN | + XWAY_MDIO_LED_LED2_EN | + XWAY_MDIO_LED_LED3_EN); + if (err) + return err; + phy_write_mmd(phydev, MDIO_MMD_VEND2, XWAY_MMD_LEDCH, XWAY_MMD_LEDCH_NACS_NONE | XWAY_MMD_LEDCH_SBF_F02HZ | diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e2b2b20c0dc5..a61fde7013bd 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -978,22 +978,28 @@ static int m88e1111_get_downshift(struct phy_device *phydev, u8 *data) static int m88e1111_set_downshift(struct phy_device *phydev, u8 cnt) { - int val; + int val, err; if (cnt > MII_M1111_PHY_EXT_CR_DOWNSHIFT_MAX) return -E2BIG; - if (!cnt) - return phy_clear_bits(phydev, MII_M1111_PHY_EXT_CR, - MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN); + if (!cnt) { + err = phy_clear_bits(phydev, MII_M1111_PHY_EXT_CR, + MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN); + } else { + val = MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN; + val |= FIELD_PREP(MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, cnt - 1); - val = MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN; - val |= FIELD_PREP(MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, cnt - 1); + err = phy_modify(phydev, MII_M1111_PHY_EXT_CR, + MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN | + MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, + val); + } - return phy_modify(phydev, MII_M1111_PHY_EXT_CR, - MII_M1111_PHY_EXT_CR_DOWNSHIFT_EN | - MII_M1111_PHY_EXT_CR_DOWNSHIFT_MASK, - val); + if (err < 0) + return err; + + return genphy_soft_reset(phydev); } static int m88e1111_get_tunable(struct phy_device *phydev, @@ -1036,22 +1042,28 @@ static int m88e1011_get_downshift(struct phy_device *phydev, u8 *data) static int m88e1011_set_downshift(struct phy_device *phydev, u8 cnt) { - int val; + int val, err; if (cnt > MII_M1011_PHY_SCR_DOWNSHIFT_MAX) return -E2BIG; - if (!cnt) - return phy_clear_bits(phydev, MII_M1011_PHY_SCR, - MII_M1011_PHY_SCR_DOWNSHIFT_EN); + if (!cnt) { + err = phy_clear_bits(phydev, MII_M1011_PHY_SCR, + MII_M1011_PHY_SCR_DOWNSHIFT_EN); + } else { + val = MII_M1011_PHY_SCR_DOWNSHIFT_EN; + val |= FIELD_PREP(MII_M1011_PHY_SCR_DOWNSHIFT_MASK, cnt - 1); - val = MII_M1011_PHY_SCR_DOWNSHIFT_EN; - val |= FIELD_PREP(MII_M1011_PHY_SCR_DOWNSHIFT_MASK, cnt - 1); + err = phy_modify(phydev, MII_M1011_PHY_SCR, + MII_M1011_PHY_SCR_DOWNSHIFT_EN | + MII_M1011_PHY_SCR_DOWNSHIFT_MASK, + val); + } - return phy_modify(phydev, MII_M1011_PHY_SCR, - MII_M1011_PHY_SCR_DOWNSHIFT_EN | - MII_M1011_PHY_SCR_DOWNSHIFT_MASK, - val); + if (err < 0) + return err; + + return genphy_soft_reset(phydev); } static int m88e1011_get_tunable(struct phy_device *phydev, diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 9986f8969d02..136ea06540ff 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -767,8 +767,6 @@ enum rtl8152_flags { PHY_RESET, SCHEDULE_TASKLET, GREEN_ETHERNET, - DELL_TB_RX_AGG_BUG, - LENOVO_MACPASSTHRU, }; #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2 0x3082 @@ -934,6 +932,8 @@ struct r8152 { u32 fc_pause_on, fc_pause_off; u32 support_2500full:1; + u32 lenovo_macpassthru:1; + u32 dell_tb_rx_agg_bug:1; u16 ocp_base; u16 speed; u16 eee_adv; @@ -1594,7 +1594,7 @@ static int vendor_mac_passthru_addr_read(struct r8152 *tp, struct sockaddr *sa) acpi_object_type mac_obj_type; int mac_strlen; - if (test_bit(LENOVO_MACPASSTHRU, &tp->flags)) { + if (tp->lenovo_macpassthru) { mac_obj_name = "\\MACA"; mac_obj_type = ACPI_TYPE_STRING; mac_strlen = 0x16; @@ -2283,7 +2283,7 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct tx_agg *agg) remain = agg_buf_sz - (int)(tx_agg_align(tx_data) - agg->head); - if (test_bit(DELL_TB_RX_AGG_BUG, &tp->flags)) + if (tp->dell_tb_rx_agg_bug) break; } @@ -6941,7 +6941,7 @@ static void r8153_init(struct r8152 *tp) /* rx aggregation */ ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_USB_CTRL); ocp_data &= ~(RX_AGG_DISABLE | RX_ZERO_EN); - if (test_bit(DELL_TB_RX_AGG_BUG, &tp->flags)) + if (tp->dell_tb_rx_agg_bug) ocp_data |= RX_AGG_DISABLE; ocp_write_word(tp, MCU_TYPE_USB, USB_USB_CTRL, ocp_data); @@ -9447,7 +9447,7 @@ static int rtl8152_probe(struct usb_interface *intf, switch (le16_to_cpu(udev->descriptor.idProduct)) { case DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2: case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2: - set_bit(LENOVO_MACPASSTHRU, &tp->flags); + tp->lenovo_macpassthru = 1; } } @@ -9455,7 +9455,7 @@ static int rtl8152_probe(struct usb_interface *intf, (!strcmp(udev->serial, "000001000000") || !strcmp(udev->serial, "000002000000"))) { dev_info(&udev->dev, "Dell TB16 Dock, disable RX aggregation"); - set_bit(DELL_TB_RX_AGG_BUG, &tp->flags); + tp->dell_tb_rx_agg_bug = 1; } netdev->ethtool_ops = &ops; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 4456abb9a074..34bde8c87324 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -40,6 +40,7 @@ int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, const u8 *cmddata[IWL_MAX_CMD_TBS_PER_TFD]; u16 cmdlen[IWL_MAX_CMD_TBS_PER_TFD]; struct iwl_tfh_tfd *tfd; + unsigned long flags; copy_size = sizeof(struct iwl_cmd_header_wide); cmd_size = sizeof(struct iwl_cmd_header_wide); @@ -108,14 +109,14 @@ int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, goto free_dup_buf; } - spin_lock_bh(&txq->lock); + spin_lock_irqsave(&txq->lock, flags); idx = iwl_txq_get_cmd_index(txq, txq->write_ptr); tfd = iwl_txq_get_tfd(trans, txq, txq->write_ptr); memset(tfd, 0, sizeof(*tfd)); if (iwl_txq_space(trans, txq) < ((cmd->flags & CMD_ASYNC) ? 2 : 1)) { - spin_unlock_bh(&txq->lock); + spin_unlock_irqrestore(&txq->lock, flags); IWL_ERR(trans, "No space in command queue\n"); iwl_op_mode_cmd_queue_full(trans->op_mode); @@ -250,7 +251,7 @@ int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, spin_unlock(&trans_pcie->reg_lock); out: - spin_unlock_bh(&txq->lock); + spin_unlock_irqrestore(&txq->lock, flags); free_dup_buf: if (idx < 0) kfree(dup_buf); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 6a29fe11485d..8b77d08d4b47 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -458,7 +458,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -struct bpf_prog; struct cgroup_bpf {}; static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c9b7a876b0c8..ad4bcf1cadbb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -310,6 +310,7 @@ enum bpf_arg_type { ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ + ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ __BPF_ARG_TYPE_MAX, }; @@ -930,7 +931,6 @@ struct bpf_link_primer { }; struct bpf_struct_ops_value; -struct btf_type; struct btf_member; #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 @@ -1955,6 +1955,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; @@ -2080,4 +2081,24 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); +enum bpf_printf_mod_type { + BPF_PRINTF_INT, + BPF_PRINTF_LONG, + BPF_PRINTF_LONG_LONG, +}; + +/* Workaround for getting va_list handling working with different argument type + * combinations generically for 32 and 64 bit archs. + */ +#define BPF_CAST_FMT_ARG(arg_nb, args, mod) \ + (mod[arg_nb] == BPF_PRINTF_LONG_LONG || \ + (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64) \ + ? (u64)args[arg_nb] \ + : (u32)args[arg_nb]) + +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args); +void bpf_printf_cleanup(void); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 51c2ffa3d901..6023a1367853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -487,6 +487,15 @@ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } +/* unpack the IDs from the key as constructed above */ +static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) +{ + if (obj_id) + *obj_id = key >> 32; + if (btf_id) + *btf_id = key & 0x7FFFFFFF; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 9cf1da2883c6..17109b65c1ac 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -65,8 +65,6 @@ struct mlx5_flow_handle * mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, struct mlx5_eswitch_rep *rep, u32 sqn); -u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); - #ifdef CONFIG_MLX5_ESWITCH enum devlink_eswitch_encap_mode mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev); @@ -126,6 +124,8 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, #define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); +u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); + #else /* CONFIG_MLX5_ESWITCH */ static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev) @@ -162,10 +162,17 @@ mlx5_eswitch_get_vport_metadata_mask(void) { return 0; } + +static inline u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) +{ + return 0; +} + #endif /* CONFIG_MLX5_ESWITCH */ static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev) { return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS; } + #endif diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 4db87bcfce7b..aad53cb72f17 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,14 +36,6 @@ #include <linux/mlx5/driver.h> #include <linux/mlx5/device.h> -#define MLX5_VPORT_PF_PLACEHOLDER (1u) -#define MLX5_VPORT_UPLINK_PLACEHOLDER (1u) -#define MLX5_VPORT_ECPF_PLACEHOLDER(mdev) (mlx5_ecpf_vport_exists(mdev)) - -#define MLX5_SPECIAL_VPORTS(mdev) (MLX5_VPORT_PF_PLACEHOLDER + \ - MLX5_VPORT_UPLINK_PLACEHOLDER + \ - MLX5_VPORT_ECPF_PLACEHOLDER(mdev)) - #define MLX5_VPORT_MANAGER(mdev) \ (MLX5_CAP_GEN(mdev, vport_group_manager) && \ (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index d4c14257db5d..515ce53aa20d 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -7,21 +7,26 @@ #include <net/netlink.h> #include <uapi/linux/netfilter/nfnetlink.h> +struct nfnl_info { + struct net *net; + struct sock *sk; + const struct nlmsghdr *nlh; + struct netlink_ext_ack *extack; +}; + +enum nfnl_callback_type { + NFNL_CB_UNSPEC = 0, + NFNL_CB_MUTEX, + NFNL_CB_RCU, + NFNL_CB_BATCH, +}; + struct nfnl_callback { - int (*call)(struct net *net, struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack); - int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack); - int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack); - const struct nla_policy *policy; /* netlink attribute policy */ - const u_int16_t attr_count; /* number of nlattr's */ + int (*call)(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]); + const struct nla_policy *policy; + enum nfnl_callback_type type; + __u16 attr_count; }; enum nfnl_abort_action { diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 8ec48466410a..07c6ad8f2a02 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -158,7 +158,7 @@ struct xt_match { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); @@ -169,7 +169,7 @@ struct xt_match { const char *table; unsigned int matchsize; unsigned int usersize; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; @@ -199,7 +199,7 @@ struct xt_target { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); @@ -210,7 +210,7 @@ struct xt_target { const char *table; unsigned int targetsize; unsigned int usersize; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; @@ -229,6 +229,9 @@ struct xt_table { /* Man behind the curtain... */ struct xt_table_info *private; + /* hook ops that register the table with the netfilter core */ + struct nf_hook_ops *ops; + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; @@ -322,6 +325,7 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err); +struct xt_table *xt_find_table(struct net *net, u8 af, const char *name); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, @@ -448,7 +452,7 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_xt_entry_match { @@ -529,5 +533,5 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ #endif /* _X_TABLES_H */ diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 26a13294318c..2aab9612f6ab 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -51,15 +51,15 @@ struct arpt_error { extern void *arpt_alloc_initial_table(const struct xt_table *); int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res); -void arpt_unregister_table(struct net *net, struct xt_table *table); -void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops); +void arpt_unregister_table(struct net *net, const char *name); +void arpt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops); extern unsigned int arpt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_arpt_entry { diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 3a956145a25c..a8178253ce53 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -100,6 +100,7 @@ struct ebt_table { unsigned int valid_hooks); /* the data used by the kernel */ struct ebt_table_info *private; + struct nf_hook_ops *ops; struct module *me; }; @@ -108,11 +109,9 @@ struct ebt_table { extern int ebt_register_table(struct net *net, const struct ebt_table *table, - const struct nf_hook_ops *ops, - struct ebt_table **res); -extern void ebt_unregister_table(struct net *net, struct ebt_table *table); -void ebt_unregister_table_pre_exit(struct net *net, const char *tablename, - const struct nf_hook_ops *ops); + const struct nf_hook_ops *ops); +extern void ebt_unregister_table(struct net *net, const char *tablename); +void ebt_unregister_table_pre_exit(struct net *net, const char *tablename); extern unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct ebt_table *table); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index c4676d6feeff..8d09bfe850dc 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -24,15 +24,10 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res); - -void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops); -void ipt_unregister_table_exit(struct net *net, struct xt_table *table); - -void ipt_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops); +void ipt_unregister_table_pre_exit(struct net *net, const char *name); +void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ struct ipt_standard { @@ -72,7 +67,7 @@ extern unsigned int ipt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_ipt_entry { diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 1547d5f9ae06..79e73fd7d965 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -26,17 +26,14 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res); -void ip6t_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops); -void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops); -void ip6t_unregister_table_exit(struct net *net, struct xt_table *table); + const struct nf_hook_ops *ops); +void ip6t_unregister_table_pre_exit(struct net *net, const char *name); +void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_ip6t_entry { diff --git a/include/linux/platform_data/eth_ixp4xx.h b/include/linux/platform_data/eth_ixp4xx.h index 6f652ea0c6ae..114b0940729f 100644 --- a/include/linux/platform_data/eth_ixp4xx.h +++ b/include/linux/platform_data/eth_ixp4xx.h @@ -14,6 +14,8 @@ struct eth_plat_info { u8 rxq; /* configurable, currently 0 - 31 only */ u8 txreadyq; u8 hwaddr[6]; + u8 npe; /* NPE instance used by this interface */ + bool has_mdio; /* If this instance has an MDIO bus */ }; #endif diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e242bf3d2b4a..aba0f0f429be 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,7 +99,8 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, + bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -404,7 +405,7 @@ static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) - psock->psock_update_sk_prot(sk, true); + psock->psock_update_sk_prot(sk, psock, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/devlink.h b/include/net/devlink.h index 853420db5d32..7c984cadfec4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -98,11 +98,13 @@ struct devlink_port_pci_vf_attrs { * @controller: Associated controller number * @sf: Associated PCI SF for of the PCI PF for this port. * @pf: Associated PCI PF number for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_sf_attrs { u32 controller; u32 sf; u16 pf; + u8 external:1; }; /** @@ -1508,7 +1510,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, - u32 controller, u16 pf, u32 sf); + u32 controller, u16 pf, u32 sf, + bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h index bcbd724cc048..7fda9ce9f694 100644 --- a/include/net/netfilter/ipv4/nf_defrag_ipv4.h +++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h @@ -3,6 +3,7 @@ #define _NF_DEFRAG_IPV4_H struct net; -int nf_defrag_ipv4_enable(struct net *); +int nf_defrag_ipv4_enable(struct net *net); +void nf_defrag_ipv4_disable(struct net *net); #endif /* _NF_DEFRAG_IPV4_H */ diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index ece923e2035b..0fd8a4159662 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -5,7 +5,8 @@ #include <linux/skbuff.h> #include <linux/types.h> -int nf_defrag_ipv6_enable(struct net *); +int nf_defrag_ipv6_enable(struct net *net); +void nf_defrag_ipv6_disable(struct net *net); int nf_ct_frag6_init(void); void nf_ct_frag6_cleanup(void); diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 0d412dd63707..987111ae5240 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -104,8 +104,6 @@ unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); -int nf_xfrm_me_harder(struct net *n, struct sk_buff *s, unsigned int family); - static inline int nf_nat_initialized(struct nf_conn *ct, enum nf_nat_manip_type manip) { diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4a75da2a2e1d..eb708b77c4a5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -13,6 +13,7 @@ #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> +#include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) @@ -1580,4 +1581,11 @@ struct nftables_pernet { u8 validate_state; }; +extern unsigned int nf_tables_net_id; + +static inline struct nftables_pernet *nft_pernet(const struct net *net) +{ + return net_generic(net, nf_tables_net_id); +} + #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 87e1612497ea..f6af8d96d3c6 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,16 +76,6 @@ struct netns_ipv4 { struct inet_peer_base *peers; struct sock * __percpu *tcp_sk; struct fqdir *fqdir; -#ifdef CONFIG_NETFILTER - struct xt_table *iptable_filter; - struct xt_table *iptable_mangle; - struct xt_table *iptable_raw; - struct xt_table *arptable_filter; -#ifdef CONFIG_SECURITY - struct xt_table *iptable_security; -#endif - struct xt_table *nat_table; -#endif u8 sysctl_icmp_echo_ignore_all; u8 sysctl_icmp_echo_enable_probe; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 808f0f79ea9c..6153c8067009 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -63,15 +63,6 @@ struct netns_ipv6 { struct ipv6_devconf *devconf_dflt; struct inet_peer_base *peers; struct fqdir *fqdir; -#ifdef CONFIG_NETFILTER - struct xt_table *ip6table_filter; - struct xt_table *ip6table_mangle; - struct xt_table *ip6table_raw; -#ifdef CONFIG_SECURITY - struct xt_table *ip6table_security; -#endif - struct xt_table *ip6table_nat; -#endif struct fib6_info *fib6_null_entry; struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h index 83c8ea2e87a6..d02316ec2906 100644 --- a/include/net/netns/x_tables.h +++ b/include/net/netns/x_tables.h @@ -5,16 +5,8 @@ #include <linux/list.h> #include <linux/netfilter_defs.h> -struct ebt_table; - struct netns_xt { bool notrack_deprecated_warning; bool clusterip_deprecated_warning; -#if defined(CONFIG_BRIDGE_NF_EBTABLES) || \ - defined(CONFIG_BRIDGE_NF_EBTABLES_MODULE) - struct ebt_table *broute_table; - struct ebt_table *frame_filter; - struct ebt_table *frame_nat; -#endif }; #endif diff --git a/include/net/sock.h b/include/net/sock.h index cadcc12cc316..42bc5e1a627f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1118,6 +1118,7 @@ struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; +struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes @@ -1189,7 +1190,9 @@ struct proto { void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); #ifdef CONFIG_BPF_SYSCALL - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, + struct sk_psock *psock, + bool restore); #endif /* Keeping track of sockets in use */ diff --git a/include/net/tcp.h b/include/net/tcp.h index eaea43afcc97..d05193cb0d99 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2215,7 +2215,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int tcp_bpf_update_proto(struct sock *sk, bool restore); +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index f55aaeef7e91..360df454356c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -543,7 +543,7 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb) #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int udp_bpf_update_proto(struct sock *sk, bool restore); +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif #endif /* _UDP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 49371eba98ba..ec6d85a81744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,6 +312,27 @@ union bpf_iter_link_info { * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -4061,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4078,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4088,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4671,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4838,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5379,6 +5444,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 91c8dda6d95d..cd5b382a4138 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -614,6 +614,7 @@ enum macvlan_macaddr_mode { }; #define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ /* VRF section */ enum { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 79bab7a36b30..467365ed59a7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1014,11 +1014,13 @@ enum nft_rt_attributes { * * @NFTA_SOCKET_KEY: socket key to match * @NFTA_SOCKET_DREG: destination register + * @NFTA_SOCKET_LEVEL: cgroups2 ancestor level (only for cgroupsv2) */ enum nft_socket_attributes { NFTA_SOCKET_UNSPEC, NFTA_SOCKET_KEY, NFTA_SOCKET_DREG, + NFTA_SOCKET_LEVEL, __NFTA_SOCKET_MAX }; #define NFTA_SOCKET_MAX (__NFTA_SOCKET_MAX - 1) @@ -1029,11 +1031,13 @@ enum nft_socket_attributes { * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option * @NFT_SOCKET_MARK: Value of the socket mark * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0) + * @NFT_SOCKET_CGROUPV2: Match on cgroups version 2 */ enum nft_socket_keys { NFT_SOCKET_TRANSPARENT, NFT_SOCKET_MARK, NFT_SOCKET_WILDCARD, + NFT_SOCKET_CGROUPV2, __NFT_SOCKET_MAX }; #define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f5423251c118..5e31ee9f7512 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1363,11 +1363,10 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions - * @stack: is the eBPF storage stack * * Decode and execute eBPF instructions. */ -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z @@ -1701,7 +1700,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ - return ___bpf_prog_run(regs, insn, stack); \ + return ___bpf_prog_run(regs, insn); \ } #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size @@ -1718,7 +1717,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ BPF_R3 = r3; \ BPF_R4 = r4; \ BPF_R5 = r5; \ - return ___bpf_prog_run(regs, insn, stack); \ + return ___bpf_prog_run(regs, insn); \ } #define EVAL1(FN, X) FN(X) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f306611c4ddf..85b26ca5aacd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -669,6 +669,310 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; +static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)unsafe_ptr < TASK_SIZE) + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + fallthrough; +#endif + case 'k': + return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + case 'u': + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + } + + return -EINVAL; +} + +/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p + */ +#define MAX_PRINTF_BUF_LEN 512 + +struct bpf_printf_buf { + char tmp_buf[MAX_PRINTF_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); +static DEFINE_PER_CPU(int, bpf_printf_buf_used); + +static int try_get_fmt_tmp_buf(char **tmp_buf) +{ + struct bpf_printf_buf *bufs; + int used; + + if (*tmp_buf) + return 0; + + preempt_disable(); + used = this_cpu_inc_return(bpf_printf_buf_used); + if (WARN_ON_ONCE(used > 1)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + return -EBUSY; + } + bufs = this_cpu_ptr(&bpf_printf_buf); + *tmp_buf = bufs->tmp_buf; + + return 0; +} + +void bpf_printf_cleanup(void) +{ + if (this_cpu_read(bpf_printf_buf_used)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + } +} + +/* + * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers + * + * Returns a negative value if fmt is an invalid format string or 0 otherwise. + * + * This can be used in two ways: + * - Format string verification only: when final_args and mod are NULL + * - Arguments preparation: in addition to the above verification, it writes in + * final_args a copy of raw_args where pointers from BPF have been sanitized + * into pointers safe to use by snprintf. This also writes in the mod array + * the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex. + * + * In argument preparation mode, if 0 is returned, safe temporary buffers are + * allocated and bpf_printf_cleanup should be called to free them after use. + */ +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args) +{ + char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end; + size_t tmp_buf_len = MAX_PRINTF_BUF_LEN; + int err, i, num_spec = 0, copy_size; + enum bpf_printf_mod_type cur_mod; + u64 cur_arg; + char fmt_ptype; + + if (!!final_args != !!mod) + return -EINVAL; + + fmt_end = strnchr(fmt, fmt_size, 0); + if (!fmt_end) + return -EINVAL; + fmt_size = fmt_end - fmt; + + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto cleanup; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (num_spec >= num_args) { + err = -EINVAL; + goto cleanup; + } + + /* The string is zero-terminated so if fmt[i] != 0, we can + * always access fmt[i + 1], in the worst case it will be a 0 + */ + i++; + + /* skip optional "[0 +-][num]" width formatting field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 'p') { + cur_mod = BPF_PRINTF_LONG; + + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + fmt[i + 1] == 'x' || fmt[i + 1] == 'B' || + fmt[i + 1] == 's' || fmt[i + 1] == 'S') { + /* just kernel pointers */ + if (final_args) + cur_arg = raw_args[num_spec]; + goto fmt_next; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || + (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { + err = -EINVAL; + goto cleanup; + } + + i += 2; + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + if (tmp_buf_len < copy_size) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr, + copy_size); + if (err < 0) + memset(tmp_buf, 0, copy_size); + cur_arg = (u64)(long)tmp_buf; + tmp_buf += copy_size; + tmp_buf_len -= copy_size; + + goto fmt_next; + } else if (fmt[i] == 's') { + cur_mod = BPF_PRINTF_LONG; + fmt_ptype = fmt[i]; +fmt_str: + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) { + err = -EINVAL; + goto cleanup; + } + + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + if (!tmp_buf_len) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, + fmt_ptype, tmp_buf_len); + if (err < 0) { + tmp_buf[0] = '\0'; + err = 1; + } + + cur_arg = (u64)(long)tmp_buf; + tmp_buf += err; + tmp_buf_len -= err; + + goto fmt_next; + } + + cur_mod = BPF_PRINTF_INT; + + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG; + i++; + } + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG_LONG; + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && + fmt[i] != 'x' && fmt[i] != 'X') { + err = -EINVAL; + goto cleanup; + } + + if (final_args) + cur_arg = raw_args[num_spec]; +fmt_next: + if (final_args) { + mod[num_spec] = cur_mod; + final_args[num_spec] = cur_arg; + } + num_spec++; + } + + err = 0; +cleanup: + if (err) + bpf_printf_cleanup(); +out: + return err; +} + +#define MAX_SNPRINTF_VARARGS 12 + +BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) +{ + enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; + u64 args[MAX_SNPRINTF_VARARGS]; + int err, num_args; + + if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + if (err < 0) + return err; + + /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give + * all of them to snprintf(). + */ + err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); + + bpf_printf_cleanup(); + + return err + 1; +} + +const struct bpf_func_proto bpf_snprintf_proto = { + .func = bpf_snprintf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -757,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d2de2abec35b..b4ebd60a6c16 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -816,8 +816,6 @@ static int __init bpf_init(void) { int ret; - mutex_init(&bpf_preload_lock); - ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6428634da57e..fd495190115e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2551,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link); info->tracing.attach_type = tr_link->attach_type; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &info->tracing.target_obj_id, + &info->tracing.target_btf_id); return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5682a02901d3..637462e9b6ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4787,6 +4787,7 @@ static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALU static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; +static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4817,6 +4818,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, [ARG_PTR_TO_FUNC] = &func_ptr_types, [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, + [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -5067,6 +5069,44 @@ skip_type_check: if (err) return err; err = check_ptr_alignment(env, reg, 0, size, true); + } else if (arg_type == ARG_PTR_TO_CONST_STR) { + struct bpf_map *map = reg->map_ptr; + int map_off; + u64 map_addr; + char *str_ptr; + + if (!bpf_map_is_rdonly(map)) { + verbose(env, "R%d does not point to a readonly map'\n", regno); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a constant address'\n", regno); + return -EACCES; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + return -EACCES; + } + + err = check_map_access(env, regno, reg->off, + map->value_size - reg->off, false); + if (err) + return err; + + map_off = reg->off + reg->var_off.value; + err = map->ops->map_direct_value_addr(map, &map_addr, map_off); + if (err) { + verbose(env, "direct value access on string failed\n"); + return err; + } + + str_ptr = (char *)(long)(map_addr); + if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { + verbose(env, "string is not zero-terminated\n"); + return -EINVAL; + } } return err; @@ -5767,6 +5807,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, if (ret_type != RET_INTEGER || (func_id != BPF_FUNC_get_stack && + func_id != BPF_FUNC_get_task_stack && func_id != BPF_FUNC_probe_read_str && func_id != BPF_FUNC_probe_read_kernel_str && func_id != BPF_FUNC_probe_read_user_str)) @@ -5877,6 +5918,43 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } +static int check_bpf_snprintf_call(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; + + /* data must be an array of u64 */ + if (data_len_reg->var_off.value % 8) + return -EINVAL; + num_args = data_len_reg->var_off.value / 8; + + /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const + * and map_direct_value_addr is set. + */ + fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, + fmt_map_off); + if (err) { + verbose(env, "verifier bug\n"); + return -EFAULT; + } + fmt = (char *)(long)fmt_addr + fmt_map_off; + + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + if (err < 0) + verbose(env, "Invalid format string\n"); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -5991,6 +6069,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_snprintf) { + err = check_bpf_snprintf_call(env, regs); + if (err < 0) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0d23755c2747..2a8bcdc927c7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -372,188 +372,38 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } -static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, - size_t bufsz) -{ - void __user *user_ptr = (__force void __user *)unsafe_ptr; - - buf[0] = 0; - - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)unsafe_ptr < TASK_SIZE) { - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } - fallthrough; -#endif - case 'k': - strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); - break; - case 'u': - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } -} - static DEFINE_RAW_SPINLOCK(trace_printk_lock); -#define BPF_TRACE_PRINTK_SIZE 1024 +#define MAX_TRACE_PRINTK_VARARGS 3 +#define BPF_TRACE_PRINTK_SIZE 1024 -static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; + enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS]; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; - va_list ap; int ret; - raw_spin_lock_irqsave(&trace_printk_lock, flags); - va_start(ap, fmt); - ret = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - /* vsnprintf() will not append null for zero-length strings */ + ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod, + MAX_TRACE_PRINTK_VARARGS); + if (ret < 0) + return ret; + + ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod)); + /* snprintf() will not append null for zero-length strings */ if (ret == 0) buf[0] = '\0'; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - return ret; -} - -/* - * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s - */ -BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, - u64, arg2, u64, arg3) -{ - int i, mod[3] = {}, fmt_cnt = 0; - char buf[64], fmt_ptype; - void *unsafe_ptr = NULL; - bool str_seen = false; + bpf_printf_cleanup(); - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - return -EINVAL; - - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) - return -EINVAL; - - if (fmt[i] != '%') - continue; - - if (fmt_cnt >= 3) - return -EINVAL; - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } else if (fmt[i] == 'p') { - mod[fmt_cnt]++; - if ((fmt[i + 1] == 'k' || - fmt[i + 1] == 'u') && - fmt[i + 2] == 's') { - fmt_ptype = fmt[i + 1]; - i += 2; - goto fmt_str; - } - - if (fmt[i + 1] == 'B') { - i++; - goto fmt_next; - } - - /* disallow any further format extensions */ - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - goto fmt_next; - } else if (fmt[i] == 's') { - mod[fmt_cnt]++; - fmt_ptype = fmt[i]; -fmt_str: - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - switch (fmt_cnt) { - case 0: - unsafe_ptr = (void *)(long)arg1; - arg1 = (long)buf; - break; - case 1: - unsafe_ptr = (void *)(long)arg2; - arg2 = (long)buf; - break; - case 2: - unsafe_ptr = (void *)(long)arg3; - arg3 = (long)buf; - break; - } - - bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, - sizeof(buf)); - goto fmt_next; - } - - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') - return -EINVAL; -fmt_next: - fmt_cnt++; - } - -/* Horrid workaround for getting va_list handling working with different - * argument type combinations generically for 32 and 64 bit archs. - */ -#define __BPF_TP_EMIT() __BPF_ARG3_TP() -#define __BPF_TP(...) \ - bpf_do_trace_printk(fmt, ##__VA_ARGS__) - -#define __BPF_ARG1_TP(...) \ - ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_TP(arg1, ##__VA_ARGS__) \ - : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ - : __BPF_TP((u32)arg1, ##__VA_ARGS__))) - -#define __BPF_ARG2_TP(...) \ - ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ - : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ - : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) - -#define __BPF_ARG3_TP(...) \ - ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ - : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ - : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) - - return __BPF_TP_EMIT(); + return ret; } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -581,184 +431,37 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) } #define MAX_SEQ_PRINTF_VARARGS 12 -#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 -#define MAX_SEQ_PRINTF_STR_LEN 128 - -struct bpf_seq_printf_buf { - char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; -}; -static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); -static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { - int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; - int i, buf_used, copy_size, num_args; - u64 params[MAX_SEQ_PRINTF_VARARGS]; - struct bpf_seq_printf_buf *bufs; - const u64 *args = data; - - buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); - if (WARN_ON_ONCE(buf_used > 1)) { - err = -EBUSY; - goto out; - } - - bufs = this_cpu_ptr(&bpf_seq_printf_buf); - - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - goto out; - - if (data_len & 7) - goto out; - - for (i = 0; i < fmt_size; i++) { - if (fmt[i] == '%') { - if (fmt[i + 1] == '%') - i++; - else if (!data || !data_len) - goto out; - } - } + enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS]; + u64 args[MAX_SEQ_PRINTF_VARARGS]; + int err, num_args; + if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; num_args = data_len / 8; - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - /* only printable ascii for now. */ - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { - err = -EINVAL; - goto out; - } - - if (fmt[i] != '%') - continue; - - if (fmt[i + 1] == '%') { - i++; - continue; - } - - if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { - err = -E2BIG; - goto out; - } - - if (fmt_cnt >= num_args) { - err = -EINVAL; - goto out; - } - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - - /* skip optional "[0 +-][num]" width formating field */ - while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || - fmt[i] == ' ') - i++; - if (fmt[i] >= '1' && fmt[i] <= '9') { - i++; - while (fmt[i] >= '0' && fmt[i] <= '9') - i++; - } - - if (fmt[i] == 's') { - void *unsafe_ptr; - - /* try our best to copy */ - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - unsafe_ptr = (void *)(long)args[fmt_cnt]; - err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], - unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); - if (err < 0) - bufs->buf[memcpy_cnt][0] = '\0'; - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'p') { - if (fmt[i + 1] == 0 || - fmt[i + 1] == 'K' || - fmt[i + 1] == 'x' || - fmt[i + 1] == 'B') { - /* just kernel pointers */ - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - continue; - } - - /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ - if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { - err = -EINVAL; - goto out; - } - if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { - err = -EINVAL; - goto out; - } - - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - - copy_size = (fmt[i + 2] == '4') ? 4 : 16; - - err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - copy_size); - if (err < 0) - memset(bufs->buf[memcpy_cnt], 0, copy_size); - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - i += 2; - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'l') { - i++; - if (fmt[i] == 'l') - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x' && - fmt[i] != 'X') { - err = -EINVAL; - goto out; - } - - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - } + err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args); + if (err < 0) + return err; /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give * all of them to seq_printf(). */ - seq_printf(m, fmt, params[0], params[1], params[2], params[3], - params[4], params[5], params[6], params[7], params[8], - params[9], params[10], params[11]); + seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); - err = seq_has_overflowed(m) ? -EOVERFLOW : 0; -out: - this_cpu_dec(bpf_seq_printf_buf_used); - return err; + bpf_printf_cleanup(); + + return seq_has_overflowed(m) ? -EOVERFLOW : 0; } BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) @@ -1373,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8b644113715e..fb3d3262dc1a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -71,6 +71,9 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, if (array == NULL) return -ENOBUFS; + /* paired with smp_rmb() in __vlan_group_get_device() */ + smp_wmb(); + vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 953405362795..fa3ad3d4d58c 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -57,6 +57,10 @@ static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg, array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; + + /* paired with smp_wmb() in vlan_group_prealloc_vid() */ + smp_rmb(); + return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL; } diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c index fa199556e122..e16183bd1bb8 100644 --- a/net/bridge/netfilter/ebt_limit.c +++ b/net/bridge/netfilter/ebt_limit.c @@ -87,7 +87,7 @@ static int ebt_limit_mt_check(const struct xt_mtchk_param *par) } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* * no conversion function needed -- * only avg/burst have meaningful values in userspace. @@ -107,7 +107,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = { .checkentry = ebt_limit_mt_check, .matchsize = sizeof(struct ebt_limit_info), .usersize = offsetof(struct ebt_limit_info, prev), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct ebt_compat_limit_info), #endif .me = THIS_MODULE, diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index 21fd3d3d77f6..8cf653c72fd8 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -53,7 +53,7 @@ static int ebt_mark_tg_check(const struct xt_tgchk_param *par) return -EINVAL; return 0; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ebt_mark_t_info { compat_ulong_t mark; compat_uint_t target; @@ -87,7 +87,7 @@ static struct xt_target ebt_mark_tg_reg __read_mostly = { .target = ebt_mark_tg, .checkentry = ebt_mark_tg_check, .targetsize = sizeof(struct ebt_mark_t_info), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ebt_mark_t_info), .compat_from_user = mark_tg_compat_from_user, .compat_to_user = mark_tg_compat_to_user, diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c index 81fb59dec499..5872e73c741e 100644 --- a/net/bridge/netfilter/ebt_mark_m.c +++ b/net/bridge/netfilter/ebt_mark_m.c @@ -37,7 +37,7 @@ static int ebt_mark_mt_check(const struct xt_mtchk_param *par) } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ebt_mark_m_info { compat_ulong_t mark, mask; uint8_t invert, bitmask; @@ -75,7 +75,7 @@ static struct xt_match ebt_mark_mt_reg __read_mostly = { .match = ebt_mark_mt, .checkentry = ebt_mark_mt_check, .matchsize = sizeof(struct ebt_mark_m_info), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ebt_mark_m_info), .compat_from_user = mark_mt_compat_from_user, .compat_to_user = mark_mt_compat_to_user, diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 32bc2821027f..020b1487ee0c 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -66,8 +66,7 @@ static unsigned int ebt_broute(void *priv, struct sk_buff *skb, NFPROTO_BRIDGE, s->in, NULL, NULL, s->net, NULL); - ret = ebt_do_table(skb, &state, state.net->xt.broute_table); - + ret = ebt_do_table(skb, &state, priv); if (ret != NF_DROP) return ret; @@ -101,18 +100,17 @@ static const struct nf_hook_ops ebt_ops_broute = { static int __net_init broute_net_init(struct net *net) { - return ebt_register_table(net, &broute_table, &ebt_ops_broute, - &net->xt.broute_table); + return ebt_register_table(net, &broute_table, &ebt_ops_broute); } static void __net_exit broute_net_pre_exit(struct net *net) { - ebt_unregister_table_pre_exit(net, "broute", &ebt_ops_broute); + ebt_unregister_table_pre_exit(net, "broute"); } static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.broute_table); + ebt_unregister_table(net, "broute"); } static struct pernet_operations broute_net_ops = { diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index bcf982e12f16..8ec0b3736803 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -59,34 +59,27 @@ static const struct ebt_table frame_filter = { }; static unsigned int -ebt_in_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +ebt_filter_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { - return ebt_do_table(skb, state, state->net->xt.frame_filter); -} - -static unsigned int -ebt_out_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, state->net->xt.frame_filter); + return ebt_do_table(skb, state, priv); } static const struct nf_hook_ops ebt_ops_filter[] = { { - .hook = ebt_in_hook, + .hook = ebt_filter_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_in_hook, + .hook = ebt_filter_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_out_hook, + .hook = ebt_filter_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_FILTER_OTHER, @@ -95,18 +88,17 @@ static const struct nf_hook_ops ebt_ops_filter[] = { static int __net_init frame_filter_net_init(struct net *net) { - return ebt_register_table(net, &frame_filter, ebt_ops_filter, - &net->xt.frame_filter); + return ebt_register_table(net, &frame_filter, ebt_ops_filter); } static void __net_exit frame_filter_net_pre_exit(struct net *net) { - ebt_unregister_table_pre_exit(net, "filter", ebt_ops_filter); + ebt_unregister_table_pre_exit(net, "filter"); } static void __net_exit frame_filter_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_filter); + ebt_unregister_table(net, "filter"); } static struct pernet_operations frame_filter_net_ops = { diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0d092773f816..7c8a1064a531 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -58,35 +58,27 @@ static const struct ebt_table frame_nat = { .me = THIS_MODULE, }; -static unsigned int -ebt_nat_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static unsigned int ebt_nat_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { - return ebt_do_table(skb, state, state->net->xt.frame_nat); -} - -static unsigned int -ebt_nat_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, state->net->xt.frame_nat); + return ebt_do_table(skb, state, priv); } static const struct nf_hook_ops ebt_ops_nat[] = { { - .hook = ebt_nat_out, + .hook = ebt_nat_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_NAT_DST_OTHER, }, { - .hook = ebt_nat_out, + .hook = ebt_nat_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_NAT_SRC, }, { - .hook = ebt_nat_in, + .hook = ebt_nat_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_NAT_DST_BRIDGED, @@ -95,18 +87,17 @@ static const struct nf_hook_ops ebt_ops_nat[] = { static int __net_init frame_nat_net_init(struct net *net) { - return ebt_register_table(net, &frame_nat, ebt_ops_nat, - &net->xt.frame_nat); + return ebt_register_table(net, &frame_nat, ebt_ops_nat); } static void __net_exit frame_nat_net_pre_exit(struct net *net) { - ebt_unregister_table_pre_exit(net, "nat", ebt_ops_nat); + ebt_unregister_table_pre_exit(net, "nat"); } static void __net_exit frame_nat_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_nat); + ebt_unregister_table(net, "nat"); } static struct pernet_operations frame_nat_net_ops = { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 96d789c8d1c7..f022deb3721e 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -47,7 +47,7 @@ struct ebt_pernet { static unsigned int ebt_pernet_id __read_mostly; static DEFINE_MUTEX(ebt_mutex); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void ebt_standard_compat_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -73,7 +73,7 @@ static struct xt_target ebt_standard_target = { .revision = 0, .family = NFPROTO_BRIDGE, .targetsize = sizeof(int), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = ebt_standard_compat_from_user, .compat_to_user = ebt_standard_compat_to_user, @@ -1136,15 +1136,18 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) vfree(table->private->entries); ebt_free_table_info(table->private); vfree(table->private); + kfree(table->ops); kfree(table); } int ebt_register_table(struct net *net, const struct ebt_table *input_table, - const struct nf_hook_ops *ops, struct ebt_table **res) + const struct nf_hook_ops *template_ops) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table_info *newinfo; struct ebt_table *t, *table; + struct nf_hook_ops *ops; + unsigned int num_ops; struct ebt_replace_kernel *repl; int ret, i, countersize; void *p; @@ -1213,15 +1216,31 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, ret = -ENOENT; goto free_unlock; } + + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto free_unlock; + } + + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + if (newinfo->nentries) + module_put(table->me); + goto free_unlock; + } + + for (i = 0; i < num_ops; i++) + ops[i].priv = table; + list_add(&table->list, &ebt_net->tables); mutex_unlock(&ebt_mutex); - WRITE_ONCE(*res, table); - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret) { + table->ops = ops; + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret) __ebt_unregister_table(net, table); - *res = NULL; - } audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REGISTER, GFP_KERNEL); @@ -1257,18 +1276,21 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name) return NULL; } -void ebt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) +void ebt_unregister_table_pre_exit(struct net *net, const char *name) { struct ebt_table *table = __ebt_find_table(net, name); if (table) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(ebt_unregister_table_pre_exit); -void ebt_unregister_table(struct net *net, struct ebt_table *table) +void ebt_unregister_table(struct net *net, const char *name) { - __ebt_unregister_table(net, table); + struct ebt_table *table = __ebt_find_table(net, name); + + if (table) + __ebt_unregister_table(net, table); } /* userspace just supplied us with counters */ @@ -1480,7 +1502,7 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user, ebt_entry_to_user, entries, tmp.entries); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* 32 bit-userspace compatibility definitions. */ struct compat_ebt_replace { char name[EBT_TABLE_MAXNAMELEN]; @@ -2345,7 +2367,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* try real handler in case userland supplied needed padding */ if (in_compat_syscall() && ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) || @@ -2412,7 +2434,7 @@ static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, switch (cmd) { case EBT_SO_SET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(net, arg, len); else @@ -2420,7 +2442,7 @@ static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, ret = do_replace(net, arg, len); break; case EBT_SO_SET_COUNTERS: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_update_counters(net, arg, len); else diff --git a/net/core/dev.c b/net/core/dev.c index d9bf63dbe4fd..222b1d322c96 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4723,10 +4723,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; + bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; - bool orig_bcast; int off; /* Reinjected packets coming from act_mirred or similar should @@ -4773,6 +4773,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; @@ -4800,8 +4801,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } @@ -5962,7 +5966,7 @@ static void gro_list_prepare(const struct list_head *head, } } -static void skb_gro_reset_offset(struct sk_buff *skb) +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; @@ -5973,7 +5977,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) { + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), @@ -6191,7 +6195,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); @@ -6280,7 +6284,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) napi->skb = NULL; skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); diff --git a/net/core/devlink.c b/net/core/devlink.c index 737b61c2976e..4eb969518ee0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8599,9 +8599,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf) + u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -8615,6 +8616,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; + attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); @@ -8667,6 +8669,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (attrs->pci_sf.external) { + n = snprintf(name, len, "c%u", attrs->pci_sf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8379719d1dce..98f20efbfadf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,6 +131,9 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); + if (n->dead) + goto out; + /* remove from the gc list if new state is permanent or if neighbor * is externally learned; otherwise entry should be on the gc list */ @@ -147,6 +150,7 @@ static void neigh_update_gc_list(struct neighbour *n) atomic_inc(&n->tbl->gc_entries); } +out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3d190d22b0d8..6f1b82b8ad49 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -188,7 +188,7 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; - return sk->sk_prot->psock_update_sk_prot(sk, false); + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) @@ -1521,7 +1521,7 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); @@ -1532,6 +1532,7 @@ void sock_map_close(struct sock *sk, long timeout) sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock, true); + sk_psock_put(sk, psock); release_sock(sk); saved_close(sk, timeout); } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d6d45d820d79..cf20316094d0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -713,7 +713,7 @@ static int copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -800,7 +800,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif @@ -808,7 +808,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -835,7 +835,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif @@ -1044,7 +1044,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_arpt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1412,7 +1412,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, switch (cmd) { case ARPT_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1444,7 +1444,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; case ARPT_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1499,10 +1499,11 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, - const struct nf_hook_ops *ops, - struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1516,41 +1517,61 @@ int arpt_register_table(struct net *net, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; + } - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __arpt_unregister_table(net, new_table); - *res = NULL; + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; } + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __arpt_unregister_table(net, new_table); return ret; } -void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table, +void arpt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + + if (table) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(arpt_unregister_table_pre_exit); -void arpt_unregister_table(struct net *net, struct xt_table *table) +void arpt_unregister_table(struct net *net, const char *name) { - __arpt_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + + if (table) + __arpt_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ @@ -1559,7 +1580,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_ARP, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 6c300ba5634e..b8f45e9bbec8 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -34,7 +34,7 @@ static unsigned int arptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return arpt_do_table(skb, state, state->net->ipv4.arptable_filter); + return arpt_do_table(skb, state, priv); } static struct nf_hook_ops *arpfilter_ops __read_mostly; @@ -44,31 +44,22 @@ static int __net_init arptable_filter_table_init(struct net *net) struct arpt_replace *repl; int err; - if (net->ipv4.arptable_filter) - return 0; - repl = arpt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; - err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, - &net->ipv4.arptable_filter); + err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops); kfree(repl); return err; } static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - if (net->ipv4.arptable_filter) - arpt_unregister_table_pre_exit(net, net->ipv4.arptable_filter, - arpfilter_ops); + arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops); } static void __net_exit arptable_filter_net_exit(struct net *net) { - if (!net->ipv4.arptable_filter) - return; - arpt_unregister_table(net, net->ipv4.arptable_filter); - net->ipv4.arptable_filter = NULL; + arpt_unregister_table(net, "filter"); } static struct pernet_operations arptable_filter_net_ops = { diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f77ea0dbe656..13acb687c19a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -868,7 +868,7 @@ copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -957,7 +957,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET); #endif @@ -965,7 +965,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -993,7 +993,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET); #endif @@ -1199,7 +1199,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1621,7 +1621,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) switch (cmd) { case IPT_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1654,7 +1654,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) break; case IPT_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1716,9 +1716,11 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1732,50 +1734,65 @@ int ipt_register_table(struct net *net, const struct xt_table *table, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); - if (!ops) + /* No template? No need to do anything. This is used by 'nat' table, it registers + * with the nat core instead of the netfilter core. + */ + if (!template_ops) return 0; - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __ipt_unregister_table(net, new_table); - *res = NULL; + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; + } + + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; } + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __ipt_unregister_table(net, new_table); return ret; } -void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) +void ipt_unregister_table_pre_exit(struct net *net, const char *name) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); -} + struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); -void ipt_unregister_table_exit(struct net *net, struct xt_table *table) -{ - __ipt_unregister_table(net, table); + if (table) + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } -void ipt_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) +void ipt_unregister_table_exit(struct net *net, const char *name) { - if (ops) - ipt_unregister_table_pre_exit(net, table, ops); - __ipt_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + + if (table) + __ipt_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ @@ -1829,7 +1846,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV4, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, @@ -1924,7 +1941,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table); EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index a8b980ad11d4..8f7ca67475b7 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -541,7 +541,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_clusterip_tgt_info { u_int32_t flags; @@ -553,7 +553,7 @@ struct compat_ipt_clusterip_tgt_info u_int32_t hash_initval; compat_uptr_t config; }; -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", @@ -563,9 +563,9 @@ static struct xt_target clusterip_tg_reg __read_mostly = { .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), .usersize = offsetof(struct ipt_clusterip_tgt_info, config), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ .me = THIS_MODULE }; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 8f7bc1ee7453..8272df7c6ad5 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -34,7 +34,7 @@ static unsigned int iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *filter_ops __read_mostly; @@ -48,9 +48,6 @@ static int __net_init iptable_filter_table_init(struct net *net) struct ipt_replace *repl; int err; - if (net->ipv4.iptable_filter) - return 0; - repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; @@ -58,8 +55,7 @@ static int __net_init iptable_filter_table_init(struct net *net) ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - err = ipt_register_table(net, &packet_filter, repl, filter_ops, - &net->ipv4.iptable_filter); + err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } @@ -74,17 +70,12 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_filter) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_filter, - filter_ops); + ipt_unregister_table_pre_exit(net, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) { - if (!net->ipv4.iptable_filter) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_filter); - net->ipv4.iptable_filter = NULL; + ipt_unregister_table_exit(net, "filter"); } static struct pernet_operations iptable_filter_net_ops = { diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 833079589273..2abc3836f391 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -37,7 +37,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) +ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; const struct iphdr *iph; @@ -53,7 +53,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); + ret = ipt_do_table(skb, state, priv); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -78,8 +78,8 @@ iptable_mangle_hook(void *priv, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ipt_mangle_out(skb, state); - return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); + return ipt_mangle_out(skb, state, priv); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; @@ -88,31 +88,22 @@ static int __net_init iptable_mangle_table_init(struct net *net) struct ipt_replace *repl; int ret; - if (net->ipv4.iptable_mangle) - return 0; - repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, - &net->ipv4.iptable_mangle); + ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_mangle) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_mangle, - mangle_ops); + ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { - if (!net->ipv4.iptable_mangle) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_mangle); - net->ipv4.iptable_mangle = NULL; + ipt_unregister_table_exit(net, "mangle"); } static struct pernet_operations iptable_mangle_net_ops = { diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index b0143b109f25..a9913842ef18 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -13,8 +13,14 @@ #include <net/netfilter/nf_nat.h> +struct iptable_nat_pernet { + struct nf_hook_ops *nf_nat_ops; +}; + static int __net_init iptable_nat_table_init(struct net *net); +static unsigned int iptable_nat_net_id __read_mostly; + static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -30,7 +36,7 @@ static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.nat_table); + return ipt_do_table(skb, state, priv); } static const struct nf_hook_ops nf_nat_ipv4_ops[] = { @@ -62,27 +68,49 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { static int ipt_nat_register_lookups(struct net *net) { + struct iptable_nat_pernet *xt_nat_net; + struct nf_hook_ops *ops; + struct xt_table *table; int i, ret; + xt_nat_net = net_generic(net, iptable_nat_net_id); + table = xt_find_table(net, NFPROTO_IPV4, "nat"); + if (WARN_ON_ONCE(!table)) + return -ENOENT; + + ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) { - ret = nf_nat_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]); + ops[i].priv = table; + ret = nf_nat_ipv4_register_fn(net, &ops[i]); if (ret) { while (i) - nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]); + nf_nat_ipv4_unregister_fn(net, &ops[--i]); + kfree(ops); return ret; } } + xt_nat_net->nf_nat_ops = ops; return 0; } static void ipt_nat_unregister_lookups(struct net *net) { + struct iptable_nat_pernet *xt_nat_net = net_generic(net, iptable_nat_net_id); + struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops; int i; + if (!ops) + return; + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) - nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]); + nf_nat_ipv4_unregister_fn(net, &ops[i]); + + kfree(ops); } static int __net_init iptable_nat_table_init(struct net *net) @@ -90,24 +118,19 @@ static int __net_init iptable_nat_table_init(struct net *net) struct ipt_replace *repl; int ret; - if (net->ipv4.nat_table) - return 0; - repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, - NULL, &net->ipv4.nat_table); + + ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, NULL); if (ret < 0) { kfree(repl); return ret; } ret = ipt_nat_register_lookups(net); - if (ret < 0) { - ipt_unregister_table(net, net->ipv4.nat_table, NULL); - net->ipv4.nat_table = NULL; - } + if (ret < 0) + ipt_unregister_table_exit(net, "nat"); kfree(repl); return ret; @@ -115,21 +138,19 @@ static int __net_init iptable_nat_table_init(struct net *net) static void __net_exit iptable_nat_net_pre_exit(struct net *net) { - if (net->ipv4.nat_table) - ipt_nat_unregister_lookups(net); + ipt_nat_unregister_lookups(net); } static void __net_exit iptable_nat_net_exit(struct net *net) { - if (!net->ipv4.nat_table) - return; - ipt_unregister_table_exit(net, net->ipv4.nat_table); - net->ipv4.nat_table = NULL; + ipt_unregister_table_exit(net, "nat"); } static struct pernet_operations iptable_nat_net_ops = { .pre_exit = iptable_nat_net_pre_exit, .exit = iptable_nat_net_exit, + .id = &iptable_nat_net_id, + .size = sizeof(struct iptable_nat_pernet), }; static int __init iptable_nat_init(void) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 9abfe6bf2cb9..ceef397c1f5f 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -41,7 +41,7 @@ static unsigned int iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *rawtable_ops __read_mostly; @@ -55,31 +55,22 @@ static int __net_init iptable_raw_table_init(struct net *net) if (raw_before_defrag) table = &packet_raw_before_defrag; - if (net->ipv4.iptable_raw) - return 0; - repl = ipt_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, table, repl, rawtable_ops, - &net->ipv4.iptable_raw); + ret = ipt_register_table(net, table, repl, rawtable_ops); kfree(repl); return ret; } static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_raw) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_raw, - rawtable_ops); + ipt_unregister_table_pre_exit(net, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) { - if (!net->ipv4.iptable_raw) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_raw); - net->ipv4.iptable_raw = NULL; + ipt_unregister_table_exit(net, "raw"); } static struct pernet_operations iptable_raw_net_ops = { diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 415c1975d770..77973f5fd8f6 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -40,7 +40,7 @@ static unsigned int iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_security); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -50,31 +50,22 @@ static int __net_init iptable_security_table_init(struct net *net) struct ipt_replace *repl; int ret; - if (net->ipv4.iptable_security) - return 0; - repl = ipt_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &security_table, repl, sectbl_ops, - &net->ipv4.iptable_security); + ret = ipt_register_table(net, &security_table, repl, sectbl_ops); kfree(repl); return ret; } static void __net_exit iptable_security_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_security) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_security, - sectbl_ops); + ipt_unregister_table_pre_exit(net, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) { - if (!net->ipv4.iptable_security) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_security); - net->ipv4.iptable_security = NULL; + ipt_unregister_table_exit(net, "security"); } static struct pernet_operations iptable_security_net_ops = { diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index ffdcc2b9360f..613432a36f0a 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -141,14 +141,16 @@ int nf_defrag_ipv4_enable(struct net *net) struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; - might_sleep(); - - if (nf_defrag->users) - return 0; - mutex_lock(&defrag4_mutex); - if (nf_defrag->users) + if (nf_defrag->users == UINT_MAX) { + err = -EOVERFLOW; goto out_unlock; + } + + if (nf_defrag->users) { + nf_defrag->users++; + goto out_unlock; + } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); @@ -161,6 +163,22 @@ int nf_defrag_ipv4_enable(struct net *net) } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); +void nf_defrag_ipv4_disable(struct net *net) +{ + struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); + + mutex_lock(&defrag4_mutex); + if (nf_defrag->users) { + nf_defrag->users--; + if (nf_defrag->users == 0) + nf_unregister_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + } + + mutex_unlock(&defrag4_mutex); +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable); + module_init(nf_defrag_init); module_exit(nf_defrag_fini); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 4f49c12dae53..ad9d17923fc5 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -499,9 +499,8 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -int tcp_bpf_update_proto(struct sock *sk, bool restore) +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { - struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7d5c4ebf42fe..954c4591a6fd 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -103,14 +103,12 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -int udp_bpf_update_proto(struct sock *sk, bool restore) +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; - struct sk_psock *psock = sk_psock(sk); if (restore) { sk->sk_write_space = psock->saved_write_space; - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, psock->sk_proto); return 0; } @@ -118,7 +116,6 @@ int udp_bpf_update_proto(struct sock *sk, bool restore) if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); return 0; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index eb2b5404806c..e810a23baf99 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -884,7 +884,7 @@ copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -973,7 +973,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET6); #endif @@ -981,7 +981,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -1009,7 +1009,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET6); #endif @@ -1215,7 +1215,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ip6t_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1630,7 +1630,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) switch (cmd) { case IP6T_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1663,7 +1663,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) break; case IP6T_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1725,10 +1725,11 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table) int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, - const struct nf_hook_ops *ops, - struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1742,50 +1743,62 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); - if (!ops) + if (!template_ops) return 0; - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __ip6t_unregister_table(net, new_table); - *res = NULL; + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; } + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __ip6t_unregister_table(net, new_table); return ret; } -void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) +void ip6t_unregister_table_pre_exit(struct net *net, const char *name) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); -} + struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); -void ip6t_unregister_table_exit(struct net *net, struct xt_table *table) -{ - __ip6t_unregister_table(net, table); + if (table) + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } -void ip6t_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) +void ip6t_unregister_table_exit(struct net *net, const char *name) { - if (ops) - ip6t_unregister_table_pre_exit(net, table, ops); - __ip6t_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + + if (table) + __ip6t_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ @@ -1840,7 +1853,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV6, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, @@ -1935,7 +1948,6 @@ static void __exit ip6_tables_fini(void) } EXPORT_SYMBOL(ip6t_register_table); -EXPORT_SYMBOL(ip6t_unregister_table); EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 88337b51ffbf..bb784ea7bbd3 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -35,7 +35,7 @@ static unsigned int ip6table_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *filter_ops __read_mostly; @@ -49,9 +49,6 @@ static int __net_init ip6table_filter_table_init(struct net *net) struct ip6t_replace *repl; int err; - if (net->ipv6.ip6table_filter) - return 0; - repl = ip6t_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; @@ -59,8 +56,7 @@ static int __net_init ip6table_filter_table_init(struct net *net) ((struct ip6t_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - err = ip6t_register_table(net, &packet_filter, repl, filter_ops, - &net->ipv6.ip6table_filter); + err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } @@ -75,17 +71,12 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_filter) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_filter, - filter_ops); + ip6t_unregister_table_pre_exit(net, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) { - if (!net->ipv6.ip6table_filter) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_filter); - net->ipv6.ip6table_filter = NULL; + ip6t_unregister_table_exit(net, "filter"); } static struct pernet_operations ip6table_filter_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index cee74803d7a1..c76cffd63041 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -32,7 +32,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) +ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; struct in6_addr saddr, daddr; @@ -49,7 +49,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); + ret = ip6t_do_table(skb, state, priv); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || @@ -71,8 +71,8 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ip6t_mangle_out(skb, state); - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); + return ip6t_mangle_out(skb, state, priv); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; @@ -81,32 +81,22 @@ static int __net_init ip6table_mangle_table_init(struct net *net) struct ip6t_replace *repl; int ret; - if (net->ipv6.ip6table_mangle) - return 0; - repl = ip6t_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops, - &net->ipv6.ip6table_mangle); + ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_mangle) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_mangle, - mangle_ops); + ip6t_unregister_table_pre_exit(net, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) { - if (!net->ipv6.ip6table_mangle) - return; - - ip6t_unregister_table_exit(net, net->ipv6.ip6table_mangle); - net->ipv6.ip6table_mangle = NULL; + ip6t_unregister_table_exit(net, "mangle"); } static struct pernet_operations ip6table_mangle_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 0a23265e3caa..b0292251e655 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -15,8 +15,14 @@ #include <net/netfilter/nf_nat.h> +struct ip6table_nat_pernet { + struct nf_hook_ops *nf_nat_ops; +}; + static int __net_init ip6table_nat_table_init(struct net *net); +static unsigned int ip6table_nat_net_id __read_mostly; + static const struct xt_table nf_nat_ipv6_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -32,7 +38,7 @@ static unsigned int ip6table_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat); + return ip6t_do_table(skb, state, priv); } static const struct nf_hook_ops nf_nat_ipv6_ops[] = { @@ -64,27 +70,49 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { static int ip6t_nat_register_lookups(struct net *net) { + struct ip6table_nat_pernet *xt_nat_net; + struct nf_hook_ops *ops; + struct xt_table *table; int i, ret; + table = xt_find_table(net, NFPROTO_IPV6, "nat"); + if (WARN_ON_ONCE(!table)) + return -ENOENT; + + xt_nat_net = net_generic(net, ip6table_nat_net_id); + ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) { - ret = nf_nat_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]); + ops[i].priv = table; + ret = nf_nat_ipv6_register_fn(net, &ops[i]); if (ret) { while (i) - nf_nat_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]); + nf_nat_ipv6_unregister_fn(net, &ops[--i]); + kfree(ops); return ret; } } + xt_nat_net->nf_nat_ops = ops; return 0; } static void ip6t_nat_unregister_lookups(struct net *net) { + struct ip6table_nat_pernet *xt_nat_net = net_generic(net, ip6table_nat_net_id); + struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops; int i; + if (!ops) + return; + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) - nf_nat_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]); + nf_nat_ipv6_unregister_fn(net, &ops[i]); + + kfree(ops); } static int __net_init ip6table_nat_table_init(struct net *net) @@ -92,45 +120,39 @@ static int __net_init ip6table_nat_table_init(struct net *net) struct ip6t_replace *repl; int ret; - if (net->ipv6.ip6table_nat) - return 0; - repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table); if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl, - NULL, &net->ipv6.ip6table_nat); + NULL); if (ret < 0) { kfree(repl); return ret; } ret = ip6t_nat_register_lookups(net); - if (ret < 0) { - ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL); - net->ipv6.ip6table_nat = NULL; - } + if (ret < 0) + ip6t_unregister_table_exit(net, "nat"); + kfree(repl); return ret; } static void __net_exit ip6table_nat_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_nat) - ip6t_nat_unregister_lookups(net); + ip6t_nat_unregister_lookups(net); } static void __net_exit ip6table_nat_net_exit(struct net *net) { - if (!net->ipv6.ip6table_nat) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat); - net->ipv6.ip6table_nat = NULL; + ip6t_unregister_table_exit(net, "nat"); } static struct pernet_operations ip6table_nat_net_ops = { .pre_exit = ip6table_nat_net_pre_exit, .exit = ip6table_nat_net_exit, + .id = &ip6table_nat_net_id, + .size = sizeof(struct ip6table_nat_pernet), }; static int __init ip6table_nat_init(void) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 8f9e742226f7..f63c106c521e 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -40,7 +40,7 @@ static unsigned int ip6table_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *rawtable_ops __read_mostly; @@ -54,31 +54,22 @@ static int __net_init ip6table_raw_table_init(struct net *net) if (raw_before_defrag) table = &packet_raw_before_defrag; - if (net->ipv6.ip6table_raw) - return 0; - repl = ip6t_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, table, repl, rawtable_ops, - &net->ipv6.ip6table_raw); + ret = ip6t_register_table(net, table, repl, rawtable_ops); kfree(repl); return ret; } static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_raw) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_raw, - rawtable_ops); + ip6t_unregister_table_pre_exit(net, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) { - if (!net->ipv6.ip6table_raw) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_raw); - net->ipv6.ip6table_raw = NULL; + ip6t_unregister_table_exit(net, "raw"); } static struct pernet_operations ip6table_raw_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 5e8c48fed032..8dc335cf450b 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -39,7 +39,7 @@ static unsigned int ip6table_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -49,31 +49,22 @@ static int __net_init ip6table_security_table_init(struct net *net) struct ip6t_replace *repl; int ret; - if (net->ipv6.ip6table_security) - return 0; - repl = ip6t_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, &security_table, repl, sectbl_ops, - &net->ipv6.ip6table_security); + ret = ip6t_register_table(net, &security_table, repl, sectbl_ops); kfree(repl); return ret; } static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_security) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_security, - sectbl_ops); + ip6t_unregister_table_pre_exit(net, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) { - if (!net->ipv6.ip6table_security) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_security); - net->ipv6.ip6table_security = NULL; + ip6t_unregister_table_exit(net, "security"); } static struct pernet_operations ip6table_security_net_ops = { diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 402dc4ca9504..e8a59d8bf2ad 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -137,14 +137,16 @@ int nf_defrag_ipv6_enable(struct net *net) struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; - might_sleep(); - - if (nf_frag->users) - return 0; - mutex_lock(&defrag6_mutex); - if (nf_frag->users) + if (nf_frag->users == UINT_MAX) { + err = -EOVERFLOW; + goto out_unlock; + } + + if (nf_frag->users) { + nf_frag->users++; goto out_unlock; + } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); @@ -157,6 +159,21 @@ int nf_defrag_ipv6_enable(struct net *net) } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); +void nf_defrag_ipv6_disable(struct net *net) +{ + struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); + + mutex_lock(&defrag6_mutex); + if (nf_frag->users) { + nf_frag->users--; + if (nf_frag->users == 0) + nf_unregister_net_hooks(net, ipv6_defrag_ops, + ARRAY_SIZE(ipv6_defrag_ops)); + } + mutex_unlock(&defrag6_mutex); +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable); + module_init(nf_defrag_init); module_exit(nf_defrag_fini); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8bf21996734d..29a2d690d8d5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -392,6 +392,14 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) return false; } +static void mptcp_set_datafin_timeout(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, + TCP_RTO_MIN << icsk->icsk_retransmits); +} + static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) { long tout = ssk && inet_csk(ssk)->icsk_pending ? @@ -1062,7 +1070,7 @@ out: } if (snd_una == READ_ONCE(msk->snd_nxt)) { - if (msk->timer_ival) + if (msk->timer_ival && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { mptcp_reset_timer(sk); @@ -2287,8 +2295,19 @@ static void __mptcp_retrans(struct sock *sk) __mptcp_clean_una_wakeup(sk); dfrag = mptcp_rtx_head(sk); - if (!dfrag) + if (!dfrag) { + if (mptcp_data_fin_enabled(msk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_retransmits++; + mptcp_set_datafin_timeout(sk); + mptcp_send_ack(msk); + + goto reset_timer; + } + return; + } ssk = mptcp_subflow_get_retrans(msk); if (!ssk) @@ -2474,6 +2493,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) pr_debug("Sending DATA_FIN on subflow %p", ssk); mptcp_set_timeout(sk, ssk); tcp_send_ack(ssk); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); } break; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index fcd8682704c4..56a2531a3402 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -728,6 +728,16 @@ config NETFILTER_XTABLES if NETFILTER_XTABLES +config NETFILTER_XTABLES_COMPAT + bool "Netfilter Xtables 32bit support" + depends on COMPAT + default y + help + This option provides a translation layer to run 32bit arp,ip(6),ebtables + binaries on 64bit kernels. + + If unsure, say N. + comment "Xtables combined modules" config NETFILTER_XT_MARK diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 359ff8ec236a..de2d20c37cda 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1031,26 +1031,22 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, return 0; } -static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { return -EOPNOTSUPP; } -static int ip_set_create(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; - u32 flags = flag_exist(nlh); + u32 flags = flag_exist(info->nlh); int ret = 0; if (unlikely(protocol_min_failed(attr) || @@ -1101,7 +1097,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Set create flags depending on the type revision */ set->flags |= set->type->create_flags[revision]; - ret = set->type->create(net, set, tb, flags); + ret = set->type->create(info->net, set, tb, flags); if (ret != 0) goto put_out; @@ -1183,12 +1179,10 @@ ip_set_destroy_set(struct ip_set *set) kfree(set); } -static int ip_set_destroy(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; int ret = 0; @@ -1230,7 +1224,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { - u32 flags = flag_exist(nlh); + u32 flags = flag_exist(info->nlh); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { @@ -1264,12 +1258,10 @@ ip_set_flush_set(struct ip_set *set) ip_set_unlock(set); } -static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; @@ -1304,12 +1296,10 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; -static int ip_set_rename(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *s; const char *name2; ip_set_id_t i; @@ -1354,12 +1344,10 @@ out: * so the ip_set_list always contains valid pointers to the sets. */ -static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; @@ -1669,10 +1657,8 @@ out: return ret < 0 ? ret : skb->len; } -static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; @@ -1683,7 +1669,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, .dump = ip_set_dump_do, .done = ip_set_dump_done, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } } @@ -1817,30 +1803,24 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, return ret; } -static int ip_set_uadd(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - return ip_set_ad(net, ctnl, skb, - IPSET_ADD, nlh, attr, extack); + return ip_set_ad(info->net, info->sk, skb, + IPSET_ADD, info->nlh, attr, info->extack); } -static int ip_set_udel(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - return ip_set_ad(net, ctnl, skb, - IPSET_DEL, nlh, attr, extack); + return ip_set_ad(info->net, info->sk, skb, + IPSET_DEL, info->nlh, attr, info->extack); } -static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; @@ -1872,12 +1852,10 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, /* Get headed data of a set */ -static int ip_set_header(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1895,7 +1873,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; @@ -1907,7 +1885,8 @@ static int ip_set_header(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -1929,10 +1908,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, }; -static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1955,7 +1932,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; @@ -1968,7 +1945,8 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -1988,10 +1966,8 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, }; -static int ip_set_protocol(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -2004,7 +1980,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_PROTOCOL); if (!nlh2) goto nlmsg_failure; @@ -2014,7 +1990,8 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -2029,12 +2006,10 @@ nlmsg_failure: /* Get set by name or index, from userspace */ -static int ip_set_byname(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; @@ -2053,7 +2028,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYNAME); if (!nlh2) goto nlmsg_failure; @@ -2063,7 +2038,8 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -2081,12 +2057,10 @@ static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_INDEX] = { .type = NLA_U16 }, }; -static int ip_set_byindex(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; @@ -2108,7 +2082,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYINDEX); if (!nlh2) goto nlmsg_failure; @@ -2117,7 +2091,8 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -2133,80 +2108,96 @@ nlmsg_failure: static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_NONE] = { .call = ip_set_none, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, }, [IPSET_CMD_CREATE] = { .call = ip_set_create, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_create_policy, }, [IPSET_CMD_DESTROY] = { .call = ip_set_destroy, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_FLUSH] = { .call = ip_set_flush, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_RENAME] = { .call = ip_set_rename, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_SWAP] = { .call = ip_set_swap, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_LIST] = { .call = ip_set_dump, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_ADD] = { .call = ip_set_uadd, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_DEL] = { .call = ip_set_udel, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_TEST] = { .call = ip_set_utest, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_HEADER] = { .call = ip_set_header, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_TYPE] = { .call = ip_set_type, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_type_policy, }, [IPSET_CMD_PROTOCOL] = { .call = ip_set_protocol, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_protocol_policy, }, [IPSET_CMD_GET_BYNAME] = { .call = ip_set_byname, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_GET_BYINDEX] = { .call = ip_set_byindex, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_index_policy, }, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 44e3cb80e2e0..8690fc07030f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1524,17 +1524,15 @@ static int ctnetlink_flush_conntrack(struct net *net, return 0; } -static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_del_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - struct nf_conn *ct; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_conntrack_zone zone; + struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1550,15 +1548,15 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, else { u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; - return ctnetlink_flush_conntrack(net, cda, + return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, - nlmsg_report(nlh), u3); + nlmsg_report(info->nlh), u3); } if (err < 0) return err; - h = nf_conntrack_find_get(net, &zone, &tuple); + h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; @@ -1578,28 +1576,26 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } } - nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); return 0; } -static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - struct nf_conn *ct; - struct sk_buff *skb2 = NULL; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct sk_buff *skb2; + struct nf_conn *ct; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = ctnetlink_start, .dump = ctnetlink_dump_table, @@ -1607,7 +1603,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, .data = (void *)cda, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1626,7 +1622,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, if (err < 0) return err; - h = nf_conntrack_find_get(net, &zone, &tuple); + h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; @@ -1639,13 +1635,16 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, return -ENOMEM; } - err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, + true, 0); nf_ct_put(ct); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (err < 0) goto out; @@ -1743,18 +1742,16 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) return ctnetlink_dump_list(skb, cb, true); } -static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_ct_dying(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, .done = ctnetlink_done_list, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; @@ -1766,18 +1763,16 @@ ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) return ctnetlink_dump_list(skb, cb, false); } -static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, .done = ctnetlink_done_list, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; @@ -2374,18 +2369,16 @@ err1: return ERR_PTR(err); } -static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_new_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -2407,13 +2400,13 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, } if (cda[CTA_TUPLE_ORIG]) - h = nf_conntrack_find_get(net, &zone, &otuple); + h = nf_conntrack_find_get(info->net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = nf_conntrack_find_get(net, &zone, &rtuple); + h = nf_conntrack_find_get(info->net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) { + if (info->nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) @@ -2421,8 +2414,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, if (otuple.dst.protonum != rtuple.dst.protonum) return -EINVAL; - ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple, - &rtuple, u3); + ct = ctnetlink_create_conntrack(info->net, &zone, cda, + &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); @@ -2445,7 +2438,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_SYNPROXY) | events, ct, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); nf_ct_put(ct); } @@ -2455,7 +2448,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { + if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) { err = ctnetlink_change_conntrack(ct, cda); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | @@ -2467,7 +2460,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } } @@ -2539,17 +2532,15 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_ct_cpu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_ct_stat_cpu_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; @@ -2585,10 +2576,8 @@ nlmsg_failure: return -1; } -static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]) { struct sk_buff *skb2; int err; @@ -2598,13 +2587,14 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, return -ENOMEM; err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (err < 0) goto out; @@ -3284,29 +3274,29 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, return err; } -static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; - struct sk_buff *skb2; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct sk_buff *skb2; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) - return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda, - extack); + return ctnetlink_dump_exp_ct(info->net, info->sk, skb, + info->nlh, cda, + info->extack); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, .done = ctnetlink_exp_done, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } } @@ -3326,7 +3316,7 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, if (err < 0) return err; - exp = nf_ct_expect_find_get(net, &zone, &tuple); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; @@ -3348,13 +3338,15 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); + info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + exp); rcu_read_unlock(); nf_ct_expect_put(exp); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (err < 0) goto out; @@ -3382,15 +3374,14 @@ static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) return true; } -static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_del_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; int err; @@ -3406,7 +3397,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(net, &zone, &tuple); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; @@ -3422,7 +3413,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); @@ -3432,14 +3423,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, } else if (cda[CTA_EXPECT_HELP_NAME]) { char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); - nf_ct_expect_iterate_net(net, expect_iter_name, name, + nf_ct_expect_iterate_net(info->net, expect_iter_name, name, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } else { /* This basically means we have to flush everything*/ - nf_ct_expect_iterate_net(net, expect_iter_all, NULL, + nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } return 0; @@ -3635,15 +3626,14 @@ err_ct: return err; } -static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_new_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; int err; @@ -3662,20 +3652,20 @@ static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, return err; spin_lock_bh(&nf_conntrack_expect_lock); - exp = __nf_ct_expect_find(net, &zone, &tuple); + exp = __nf_ct_expect_find(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) { - err = ctnetlink_create_expect(net, &zone, cda, u3, + if (info->nlh->nlmsg_flags & NLM_F_CREATE) { + err = ctnetlink_create_expect(info->net, &zone, cda, u3, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } return err; } err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); spin_unlock_bh(&nf_conntrack_expect_lock); @@ -3736,17 +3726,15 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_exp_stat_cpu_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; @@ -3763,35 +3751,71 @@ static struct nf_exp_event_notifier ctnl_notifier_exp = { #endif static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { - [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu }, - [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct }, - [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying }, - [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed }, + [IPCTNL_MSG_CT_NEW] = { + .call = ctnetlink_new_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET] = { + .call = ctnetlink_get_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_DELETE] = { + .call = ctnetlink_del_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { + .call = ctnetlink_get_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET_STATS_CPU] = { + .call = ctnetlink_stat_ct_cpu, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_STATS] = { + .call = ctnetlink_stat_ct, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_DYING] = { + .call = ctnetlink_get_ct_dying, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { + .call = ctnetlink_get_ct_unconfirmed, + .type = NFNL_CB_MUTEX, + }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { - [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu }, + [IPCTNL_MSG_EXP_GET] = { + .call = ctnetlink_get_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_NEW] = { + .call = ctnetlink_new_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_DELETE] = { + .call = ctnetlink_del_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_GET_STATS_CPU] = { + .call = ctnetlink_stat_exp_cpu, + .type = NFNL_CB_MUTEX, + }, }; static const struct nfnetlink_subsystem ctnl_subsys = { diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 47e9319d2cf3..89e5bac384d7 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -536,15 +536,19 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto) mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { case NFPROTO_IPV4: - if (cnet->users4 && (--cnet->users4 == 0)) + if (cnet->users4 && (--cnet->users4 == 0)) { nf_unregister_net_hooks(net, ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); + nf_defrag_ipv4_disable(net); + } break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: - if (cnet->users6 && (--cnet->users6 == 0)) + if (cnet->users6 && (--cnet->users6 == 0)) { nf_unregister_net_hooks(net, ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); + nf_defrag_ipv6_disable(net); + } break; #endif case NFPROTO_BRIDGE: diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 2518818ed479..13234641cdb3 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -1011,6 +1011,7 @@ static void __net_exit nf_log_syslog_net_exit(struct net *net) nf_log_unset(net, &nf_arp_logger); nf_log_unset(net, &nf_ip6_logger); nf_log_unset(net, &nf_netdev_logger); + nf_log_unset(net, &nf_bridge_logger); } static struct pernet_operations nf_log_syslog_net_ops = { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index b7c3c902290f..7de595ead06a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -146,43 +146,6 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) return; } } - -int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) -{ - struct flowi fl; - unsigned int hh_len; - struct dst_entry *dst; - struct sock *sk = skb->sk; - int err; - - err = xfrm_decode_session(skb, &fl, family); - if (err < 0) - return err; - - dst = skb_dst(skb); - if (dst->xfrm) - dst = ((struct xfrm_dst *)dst)->route; - if (!dst_hold_safe(dst)) - return -EHOSTUNREACH; - - if (sk && !net_eq(net, sock_net(sk))) - sk = NULL; - - dst = xfrm_lookup(net, dst, &fl, sk, 0); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - skb_dst_drop(skb); - skb_dst_set(skb, dst); - - /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len && - pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 4731d21fc3ad..48cc60084d28 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -659,6 +659,44 @@ nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, return ret; } +#ifdef CONFIG_XFRM +static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) +{ + struct sock *sk = skb->sk; + struct dst_entry *dst; + unsigned int hh_len; + struct flowi fl; + int err; + + err = xfrm_decode_session(skb, &fl, family); + if (err < 0) + return err; + + dst = skb_dst(skb); + if (dst->xfrm) + dst = ((struct xfrm_dst *)dst)->route; + if (!dst_hold_safe(dst)) + return -EHOSTUNREACH; + + if (sk && !net_eq(net, sock_net(sk))) + sk = NULL; + + dst = xfrm_lookup(net, dst, &fl, sk, 0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + return -ENOMEM; + return 0; +} +#endif + static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 357443b3c0e4..1050f23c0d29 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -21,7 +21,6 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/net_namespace.h> -#include <net/netns/generic.h> #include <net/sock.h> #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) @@ -106,7 +105,7 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types static void nft_validate_state_update(struct net *net, u8 new_validate_state) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: @@ -181,7 +180,7 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) if (!nft_set_is_anonymous(set)) return; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: @@ -278,9 +277,8 @@ static void nf_tables_unregister_hook(struct net *net, static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { - struct nftables_pernet *nft_net; + struct nftables_pernet *nft_net = nft_pernet(net); - nft_net = net_generic(net, nf_tables_net_id); list_add_tail(&trans->list, &nft_net->commit_list); } @@ -566,7 +564,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry_rcu(table, &nft_net->tables, list, lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && @@ -590,7 +588,7 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, struct nftables_pernet *nft_net; struct nft_table *table; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && nft_active_genmask(table, genmask)) @@ -655,7 +653,7 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, if (ret >= MODULE_NAME_LEN) return 0; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) @@ -711,7 +709,7 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, static __be16 nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); return htons(nft_net->base_seq & 0xffff); } @@ -793,7 +791,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: @@ -811,7 +809,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, int family = nfmsg->nfgen_family; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -860,25 +858,25 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, } /* called with rcu_read_lock held */ -static int nf_tables_gettable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + int family = nfmsg->nfgen_family; const struct nft_table *table; + struct net *net = info->net; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_tables, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0); @@ -892,8 +890,8 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, - family, table); + info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, + 0, family, table); if (err < 0) goto err_fill_table_info; @@ -1057,15 +1055,15 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg, return strcmp(obj->key.name, k->name); } -static int nf_tables_newtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + struct nftables_pernet *nft_net = nft_pernet(info->net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; @@ -1080,14 +1078,15 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + return nf_tables_updtable(&ctx); } @@ -1128,7 +1127,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table->flags & NFT_TABLE_F_OWNER) table->nlpid = NETLINK_CB(skb).portid; - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err_trans; @@ -1221,9 +1220,9 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); - struct nft_table *table, *nt; + struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nlattr * const *nla = ctx->nla; + struct nft_table *table, *nt; int err = 0; list_for_each_entry_safe(table, nt, &nft_net->tables, list) { @@ -1252,19 +1251,19 @@ out: return err; } -static int nf_tables_deltable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; - nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla); if (family == AF_UNSPEC || (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); @@ -1283,7 +1282,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - if (nlh->nlmsg_flags & NLM_F_NONREC && + if (info->nlh->nlmsg_flags & NLM_F_NONREC && table->use > 0) return -EBUSY; @@ -1345,7 +1344,7 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); return lockdep_is_held(&nft_net->commit_mutex); #else @@ -1570,7 +1569,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: @@ -1581,15 +1580,15 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_table *table; - const struct nft_chain *chain; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; + const struct nft_table *table; + const struct nft_chain *chain; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -1625,26 +1624,26 @@ done: } /* called with rcu_read_lock held */ -static int nf_tables_getchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + int family = nfmsg->nfgen_family; const struct nft_chain *chain; + struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_chains, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0); @@ -1664,8 +1663,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, - family, table, chain); + info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, + 0, family, table, chain); if (err < 0) goto err_fill_chain_info; @@ -1908,7 +1907,7 @@ static int nft_chain_parse_hook(struct net *net, struct nft_chain_hook *hook, u8 family, bool autoload) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; int err; @@ -2302,7 +2301,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_trans *tmp; char *name; @@ -2338,7 +2337,7 @@ err: static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nlattr *nla) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; @@ -2352,16 +2351,16 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net, return ERR_PTR(-ENOENT); } -static int nf_tables_newchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + struct nftables_pernet *nft_net = nft_pernet(info->net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; struct nft_chain *chain = NULL; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; u8 policy = NF_ACCEPT; @@ -2433,14 +2432,14 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (flags & ~NFT_CHAIN_FLAGS) return -EOPNOTSUPP; - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain != NULL) { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; flags |= chain->flags & NFT_CHAIN_BASE; @@ -2451,14 +2450,14 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return nf_tables_addchain(&ctx, family, genmask, policy, flags); } -static int nf_tables_delchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; @@ -2488,11 +2487,11 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } - if (nlh->nlmsg_flags & NLM_F_NONREC && + if (info->nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); use = chain->use; list_for_each_entry(rule, &chain->rules, list) { @@ -2715,15 +2714,15 @@ err1: } static int nf_tables_newexpr(const struct nft_ctx *ctx, - const struct nft_expr_info *info, + const struct nft_expr_info *expr_info, struct nft_expr *expr) { - const struct nft_expr_ops *ops = info->ops; + const struct nft_expr_ops *ops = expr_info->ops; int err; expr->ops = ops; if (ops->init) { - err = ops->init(ctx, expr, (const struct nlattr **)info->tb); + err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb); if (err < 0) goto err1; } @@ -2747,21 +2746,21 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx, static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, const struct nlattr *nla) { - struct nft_expr_info info; + struct nft_expr_info expr_info; struct nft_expr *expr; struct module *owner; int err; - err = nf_tables_expr_parse(ctx, nla, &info); + err = nf_tables_expr_parse(ctx, nla, &expr_info); if (err < 0) goto err1; err = -ENOMEM; - expr = kzalloc(info.ops->size, GFP_KERNEL); + expr = kzalloc(expr_info.ops->size, GFP_KERNEL); if (expr == NULL) goto err2; - err = nf_tables_newexpr(ctx, &info, expr); + err = nf_tables_newexpr(ctx, &expr_info, expr); if (err < 0) goto err3; @@ -2769,9 +2768,9 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, err3: kfree(expr); err2: - owner = info.ops->type->owner; - if (info.ops->type->release_ops) - info.ops->type->release_ops(info.ops); + owner = expr_info.ops->type->owner; + if (expr_info.ops->type->release_ops) + expr_info.ops->type->release_ops(expr_info.ops); module_put(owner); err1: @@ -2908,7 +2907,7 @@ nla_put_failure: static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; int err; @@ -2989,7 +2988,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, struct nftables_pernet *nft_net; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -3078,21 +3077,21 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + int family = nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; + struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start= nf_tables_dump_rules_start, .dump = nf_tables_dump_rules, @@ -3101,7 +3100,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, .data = (void *)nla, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); @@ -3127,7 +3126,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, + info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, NULL); if (err < 0) goto err_fill_rule_info; @@ -3218,28 +3217,28 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, #define NFT_RULE_MAXEXPRS 128 -static int nf_tables_newrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - struct nft_expr_info *info = NULL; + struct nftables_pernet *nft_net = nft_pernet(info->net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + unsigned int size, i, n, ulen = 0, usize = 0; + u8 genmask = nft_genmask_next(info->net); + struct nft_rule *rule, *old_rule = NULL; + struct nft_expr_info *expr_info = NULL; int family = nfmsg->nfgen_family; + struct net *net = info->net; struct nft_flow_rule *flow; + struct nft_userdata *udata; struct nft_table *table; struct nft_chain *chain; - struct nft_rule *rule, *old_rule = NULL; - struct nft_userdata *udata; - struct nft_trans *trans = NULL; + struct nft_trans *trans; + u64 handle, pos_handle; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n, ulen = 0, usize = 0; int err, rem; - u64 handle, pos_handle; lockdep_assert_held(&nft_net->commit_mutex); @@ -3278,17 +3277,17 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(rule); } - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) old_rule = rule; else return -EOPNOTSUPP; } else { - if (!(nlh->nlmsg_flags & NLM_F_CREATE) || - nlh->nlmsg_flags & NLM_F_REPLACE) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) || + info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); @@ -3311,15 +3310,15 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } } - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { - info = kvmalloc_array(NFT_RULE_MAXEXPRS, - sizeof(struct nft_expr_info), - GFP_KERNEL); - if (!info) + expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS, + sizeof(struct nft_expr_info), + GFP_KERNEL); + if (!expr_info) return -ENOMEM; nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { @@ -3328,10 +3327,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, goto err1; if (n == NFT_RULE_MAXEXPRS) goto err1; - err = nf_tables_expr_parse(&ctx, tmp, &info[n]); + err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); if (err < 0) goto err1; - size += info[n].ops->size; + size += expr_info[n].ops->size; n++; } } @@ -3365,20 +3364,20 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, expr = nft_expr_first(rule); for (i = 0; i < n; i++) { - err = nf_tables_newexpr(&ctx, &info[i], expr); + err = nf_tables_newexpr(&ctx, &expr_info[i], expr); if (err < 0) { - NL_SET_BAD_ATTR(extack, info[i].attr); + NL_SET_BAD_ATTR(extack, expr_info[i].attr); goto err2; } - if (info[i].ops->validate) + if (expr_info[i].ops->validate) nft_validate_state_update(net, NFT_VALIDATE_NEED); - info[i].ops = NULL; + expr_info[i].ops = NULL; expr = nft_expr_next(expr); } - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; @@ -3398,7 +3397,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, goto err2; } - if (nlh->nlmsg_flags & NLM_F_APPEND) { + if (info->nlh->nlmsg_flags & NLM_F_APPEND) { if (old_rule) list_add_rcu(&rule->list, &old_rule->list); else @@ -3410,7 +3409,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_rcu(&rule->list, &chain->rules); } } - kvfree(info); + kvfree(expr_info); chain->use++; if (nft_net->validate_state == NFT_VALIDATE_DO) @@ -3429,20 +3428,21 @@ err2: nf_tables_rule_release(&ctx, rule); err1: for (i = 0; i < n; i++) { - if (info[i].ops) { - module_put(info[i].ops->type->owner); - if (info[i].ops->type->release_ops) - info[i].ops->type->release_ops(info[i].ops); + if (expr_info[i].ops) { + module_put(expr_info[i].ops->type->owner); + if (expr_info[i].ops->type->release_ops) + expr_info[i].ops->type->release_ops(expr_info[i].ops); } } - kvfree(info); + kvfree(expr_info); + return err; } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nlattr *nla) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; @@ -3456,17 +3456,17 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, return ERR_PTR(-ENOENT); } -static int nf_tables_delrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - struct nft_table *table; + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + int family = nfmsg->nfgen_family, err = 0; + u8 genmask = nft_genmask_next(info->net); struct nft_chain *chain = NULL; + struct net *net = info->net; + struct nft_table *table; struct nft_rule *rule; - int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, @@ -3487,7 +3487,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -3559,7 +3559,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, enum nft_set_policies policy) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; @@ -3704,9 +3704,9 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, static struct nft_set *nft_set_lookup_byid(const struct net *net, const struct nlattr *nla, u8 genmask) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); - struct nft_trans *trans; + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWSET) { @@ -3942,7 +3942,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, int event, gfp_t gfp_flags) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; u32 portid = ctx->portid; int err; @@ -3980,7 +3980,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -4047,25 +4047,25 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + struct net *net = info->net; const struct nft_set *set; - struct nft_ctx ctx; struct sk_buff *skb2; - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_ctx ctx; int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, genmask, 0); if (err < 0) return err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_sets_start, .dump = nf_tables_dump_sets, @@ -4074,7 +4074,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } /* Only accept unspec with dump */ @@ -4168,28 +4168,27 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, return err; } -static int nf_tables_newset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u32 ktype, dtype, flags, policy, gc_int, objtype; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; const struct nft_set_ops *ops; struct nft_expr *expr = NULL; + struct net *net = info->net; + struct nft_set_desc desc; struct nft_table *table; + unsigned char *udata; struct nft_set *set; struct nft_ctx ctx; - char *name; - u64 size; u64 timeout; - u32 ktype, dtype, flags, policy, gc_int, objtype; - struct nft_set_desc desc; - unsigned char *udata; + char *name; + int err, i; u16 udlen; - int err; - int i; + u64 size; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -4297,7 +4296,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { @@ -4306,17 +4305,17 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return PTR_ERR(set); } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; return 0; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; ops = nft_select_set_ops(&ctx, nla, &desc, policy); @@ -4450,13 +4449,13 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) kvfree(set); } -static int nf_tables_delset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + struct net *net = info->net; const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -4467,7 +4466,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -4485,7 +4484,8 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return PTR_ERR(set); } if (set->use || - (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) { + (info->nlh->nlmsg_flags & NLM_F_NONREC && + atomic_read(&set->nelems) > 0)) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } @@ -4833,7 +4833,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) int event; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) @@ -5065,18 +5065,19 @@ err_fill_setelem: } /* called with rcu_read_lock held */ -static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_cur(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + struct net *net = info->net; struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5085,7 +5086,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (IS_ERR(set)) return PTR_ERR(set); - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, .dump = nf_tables_dump_set, @@ -5098,7 +5099,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, }; c.data = &dump_ctx; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) @@ -5138,7 +5139,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: @@ -5655,13 +5656,14 @@ err_set_elem_expr_clone: return err; } -static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); - u8 genmask = nft_genmask_next(net); + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + struct net *net = info->net; const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -5670,7 +5672,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5684,7 +5686,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); + err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); if (err < 0) return err; } @@ -5867,18 +5869,19 @@ err1: return err; } -static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_next(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + struct net *net = info->net; const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -6162,15 +6165,15 @@ err_free_trans: return err; } -static int nf_tables_newobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); const struct nft_object_type *type; - u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct net *net = info->net; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; @@ -6198,20 +6201,20 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, return err; } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; type = __nft_obj_type_get(objtype); - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); type = nft_obj_type_get(net, objtype); if (IS_ERR(type)) @@ -6323,7 +6326,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) reset = true; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -6418,22 +6421,22 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); int family = nfmsg->nfgen_family; const struct nft_table *table; + struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; bool reset = false; u32 objtype; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_obj_start, .dump = nf_tables_dump_obj, @@ -6442,7 +6445,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, .data = (void *)nla, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_OBJ_NAME] || @@ -6466,14 +6469,14 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, if (!skb2) return -ENOMEM; - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) reset = true; if (reset) { const struct nftables_pernet *nft_net; char *buf; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); audit_log_nfcfg(buf, @@ -6485,7 +6488,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, } err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) goto err_fill_obj_info; @@ -6508,14 +6511,14 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) kfree(obj); } -static int nf_tables_delobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_object *obj; @@ -6551,7 +6554,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return -EBUSY; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_delobj(&ctx, obj); } @@ -6560,7 +6563,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; char *buf = kasprintf(gfp, "%s:%u", @@ -6938,19 +6941,19 @@ err_flowtable_update_hook: } -static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; + u8 genmask = nft_genmask_next(info->net); const struct nf_flowtable_type *type; - u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct nft_hook *hook, *next; + struct net *net = info->net; struct nft_table *table; struct nft_ctx ctx; int err; @@ -6976,17 +6979,17 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return err; } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -EEXIST; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - return nft_flowtable_update(&ctx, nlh, flowtable); + return nft_flowtable_update(&ctx, info->nlh, flowtable); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); if (!flowtable) @@ -7127,16 +7130,16 @@ err_flowtable_del_hook: return err; } -static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; @@ -7166,7 +7169,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, return PTR_ERR(flowtable); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (nla[NFTA_FLOWTABLE_HOOK]) return nft_delflowtable_hook(&ctx, flowtable); @@ -7246,7 +7249,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, const struct nft_table *table; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -7322,21 +7325,20 @@ static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u8 genmask = nft_genmask_cur(info->net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; + struct net *net = info->net; struct sk_buff *skb2; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_flowtable_start, .dump = nf_tables_dump_flowtable, @@ -7345,7 +7347,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, .data = (void *)nla, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_FLOWTABLE_NAME]) @@ -7366,7 +7368,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, + info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable, &flowtable->hook_list); if (err < 0) @@ -7384,7 +7386,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct list_head *hook_list, int event) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; int err; @@ -7429,7 +7431,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); @@ -7482,7 +7484,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this, return 0; net = dev_net(dev); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { @@ -7528,10 +7530,8 @@ err: -ENOBUFS); } -static int nf_tables_getgen(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { struct sk_buff *skb2; int err; @@ -7540,12 +7540,12 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, if (skb2 == NULL) return -ENOMEM; - err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq); + err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq); if (err < 0) goto err_fill_gen_info; - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); err_fill_gen_info: kfree_skb(skb2); @@ -7554,115 +7554,138 @@ err_fill_gen_info: static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { - .call_batch = nf_tables_newtable, + .call = nf_tables_newtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_GETTABLE] = { - .call_rcu = nf_tables_gettable, + .call = nf_tables_gettable, + .type = NFNL_CB_RCU, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { - .call_batch = nf_tables_deltable, + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { - .call_batch = nf_tables_newchain, + .call = nf_tables_newchain, + .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_GETCHAIN] = { - .call_rcu = nf_tables_getchain, + .call = nf_tables_getchain, + .type = NFNL_CB_RCU, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { - .call_batch = nf_tables_delchain, + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_NEWRULE] = { - .call_batch = nf_tables_newrule, + .call = nf_tables_newrule, + .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_GETRULE] = { - .call_rcu = nf_tables_getrule, + .call = nf_tables_getrule, + .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_DELRULE] = { - .call_batch = nf_tables_delrule, + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { - .call_batch = nf_tables_newset, + .call = nf_tables_newset, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_GETSET] = { - .call_rcu = nf_tables_getset, + .call = nf_tables_getset, + .type = NFNL_CB_RCU, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { - .call_batch = nf_tables_delset, + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { - .call_batch = nf_tables_newsetelem, + .call = nf_tables_newsetelem, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM] = { - .call_rcu = nf_tables_getsetelem, + .call = nf_tables_getsetelem, + .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { - .call_batch = nf_tables_delsetelem, + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETGEN] = { - .call_rcu = nf_tables_getgen, + .call = nf_tables_getgen, + .type = NFNL_CB_RCU, }, [NFT_MSG_NEWOBJ] = { - .call_batch = nf_tables_newobj, + .call = nf_tables_newobj, + .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ] = { - .call_rcu = nf_tables_getobj, + .call = nf_tables_getobj, + .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_DELOBJ] = { - .call_batch = nf_tables_delobj, + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call_rcu = nf_tables_getobj, + .call = nf_tables_getobj, + .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_NEWFLOWTABLE] = { - .call_batch = nf_tables_newflowtable, + .call = nf_tables_newflowtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_GETFLOWTABLE] = { - .call_rcu = nf_tables_getflowtable, + .call = nf_tables_getflowtable, + .type = NFNL_CB_RCU, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_DELFLOWTABLE] = { - .call_batch = nf_tables_delflowtable, + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, @@ -7670,7 +7693,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { static int nf_tables_validate(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; switch (nft_net->validate_state) { @@ -7855,7 +7878,7 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha static void nf_tables_commit_chain_prepare_cancel(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { @@ -7967,7 +7990,7 @@ static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, static void nf_tables_module_autoload_cleanup(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); @@ -7980,7 +8003,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net) static void nf_tables_commit_release(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; /* all side effects have to be made visible. @@ -8014,7 +8037,7 @@ static void nf_tables_commit_release(struct net *net) static void nft_commit_notify(struct net *net, u32 portid) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *batch_skb = NULL, *nskb, *skb; unsigned char *data; int len; @@ -8101,7 +8124,7 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) static int nf_tables_commit(struct net *net, struct sk_buff *skb) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_chain *chain; @@ -8322,7 +8345,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) static void nf_tables_module_autoload(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; LIST_HEAD(module_list); @@ -8370,7 +8393,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_hook *hook; @@ -8524,7 +8547,7 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); int ret = __nf_tables_abort(net, action); mutex_unlock(&nft_net->commit_mutex); @@ -8534,7 +8557,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, static bool nf_tables_valid_genid(struct net *net, u32 genid) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); bool genid_ok; mutex_lock(&nft_net->commit_mutex); @@ -9096,7 +9119,7 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) static void __nft_release_hooks(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; list_for_each_entry(table, &nft_net->tables, list) { @@ -9156,7 +9179,7 @@ static void __nft_release_table(struct net *net, struct nft_table *table) static void __nft_release_tables(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table, *nt; list_for_each_entry_safe(table, nt, &nft_net->tables, list) { @@ -9179,7 +9202,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && @@ -9207,7 +9230,7 @@ static struct notifier_block nft_nl_notifier = { static int __net_init nf_tables_init_net(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); @@ -9227,7 +9250,7 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); if (!list_empty(&nft_net->commit_list)) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 19215e81dd66..a48c5fd53a80 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -7,8 +7,6 @@ #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> -extern unsigned int nf_tables_net_id; - static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; @@ -389,7 +387,7 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); @@ -490,7 +488,7 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { @@ -539,7 +537,7 @@ static void nft_flow_rule_offload_abort(struct net *net, int nft_flow_rule_offload_commit(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; @@ -663,7 +661,7 @@ static int nft_offload_netdev_event(struct notifier_block *this, if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 06f5886f652e..d7a9628b6cee 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -252,6 +252,12 @@ replay: struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); + struct nfnl_info info = { + .net = net, + .sk = nfnlnet->nfnl, + .nlh = nlh, + .extack = extack, + }; /* Sanity-check NFNL_MAX_ATTR_COUNT */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { @@ -267,24 +273,30 @@ replay: return err; } - if (nc->call_rcu) { - err = nc->call_rcu(net, nfnlnet->nfnl, skb, nlh, - (const struct nlattr **)cda, - extack); + if (!nc->call) { rcu_read_unlock(); - } else { + return -EINVAL; + } + + switch (nc->type) { + case NFNL_CB_RCU: + err = nc->call(skb, &info, (const struct nlattr **)cda); + rcu_read_unlock(); + break; + case NFNL_CB_MUTEX: rcu_read_unlock(); nfnl_lock(subsys_id); if (nfnl_dereference_protected(subsys_id) != ss || - nfnetlink_find_client(type, ss) != nc) + nfnetlink_find_client(type, ss) != nc) { err = -EAGAIN; - else if (nc->call) - err = nc->call(net, nfnlnet->nfnl, skb, nlh, - (const struct nlattr **)cda, - extack); - else - err = -EINVAL; + break; + } + err = nc->call(skb, &info, (const struct nlattr **)cda); nfnl_unlock(subsys_id); + break; + default: + err = -EINVAL; + break; } if (err == -EAGAIN) goto replay; @@ -462,12 +474,24 @@ replay_abort: goto ack; } + if (nc->type != NFNL_CB_BATCH) { + err = -EINVAL; + goto ack; + } + { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + struct nfnl_net *nfnlnet = nfnl_pernet(net); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; + u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); int attrlen = nlh->nlmsg_len - min_len; + struct nfnl_info info = { + .net = net, + .sk = nfnlnet->nfnl, + .nlh = nlh, + .extack = &extack, + }; /* Sanity-check NFTA_MAX_ATTR */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { @@ -482,13 +506,7 @@ replay_abort: if (err < 0) goto ack; - if (nc->call_batch) { - struct nfnl_net *nfnlnet = nfnl_pernet(net); - - err = nc->call_batch(net, nfnlnet->nfnl, skb, nlh, - (const struct nlattr **)cda, - &extack); - } + err = nc->call(skb, &info, (const struct nlattr **)cda); /* The lock was released to autoload some module, we * have to abort and start from scratch using the diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 6895f31c5fbb..3c8cf8748cfb 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -56,15 +56,13 @@ static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net) #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ -static int nfnl_acct_new(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { - struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *nfacct, *matching = NULL; - char *acct_name; unsigned int size = 0; + char *acct_name; u32 flags = 0; if (!tb[NFACCT_NAME]) @@ -78,7 +76,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = nfacct; @@ -86,7 +84,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, } if (matching) { - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); @@ -273,17 +271,15 @@ static int nfnl_acct_start(struct netlink_callback *cb) return 0; } -static int nfnl_acct_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { - struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, .start = nfnl_acct_start, @@ -291,7 +287,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, .data = (void *)tb[NFACCT_FILTER], }; - return netlink_dump_start(nfnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!tb[NFACCT_NAME]) @@ -311,15 +307,15 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, } ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), - NFNL_MSG_ACCT_NEW, cur); + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), + NFNL_MSG_ACCT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -347,12 +343,10 @@ static int nfnl_acct_try_del(struct nf_acct *cur) return ret; } -static int nfnl_acct_del(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { - struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *cur, *tmp; int ret = -ENOENT; char *acct_name; @@ -388,18 +382,30 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { - [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_NEW] = { + .call = nfnl_acct_new, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_GET] = { + .call = nfnl_acct_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_GET_CTRZERO] = { + .call = nfnl_acct_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_DEL] = { + .call = nfnl_acct_del, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, }; static const struct nfnetlink_subsystem nfnl_acct_subsys = { diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 22f6f7fcc724..322ac5dd5402 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -408,10 +408,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[], return 0; } -static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; @@ -441,7 +439,7 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, tuple.dst.protonum != cur->tuple.dst.protonum)) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; helper = cur; @@ -607,10 +605,8 @@ out: return skb->len; } -static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { int ret = -ENOENT; struct nf_conntrack_helper *cur; @@ -623,11 +619,11 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, }; - return netlink_dump_start(nfnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (tb[NFCTH_NAME]) @@ -659,15 +655,15 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, } ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -678,10 +674,8 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, return ret; } -static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { char *helper_name = NULL; struct nf_conntrack_helper *cur; @@ -743,15 +737,24 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { - [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, - [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, - [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, + [NFNL_MSG_CTHELPER_NEW] = { + .call = nfnl_cthelper_new, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, + [NFNL_MSG_CTHELPER_GET] = { + .call = nfnl_cthelper_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, + [NFNL_MSG_CTHELPER_DEL] = { + .call = nfnl_cthelper_del, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, }; static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 46da5548d0b3..38848ad68899 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -83,13 +83,11 @@ err: return ret; } -static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_new_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; @@ -111,7 +109,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = timeout; @@ -119,7 +117,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, } if (matching) { - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ @@ -129,7 +127,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, return ctnl_timeout_parse_policy(&matching->timeout.data, matching->timeout.l4proto, - net, cda[CTA_TIMEOUT_DATA]); + info->net, + cda[CTA_TIMEOUT_DATA]); } return -EBUSY; @@ -150,8 +149,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, goto err_proto_put; } - ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net, - cda[CTA_TIMEOUT_DATA]); + ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, + info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -248,22 +247,20 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_get_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnl_timeout_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!cda[CTA_TIMEOUT_NAME]) @@ -283,15 +280,15 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, } ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -320,13 +317,11 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) return ret; } -static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_del_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; @@ -334,7 +329,7 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, if (!cda[CTA_TIMEOUT_NAME]) { list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) - ctnl_timeout_try_del(net, cur); + ctnl_timeout_try_del(info->net, cur); return 0; } @@ -344,7 +339,7 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - ret = ctnl_timeout_try_del(net, cur); + ret = ctnl_timeout_try_del(info->net, cur); if (ret < 0) return ret; @@ -353,11 +348,9 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, return ret; } -static int cttimeout_default_set(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_default_set(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; __u8 l4num; @@ -377,7 +370,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, goto err; } - ret = ctnl_timeout_parse_policy(NULL, l4proto, net, + ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -427,11 +420,9 @@ nla_put_failure: return -1; } -static int cttimeout_default_get(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_default_get(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; @@ -453,35 +444,35 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, switch (l4proto->l4proto) { case IPPROTO_ICMP: - timeouts = &nf_icmp_pernet(net)->timeout; + timeouts = &nf_icmp_pernet(info->net)->timeout; break; case IPPROTO_TCP: - timeouts = nf_tcp_pernet(net)->timeouts; + timeouts = nf_tcp_pernet(info->net)->timeouts; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: - timeouts = nf_udp_pernet(net)->timeouts; + timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_DCCP: #ifdef CONFIG_NF_CT_PROTO_DCCP - timeouts = nf_dccp_pernet(net)->dccp_timeout; + timeouts = nf_dccp_pernet(info->net)->dccp_timeout; #endif break; case IPPROTO_ICMPV6: - timeouts = &nf_icmpv6_pernet(net)->timeout; + timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP - timeouts = nf_sctp_pernet(net)->timeouts; + timeouts = nf_sctp_pernet(info->net)->timeouts; #endif break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE - timeouts = nf_gre_pernet(net)->timeouts; + timeouts = nf_gre_pernet(info->net)->timeouts; #endif break; case 255: - timeouts = &nf_generic_pernet(net)->timeout; + timeouts = &nf_generic_pernet(info->net)->timeout; break; default: WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); @@ -497,9 +488,10 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, goto err; } - ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + ret = cttimeout_default_fill_info(info->net, skb2, + NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, l3num, l4proto, timeouts); if (ret <= 0) { @@ -507,7 +499,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, err = -ENOMEM; goto err; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -553,21 +546,36 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t) } static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { - [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_NEW] = { + .call = cttimeout_new_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_GET] = { + .call = cttimeout_get_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DELETE] = { + .call = cttimeout_del_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = { + .call = cttimeout_default_set, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = { + .call = cttimeout_default_get, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, }; static const struct nfnetlink_subsystem cttimeout_subsys = { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d5f458d0ff3d..587086b18c36 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -845,10 +845,8 @@ static struct notifier_block nfulnl_rtnl_notifier = { .notifier_call = nfulnl_rcv_nl_event, }; -static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfula[]) { return -ENOTSUPP; } @@ -869,18 +867,16 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; -static int nfulnl_recv_config(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfula[], - struct netlink_ext_ack *extack) +static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfula[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_log_net *log = nfnl_log_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u_int16_t group_num = ntohs(nfmsg->res_id); - struct nfulnl_instance *inst; struct nfulnl_msg_config_cmd *cmd = NULL; - struct nfnl_log_net *log = nfnl_log_pernet(net); - int ret = 0; + struct nfulnl_instance *inst; u16 flags = 0; + int ret = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = nfmsg->nfgen_family; @@ -889,9 +885,9 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl, /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: - return nf_log_bind_pf(net, pf, &nfulnl_logger); + return nf_log_bind_pf(info->net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: - nf_log_unbind_pf(net, pf); + nf_log_unbind_pf(info->net, pf); return 0; } } @@ -932,7 +928,7 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl, goto out_put; } - inst = instance_create(net, group_num, + inst = instance_create(info->net, group_num, NETLINK_CB(skb).portid, sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { @@ -993,11 +989,17 @@ out: } static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { - [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .attr_count = NFULA_MAX, }, - [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, - .policy = nfula_cfg_policy }, + [NFULNL_MSG_PACKET] = { + .call = nfulnl_recv_unsupp, + .type = NFNL_CB_MUTEX, + .attr_count = NFULA_MAX, + }, + [NFULNL_MSG_CONFIG] = { + .call = nfulnl_recv_config, + .type = NFNL_CB_MUTEX, + .attr_count = NFULA_CFG_MAX, + .policy = nfula_cfg_policy + }, }; static const struct nfnetlink_subsystem nfulnl_subsys = { diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 916a3c7f9eaf..e8f8875c6884 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -292,10 +292,9 @@ static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; -static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) +static int nfnl_osf_add_callback(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; @@ -307,7 +306,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); @@ -325,7 +324,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, kfree(kf); kf = NULL; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } @@ -339,11 +338,9 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, return err; } -static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) +static int nfnl_osf_remove_callback(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; @@ -377,11 +374,13 @@ static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl, static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, + .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, + .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 37e81d895e61..f37a575ebd7f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1046,20 +1046,18 @@ static int nfq_id_after(unsigned int id, unsigned int max) return (int)(id - max) > 0; } -static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_verdict_batch(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u16 queue_num = ntohs(nfmsg->res_id); struct nf_queue_entry *entry, *tmp; - unsigned int verdict, maxid; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; + unsigned int verdict, maxid; LIST_HEAD(batch_list); - u16 queue_num = ntohs(nfmsg->res_id); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); @@ -1158,22 +1156,19 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, return 0; } -static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_msg_verdict_hdr *vhdr; + enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; - unsigned int verdict; struct nf_queue_entry *entry; - enum ip_conntrack_info ctinfo; struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, @@ -1196,7 +1191,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) - ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); + ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, + &ctinfo); } if (entry->state.pf == PF_BRIDGE) { @@ -1224,10 +1220,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, return 0; } -static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]) { return -ENOTSUPP; } @@ -1245,16 +1239,14 @@ static const struct nf_queue_handler nfqh = { .nf_hook_drop = nfqnl_nf_hook_drop, }; -static int nfqnl_recv_config(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); - struct nfqnl_instance *queue; struct nfqnl_msg_config_cmd *cmd = NULL; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); + struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; @@ -1373,17 +1365,29 @@ err_out_unlock: } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { - [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, }, - [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_policy }, - [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .policy = nfqa_cfg_policy }, - [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_batch_policy }, + [NFQNL_MSG_PACKET] = { + .call = nfqnl_recv_unsupp, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + }, + [NFQNL_MSG_VERDICT] = { + .call = nfqnl_recv_verdict, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy + }, + [NFQNL_MSG_CONFIG] = { + .call = nfqnl_recv_config, + .type = NFNL_CB_MUTEX, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy + }, + [NFQNL_MSG_VERDICT_BATCH] = { + .call = nfqnl_recv_verdict_batch, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_batch_policy + }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 7a9aa57b195b..363bdd7044ec 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -2,7 +2,6 @@ #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/net_namespace.h> -#include <net/netns/generic.h> #include <net/netfilter/nf_tables.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> @@ -11,8 +10,6 @@ #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> -extern unsigned int nf_tables_net_id; - #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, @@ -369,7 +366,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - nft_net = net_generic(ctx.net, nf_tables_net_id); + nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index b8dbd20a6a4c..5415ab14400d 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -613,17 +613,15 @@ nla_put_failure: return -1; } -static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_compat_get_rcu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const tb[]) { - int ret = 0, target; struct nfgenmsg *nfmsg; - const char *fmt; - const char *name; - u32 rev; + const char *name, *fmt; struct sk_buff *skb2; + int ret = 0, target; + u32 rev; if (tb[NFTA_COMPAT_NAME] == NULL || tb[NFTA_COMPAT_REV] == NULL || @@ -634,7 +632,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); - nfmsg = nlmsg_data(nlh); + nfmsg = nlmsg_data(info->nlh); switch(nfmsg->nfgen_family) { case AF_INET: @@ -673,8 +671,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, /* include the best revision for this extension in the message */ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, nfmsg->nfgen_family, name, ret, target) <= 0) { @@ -682,8 +680,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, goto out_put; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; out_put: @@ -700,9 +698,12 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { }; static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { - [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu, - .attr_count = NFTA_COMPAT_MAX, - .policy = nfnl_compat_policy_get }, + [NFNL_MSG_COMPAT_GET] = { + .call = nfnl_compat_get_rcu, + .type = NFNL_CB_RCU, + .attr_count = NFTA_COMPAT_MAX, + .policy = nfnl_compat_policy_get + }, }; static const struct nfnetlink_subsystem nfnl_compat_subsys = { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index f9437a0dcfef..6ba3256fa844 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -11,9 +11,6 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> -#include <net/netns/generic.h> - -extern unsigned int nf_tables_net_id; struct nft_dynset { struct nft_set *set; @@ -164,7 +161,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_dynset *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index c9b8a2b03b71..9c169d100651 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,6 +9,7 @@ struct nft_socket { enum nft_socket_keys key:8; + u8 level; union { u8 dreg; }; @@ -33,6 +34,26 @@ static void nft_socket_wildcard(const struct nft_pktinfo *pkt, } } +#ifdef CONFIG_CGROUPS +static noinline bool +nft_sock_get_eval_cgroupv2(u32 *dest, const struct nft_pktinfo *pkt, u32 level) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (level > cgrp->level) + return false; + + memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64)); + + return true; +} +#endif + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -85,6 +106,14 @@ static void nft_socket_eval(const struct nft_expr *expr, } nft_socket_wildcard(pkt, regs, sk, dest); break; +#ifdef CONFIG_CGROUPS + case NFT_SOCKET_CGROUPV2: + if (!nft_sock_get_eval_cgroupv2(dest, pkt, priv->level)) { + regs->verdict.code = NFT_BREAK; + return; + } + break; +#endif default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -97,6 +126,7 @@ static void nft_socket_eval(const struct nft_expr *expr, static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, + [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 }, }; static int nft_socket_init(const struct nft_ctx *ctx, @@ -104,7 +134,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_socket *priv = nft_expr_priv(expr); - unsigned int len; + unsigned int len, level; if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY]) return -EINVAL; @@ -129,6 +159,19 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_MARK: len = sizeof(u32); break; +#ifdef CONFIG_CGROUPS + case NFT_SOCKET_CGROUPV2: + if (!tb[NFTA_SOCKET_LEVEL]) + return -EINVAL; + + level = ntohl(nla_get_u32(tb[NFTA_SOCKET_LEVEL])); + if (level > 255) + return -EOPNOTSUPP; + + priv->level = level; + len = sizeof(u64); + break; +#endif default: return -EOPNOTSUPP; } @@ -146,6 +189,9 @@ static int nft_socket_dump(struct sk_buff *skb, return -1; if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; + if (priv->key == NFT_SOCKET_CGROUPV2 && + nla_put_u32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) + return -1; return 0; } diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 43a5a780a6d3..accef672088c 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -263,6 +263,29 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, return 0; } +static void nft_tproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_tproxy *priv = nft_expr_priv(expr); + + switch (priv->family) { + case NFPROTO_IPV4: + nf_defrag_ipv4_disable(ctx->net); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nf_defrag_ipv6_disable(ctx->net); + break; +#endif + case NFPROTO_UNSPEC: + nf_defrag_ipv4_disable(ctx->net); +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + nf_defrag_ipv6_disable(ctx->net); +#endif + break; + } +} + static int nft_tproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -288,6 +311,7 @@ static const struct nft_expr_ops nft_tproxy_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)), .eval = nft_tproxy_eval, .init = nft_tproxy_init, + .destroy = nft_tproxy_destroy, .dump = nft_tproxy_dump, }; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index b7f8d2ed3cc2..84e58ee501a4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -52,7 +52,7 @@ struct xt_af { struct mutex mutex; struct list_head match; struct list_head target; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct mutex compat_mutex; struct compat_delta *compat_tab; unsigned int number; /* number of slots in compat_tab[] */ @@ -647,7 +647,7 @@ static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, return usersize == kernsize && strnlen(msg, msglen) < msglen; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; @@ -850,7 +850,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, __alignof__(struct compat_xt_entry_match)); } EXPORT_SYMBOL(xt_compat_check_entry_offsets); -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ /** * xt_check_entry_offsets - validate arp/ip/ip6t_entry @@ -868,7 +868,7 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets); * match structures are aligned, and that the last structure ends where * the target structure begins. * - * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version. + * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version. * * The arp/ip/ip6t_entry structure @base must have passed following tests: * - it must point to a valid memory location @@ -1059,7 +1059,7 @@ void *xt_copy_counters(sockptr_t arg, unsigned int len, void *mem; u64 size; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) { /* structures only differ in size due to alignment */ struct compat_xt_counters_info compat_tmp; @@ -1106,7 +1106,7 @@ void *xt_copy_counters(sockptr_t arg, unsigned int len, } EXPORT_SYMBOL_GPL(xt_copy_counters); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_target_offset(const struct xt_target *target) { u_int16_t csize = target->compatsize ? : target->targetsize; @@ -1199,6 +1199,23 @@ void xt_free_table_info(struct xt_table_info *info) } EXPORT_SYMBOL(xt_free_table_info); +struct xt_table *xt_find_table(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + mutex_unlock(&xt[af].mutex); + return t; + } + } + mutex_unlock(&xt[af].mutex); + return NULL; +} +EXPORT_SYMBOL(xt_find_table); + /* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) @@ -1276,7 +1293,7 @@ void xt_table_unlock(struct xt_table *table) } EXPORT_SYMBOL_GPL(xt_table_unlock); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); @@ -1481,6 +1498,7 @@ void *xt_unregister_table(struct xt_table *table) mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + kfree(table->ops); kfree(table); return private; @@ -1913,7 +1931,7 @@ static int __init xt_init(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) { mutex_init(&xt[i].mutex); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT mutex_init(&xt[i].compat_mutex); xt[i].compat_tab = NULL; #endif diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 194dc03341f3..459d0696c91a 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -200,6 +200,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par) pr_info_ratelimited("Can be used only with -p tcp or -p udp\n"); return -EINVAL; } + +static void tproxy_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_defrag_ipv6_disable(par->net); +} #endif static int tproxy_tg4_check(const struct xt_tgchk_param *par) @@ -219,6 +224,11 @@ static int tproxy_tg4_check(const struct xt_tgchk_param *par) return -EINVAL; } +static void tproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ + nf_defrag_ipv4_disable(par->net); +} + static struct xt_target tproxy_tg_reg[] __read_mostly = { { .name = "TPROXY", @@ -228,6 +238,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 0, .targetsize = sizeof(struct xt_tproxy_target_info), .checkentry = tproxy_tg4_check, + .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, @@ -239,6 +250,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg4_check, + .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, @@ -251,6 +263,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg6_check, + .destroy = tproxy_tg6_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index bd1dea9c7b88..24d4afb9988d 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -134,7 +134,7 @@ static void limit_mt_destroy(const struct xt_mtdtor_param *par) kfree(info->master); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_xt_rateinfo { u_int32_t avg; u_int32_t burst; @@ -176,7 +176,7 @@ static int limit_mt_compat_to_user(void __user *dst, const void *src) }; return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; } -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_match limit_mt_reg __read_mostly = { .name = "limit", @@ -186,7 +186,7 @@ static struct xt_match limit_mt_reg __read_mostly = { .checkentry = limit_mt_check, .destroy = limit_mt_destroy, .matchsize = sizeof(struct xt_rateinfo), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_xt_rateinfo), .compat_from_user = limit_mt_compat_from_user, .compat_to_user = limit_mt_compat_to_user, diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5f973987265d..5e6459e11605 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -216,6 +216,14 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par) return 0; } +static void socket_mt_destroy(const struct xt_mtdtor_param *par) +{ + if (par->family == NFPROTO_IPV4) + nf_defrag_ipv4_disable(par->net); + else if (par->family == NFPROTO_IPV6) + nf_defrag_ipv4_disable(par->net); +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -231,6 +239,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .revision = 1, .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, + .destroy = socket_mt_destroy, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -245,6 +254,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), + .destroy = socket_mt_destroy, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, @@ -256,6 +266,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -268,6 +279,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -280,6 +292,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v3_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -292,6 +305,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v3_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 15424d26e85d..96b524ceabca 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -392,7 +392,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * * Start with a full bucket. */ - band->bucket = (band->burst_size + band->rate) * 1000ULL; + band->bucket = band->burst_size * 1000ULL; band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; @@ -641,7 +641,7 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, long long int max_bucket_size; band = &meter->bands[i]; - max_bucket_size = (band->burst_size + band->rate) * 1000LL; + max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index 2bf2b1943e61..fa611678af05 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -50,6 +50,9 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep); int rc; + if (skb->sk) + sock_hold(skb->sk); + rc = skb_linearize(skb); if (rc) goto free_skb; @@ -59,12 +62,11 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) if (rc) goto free_skb; - if (skb->sk) - sock_hold(skb->sk); - return rc; free_skb: + if (skb->sk) + sock_put(skb->sk); kfree_skb(skb); return rc; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 922ed6b91abb..5c91df52b8c2 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -945,6 +945,12 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); + + if (!cycle) { + NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); + return -EINVAL; + } + new->cycle_time = cycle; } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e4370b1b7494..902cb6dd710b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -733,6 +733,23 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, return t->send_pkt(reply); } +/* This function should be called with sk_lock held and SOCK_DONE set */ +static void virtio_transport_remove_sock(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt, *tmp; + + /* We don't need to take rx_lock, as the socket is closing and we are + * removing it. + */ + list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + vsock_remove_sock(vsk); +} + static void virtio_transport_wait_close(struct sock *sk, long timeout) { if (timeout) { @@ -765,7 +782,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { vsk->close_work_scheduled = false; - vsock_remove_sock(vsk); + virtio_transport_remove_sock(vsk); /* Release refcnt obtained when we scheduled the timeout */ sock_put(sk); @@ -828,22 +845,15 @@ static bool virtio_transport_close(struct vsock_sock *vsk) void virtio_transport_release(struct vsock_sock *vsk) { - struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt, *tmp; struct sock *sk = &vsk->sk; bool remove_sock = true; if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); - list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - if (remove_sock) { sock_set_flag(sk, SOCK_DONE); - vsock_remove_sock(vsk); + virtio_transport_remove_sock(vsk); } } EXPORT_SYMBOL_GPL(virtio_transport_release); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 8b65323207db..1c9ecb18b8e6 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -568,8 +568,7 @@ vmci_transport_queue_pair_alloc(struct vmci_qp **qpair, peer, flags, VMCI_NO_PRIVILEGE_FLAGS); out: if (err < 0) { - pr_err("Could not attach to queue pair with %d\n", - err); + pr_err_once("Could not attach to queue pair with %d\n", err); err = vmci_transport_error_to_vsock_error(err); } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a71ed664da0a..cd62d4ba87a9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -30,7 +30,7 @@ #include "xdp_umem.h" #include "xsk.h" -#define TX_BATCH_SIZE 16 +#define TX_BATCH_SIZE 32 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c index 3f4599c9a202..ef30d2b353b0 100644 --- a/samples/bpf/tracex1_kern.c +++ b/samples/bpf/tracex1_kern.c @@ -26,7 +26,7 @@ SEC("kprobe/__netif_receive_skb_core") int bpf_prog1(struct pt_regs *ctx) { - /* attaches to kprobe netif_receive_skb, + /* attaches to kprobe __netif_receive_skb_core, * looks for packets on loobpack device and prints them */ char devname[IFNAMSIZ]; @@ -35,7 +35,7 @@ int bpf_prog1(struct pt_regs *ctx) int len; /* non-portable! works for the given kernel only */ - skb = (struct sk_buff *) PT_REGS_PARM1(ctx); + bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); dev = _(skb->dev); len = _(skb->len); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 3b261b0f74f0..667aacb9261c 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -213,6 +213,7 @@ vmlinux_link() gen_btf() { local pahole_ver + local extra_paholeopt= if ! [ -x "$(command -v ${PAHOLE})" ]; then echo >&2 "BTF: ${1}: pahole (${PAHOLE}) is not available" @@ -227,8 +228,12 @@ gen_btf() vmlinux_link ${1} + if [ "${pahole_ver}" -ge "121" ]; then + extra_paholeopt="${extra_paholeopt} --btf_gen_floats" + fi + info "BTF" ${2} - LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1} + LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${extra_paholeopt} ${1} # Create ${2} which contains just .BTF section but no symbols. Add # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 62953bbf68b4..385d5c955cf3 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -71,7 +71,9 @@ static const char *btf_var_linkage_str(__u32 linkage) case BTF_VAR_STATIC: return "static"; case BTF_VAR_GLOBAL_ALLOCATED: - return "global-alloc"; + return "global"; + case BTF_VAR_GLOBAL_EXTERN: + return "extern"; default: return "(unknown)"; } @@ -98,26 +100,28 @@ static const char *btf_str(const struct btf *btf, __u32 off) return btf__name_by_offset(btf, off) ? : "(invalid)"; } +static int btf_kind_safe(int kind) +{ + return kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; +} + static int dump_btf_type(const struct btf *btf, __u32 id, const struct btf_type *t) { json_writer_t *w = json_wtr; - int kind, safe_kind; - - kind = BTF_INFO_KIND(t->info); - safe_kind = kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; + int kind = btf_kind(t); if (json_output) { jsonw_start_object(w); jsonw_uint_field(w, "id", id); - jsonw_string_field(w, "kind", btf_kind_str[safe_kind]); + jsonw_string_field(w, "kind", btf_kind_str[btf_kind_safe(kind)]); jsonw_string_field(w, "name", btf_str(btf, t->name_off)); } else { - printf("[%u] %s '%s'", id, btf_kind_str[safe_kind], + printf("[%u] %s '%s'", id, btf_kind_str[btf_kind_safe(kind)], btf_str(btf, t->name_off)); } - switch (BTF_INFO_KIND(t->info)) { + switch (kind) { case BTF_KIND_INT: { __u32 v = *(__u32 *)(t + 1); const char *enc; @@ -300,7 +304,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id, break; } case BTF_KIND_DATASEC: { - const struct btf_var_secinfo *v = (const void *)(t+1); + const struct btf_var_secinfo *v = (const void *)(t + 1); + const struct btf_type *vt; __u16 vlen = BTF_INFO_VLEN(t->info); int i; @@ -322,6 +327,13 @@ static int dump_btf_type(const struct btf *btf, __u32 id, } else { printf("\n\ttype_id=%u offset=%u size=%u", v->type, v->offset, v->size); + + if (v->type <= btf__get_nr_types(btf)) { + vt = btf__type_by_id(btf, v->type); + printf(" (%s '%s')", + btf_kind_str[btf_kind_safe(btf_kind(vt))], + btf_str(btf, vt->name_off)); + } } } if (json_output) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index ff3aa0cf3997..f836d115d7d6 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -157,7 +157,7 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq, if (len == 0) break; - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len); nh = NLMSG_NEXT(nh, len)) { if (nh->nlmsg_pid != nl_pid) { ret = -LIBBPF_ERRNO__WRNGPID; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69902603012c..ec6d85a81744 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -312,6 +312,27 @@ union bpf_iter_link_info { * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -4061,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4078,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4088,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4578,7 +4615,7 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based + * Check packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. * @@ -4595,6 +4632,14 @@ union bpf_attr { * against the current net device. This is practical if this isn't * used prior to redirect. * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * * The Linux kernel route table can configure MTUs on a more * specific per route level, which is not provided by this helper. * For route level MTU checks use the **bpf_fib_lookup**\ () @@ -4619,11 +4664,9 @@ union bpf_attr { * * On return *mtu_len* pointer contains the MTU value of the net * device. Remember the net device configured MTU is the L3 size, - * which is returned here and XDP and TX length operate at L2. + * which is returned here and XDP and TC length operate at L2. * Helper take this into account for you, but remember when using - * MTU value in your BPF-code. On input *mtu_len* must be a valid - * pointer and be initialized (to zero), else verifier will reject - * BPF program. + * MTU value in your BPF-code. * * Return * * 0 on success, and populate MTU value in *mtu_len* pointer. @@ -4665,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4832,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5373,6 +5444,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cc2e51c64a54..9720dc0b4605 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -25,9 +25,16 @@ /* * Helper macro to place programs, maps, license in * different sections in elf_bpf file. Section names - * are interpreted by elf_bpf loader + * are interpreted by libbpf depending on the context (BPF programs, BPF maps, + * extern variables, etc). + * To allow use of SEC() with externs (e.g., for extern .maps declarations), + * make sure __attribute__((unused)) doesn't trigger compilation warning. */ -#define SEC(NAME) __attribute__((section(NAME), used)) +#define SEC(name) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ + __attribute__((section(name), used)) \ + _Pragma("GCC diagnostic pop") \ /* Avoid 'linux/stddef.h' definition of '__always_inline'. */ #undef __always_inline @@ -40,6 +47,14 @@ #define __weak __attribute__((weak)) #endif +/* + * Use __hidden attribute to mark a non-static BPF subprogram effectively + * static for BPF verifier's verification algorithm purposes, allowing more + * extensive and permissive BPF verification process, taking into account + * subprogram's caller context. + */ +#define __hidden __attribute__((visibility("hidden"))) + /* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include * any system-level headers (such as stddef.h, linux/version.h, etc), and * commonly-used macros like NULL and KERNEL_VERSION aren't available through @@ -51,7 +66,7 @@ #endif #ifndef KERNEL_VERSION -#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c)) +#define KERNEL_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) #endif /* diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index f9ef37707888..8c954ebc0c7c 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -413,20 +413,56 @@ typeof(name(0)) name(struct pt_regs *ctx) \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) +#define ___bpf_fill0(arr, p, x) do {} while (0) +#define ___bpf_fill1(arr, p, x) arr[p] = x +#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args) +#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args) +#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args) +#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args) +#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args) +#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args) +#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args) +#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args) +#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args) +#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args) +#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args) +#define ___bpf_fill(arr, args...) \ + ___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args) + /* * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values * in a structure. */ -#define BPF_SEQ_PRINTF(seq, fmt, args...) \ - ({ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - static const char ___fmt[] = fmt; \ - unsigned long long ___param[] = { args }; \ - _Pragma("GCC diagnostic pop") \ - int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ - ___param, sizeof(___param)); \ - ___ret; \ - }) +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* + * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of + * an array of u64. + */ +#define BPF_SNPRINTF(out, out_size, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_snprintf(out, out_size, ___fmt, \ + ___param, sizeof(___param)); \ +}) #endif diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d30e67e7e1e5..d57e13a13798 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1605,11 +1605,6 @@ static void *btf_add_type_mem(struct btf *btf, size_t add_sz) btf->hdr->type_len, UINT_MAX, add_sz); } -static __u32 btf_type_info(int kind, int vlen, int kflag) -{ - return (kflag << 31) | (kind << 24) | vlen; -} - static void btf_type_inc_vlen(struct btf_type *t) { t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7aad78dbb4b4..a1cddd17af7d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -69,8 +69,7 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); -static const struct btf_type * -skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); static int __base_pr(enum libbpf_print_level level, const char *format, va_list args) @@ -195,7 +194,6 @@ struct reloc_desc { int insn_idx; int map_idx; int sym_off; - bool processed; }; struct bpf_sec_def; @@ -275,6 +273,7 @@ struct bpf_program { bpf_program_clear_priv_t clear_priv; bool load; + bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; int prog_ifindex; @@ -501,8 +500,6 @@ static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); static int elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn, GElf_Shdr *hdr); static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym); void bpf_program__unload(struct bpf_program *prog) { @@ -643,25 +640,29 @@ static int bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, const char *sec_name, int sec_idx) { + Elf_Data *symbols = obj->efile.symbols; struct bpf_program *prog, *progs; void *data = sec_data->d_buf; - size_t sec_sz = sec_data->d_size, sec_off, prog_sz; - int nr_progs, err; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz, nr_syms; + int nr_progs, err, i; const char *name; GElf_Sym sym; progs = obj->programs; nr_progs = obj->nr_programs; + nr_syms = symbols->d_size / sizeof(GElf_Sym); sec_off = 0; - while (sec_off < sec_sz) { - if (elf_sym_by_sec_off(obj, sec_idx, sec_off, STT_FUNC, &sym)) { - pr_warn("sec '%s': failed to find program symbol at offset %zu\n", - sec_name, sec_off); - return -LIBBPF_ERRNO__FORMAT; - } + for (i = 0; i < nr_syms; i++) { + if (!gelf_getsym(symbols, i, &sym)) + continue; + if (sym.st_shndx != sec_idx) + continue; + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; prog_sz = sym.st_size; + sec_off = sym.st_value; name = elf_sym_str(obj, sym.st_name); if (!name) { @@ -699,10 +700,17 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, if (err) return err; + /* if function is a global/weak symbol, but has hidden + * visibility (STV_HIDDEN), mark its BTF FUNC as static to + * enable more permissive BPF verification mode with more + * outside context available to BPF verifier + */ + if (GELF_ST_BIND(sym.st_info) != STB_LOCAL + && GELF_ST_VISIBILITY(sym.st_other) == STV_HIDDEN) + prog->mark_btf_static = true; + nr_progs++; obj->nr_programs = nr_progs; - - sec_off += prog_sz; } return 0; @@ -1896,7 +1904,7 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) return 0; } -static const struct btf_type * +const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { const struct btf_type *t = btf__type_by_id(btf, id); @@ -1951,16 +1959,11 @@ static const char *__btf_kind_str(__u16 kind) } } -static const char *btf_kind_str(const struct btf_type *t) +const char *btf_kind_str(const struct btf_type *t) { return __btf_kind_str(btf_kind(t)); } -static enum btf_func_linkage btf_func_linkage(const struct btf_type *t) -{ - return (enum btf_func_linkage)BTF_INFO_VLEN(t->info); -} - /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -2015,254 +2018,262 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return bpf_map__set_pin_path(map, buf); } - -static int parse_btf_map_def(struct bpf_object *obj, - struct bpf_map *map, - const struct btf_type *def, - bool strict, bool is_inner, - const char *pin_root_path) +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def) { const struct btf_type *t; const struct btf_member *m; + bool is_inner = inner_def == NULL; int vlen, i; - vlen = btf_vlen(def); - m = btf_members(def); + vlen = btf_vlen(def_t); + m = btf_members(def_t); for (i = 0; i < vlen; i++, m++) { - const char *name = btf__name_by_offset(obj->btf, m->name_off); + const char *name = btf__name_by_offset(btf, m->name_off); if (!name) { - pr_warn("map '%s': invalid field #%d.\n", map->name, i); + pr_warn("map '%s': invalid field #%d.\n", map_name, i); return -EINVAL; } if (strcmp(name, "type") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.type)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_type)) return -EINVAL; - pr_debug("map '%s': found type = %u.\n", - map->name, map->def.type); + map_def->parts |= MAP_DEF_MAP_TYPE; } else if (strcmp(name, "max_entries") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.max_entries)) + if (!get_map_field_int(map_name, btf, m, &map_def->max_entries)) return -EINVAL; - pr_debug("map '%s': found max_entries = %u.\n", - map->name, map->def.max_entries); + map_def->parts |= MAP_DEF_MAX_ENTRIES; } else if (strcmp(name, "map_flags") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.map_flags)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_flags)) return -EINVAL; - pr_debug("map '%s': found map_flags = %u.\n", - map->name, map->def.map_flags); + map_def->parts |= MAP_DEF_MAP_FLAGS; } else if (strcmp(name, "numa_node") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, &map->numa_node)) + if (!get_map_field_int(map_name, btf, m, &map_def->numa_node)) return -EINVAL; - pr_debug("map '%s': found numa_node = %u.\n", map->name, map->numa_node); + map_def->parts |= MAP_DEF_NUMA_NODE; } else if (strcmp(name, "key_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found key_size = %u.\n", - map->name, sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %u.\n", - map->name, map->def.key_size, sz); + map_name, map_def->key_size, sz); return -EINVAL; } - map->def.key_size = sz; + map_def->key_size = sz; + map_def->parts |= MAP_DEF_KEY_SIZE; } else if (strcmp(name, "key") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': key type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': key spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found key [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %zd.\n", - map->name, map->def.key_size, (ssize_t)sz); + map_name, map_def->key_size, (ssize_t)sz); return -EINVAL; } - map->def.key_size = sz; - map->btf_key_type_id = t->type; + map_def->key_size = sz; + map_def->key_type_id = t->type; + map_def->parts |= MAP_DEF_KEY_SIZE | MAP_DEF_KEY_TYPE; } else if (strcmp(name, "value_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found value_size = %u.\n", - map->name, sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %u.\n", - map->name, map->def.value_size, sz); + map_name, map_def->value_size, sz); return -EINVAL; } - map->def.value_size = sz; + map_def->value_size = sz; + map_def->parts |= MAP_DEF_VALUE_SIZE; } else if (strcmp(name, "value") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': value type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': value spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found value [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %zd.\n", - map->name, map->def.value_size, (ssize_t)sz); + map_name, map_def->value_size, (ssize_t)sz); return -EINVAL; } - map->def.value_size = sz; - map->btf_value_type_id = t->type; + map_def->value_size = sz; + map_def->value_type_id = t->type; + map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; } else if (strcmp(name, "values") == 0) { + char inner_map_name[128]; int err; if (is_inner) { pr_warn("map '%s': multi-level inner maps not supported.\n", - map->name); + map_name); return -ENOTSUP; } if (i != vlen - 1) { pr_warn("map '%s': '%s' member should be last.\n", - map->name, name); + map_name, name); return -EINVAL; } - if (!bpf_map_type__is_map_in_map(map->def.type)) { + if (!bpf_map_type__is_map_in_map(map_def->map_type)) { pr_warn("map '%s': should be map-in-map.\n", - map->name); + map_name); return -ENOTSUP; } - if (map->def.value_size && map->def.value_size != 4) { + if (map_def->value_size && map_def->value_size != 4) { pr_warn("map '%s': conflicting value size %u != 4.\n", - map->name, map->def.value_size); + map_name, map_def->value_size); return -EINVAL; } - map->def.value_size = 4; - t = btf__type_by_id(obj->btf, m->type); + map_def->value_size = 4; + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': map-in-map inner type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_array(t) || btf_array(t)->nelems) { pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n", - map->name); + map_name); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, btf_array(t)->type, - NULL); + t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); if (!btf_is_ptr(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + t = skip_mods_and_typedefs(btf, t->type, NULL); if (!btf_is_struct(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - map->inner_map = calloc(1, sizeof(*map->inner_map)); - if (!map->inner_map) - return -ENOMEM; - map->inner_map->sec_idx = obj->efile.btf_maps_shndx; - map->inner_map->name = malloc(strlen(map->name) + - sizeof(".inner") + 1); - if (!map->inner_map->name) - return -ENOMEM; - sprintf(map->inner_map->name, "%s.inner", map->name); - - err = parse_btf_map_def(obj, map->inner_map, t, strict, - true /* is_inner */, NULL); + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); if (err) return err; + + map_def->parts |= MAP_DEF_INNER_MAP; } else if (strcmp(name, "pinning") == 0) { __u32 val; - int err; if (is_inner) { - pr_debug("map '%s': inner def can't be pinned.\n", - map->name); + pr_warn("map '%s': inner def can't be pinned.\n", map_name); return -EINVAL; } - if (!get_map_field_int(map->name, obj->btf, m, &val)) + if (!get_map_field_int(map_name, btf, m, &val)) return -EINVAL; - pr_debug("map '%s': found pinning = %u.\n", - map->name, val); - - if (val != LIBBPF_PIN_NONE && - val != LIBBPF_PIN_BY_NAME) { + if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { pr_warn("map '%s': invalid pinning value %u.\n", - map->name, val); + map_name, val); return -EINVAL; } - if (val == LIBBPF_PIN_BY_NAME) { - err = build_map_pin_path(map, pin_root_path); - if (err) { - pr_warn("map '%s': couldn't build pin path.\n", - map->name); - return err; - } - } + map_def->pinning = val; + map_def->parts |= MAP_DEF_PINNING; } else { if (strict) { - pr_warn("map '%s': unknown field '%s'.\n", - map->name, name); + pr_warn("map '%s': unknown field '%s'.\n", map_name, name); return -ENOTSUP; } - pr_debug("map '%s': ignoring unknown field '%s'.\n", - map->name, name); + pr_debug("map '%s': ignoring unknown field '%s'.\n", map_name, name); } } - if (map->def.type == BPF_MAP_TYPE_UNSPEC) { - pr_warn("map '%s': map type isn't specified.\n", map->name); + if (map_def->map_type == BPF_MAP_TYPE_UNSPEC) { + pr_warn("map '%s': map type isn't specified.\n", map_name); return -EINVAL; } return 0; } +static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) +{ + map->def.type = def->map_type; + map->def.key_size = def->key_size; + map->def.value_size = def->value_size; + map->def.max_entries = def->max_entries; + map->def.map_flags = def->map_flags; + + map->numa_node = def->numa_node; + map->btf_key_type_id = def->key_type_id; + map->btf_value_type_id = def->value_type_id; + + if (def->parts & MAP_DEF_MAP_TYPE) + pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); + + if (def->parts & MAP_DEF_KEY_TYPE) + pr_debug("map '%s': found key [%u], sz = %u.\n", + map->name, def->key_type_id, def->key_size); + else if (def->parts & MAP_DEF_KEY_SIZE) + pr_debug("map '%s': found key_size = %u.\n", map->name, def->key_size); + + if (def->parts & MAP_DEF_VALUE_TYPE) + pr_debug("map '%s': found value [%u], sz = %u.\n", + map->name, def->value_type_id, def->value_size); + else if (def->parts & MAP_DEF_VALUE_SIZE) + pr_debug("map '%s': found value_size = %u.\n", map->name, def->value_size); + + if (def->parts & MAP_DEF_MAX_ENTRIES) + pr_debug("map '%s': found max_entries = %u.\n", map->name, def->max_entries); + if (def->parts & MAP_DEF_MAP_FLAGS) + pr_debug("map '%s': found map_flags = %u.\n", map->name, def->map_flags); + if (def->parts & MAP_DEF_PINNING) + pr_debug("map '%s': found pinning = %u.\n", map->name, def->pinning); + if (def->parts & MAP_DEF_NUMA_NODE) + pr_debug("map '%s': found numa_node = %u.\n", map->name, def->numa_node); + + if (def->parts & MAP_DEF_INNER_MAP) + pr_debug("map '%s': found inner map definition.\n", map->name); +} + static int bpf_object__init_user_btf_map(struct bpf_object *obj, const struct btf_type *sec, int var_idx, int sec_idx, const Elf_Data *data, bool strict, const char *pin_root_path) { + struct btf_map_def map_def = {}, inner_def = {}; const struct btf_type *var, *def; const struct btf_var_secinfo *vi; const struct btf_var *var_extra; const char *map_name; struct bpf_map *map; + int err; vi = btf_var_secinfos(sec) + var_idx; var = btf__type_by_id(obj->btf, vi->type); @@ -2316,7 +2327,35 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, pr_debug("map '%s': at sec_idx %d, offset %zu.\n", map_name, map->sec_idx, map->sec_offset); - return parse_btf_map_def(obj, map, def, strict, false, pin_root_path); + err = parse_btf_map_def(map->name, obj->btf, def, strict, &map_def, &inner_def); + if (err) + return err; + + fill_map_from_def(map, &map_def); + + if (map_def.pinning == LIBBPF_PIN_BY_NAME) { + err = build_map_pin_path(map, pin_root_path); + if (err) { + pr_warn("map '%s': couldn't build pin path.\n", map->name); + return err; + } + } + + if (map_def.parts & MAP_DEF_INNER_MAP) { + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->fd = -1; + map->inner_map->sec_idx = sec_idx; + map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map_name); + + fill_map_from_def(map->inner_map, &inner_def); + } + + return 0; } static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, @@ -2618,7 +2657,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) { struct btf *kern_btf = obj->btf; bool btf_mandatory, sanitize; - int err = 0; + int i, err = 0; if (!obj->btf) return 0; @@ -2632,6 +2671,38 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) return 0; } + /* Even though some subprogs are global/weak, user might prefer more + * permissive BPF verification process that BPF verifier performs for + * static functions, taking into account more context from the caller + * functions. In such case, they need to mark such subprogs with + * __attribute__((visibility("hidden"))) and libbpf will adjust + * corresponding FUNC BTF type to be marked as static and trigger more + * involved BPF verification process. + */ + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + struct btf_type *t; + const char *name; + int j, n; + + if (!prog->mark_btf_static || !prog_is_subprog(obj, prog)) + continue; + + n = btf__get_nr_types(obj->btf); + for (j = 1; j <= n; j++) { + t = btf_type_by_id(obj->btf, j); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, prog->name) != 0) + continue; + + t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_STATIC, 0); + break; + } + } + sanitize = btf_needs_sanitization(obj); if (sanitize) { const void *raw_data; @@ -2782,26 +2853,6 @@ static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) return data; } -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym) -{ - Elf_Data *symbols = obj->efile.symbols; - size_t n = symbols->d_size / sizeof(GElf_Sym); - int i; - - for (i = 0; i < n; i++) { - if (!gelf_getsym(symbols, i, sym)) - continue; - if (sym->st_shndx != sec_idx || sym->st_value != off) - continue; - if (GELF_ST_TYPE(sym->st_info) != sym_type) - continue; - return 0; - } - - return -ENOENT; -} - static bool is_sec_name_dwarf(const char *name) { /* approximation, but the actual list is too long */ @@ -3498,8 +3549,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, const char *sym_sec_name; struct bpf_map *map; - reloc_desc->processed = false; - if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); @@ -3682,11 +3731,16 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data int err, i, nrels; const char *sym_name; __u32 insn_idx; + Elf_Scn *scn; + Elf_Data *scn_data; GElf_Sym sym; GElf_Rel rel; + scn = elf_sec_by_idx(obj, sec_idx); + scn_data = elf_sec_data(obj, scn); + relo_sec_name = elf_sec_str(obj, shdr->sh_name); - sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + sec_name = elf_sec_name(obj, scn); if (!relo_sec_name || !sec_name) return -EINVAL; @@ -3704,7 +3758,8 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; } - if (rel.r_offset % BPF_INSN_SZ) { + + if (rel.r_offset % BPF_INSN_SZ || rel.r_offset >= scn_data->d_size) { pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n", relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; @@ -3728,9 +3783,9 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); if (!prog) { - pr_warn("sec '%s': relo #%d: program not found in section '%s' for insn #%u\n", + pr_debug("sec '%s': relo #%d: couldn't find program in section '%s' for insn #%u, probably overridden weak function, skipping...\n", relo_sec_name, i, sec_name, insn_idx); - return -LIBBPF_ERRNO__RELOC; + continue; } relos = libbpf_reallocarray(prog->reloc_desc, @@ -3845,6 +3900,14 @@ __u32 bpf_map__max_entries(const struct bpf_map *map) return map->def.max_entries; } +struct bpf_map *bpf_map__inner_map(struct bpf_map *map) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) + return NULL; + + return map->inner_map; +} + int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { if (map->fd >= 0) @@ -6305,13 +6368,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_LD64: insn[0].src_reg = BPF_PSEUDO_MAP_FD; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; case RELO_DATA: insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; insn[1].imm = insn[0].imm + relo->sym_off; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; case RELO_EXTERN_VAR: ext = &obj->externs[relo->sym_off]; @@ -6329,13 +6390,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[1].imm = ext->ksym.addr >> 32; } } - relo->processed = true; break; case RELO_EXTERN_FUNC: ext = &obj->externs[relo->sym_off]; insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; insn[0].imm = ext->ksym.kernel_btf_id; - relo->processed = true; break; case RELO_SUBPROG_ADDR: insn[0].src_reg = BPF_PSEUDO_FUNC; @@ -6621,9 +6680,6 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * different main programs */ insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; - if (relo) - relo->processed = true; - pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); } @@ -6716,7 +6772,7 @@ static int bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_program *subprog; - int i, j, err; + int i, err; /* mark all subprogs as not relocated (yet) within the context of * current main program @@ -6727,9 +6783,6 @@ bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) continue; subprog->sub_insn_off = 0; - for (j = 0; j < subprog->nr_reloc; j++) - if (subprog->reloc_desc[j].type == RELO_CALL) - subprog->reloc_desc[j].processed = false; } err = bpf_object__reloc_code(obj, prog, prog); @@ -6976,7 +7029,7 @@ static bool insn_is_helper_call(struct bpf_insn *insn, enum bpf_func_id *func_id return false; } -static int bpf_object__sanitize_prog(struct bpf_object* obj, struct bpf_program *prog) +static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_insn *insn = prog->insns; enum bpf_func_id func_id; @@ -9476,6 +9529,7 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) pr_warn("error: inner_map_fd already specified\n"); return -EINVAL; } + zfree(&map->inner_map); map->inner_map_fd = fd; return 0; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f500621d28e5..bec4e6a6e31d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -480,6 +480,7 @@ LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); +LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); LIBBPF_API long libbpf_get_error(const void *ptr); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index f5990f7208ce..b9b29baf1df8 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -359,5 +359,6 @@ LIBBPF_0.4.0 { bpf_linker__finalize; bpf_linker__free; bpf_linker__new; + bpf_map__inner_map; bpf_object__set_kversion; } LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 6017902c687e..ee426226928f 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -19,6 +19,7 @@ #pragma GCC poison reallocarray #include "libbpf.h" +#include "btf.h" #ifndef EM_BPF #define EM_BPF 247 @@ -131,6 +132,50 @@ struct btf; struct btf_type; struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); +const char *btf_kind_str(const struct btf_type *t); +const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + +static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)(int)btf_vlen(t); +} + +static inline __u32 btf_type_info(int kind, int vlen, int kflag) +{ + return (kflag << 31) | (kind << 24) | vlen; +} + +enum map_def_parts { + MAP_DEF_MAP_TYPE = 0x001, + MAP_DEF_KEY_TYPE = 0x002, + MAP_DEF_KEY_SIZE = 0x004, + MAP_DEF_VALUE_TYPE = 0x008, + MAP_DEF_VALUE_SIZE = 0x010, + MAP_DEF_MAX_ENTRIES = 0x020, + MAP_DEF_MAP_FLAGS = 0x040, + MAP_DEF_NUMA_NODE = 0x080, + MAP_DEF_PINNING = 0x100, + MAP_DEF_INNER_MAP = 0x200, + + MAP_DEF_ALL = 0x3ff, /* combination of all above */ +}; + +struct btf_map_def { + enum map_def_parts parts; + __u32 map_type; + __u32 key_type_id; + __u32 key_size; + __u32 value_type_id; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + __u32 numa_node; + __u32 pinning; +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def); void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t cur_cnt, size_t max_cnt, size_t add_cnt); diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 46b16cbdcda3..9de084b1c699 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -22,6 +22,8 @@ #include "libbpf_internal.h" #include "strset.h" +#define BTF_EXTERN_SEC ".extern" + struct src_sec { const char *sec_name; /* positional (not necessarily ELF) index in an array of sections */ @@ -74,11 +76,36 @@ struct btf_ext_sec_data { void *recs; }; +struct glob_sym { + /* ELF symbol index */ + int sym_idx; + /* associated section id for .ksyms, .kconfig, etc, but not .extern */ + int sec_id; + /* extern name offset in STRTAB */ + int name_off; + /* optional associated BTF type ID */ + int btf_id; + /* BTF type ID to which VAR/FUNC type is pointing to; used for + * rewriting types when extern VAR/FUNC is resolved to a concrete + * definition + */ + int underlying_btf_id; + /* sec_var index in the corresponding dst_sec, if exists */ + int var_idx; + + /* extern or resolved/global symbol */ + bool is_extern; + /* weak or strong symbol, never goes back from strong to weak */ + bool is_weak; +}; + struct dst_sec { char *sec_name; /* positional (not necessarily ELF) index in an array of sections */ int id; + bool ephemeral; + /* ELF info */ size_t sec_idx; Elf_Scn *scn; @@ -120,22 +147,28 @@ struct bpf_linker { struct btf *btf; struct btf_ext *btf_ext; + + /* global (including extern) ELF symbols */ + int glob_sym_cnt; + struct glob_sym *glob_syms; }; #define pr_warn_elf(fmt, ...) \ -do { \ - libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)); \ -} while (0) + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)) static int init_output_elf(struct bpf_linker *linker, const char *file); static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj); static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec); static int linker_sanity_check_btf(struct src_obj *obj); static int linker_sanity_check_btf_ext(struct src_obj *obj); static int linker_fixup_btf(struct src_obj *obj); static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx); static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); @@ -282,7 +315,7 @@ static int init_output_elf(struct bpf_linker *linker, const char *file) /* ELF header */ linker->elf_hdr = elf64_newehdr(linker->elf); - if (!linker->elf_hdr){ + if (!linker->elf_hdr) { pr_warn_elf("failed to create ELF header"); return -EINVAL; } @@ -663,8 +696,8 @@ static bool is_pow_of_2(size_t x) static int linker_sanity_check_elf(struct src_obj *obj) { - struct src_sec *sec, *link_sec; - int i, j, n; + struct src_sec *sec; + int i, err; if (!obj->symtab_sec_idx) { pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); @@ -692,43 +725,11 @@ static int linker_sanity_check_elf(struct src_obj *obj) return -EINVAL; switch (sec->shdr->sh_type) { - case SHT_SYMTAB: { - Elf64_Sym *sym; - - if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) - return -EINVAL; - if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) - return -EINVAL; - - if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { - pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } - link_sec = &obj->secs[sec->shdr->sh_link]; - if (link_sec->shdr->sh_type != SHT_STRTAB) { - pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } - - n = sec->shdr->sh_size / sec->shdr->sh_entsize; - sym = sec->data->d_buf; - for (j = 0; j < n; j++, sym++) { - if (sym->st_shndx - && sym->st_shndx < SHN_LORESERVE - && sym->st_shndx >= obj->sec_cnt) { - pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", - j, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); - return -EINVAL; - } - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) { - if (sym->st_value != 0) - return -EINVAL; - } - } + case SHT_SYMTAB: + err = linker_sanity_check_elf_symtab(obj, sec); + if (err) + return err; break; - } case SHT_STRTAB: break; case SHT_PROGBITS: @@ -739,87 +740,169 @@ static int linker_sanity_check_elf(struct src_obj *obj) break; case SHT_NOBITS: break; - case SHT_REL: { - Elf64_Rel *relo; - struct src_sec *sym_sec; + case SHT_REL: + err = linker_sanity_check_elf_relos(obj, sec); + if (err) + return err; + break; + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } - if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) - return -EINVAL; - if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) - return -EINVAL; + return 0; +} - /* SHT_REL's sh_link should point to SYMTAB */ - if (sec->shdr->sh_link != obj->symtab_sec_idx) { - pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec; + Elf64_Sym *sym; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } - /* SHT_REL's sh_info points to relocated section */ - if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { - pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (i = 0; i < n; i++, sym++) { + int sym_type = ELF64_ST_TYPE(sym->st_info); + int sym_bind = ELF64_ST_BIND(sym->st_info); + int sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + + if (i == 0) { + if (sym->st_name != 0 || sym->st_info != 0 + || sym->st_other != 0 || sym->st_shndx != 0 + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #0 is invalid in %s\n", obj->filename); return -EINVAL; } - link_sec = &obj->secs[sec->shdr->sh_info]; + continue; + } + if (sym_bind != STB_LOCAL && sym_bind != STB_GLOBAL && sym_bind != STB_WEAK) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol binding %d\n", + i, sec->sec_idx, sym_bind); + return -EINVAL; + } + if (sym_vis != STV_DEFAULT && sym_vis != STV_HIDDEN) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol visibility %d\n", + i, sec->sec_idx, sym_vis); + return -EINVAL; + } + if (sym->st_shndx == 0) { + if (sym_type != STT_NOTYPE || sym_bind == STB_LOCAL + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #%d is invalid extern symbol in %s\n", + i, obj->filename); - /* .rel<secname> -> <secname> pattern is followed */ - if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 - || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { - pr_warn("ELF relo section #%zu name has invalid name in %s\n", - sec->sec_idx, obj->filename); return -EINVAL; } + continue; + } + if (sym->st_shndx < SHN_LORESERVE && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (sym_type == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + continue; + } + } - /* don't further validate relocations for ignored sections */ - if (link_sec->skipped) - break; + return 0; +} - /* relocatable section is data or instructions */ - if (link_sec->shdr->sh_type != SHT_PROGBITS - && link_sec->shdr->sh_type != SHT_NOBITS) { - pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); - return -EINVAL; - } +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec, *sym_sec; + Elf64_Rel *relo; + int i, n; - /* check sanity of each relocation */ - n = sec->shdr->sh_size / sec->shdr->sh_entsize; - relo = sec->data->d_buf; - sym_sec = &obj->secs[obj->symtab_sec_idx]; - for (j = 0; j < n; j++, relo++) { - size_t sym_idx = ELF64_R_SYM(relo->r_info); - size_t sym_type = ELF64_R_TYPE(relo->r_info); - - if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { - pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", - j, sec->sec_idx, sym_type, obj->filename); - return -EINVAL; - } + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; - if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { - pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", - j, sec->sec_idx, sym_idx, obj->filename); - return -EINVAL; - } + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } - if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { - if (relo->r_offset % sizeof(struct bpf_insn) != 0) { - pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", - j, sec->sec_idx, sym_idx, obj->filename); - return -EINVAL; - } - } - } - break; + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel<secname> -> <secname> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + return 0; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (i = 0; i < n; i++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + i, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; } - case SHT_LLVM_ADDRSIG: - break; - default: - pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", - sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); return -EINVAL; } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } } return 0; @@ -897,6 +980,7 @@ static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct s dst_sec->sec_sz = 0; dst_sec->sec_idx = 0; + dst_sec->ephemeral = src_sec->ephemeral; /* ephemeral sections are just thin section shells lacking most parts */ if (src_sec->ephemeral) @@ -904,13 +988,13 @@ static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct s scn = elf_newscn(linker->elf); if (!scn) - return -1; + return -ENOMEM; data = elf_newdata(scn); if (!data) - return -1; + return -ENOMEM; shdr = elf64_getshdr(scn); if (!shdr) - return -1; + return -ENOMEM; dst_sec->scn = scn; dst_sec->shdr = shdr; @@ -960,6 +1044,9 @@ static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const cha static bool secs_match(struct dst_sec *dst, struct src_sec *src) { + if (dst->ephemeral || src->ephemeral) + return true; + if (dst->shdr->sh_type != src->shdr->sh_type) { pr_warn("sec %s types mismatch\n", dst->sec_name); return false; @@ -985,13 +1072,33 @@ static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec return true; } -static int extend_sec(struct dst_sec *dst, struct src_sec *src) +static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src) { void *tmp; - size_t dst_align = dst->shdr->sh_addralign; - size_t src_align = src->shdr->sh_addralign; + size_t dst_align, src_align; size_t dst_align_sz, dst_final_sz; + int err; + + /* Ephemeral source section doesn't contribute anything to ELF + * section data. + */ + if (src->ephemeral) + return 0; + + /* Some sections (like .maps) can contain both externs (and thus be + * ephemeral) and non-externs (map definitions). So it's possible that + * it has to be "upgraded" from ephemeral to non-ephemeral when the + * first non-ephemeral entity appears. In such case, we add ELF + * section, data, etc. + */ + if (dst->ephemeral) { + err = init_sec(linker, dst, src); + if (err) + return err; + } + dst_align = dst->shdr->sh_addralign; + src_align = src->shdr->sh_addralign; if (dst_align == 0) dst_align = 1; if (dst_align < src_align) @@ -1087,10 +1194,7 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj /* record mapped section index */ src_sec->dst_id = dst_sec->id; - if (src_sec->ephemeral) - continue; - - err = extend_sec(dst_sec, src_sec); + err = extend_sec(linker, dst_sec, src_sec); if (err) return err; } @@ -1101,68 +1205,778 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) { struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; - Elf64_Sym *sym = symtab->data->d_buf, *dst_sym; - int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize, err; int str_sec_idx = symtab->shdr->sh_link; + const char *sym_name; obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); if (!obj->sym_map) return -ENOMEM; for (i = 0; i < n; i++, sym++) { - struct src_sec *src_sec = NULL; - struct dst_sec *dst_sec = NULL; - const char *sym_name; - size_t dst_sym_idx; - int name_off; - - /* we already have all-zero initial symbol */ - if (sym->st_name == 0 && sym->st_info == 0 && - sym->st_other == 0 && sym->st_shndx == SHN_UNDEF && - sym->st_value == 0 && sym->st_size ==0) + /* We already validated all-zero symbol #0 and we already + * appended it preventively to the final SYMTAB, so skip it. + */ + if (i == 0) continue; sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); if (!sym_name) { pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); - return -1; + return -EINVAL; + } + + err = linker_append_elf_sym(linker, obj, sym, sym_name, i); + if (err) + return err; + } + + return 0; +} + +static Elf64_Sym *get_sym_by_idx(struct bpf_linker *linker, size_t sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms = symtab->raw_data; + + return &syms[sym_idx]; +} + +static struct glob_sym *find_glob_sym(struct bpf_linker *linker, const char *sym_name) +{ + struct glob_sym *glob_sym; + const char *name; + int i; + + for (i = 0; i < linker->glob_sym_cnt; i++) { + glob_sym = &linker->glob_syms[i]; + name = strset__data(linker->strtab_strs) + glob_sym->name_off; + + if (strcmp(name, sym_name) == 0) + return glob_sym; + } + + return NULL; +} + +static struct glob_sym *add_glob_sym(struct bpf_linker *linker) +{ + struct glob_sym *syms, *sym; + + syms = libbpf_reallocarray(linker->glob_syms, linker->glob_sym_cnt + 1, + sizeof(*linker->glob_syms)); + if (!syms) + return NULL; + + sym = &syms[linker->glob_sym_cnt]; + memset(sym, 0, sizeof(*sym)); + sym->var_idx = -1; + + linker->glob_syms = syms; + linker->glob_sym_cnt++; + + return sym; +} + +static bool glob_sym_btf_matches(const char *sym_name, bool exact, + const struct btf *btf1, __u32 id1, + const struct btf *btf2, __u32 id2) +{ + const struct btf_type *t1, *t2; + bool is_static1, is_static2; + const char *n1, *n2; + int i, n; + +recur: + n1 = n2 = NULL; + t1 = skip_mods_and_typedefs(btf1, id1, &id1); + t2 = skip_mods_and_typedefs(btf2, id2, &id2); + + /* check if only one side is FWD, otherwise handle with common logic */ + if (!exact && btf_is_fwd(t1) != btf_is_fwd(t2)) { + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible forward declaration names '%s' and '%s'\n", + sym_name, n1, n2); + return false; } + /* validate if FWD kind matches concrete kind */ + if (btf_is_fwd(t1)) { + if (btf_kflag(t1) && btf_is_union(t2)) + return true; + if (!btf_kflag(t1) && btf_is_struct(t2)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t1) ? "union" : "struct", btf_kind_str(t2)); + } else { + if (btf_kflag(t2) && btf_is_union(t1)) + return true; + if (!btf_kflag(t2) && btf_is_struct(t1)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t2) ? "union" : "struct", btf_kind_str(t1)); + } + return false; + } + + if (btf_kind(t1) != btf_kind(t2)) { + pr_warn("global '%s': incompatible BTF kinds %s and %s\n", + sym_name, btf_kind_str(t1), btf_kind_str(t2)); + return false; + } + + switch (btf_kind(t1)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible %s names '%s' and '%s'\n", + sym_name, btf_kind_str(t1), n1, n2); + return false; + } + break; + default: + break; + } + + switch (btf_kind(t1)) { + case BTF_KIND_UNKN: /* void */ + case BTF_KIND_FWD: + return true; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + /* ignore encoding for int and enum values for enum */ + if (t1->size != t2->size) { + pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", + sym_name, btf_kind_str(t1), n1, t1->size, t2->size); + return false; + } + return true; + case BTF_KIND_PTR: + /* just validate overall shape of the referenced type, so no + * contents comparison for struct/union, and allowd fwd vs + * struct/union + */ + exact = false; + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_ARRAY: + /* ignore index type and array size */ + id1 = btf_array(t1)->type; + id2 = btf_array(t2)->type; + goto recur; + case BTF_KIND_FUNC: + /* extern and global linkages are compatible */ + is_static1 = btf_func_linkage(t1) == BTF_FUNC_STATIC; + is_static2 = btf_func_linkage(t2) == BTF_FUNC_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible func '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_VAR: + /* extern and global linkages are compatible */ + is_static1 = btf_var(t1)->linkage == BTF_VAR_STATIC; + is_static2 = btf_var(t2)->linkage == BTF_VAR_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible var '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m1, *m2; + + if (!exact) + return true; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s fields %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < n; i++, m1++, m2++) { + n1 = btf__str_by_offset(btf1, m1->name_off); + n2 = btf__str_by_offset(btf2, m2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible field #%d names '%s' and '%s'\n", + sym_name, i, n1, n2); + return false; + } + if (m1->offset != m2->offset) { + pr_warn("global '%s': incompatible field #%d ('%s') offsets\n", + sym_name, i, n1); + return false; + } + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + return true; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m1, *m2; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s params %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < n; i++, m1++, m2++) { + /* ignore func arg names */ + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + /* now check return type as well */ + id1 = t1->type; + id2 = t2->type; + goto recur; + } + + /* skip_mods_and_typedefs() make this impossible */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + /* DATASECs are never compared with each other */ + case BTF_KIND_DATASEC: + default: + pr_warn("global '%s': unsupported BTF kind %s\n", + sym_name, btf_kind_str(t1)); + return false; + } +} + +static bool map_defs_match(const char *sym_name, + const struct btf *main_btf, + const struct btf_map_def *main_def, + const struct btf_map_def *main_inner_def, + const struct btf *extra_btf, + const struct btf_map_def *extra_def, + const struct btf_map_def *extra_inner_def) +{ + const char *reason; + + if (main_def->map_type != extra_def->map_type) { + reason = "type"; + goto mismatch; + } + + /* check key type/size match */ + if (main_def->key_size != extra_def->key_size) { + reason = "key_size"; + goto mismatch; + } + if (!!main_def->key_type_id != !!extra_def->key_type_id) { + reason = "key type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_KEY_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->key_type_id, + extra_btf, extra_def->key_type_id)) { + reason = "key type"; + goto mismatch; + } + + /* validate value type/size match */ + if (main_def->value_size != extra_def->value_size) { + reason = "value_size"; + goto mismatch; + } + if (!!main_def->value_type_id != !!extra_def->value_type_id) { + reason = "value type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_VALUE_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->value_type_id, + extra_btf, extra_def->value_type_id)) { + reason = "key type"; + goto mismatch; + } + + if (main_def->max_entries != extra_def->max_entries) { + reason = "max_entries"; + goto mismatch; + } + if (main_def->map_flags != extra_def->map_flags) { + reason = "map_flags"; + goto mismatch; + } + if (main_def->numa_node != extra_def->numa_node) { + reason = "numa_node"; + goto mismatch; + } + if (main_def->pinning != extra_def->pinning) { + reason = "pinning"; + goto mismatch; + } + + if ((main_def->parts & MAP_DEF_INNER_MAP) != (extra_def->parts & MAP_DEF_INNER_MAP)) { + reason = "inner map"; + goto mismatch; + } - if (sym->st_shndx && sym->st_shndx < SHN_LORESERVE) { - src_sec = &obj->secs[sym->st_shndx]; - if (src_sec->skipped) + if (main_def->parts & MAP_DEF_INNER_MAP) { + char inner_map_name[128]; + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", sym_name); + + return map_defs_match(inner_map_name, + main_btf, main_inner_def, NULL, + extra_btf, extra_inner_def, NULL); + } + + return true; + +mismatch: + pr_warn("global '%s': map %s mismatch\n", sym_name, reason); + return false; +} + +static bool glob_map_defs_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, int btf_id) +{ + struct btf_map_def dst_def = {}, dst_inner_def = {}; + struct btf_map_def src_def = {}, src_inner_def = {}; + const struct btf_type *t; + int err; + + t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(t)) { + pr_warn("global '%s': invalid map definition type [%d]\n", sym_name, btf_id); + return false; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + + err = parse_btf_map_def(sym_name, obj->btf, t, true /*strict*/, &src_def, &src_inner_def); + if (err) { + pr_warn("global '%s': invalid map definition\n", sym_name); + return false; + } + + /* re-parse existing map definition */ + t = btf__type_by_id(linker->btf, glob_sym->btf_id); + t = skip_mods_and_typedefs(linker->btf, t->type, NULL); + err = parse_btf_map_def(sym_name, linker->btf, t, true /*strict*/, &dst_def, &dst_inner_def); + if (err) { + /* this should not happen, because we already validated it */ + pr_warn("global '%s': invalid dst map definition\n", sym_name); + return false; + } + + /* Currently extern map definition has to be complete and match + * concrete map definition exactly. This restriction might be lifted + * in the future. + */ + return map_defs_match(sym_name, linker->btf, &dst_def, &dst_inner_def, + obj->btf, &src_def, &src_inner_def); +} + +static bool glob_syms_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) +{ + const struct btf_type *src_t; + + /* if we are dealing with externs, BTF types describing both global + * and extern VARs/FUNCs should be completely present in all files + */ + if (!glob_sym->btf_id || !btf_id) { + pr_warn("BTF info is missing for global symbol '%s'\n", sym_name); + return false; + } + + src_t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(src_t) && !btf_is_func(src_t)) { + pr_warn("only extern variables and functions are supported, but got '%s' for '%s'\n", + btf_kind_str(src_t), sym_name); + return false; + } + + /* deal with .maps definitions specially */ + if (glob_sym->sec_id && strcmp(linker->secs[glob_sym->sec_id].sec_name, MAPS_ELF_SEC) == 0) + return glob_map_defs_match(sym_name, linker, glob_sym, obj, sym, btf_id); + + if (!glob_sym_btf_matches(sym_name, true /*exact*/, + linker->btf, glob_sym->btf_id, obj->btf, btf_id)) + return false; + + return true; +} + +static bool btf_is_non_static(const struct btf_type *t) +{ + return (btf_is_var(t) && btf_var(t)->linkage != BTF_VAR_STATIC) + || (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_STATIC); +} + +static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, + int *out_btf_sec_id, int *out_btf_id) +{ + int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; + const struct btf_type *t; + const struct btf_var_secinfo *vi; + const char *name; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* some global and extern FUNCs and VARs might not be associated with any + * DATASEC, so try to detect them in the same pass + */ + if (btf_is_non_static(t)) { + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, sym_name) != 0) continue; - dst_sec = &linker->secs[src_sec->dst_id]; - /* allow only one STT_SECTION symbol per section */ - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sec->sec_sym_idx) { - obj->sym_map[i] = dst_sec->sec_sym_idx; + /* remember and still try to find DATASEC */ + btf_id = i; + continue; + } + + if (!btf_is_datasec(t)) + continue; + + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + t = btf__type_by_id(obj->btf, vi->type); + name = btf__str_by_offset(obj->btf, t->name_off); + + if (strcmp(name, sym_name) != 0) + continue; + if (btf_is_var(t) && btf_var(t)->linkage == BTF_VAR_STATIC) continue; + if (btf_is_func(t) && btf_func_linkage(t) == BTF_FUNC_STATIC) + continue; + + if (btf_id && btf_id != vi->type) { + pr_warn("global/extern '%s' BTF is ambiguous: both types #%d and #%u match\n", + sym_name, btf_id, vi->type); + return -EINVAL; } + + *out_btf_sec_id = i; + *out_btf_id = vi->type; + + return 0; } + } + + /* free-floating extern or global FUNC */ + if (btf_id) { + *out_btf_sec_id = 0; + *out_btf_id = btf_id; + return 0; + } - name_off = strset__add_str(linker->strtab_strs, sym_name); - if (name_off < 0) - return name_off; + pr_warn("failed to find BTF info for global/extern symbol '%s'\n", sym_name); + return -ENOENT; +} - dst_sym = add_new_sym(linker, &dst_sym_idx); - if (!dst_sym) - return -ENOMEM; +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; - dst_sym->st_name = name_off; - dst_sym->st_info = sym->st_info; - dst_sym->st_other = sym->st_other; - dst_sym->st_shndx = src_sec ? dst_sec->sec_idx : sym->st_shndx; - dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; - dst_sym->st_size = sym->st_size; + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static int complete_extern_btf_info(struct btf *dst_btf, int dst_id, + struct btf *src_btf, int src_id) +{ + struct btf_type *dst_t = btf_type_by_id(dst_btf, dst_id); + struct btf_type *src_t = btf_type_by_id(src_btf, src_id); + struct btf_param *src_p, *dst_p; + const char *s; + int i, n, off; + + /* We already made sure that source and destination types (FUNC or + * VAR) match in terms of types and argument names. + */ + if (btf_is_var(dst_t)) { + btf_var(dst_t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + return 0; + } + + dst_t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_GLOBAL, 0); + + /* now onto FUNC_PROTO types */ + src_t = btf_type_by_id(src_btf, src_t->type); + dst_t = btf_type_by_id(dst_btf, dst_t->type); + + /* Fill in all the argument names, which for extern FUNCs are missing. + * We'll end up with two copies of FUNCs/VARs for externs, but that + * will be taken care of by BTF dedup at the very end. + * It might be that BTF types for extern in one file has less/more BTF + * information (e.g., FWD instead of full STRUCT/UNION information), + * but that should be (in most cases, subject to BTF dedup rules) + * handled and resolved by BTF dedup algorithm as well, so we won't + * worry about it. Our only job is to make sure that argument names + * are populated on both sides, otherwise BTF dedup will pedantically + * consider them different. + */ + src_p = btf_params(src_t); + dst_p = btf_params(dst_t); + for (i = 0, n = btf_vlen(dst_t); i < n; i++, src_p++, dst_p++) { + if (!src_p->name_off) + continue; + + /* src_btf has more complete info, so add name to dst_btf */ + s = btf__str_by_offset(src_btf, src_p->name_off); + off = btf__add_str(dst_btf, s); + if (off < 0) + return off; + dst_p->name_off = off; + } + return 0; +} + +static void sym_update_bind(Elf64_Sym *sym, int sym_bind) +{ + sym->st_info = ELF64_ST_INFO(sym_bind, ELF64_ST_TYPE(sym->st_info)); +} + +static void sym_update_type(Elf64_Sym *sym, int sym_type) +{ + sym->st_info = ELF64_ST_INFO(ELF64_ST_BIND(sym->st_info), sym_type); +} + +static void sym_update_visibility(Elf64_Sym *sym, int sym_vis) +{ + /* libelf doesn't provide setters for ST_VISIBILITY, + * but it is stored in the lower 2 bits of st_other + */ + sym->st_other &= 0x03; + sym->st_other |= sym_vis; +} + +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx) +{ + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + struct glob_sym *glob_sym = NULL; + int name_off, sym_type, sym_bind, sym_vis, err; + int btf_sec_id = 0, btf_id = 0; + size_t dst_sym_idx; + Elf64_Sym *dst_sym; + bool sym_is_extern; + + sym_type = ELF64_ST_TYPE(sym->st_info); + sym_bind = ELF64_ST_BIND(sym->st_info); + sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + sym_is_extern = sym->st_shndx == SHN_UNDEF; + + if (sym_is_extern) { + if (!obj->btf) { + pr_warn("externs without BTF info are not supported\n"); + return -ENOTSUP; + } + } else if (sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + return 0; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (sym_type == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; + return 0; + } + } + + if (sym_bind == STB_LOCAL) + goto add_sym; + + /* find matching BTF info */ + err = find_glob_sym_btf(obj, sym, sym_name, &btf_sec_id, &btf_id); + if (err) + return err; + + if (sym_is_extern && btf_sec_id) { + const char *sec_name = NULL; + const struct btf_type *t; + + t = btf__type_by_id(obj->btf, btf_sec_id); + sec_name = btf__str_by_offset(obj->btf, t->name_off); + + /* Clang puts unannotated extern vars into + * '.extern' BTF DATASEC. Treat them the same + * as unannotated extern funcs (which are + * currently not put into any DATASECs). + * Those don't have associated src_sec/dst_sec. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) != 0) { + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("failed to find matching ELF sec '%s'\n", sec_name); + return -ENOENT; + } + dst_sec = &linker->secs[src_sec->dst_id]; + } + } + + glob_sym = find_glob_sym(linker, sym_name); + if (glob_sym) { + /* Preventively resolve to existing symbol. This is + * needed for further relocation symbol remapping in + * the next step of linking. + */ + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + + /* If both symbols are non-externs, at least one of + * them has to be STB_WEAK, otherwise they are in + * a conflict with each other. + */ + if (!sym_is_extern && !glob_sym->is_extern + && !glob_sym->is_weak && sym_bind != STB_WEAK) { + pr_warn("conflicting non-weak symbol #%d (%s) definition in '%s'\n", + src_sym_idx, sym_name, obj->filename); + return -EINVAL; + } - obj->sym_map[i] = dst_sym_idx; + if (!glob_syms_match(sym_name, linker, glob_sym, obj, sym, src_sym_idx, btf_id)) + return -EINVAL; + + dst_sym = get_sym_by_idx(linker, glob_sym->sym_idx); - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sym) { - dst_sec->sec_sym_idx = dst_sym_idx; - dst_sym->st_value = 0; + /* If new symbol is strong, then force dst_sym to be strong as + * well; this way a mix of weak and non-weak extern + * definitions will end up being strong. + */ + if (sym_bind == STB_GLOBAL) { + /* We still need to preserve type (NOTYPE or + * OBJECT/FUNC, depending on whether the symbol is + * extern or not) + */ + sym_update_bind(dst_sym, STB_GLOBAL); + glob_sym->is_weak = false; } + /* Non-default visibility is "contaminating", with stricter + * visibility overwriting more permissive ones, even if more + * permissive visibility comes from just an extern definition. + * Currently only STV_DEFAULT and STV_HIDDEN are allowed and + * ensured by ELF symbol sanity checks above. + */ + if (sym_vis > ELF64_ST_VISIBILITY(dst_sym->st_other)) + sym_update_visibility(dst_sym, sym_vis); + + /* If the new symbol is extern, then regardless if + * existing symbol is extern or resolved global, just + * keep the existing one untouched. + */ + if (sym_is_extern) + return 0; + + /* If existing symbol is a strong resolved symbol, bail out, + * because we lost resolution battle have nothing to + * contribute. We already checked abover that there is no + * strong-strong conflict. We also already tightened binding + * and visibility, so nothing else to contribute at that point. + */ + if (!glob_sym->is_extern && sym_bind == STB_WEAK) + return 0; + + /* At this point, new symbol is strong non-extern, + * so overwrite glob_sym with new symbol information. + * Preserve binding and visibility. + */ + sym_update_type(dst_sym, sym_type); + dst_sym->st_shndx = dst_sec->sec_idx; + dst_sym->st_value = src_sec->dst_off + sym->st_value; + dst_sym->st_size = sym->st_size; + + /* see comment below about dst_sec->id vs dst_sec->sec_idx */ + glob_sym->sec_id = dst_sec->id; + glob_sym->is_extern = false; + + if (complete_extern_btf_info(linker->btf, glob_sym->btf_id, + obj->btf, btf_id)) + return -EINVAL; + + /* request updating VAR's/FUNC's underlying BTF type when appending BTF type */ + glob_sym->underlying_btf_id = 0; + + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + return 0; + } + +add_sym: + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = dst_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[src_sym_idx] = dst_sym_idx; + + if (sym_type == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + if (sym_bind != STB_LOCAL) { + glob_sym = add_glob_sym(linker); + if (!glob_sym) + return -ENOMEM; + + glob_sym->sym_idx = dst_sym_idx; + /* we use dst_sec->id (and not dst_sec->sec_idx), because + * ephemeral sections (.kconfig, .ksyms, etc) don't have + * sec_idx (as they don't have corresponding ELF section), but + * still have id. .extern doesn't have even ephemeral section + * associated with it, so dst_sec->id == dst_sec->sec_idx == 0. + */ + glob_sym->sec_id = dst_sec ? dst_sec->id : 0; + glob_sym->name_off = name_off; + /* we will fill btf_id in during BTF merging step */ + glob_sym->btf_id = 0; + glob_sym->is_extern = sym_is_extern; + glob_sym->is_weak = sym_bind == STB_WEAK; } return 0; @@ -1200,7 +2014,7 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return err; } } else if (!secs_match(dst_sec, src_sec)) { - pr_warn("Secs %s are not compatible\n", src_sec->sec_name); + pr_warn("sections %s are not compatible\n", src_sec->sec_name); return -1; } @@ -1212,7 +2026,7 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; src_sec->dst_id = dst_sec->id; - err = extend_sec(dst_sec, src_sec); + err = extend_sec(linker, dst_sec, src_sec); if (err) return err; @@ -1265,21 +2079,6 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return 0; } -static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) -{ - struct src_sec *sec; - int i; - - for (i = 1; i < obj->sec_cnt; i++) { - sec = &obj->secs[i]; - - if (strcmp(sec->sec_name, sec_name) == 0) - return sec; - } - - return NULL; -} - static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, int sym_type, const char *sym_name) { @@ -1334,12 +2133,32 @@ static int linker_fixup_btf(struct src_obj *obj) t->size = sec->shdr->sh_size; } else { /* BTF can have some sections that are not represented - * in ELF, e.g., .kconfig and .ksyms, which are used - * for special extern variables. Here we'll - * pre-create "section shells" for them to be able to - * keep track of extra per-section metadata later - * (e.g., BTF variables). + * in ELF, e.g., .kconfig, .ksyms, .extern, which are used + * for special extern variables. + * + * For all but one such special (ephemeral) + * sections, we pre-create "section shells" to be able + * to keep track of extra per-section metadata later + * (e.g., those BTF extern variables). + * + * .extern is even more special, though, because it + * contains extern variables that need to be resolved + * by static linker, not libbpf and kernel. When such + * externs are resolved, we are going to remove them + * from .extern BTF section and might end up not + * needing it at all. Each resolved extern should have + * matching non-extern VAR/FUNC in other sections. + * + * We do support leaving some of the externs + * unresolved, though, to support cases of building + * libraries, which will later be linked against final + * BPF applications. So if at finalization we still + * see unresolved externs, we'll create .extern + * section on our own. */ + if (strcmp(sec_name, BTF_EXTERN_SEC) == 0) + continue; + sec = add_src_sec(obj, sec_name); if (!sec) return -ENOMEM; @@ -1379,6 +2198,13 @@ static int linker_fixup_btf(struct src_obj *obj) static int remap_type_id(__u32 *type_id, void *ctx) { int *id_map = ctx; + int new_id = id_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); + return -EINVAL; + } *type_id = id_map[*type_id]; @@ -1389,6 +2215,7 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) { const struct btf_type *t; int i, j, n, start_id, id; + const char *name; if (!obj->btf) return 0; @@ -1401,12 +2228,44 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) return -ENOMEM; for (i = 1; i <= n; i++) { + struct glob_sym *glob_sym = NULL; + t = btf__type_by_id(obj->btf, i); /* DATASECs are handled specially below */ if (btf_kind(t) == BTF_KIND_DATASEC) continue; + if (btf_is_non_static(t)) { + /* there should be glob_sym already */ + name = btf__str_by_offset(obj->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + + /* VARs without corresponding glob_sym are those that + * belong to skipped/deduplicated sections (i.e., + * license and version), so just skip them + */ + if (!glob_sym) + continue; + + /* linker_append_elf_sym() might have requested + * updating underlying type ID, if extern was resolved + * to strong symbol or weak got upgraded to non-weak + */ + if (glob_sym->underlying_btf_id == 0) + glob_sym->underlying_btf_id = -t->type; + + /* globals from previous object files that match our + * VAR/FUNC already have a corresponding associated + * BTF type, so just make sure to use it + */ + if (glob_sym->btf_id) { + /* reuse existing BTF type for global var/func */ + obj->btf_type_map[i] = glob_sym->btf_id; + continue; + } + } + id = btf__add_type(linker->btf, obj->btf, t); if (id < 0) { pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); @@ -1414,6 +2273,12 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) } obj->btf_type_map[i] = id; + + /* record just appended BTF type for var/func */ + if (glob_sym) { + glob_sym->btf_id = id; + glob_sym->underlying_btf_id = -t->type; + } } /* remap all the types except DATASECs */ @@ -1425,6 +2290,22 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) return -EINVAL; } + /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's + * actual type), if necessary + */ + for (i = 0; i < linker->glob_sym_cnt; i++) { + struct glob_sym *glob_sym = &linker->glob_syms[i]; + struct btf_type *glob_t; + + if (glob_sym->underlying_btf_id >= 0) + continue; + + glob_sym->underlying_btf_id = obj->btf_type_map[-glob_sym->underlying_btf_id]; + + glob_t = btf_type_by_id(linker->btf, glob_sym->btf_id); + glob_t->type = glob_sym->underlying_btf_id; + } + /* append DATASEC info */ for (i = 1; i < obj->sec_cnt; i++) { struct src_sec *src_sec; @@ -1452,6 +2333,42 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) n = btf_vlen(t); for (j = 0; j < n; j++, src_var++) { void *sec_vars = dst_sec->sec_vars; + int new_id = obj->btf_type_map[src_var->type]; + struct glob_sym *glob_sym = NULL; + + t = btf_type_by_id(linker->btf, new_id); + if (btf_is_non_static(t)) { + name = btf__str_by_offset(linker->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + if (glob_sym->sec_id != dst_sec->id) { + pr_warn("global '%s': section mismatch %d vs %d\n", + name, glob_sym->sec_id, dst_sec->id); + return -EINVAL; + } + } + + /* If there is already a member (VAR or FUNC) mapped + * to the same type, don't add a duplicate entry. + * This will happen when multiple object files define + * the same extern VARs/FUNCs. + */ + if (glob_sym && glob_sym->var_idx >= 0) { + __s64 sz; + + dst_var = &dst_sec->sec_vars[glob_sym->var_idx]; + /* Because underlying BTF type might have + * changed, so might its size have changed, so + * re-calculate and update it in sec_var. + */ + sz = btf__resolve_size(linker->btf, glob_sym->underlying_btf_id); + if (sz < 0) { + pr_warn("global '%s': failed to resolve size of underlying type: %d\n", + name, (int)sz); + return -EINVAL; + } + dst_var->size = sz; + continue; + } sec_vars = libbpf_reallocarray(sec_vars, dst_sec->sec_var_cnt + 1, @@ -1466,6 +2383,9 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) dst_var->type = obj->btf_type_map[src_var->type]; dst_var->size = src_var->size; dst_var->offset = src_sec->dst_off + src_var->offset; + + if (glob_sym) + glob_sym->var_idx = dst_sec->sec_var_cnt - 1; } } @@ -1895,7 +2815,7 @@ static int finalize_btf_ext(struct bpf_linker *linker) hdr->func_info_len = funcs_sz; hdr->line_info_off = funcs_sz; hdr->line_info_len = lines_sz; - hdr->core_relo_off = funcs_sz + lines_sz;; + hdr->core_relo_off = funcs_sz + lines_sz; hdr->core_relo_len = core_relos_sz; if (funcs_sz) { diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a402f32a145c..91130648d8e6 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -39,8 +39,6 @@ EXTRA_WARNINGS += -Wundef EXTRA_WARNINGS += -Wwrite-strings EXTRA_WARNINGS += -Wformat -CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) - # Makefiles suck: This macro sets a default value of $(2) for the # variable named by $(1), unless the variable has been set by # environment or command line. This is necessary for CC and AR @@ -52,12 +50,22 @@ define allow-override $(eval $(1) = $(2))) endef +ifneq ($(LLVM),) +$(call allow-override,CC,clang) +$(call allow-override,AR,llvm-ar) +$(call allow-override,LD,ld.lld) +$(call allow-override,CXX,clang++) +$(call allow-override,STRIP,llvm-strip) +else # Allow setting various cross-compile vars or setting CROSS_COMPILE as a prefix. $(call allow-override,CC,$(CROSS_COMPILE)gcc) $(call allow-override,AR,$(CROSS_COMPILE)ar) $(call allow-override,LD,$(CROSS_COMPILE)ld) $(call allow-override,CXX,$(CROSS_COMPILE)g++) $(call allow-override,STRIP,$(CROSS_COMPILE)strip) +endif + +CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) ifneq ($(LLVM),) HOSTAR ?= llvm-ar diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6448c626498f..283e5ad8385e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -21,13 +21,18 @@ endif BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= -CFLAGS += -g -Og -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ +CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_verifier_log test_dev_cgroup \ @@ -182,7 +187,6 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) -$(TEST_GEN_FILES): docs $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c @@ -201,10 +205,12 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ CC=$(HOSTCC) LD=$(HOSTLD) \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install +all: docs + docs: $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ -f Makefile.docs \ @@ -219,7 +225,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers ifneq ($(BPFOBJ),$(HOST_BPFOBJ)) @@ -227,7 +233,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(HOST_BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif @@ -303,9 +309,15 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c -LINKED_SKELS := test_static_linked.skel.h +LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ + linked_vars.skel.h linked_maps.skel.h test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o +linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o +linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o +linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o + +LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. @@ -325,7 +337,7 @@ TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ - $$(filter-out $(SKEL_BLACKLIST), \ + $$(filter-out $(SKEL_BLACKLIST) $(LINKED_BPF_SRCS),\ $$(TRUNNER_BPF_SRCS))) TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) @@ -481,7 +493,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) - $(Q)$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 65fe318d1e71..3353778c30f8 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -193,3 +193,12 @@ Without it, the error from compiling bpf selftests looks like: libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2 __ https://reviews.llvm.org/D93563 + +Clang dependencies for static linking tests +=========================================== + +linked_vars, linked_maps, and linked_funcs tests depend on `Clang fix`__ to +generate valid BTF information for weak variables. Please make sure you use +Clang that contains the fix. + +__ https://reviews.llvm.org/D100362 diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 37e1f303fc11..5192305159ec 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -44,3 +44,5 @@ CONFIG_SECURITYFS=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_READ_POLICY=y CONFIG_BLK_DEV_LOOP=y +CONFIG_FUNCTION_TRACER=y +CONFIG_DYNAMIC_FTRACE=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 74c45d557a2b..2d3590cfb5e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -147,6 +147,7 @@ static void test_task_stack(void) return; do_dummy_read(skel->progs.dump_task_stack); + do_dummy_read(skel->progs.get_task_user_stacks); bpf_iter_task_stack__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index 5c0448910426..63990842d20f 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -58,42 +58,73 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, test_cb cb) { struct bpf_object *obj = NULL, *tgt_obj; + __u32 retval, tgt_prog_id, info_len; + struct bpf_prog_info prog_info = {}; struct bpf_program **prog = NULL; struct bpf_link **link = NULL; - __u32 duration = 0, retval; int err, tgt_fd, i; + struct btf *btf; err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC, &tgt_obj, &tgt_fd); - if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n", - target_obj_file, err, errno)) + if (!ASSERT_OK(err, "tgt_prog_load")) return; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .attach_prog_fd = tgt_fd, ); + info_len = sizeof(prog_info); + err = bpf_obj_get_info_by_fd(tgt_fd, &prog_info, &info_len); + if (!ASSERT_OK(err, "tgt_fd_get_info")) + goto close_prog; + + tgt_prog_id = prog_info.id; + btf = bpf_object__btf(tgt_obj); + link = calloc(sizeof(struct bpf_link *), prog_cnt); + if (!ASSERT_OK_PTR(link, "link_ptr")) + goto close_prog; + prog = calloc(sizeof(struct bpf_program *), prog_cnt); - if (CHECK(!link || !prog, "alloc_memory", "failed to alloc memory")) + if (!ASSERT_OK_PTR(prog, "prog_ptr")) goto close_prog; obj = bpf_object__open_file(obj_file, &opts); - if (CHECK(IS_ERR_OR_NULL(obj), "obj_open", - "failed to open %s: %ld\n", obj_file, - PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open")) goto close_prog; err = bpf_object__load(obj); - if (CHECK(err, "obj_load", "err %d\n", err)) + if (!ASSERT_OK(err, "obj_load")) goto close_prog; for (i = 0; i < prog_cnt; i++) { + struct bpf_link_info link_info; + char *tgt_name; + __s32 btf_id; + + tgt_name = strstr(prog_name[i], "/"); + if (!ASSERT_OK_PTR(tgt_name, "tgt_name")) + goto close_prog; + btf_id = btf__find_by_name_kind(btf, tgt_name + 1, BTF_KIND_FUNC); + prog[i] = bpf_object__find_program_by_title(obj, prog_name[i]); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name[i])) + if (!ASSERT_OK_PTR(prog[i], prog_name[i])) goto close_prog; + link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) + if (!ASSERT_OK_PTR(link[i], "attach_trace")) goto close_prog; + + info_len = sizeof(link_info); + memset(&link_info, 0, sizeof(link_info)); + err = bpf_obj_get_info_by_fd(bpf_link__fd(link[i]), + &link_info, &info_len); + ASSERT_OK(err, "link_fd_get_info"); + ASSERT_EQ(link_info.tracing.attach_type, + bpf_program__get_expected_attach_type(prog[i]), + "link_attach_type"); + ASSERT_EQ(link_info.tracing.target_obj_id, tgt_prog_id, "link_tgt_obj_id"); + ASSERT_EQ(link_info.tracing.target_btf_id, btf_id, "link_tgt_btf_id"); } if (cb) { @@ -106,10 +137,9 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, goto close_prog; err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6), - NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + NULL, NULL, &retval, NULL); + ASSERT_OK(err, "prog_run"); + ASSERT_EQ(retval, 0, "prog_run_ret"); if (check_data_map(obj, prog_cnt, false)) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c index 6c4d42a2386f..ccc7e8a34ab6 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -39,7 +39,7 @@ void test_fexit_sleep(void) goto cleanup; cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); - if (CHECK(cpid == -1, "clone", strerror(errno))) + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) goto cleanup; /* wait until first sys_nanosleep ends and second sys_nanosleep starts */ @@ -65,7 +65,7 @@ void test_fexit_sleep(void) /* kill the thread to unwind sys_nanosleep stack through the trampoline */ kill(cpid, 9); - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) goto cleanup; if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/linked_funcs.c b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c new file mode 100644 index 000000000000..e9916f2817ec --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_funcs.skel.h" + +void test_linked_funcs(void) +{ + int err; + struct linked_funcs *skel; + + skel = linked_funcs__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->rodata->my_tid = syscall(SYS_gettid); + skel->bss->syscall_id = SYS_getpgid; + + err = linked_funcs__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_funcs__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_val1, 2000 + 2000, "output_val1"); + ASSERT_EQ(skel->bss->output_ctx1, SYS_getpgid, "output_ctx1"); + ASSERT_EQ(skel->bss->output_weak1, 42, "output_weak1"); + + ASSERT_EQ(skel->bss->output_val2, 2 * 1000 + 2 * (2 * 1000), "output_val2"); + ASSERT_EQ(skel->bss->output_ctx2, SYS_getpgid, "output_ctx2"); + /* output_weak2 should never be updated */ + ASSERT_EQ(skel->bss->output_weak2, 0, "output_weak2"); + +cleanup: + linked_funcs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_maps.c b/tools/testing/selftests/bpf/prog_tests/linked_maps.c new file mode 100644 index 000000000000..85dcaaaf2775 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_maps.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_maps.skel.h" + +void test_linked_maps(void) +{ + int err; + struct linked_maps *skel; + + skel = linked_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + err = linked_maps__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_first1, 2000, "output_first1"); + ASSERT_EQ(skel->bss->output_second1, 2, "output_second1"); + ASSERT_EQ(skel->bss->output_weak1, 2, "output_weak1"); + +cleanup: + linked_maps__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_vars.c b/tools/testing/selftests/bpf/prog_tests/linked_vars.c new file mode 100644 index 000000000000..267166abe4c1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_vars.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_vars.skel.h" + +void test_linked_vars(void) +{ + int err; + struct linked_vars *skel; + + skel = linked_vars__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->input_bss1 = 1000; + skel->bss->input_bss2 = 2000; + skel->bss->input_bss_weak = 3000; + + err = linked_vars__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_vars__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_bss1, 1000 + 2000 + 3000, "output_bss1"); + ASSERT_EQ(skel->bss->output_bss2, 1000 + 2000 + 3000, "output_bss2"); + /* 10 comes from "winner" input_data_weak in first obj file */ + ASSERT_EQ(skel->bss->output_data1, 1 + 2 + 10, "output_bss1"); + ASSERT_EQ(skel->bss->output_data2, 1 + 2 + 10, "output_bss2"); + /* 100 comes from "winner" input_rodata_weak in first obj file */ + ASSERT_EQ(skel->bss->output_rodata1, 11 + 22 + 100, "output_weak1"); + ASSERT_EQ(skel->bss->output_rodata2, 11 + 22 + 100, "output_weak2"); + +cleanup: + linked_vars__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/map_ptr.c b/tools/testing/selftests/bpf/prog_tests/map_ptr.c index c230a573c373..4972f92205c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_ptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_ptr.c @@ -12,11 +12,22 @@ void test_map_ptr(void) __u32 duration = 0, retval; char buf[128]; int err; + int page_size = getpagesize(); - skel = map_ptr_kern__open_and_load(); - if (CHECK(!skel, "skel_open_load", "open_load failed\n")) + skel = map_ptr_kern__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) return; + err = bpf_map__set_max_entries(skel->maps.m_ringbuf, page_size); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto cleanup; + + err = map_ptr_kern__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + skel->bss->page_size = page_size; + err = bpf_prog_test_run(bpf_program__fd(skel->progs.cg_skb), 1, &pkt_v4, sizeof(pkt_v4), buf, NULL, &retval, NULL); diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 9c3c5c0f068f..37b002ca1167 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -29,22 +29,36 @@ void test_mmap(void) struct test_mmap *skel; __u64 val = 0; - skel = test_mmap__open_and_load(); - if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + skel = test_mmap__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.rdonly_map, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + /* at least 4 pages of data */ + err = bpf_map__set_max_entries(skel->maps.data_map, + 4 * (page_size / sizeof(u64))); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_mmap__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + bss_map = skel->maps.bss; data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); rdmap_fd = bpf_map__fd(skel->maps.rdonly_map); - tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) { - munmap(tmp1, 4096); + munmap(tmp1, page_size); goto cleanup; } /* now double-check if it's mmap()'able at all */ - tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno)) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 31a3114906e2..2535788e135f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -68,10 +68,10 @@ static void test_ns_current_pid_tgid_new_ns(void) cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, CLONE_NEWPID | SIGCHLD, NULL); - if (CHECK(cpid == -1, "clone", strerror(errno))) + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) return; if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index fddbc5db5d6a..de78617f6550 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -87,11 +87,20 @@ void test_ringbuf(void) pthread_t thread; long bg_ret = -1; int err, cnt; + int page_size = getpagesize(); - skel = test_ringbuf__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_ringbuf__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -110,9 +119,9 @@ void test_ringbuf(void) CHECK(skel->bss->avail_data != 3 * rec_sz, "err_avail_size", "exp %ld, got %ld\n", 3L * rec_sz, skel->bss->avail_data); - CHECK(skel->bss->ring_size != 4096, + CHECK(skel->bss->ring_size != page_size, "err_ring_size", "exp %ld, got %ld\n", - 4096L, skel->bss->ring_size); + (long)page_size, skel->bss->ring_size); CHECK(skel->bss->cons_pos != 0, "err_cons_pos", "exp %ld, got %ld\n", 0L, skel->bss->cons_pos); diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index d37161e59bb2..cef63e703924 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -41,13 +41,42 @@ static int process_sample(void *ctx, void *data, size_t len) void test_ringbuf_multi(void) { struct test_ringbuf_multi *skel; - struct ring_buffer *ringbuf; + struct ring_buffer *ringbuf = NULL; int err; + int page_size = getpagesize(); + int proto_fd = -1; - skel = test_ringbuf_multi__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf_multi__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf1, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.ringbuf2, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(bpf_map__inner_map(skel->maps.ringbuf_arr), page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0); + if (CHECK(proto_fd == -1, "bpf_create_map", "bpf_create_map failed\n")) + goto cleanup; + + err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd); + if (CHECK(err != 0, "bpf_map__set_inner_map_fd", "bpf_map__set_inner_map_fd failed\n")) + goto cleanup; + + err = test_ringbuf_multi__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + + close(proto_fd); + proto_fd = -1; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -97,6 +126,8 @@ void test_ringbuf_multi(void) 2L, skel->bss->total); cleanup: + if (proto_fd >= 0) + close(proto_fd); ring_buffer__free(ringbuf); test_ringbuf_multi__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c new file mode 100644 index 000000000000..a958c22aec75 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <test_progs.h> +#include "test_snprintf.skel.h" +#include "test_snprintf_single.skel.h" + +#define EXP_NUM_OUT "-8 9 96 -424242 1337 DABBAD00" +#define EXP_NUM_RET sizeof(EXP_NUM_OUT) + +#define EXP_IP_OUT "127.000.000.001 0000:0000:0000:0000:0000:0000:0000:0001" +#define EXP_IP_RET sizeof(EXP_IP_OUT) + +/* The third specifier, %pB, depends on compiler inlining so don't check it */ +#define EXP_SYM_OUT "schedule schedule+0x0/" +#define MIN_SYM_RET sizeof(EXP_SYM_OUT) + +/* The third specifier, %p, is a hashed pointer which changes on every reboot */ +#define EXP_ADDR_OUT "0000000000000000 ffff00000add4e55 " +#define EXP_ADDR_RET sizeof(EXP_ADDR_OUT "unknownhashedptr") + +#define EXP_STR_OUT "str1 longstr" +#define EXP_STR_RET sizeof(EXP_STR_OUT) + +#define EXP_OVER_OUT "%over" +#define EXP_OVER_RET 10 + +#define EXP_PAD_OUT " 4 000" +#define EXP_PAD_RET 900007 + +#define EXP_NO_ARG_OUT "simple case" +#define EXP_NO_ARG_RET 12 + +#define EXP_NO_BUF_RET 29 + +void test_snprintf_positive(void) +{ + char exp_addr_out[] = EXP_ADDR_OUT; + char exp_sym_out[] = EXP_SYM_OUT; + struct test_snprintf *skel; + + skel = test_snprintf__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach")) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + ASSERT_STREQ(skel->bss->num_out, EXP_NUM_OUT, "num_out"); + ASSERT_EQ(skel->bss->num_ret, EXP_NUM_RET, "num_ret"); + + ASSERT_STREQ(skel->bss->ip_out, EXP_IP_OUT, "ip_out"); + ASSERT_EQ(skel->bss->ip_ret, EXP_IP_RET, "ip_ret"); + + ASSERT_OK(memcmp(skel->bss->sym_out, exp_sym_out, + sizeof(exp_sym_out) - 1), "sym_out"); + ASSERT_LT(MIN_SYM_RET, skel->bss->sym_ret, "sym_ret"); + + ASSERT_OK(memcmp(skel->bss->addr_out, exp_addr_out, + sizeof(exp_addr_out) - 1), "addr_out"); + ASSERT_EQ(skel->bss->addr_ret, EXP_ADDR_RET, "addr_ret"); + + ASSERT_STREQ(skel->bss->str_out, EXP_STR_OUT, "str_out"); + ASSERT_EQ(skel->bss->str_ret, EXP_STR_RET, "str_ret"); + + ASSERT_STREQ(skel->bss->over_out, EXP_OVER_OUT, "over_out"); + ASSERT_EQ(skel->bss->over_ret, EXP_OVER_RET, "over_ret"); + + ASSERT_STREQ(skel->bss->pad_out, EXP_PAD_OUT, "pad_out"); + ASSERT_EQ(skel->bss->pad_ret, EXP_PAD_RET, "pad_ret"); + + ASSERT_STREQ(skel->bss->noarg_out, EXP_NO_ARG_OUT, "no_arg_out"); + ASSERT_EQ(skel->bss->noarg_ret, EXP_NO_ARG_RET, "no_arg_ret"); + + ASSERT_EQ(skel->bss->nobuf_ret, EXP_NO_BUF_RET, "no_buf_ret"); + +cleanup: + test_snprintf__destroy(skel); +} + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +/* Loads an eBPF object calling bpf_snprintf with up to 10 characters of fmt */ +static int load_single_snprintf(char *fmt) +{ + struct test_snprintf_single *skel; + int ret; + + skel = test_snprintf_single__open(); + if (!skel) + return -EINVAL; + + memcpy(skel->rodata->fmt, fmt, min(strlen(fmt) + 1, 10)); + + ret = test_snprintf_single__load(skel); + test_snprintf_single__destroy(skel); + + return ret; +} + +void test_snprintf_negative(void) +{ + ASSERT_OK(load_single_snprintf("valid %d"), "valid usage"); + + ASSERT_ERR(load_single_snprintf("0123456789"), "no terminating zero"); + ASSERT_ERR(load_single_snprintf("%d %d"), "too many specifiers"); + ASSERT_ERR(load_single_snprintf("%pi5"), "invalid specifier 1"); + ASSERT_ERR(load_single_snprintf("%a"), "invalid specifier 2"); + ASSERT_ERR(load_single_snprintf("%"), "invalid specifier 3"); + ASSERT_ERR(load_single_snprintf("%12345678"), "invalid specifier 4"); + ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5"); + ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character"); + ASSERT_ERR(load_single_snprintf("\x1"), "non printable character"); +} + +void test_snprintf(void) +{ + if (test__start_subtest("snprintf_positive")) + test_snprintf_positive(); + if (test__start_subtest("snprintf_negative")) + test_snprintf_negative(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index d5b44b135c00..4b937e5dbaca 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -3,6 +3,7 @@ #include "cgroup_helpers.h" #include <linux/tcp.h> +#include "sockopt_sk.skel.h" #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -191,60 +192,30 @@ err: return -1; } -static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title) +static void run_test(int cgroup_fd) { - enum bpf_attach_type attach_type; - enum bpf_prog_type prog_type; - struct bpf_program *prog; - int err; + struct sockopt_sk *skel; - err = libbpf_prog_type_by_name(title, &prog_type, &attach_type); - if (err) { - log_err("Failed to deduct types for %s BPF program", title); - return -1; - } + skel = sockopt_sk__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; - prog = bpf_object__find_program_by_title(obj, title); - if (!prog) { - log_err("Failed to find %s BPF program", title); - return -1; - } - - err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, - attach_type, 0); - if (err) { - log_err("Failed to attach %s BPF program", title); - return -1; - } - - return 0; -} - -static void run_test(int cgroup_fd) -{ - struct bpf_prog_load_attr attr = { - .file = "./sockopt_sk.o", - }; - struct bpf_object *obj; - int ignored; - int err; - - err = bpf_prog_load_xattr(&attr, &obj, &ignored); - if (CHECK_FAIL(err)) - return; + skel->bss->page_size = getpagesize(); - err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._setsockopt = + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) + goto cleanup; - err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._getsockopt = + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) + goto cleanup; - CHECK_FAIL(getsetsockopt()); + ASSERT_OK(getsetsockopt(), "getsetsockopt"); -close_bpf_object: - bpf_object__close(obj); +cleanup: + sockopt_sk__destroy(skel); } void test_sockopt_sk(void) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c index 50e59a2e142e..43c36f5f7649 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c @@ -35,3 +35,30 @@ int dump_task_stack(struct bpf_iter__task *ctx) return 0; } + +SEC("iter/task") +int get_task_user_stacks(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + uint64_t buf_sz = 0; + int64_t res; + + if (task == (void *)0) + return 0; + + res = bpf_get_task_stack(task, entries, + MAX_STACK_TRACE_DEPTH * SIZE_OF_ULONG, BPF_F_USER_STACK); + if (res <= 0) + return 0; + + buf_sz += res; + + /* If the verifier doesn't refine bpf_get_task_stack res, and instead + * assumes res is entirely unknown, this program will fail to load as + * the verifier will believe that max buf_sz value allows reading + * past the end of entries in bpf_seq_write call + */ + bpf_seq_write(seq, &entries, buf_sz); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c new file mode 100644 index 000000000000..b964ec1390c2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* weak and shared between two files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val1; +int output_ctx1; +int output_weak1; + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 1; +} + +/* Global functions can't be void */ +int set_output_val1(int x) +{ + output_val1 = x + subprog(x); + return x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx1(__u64 *ctx) +{ + output_ctx1 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should win because it's the first one */ +__weak int set_output_weak(int x) +{ + output_weak1 = x; + return x; +} + +extern int set_output_val2(int x); + +/* here we'll force set_output_ctx2() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx2(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val2(1000); + set_output_ctx2(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c new file mode 100644 index 000000000000..575e958e60b7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* weak and shared between both files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val2; +int output_ctx2; +int output_weak2; /* should stay zero */ + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 2; +} + +/* Global functions can't be void */ +int set_output_val2(int x) +{ + output_val2 = 2 * x + 2 * subprog(x); + return 2 * x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx2(__u64 *ctx) +{ + output_ctx2 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should lose, because it will be processed second */ +__weak int set_output_weak(int x) +{ + output_weak2 = x; + return 2 * x; +} + +extern int set_output_val1(int x); + +/* here we'll force set_output_ctx1() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx1(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val1(2000); + set_output_ctx1(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_maps1.c b/tools/testing/selftests/bpf/progs/linked_maps1.c new file mode 100644 index 000000000000..52291515cc72 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps1.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct my_key { long x; }; +struct my_value { long x; }; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct my_key); + __type(value, struct my_value); + __uint(max_entries, 16); +} map1 SEC(".maps"); + + /* Matches map2 definition in linked_maps2.c. Order of the attributes doesn't + * matter. + */ +typedef struct { + __uint(max_entries, 8); + __type(key, int); + __type(value, int); + __uint(type, BPF_MAP_TYPE_ARRAY); +} map2_t; + +extern map2_t map2 SEC(".maps"); + +/* This should be the winning map definition, but we have no way of verifying, + * so we just make sure that it links and works without errors + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first1; +int output_second1; +int output_weak1; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter1) +{ + /* update values with key = 1 */ + int key = 1, val = 1; + struct my_key key_struct = { .x = 1 }; + struct my_value val_struct = { .x = 1000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit1) +{ + /* lookup values with key = 2, set in another file */ + int key = 2, *val; + struct my_key key_struct = { .x = 2 }; + struct my_value *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first1 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second1 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak1 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_maps2.c b/tools/testing/selftests/bpf/progs/linked_maps2.c new file mode 100644 index 000000000000..0693687474ed --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps2.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* modifiers and typedefs are ignored when comparing key/value types */ +typedef struct my_key { long x; } key_type; +typedef struct my_value { long x; } value_type; + +extern struct { + __uint(max_entries, 16); + __type(key, key_type); + __type(value, value_type); + __uint(type, BPF_MAP_TYPE_HASH); +} map1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 8); +} map2 SEC(".maps"); + +/* this definition will lose, but it has to exactly match the winner */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first2; +int output_second2; +int output_weak2; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter2) +{ + /* update values with key = 2 */ + int key = 2, val = 2; + key_type key_struct = { .x = 2 }; + value_type val_struct = { .x = 2000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit2) +{ + /* lookup values with key = 1, set in another file */ + int key = 1, *val; + key_type key_struct = { .x = 1 }; + value_type *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first2 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second2 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak2 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_vars1.c b/tools/testing/selftests/bpf/progs/linked_vars1.c new file mode 100644 index 000000000000..ef9e9d0bb0ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars1.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern int LINUX_KERNEL_VERSION __kconfig; +/* this weak extern will be strict due to the other file's strong extern */ +extern bool CONFIG_BPF_SYSCALL __kconfig __weak; +extern const void bpf_link_fops __ksym __weak; + +int input_bss1; +int input_data1 = 1; +const volatile int input_rodata1 = 11; + +int input_bss_weak __weak; +/* these two definitions should win */ +int input_data_weak __weak = 10; +const volatile int input_rodata_weak __weak = 100; + +extern int input_bss2; +extern int input_data2; +extern const int input_rodata2; + +int output_bss1; +int output_data1; +int output_rodata1; + +long output_sink1; + +static __noinline int get_bss_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_bss1 + input_bss2 + input_bss_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1) +{ + output_bss1 = get_bss_res(); + output_data1 = input_data1 + input_data2 + input_data_weak; + output_rodata1 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink1 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&bpf_link_fops; + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_vars2.c b/tools/testing/selftests/bpf/progs/linked_vars2.c new file mode 100644 index 000000000000..e4f5bd388a3c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars2.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern int LINUX_KERNEL_VERSION __kconfig; +/* when an extern is defined as both strong and weak, resulting symbol will be strong */ +extern bool CONFIG_BPF_SYSCALL __kconfig; +extern const void __start_BTF __ksym; + +int input_bss2; +int input_data2 = 2; +const volatile int input_rodata2 = 22; + +int input_bss_weak __weak; +/* these two weak variables should lose */ +int input_data_weak __weak = 20; +const volatile int input_rodata_weak __weak = 200; + +extern int input_bss1; +extern int input_data1; +extern const int input_rodata1; + +int output_bss2; +int output_data2; +int output_rodata2; + +int output_sink2; + +static __noinline int get_data_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_data1 + input_data2 + input_data_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2) +{ + output_bss2 = input_bss1 + input_bss2 + input_bss_weak; + output_data2 = get_data_res(); + output_rodata2 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink2 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&__start_BTF; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index d8850bc6a9f1..d1d304c980f0 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -12,6 +12,7 @@ _Static_assert(MAX_ENTRIES < LOOP_BOUND, "MAX_ENTRIES must be < LOOP_BOUND"); enum bpf_map_type g_map_type = BPF_MAP_TYPE_UNSPEC; __u32 g_line = 0; +int page_size = 0; /* userspace should set it */ #define VERIFY_TYPE(type, func) ({ \ g_map_type = type; \ @@ -635,7 +636,6 @@ struct bpf_ringbuf_map { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } m_ringbuf SEC(".maps"); static inline int check_ringbuf(void) @@ -643,7 +643,7 @@ static inline int check_ringbuf(void) struct bpf_ringbuf_map *ringbuf = (struct bpf_ringbuf_map *)&m_ringbuf; struct bpf_map *map = (struct bpf_map *)&m_ringbuf; - VERIFY(check(&ringbuf->map, map, 0, 0, 1 << 12)); + VERIFY(check(&ringbuf->map, map, 0, 0, page_size)); return 1; } diff --git a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c index fdb4bf4408fa..eeaf6e75c9a2 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c @@ -8,18 +8,6 @@ int _version SEC("version") = 1; SEC("sk_msg1") int bpf_prog1(struct sk_msg_md *msg) { - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - - char *d; - - if (data + 8 > data_end) - return SK_DROP; - - bpf_printk("data length %i\n", (__u64)msg->data_end - (__u64)msg->data); - d = (char *)data; - bpf_printk("hello sendmsg hook %i %i\n", d[0], d[1]); - return SK_PASS; } diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index d3597f81e6e9..8acdb99b5959 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -6,11 +6,8 @@ #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; -__u32 _version SEC("version") = 1; -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 -#endif +int page_size = 0; /* userspace should set it */ #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -90,7 +87,7 @@ int _getsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; @@ -161,7 +158,7 @@ int _setsockopt(struct bpf_sockopt *ctx) if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { /* Original optlen is larger than PAGE_SIZE. */ - if (ctx->optlen != PAGE_SIZE * 2) + if (ctx->optlen != page_size * 2) return 0; /* EPERM, unexpected data size */ if (optval + 1 > optval_end) @@ -175,7 +172,7 @@ int _setsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 4eb42cff5fe9..5a5cc19a15bf 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -9,7 +9,6 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 4096); __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG); __type(key, __u32); __type(value, char); @@ -17,7 +16,6 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 512 * 4); /* at least 4 pages of data */ __uint(map_flags, BPF_F_MMAPABLE); __type(key, __u32); __type(value, __u64); diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c index 8ba9959b036b..6b3f288b7c63 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -15,7 +15,6 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf SEC(".maps"); /* inputs */ diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c index edf3b6953533..197b86546dca 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -15,7 +15,6 @@ struct sample { struct ringbuf_map { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf1 SEC(".maps"), ringbuf2 SEC(".maps"); @@ -31,6 +30,17 @@ struct { }, }; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_hash SEC(".maps") = { + .values = { + [0] = &ringbuf1, + }, +}; + /* inputs */ int pid = 0; int target_ring = 0; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c new file mode 100644 index 000000000000..951a0301c553 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char num_out[64] = {}; +long num_ret = 0; + +char ip_out[64] = {}; +long ip_ret = 0; + +char sym_out[64] = {}; +long sym_ret = 0; + +char addr_out[64] = {}; +long addr_ret = 0; + +char str_out[64] = {}; +long str_ret = 0; + +char over_out[6] = {}; +long over_ret = 0; + +char pad_out[10] = {}; +long pad_ret = 0; + +char noarg_out[64] = {}; +long noarg_ret = 0; + +long nobuf_ret = 0; + +extern const void schedule __ksym; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + /* Convenient values to pretty-print */ + const __u8 ex_ipv4[] = {127, 0, 0, 1}; + const __u8 ex_ipv6[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + static const char str1[] = "str1"; + static const char longstr[] = "longstr"; + + /* Integer types */ + num_ret = BPF_SNPRINTF(num_out, sizeof(num_out), + "%d %u %x %li %llu %lX", + -8, 9, 150, -424242, 1337, 0xDABBAD00); + /* IP addresses */ + ip_ret = BPF_SNPRINTF(ip_out, sizeof(ip_out), "%pi4 %pI6", + &ex_ipv4, &ex_ipv6); + /* Symbol lookup formatting */ + sym_ret = BPF_SNPRINTF(sym_out, sizeof(sym_out), "%ps %pS %pB", + &schedule, &schedule, &schedule); + /* Kernel pointers */ + addr_ret = BPF_SNPRINTF(addr_out, sizeof(addr_out), "%pK %px %p", + 0, 0xFFFF00000ADD4E55, 0xFFFF00000ADD4E55); + /* Strings embedding */ + str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s %+05s", + str1, longstr); + /* Overflow */ + over_ret = BPF_SNPRINTF(over_out, sizeof(over_out), "%%overflow"); + /* Padding of fixed width numbers */ + pad_ret = BPF_SNPRINTF(pad_out, sizeof(pad_out), "%5d %0900000X", 4, 4); + /* No args */ + noarg_ret = BPF_SNPRINTF(noarg_out, sizeof(noarg_out), "simple case"); + /* No buffer */ + nobuf_ret = BPF_SNPRINTF(NULL, 0, "only interested in length %d", 60); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf_single.c b/tools/testing/selftests/bpf/progs/test_snprintf_single.c new file mode 100644 index 000000000000..402adaf344f9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf_single.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* The format string is filled from the userspace such that loading fails */ +static const char fmt[10]; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + unsigned long long arg = 42; + + bpf_snprintf(NULL, 0, fmt, &arg, sizeof(arg)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index ba6eadfec565..e7b673117436 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -396,7 +396,7 @@ int _ip6vxlan_get_tunnel(struct __sk_buff *skb) SEC("geneve_set_tunnel") int _geneve_set_tunnel(struct __sk_buff *skb) { - int ret, ret2; + int ret; struct bpf_tunnel_key key; struct geneve_opt gopt; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index e87c8546230e..ee7e3b45182a 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -210,7 +210,7 @@ extern int test__join_cgroup(const char *path); #define ASSERT_ERR_PTR(ptr, name) ({ \ static int duration = 0; \ const void *___res = (ptr); \ - bool ___ok = IS_ERR(___res) \ + bool ___ok = IS_ERR(___res); \ CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res); \ ___ok; \ }) diff --git a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c index 69b048cf46d9..3e024c891178 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c +++ b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c @@ -42,3 +42,46 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, +{ + "bpf_get_task_stack return R0 range is refined", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_6, 0), // ctx->meta->seq + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 8), // ctx->task + BPF_LD_MAP_FD(BPF_REG_1, 0), // fixup_map_array_48b + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), // keep buf for seq_write + BPF_MOV64_IMM(BPF_REG_3, 48), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_EMIT_CALL(BPF_FUNC_get_task_stack), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_seq_write), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_ITER, + .kfunc = "task", + .runs = -1, // Don't run, just load + .fixup_map_array_48b = { 3 }, +}, diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index a5ce26d548e4..9a41d8bb9ff1 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -1,6 +1,10 @@ # This mimics the top-level Makefile. We do it explicitly here so that this # Makefile can operate with or without the kbuild infrastructure. +ifneq ($(LLVM),) +CC := clang +else CC := $(CROSS_COMPILE)gcc +endif ifeq (0,$(MAKELEVEL)) ifeq ($(OUTPUT),) |