diff options
210 files changed, 9477 insertions, 5374 deletions
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 38c15c6fcb14..0b3db91dc100 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -437,6 +437,21 @@ needed:: See the kernels selftest `Documentation/dev-tools/kselftest.rst`_ document for further documentation. +To maximize the number of tests passing, the .config of the kernel +under test should match the config file fragment in +tools/testing/selftests/bpf as closely as possible. + +Finally to ensure support for latest BPF Type Format features - +discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16 +is required for kernels built with CONFIG_DEBUG_INFO_BTF=y. +pahole is delivered in the dwarves package or can be built +from source at + +https://github.com/acmel/dwarves + +Some distros have pahole version 1.16 packaged already, e.g. +Fedora, Gentoo. + Q: Which BPF kernel selftests version should I run my kernel against? --------------------------------------------------------------------- A: If you run a kernel ``xyz``, then always run the BPF kernel selftests diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt deleted file mode 100644 index ecf027a9003a..000000000000 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt +++ /dev/null @@ -1,36 +0,0 @@ -Mediatek pericfg controller -=========================== - -The Mediatek pericfg controller provides various clocks and reset -outputs to the system. - -Required Properties: - -- compatible: Should be one of: - - "mediatek,mt2701-pericfg", "syscon" - - "mediatek,mt2712-pericfg", "syscon" - - "mediatek,mt7622-pericfg", "syscon" - - "mediatek,mt7623-pericfg", "mediatek,mt2701-pericfg", "syscon" - - "mediatek,mt7629-pericfg", "syscon" - - "mediatek,mt8135-pericfg", "syscon" - - "mediatek,mt8173-pericfg", "syscon" - - "mediatek,mt8183-pericfg", "syscon" -- #clock-cells: Must be 1 -- #reset-cells: Must be 1 - -The pericfg controller uses the common clk binding from -Documentation/devicetree/bindings/clock/clock-bindings.txt -The available clocks are defined in dt-bindings/clock/mt*-clk.h. -Also it uses the common reset controller binding from -Documentation/devicetree/bindings/reset/reset.txt. -The available reset outputs are defined in -dt-bindings/reset/mt*-resets.h - -Example: - -pericfg: power-controller@10003000 { - compatible = "mediatek,mt8173-pericfg", "syscon"; - reg = <0 0x10003000 0 0x1000>; - #clock-cells = <1>; - #reset-cells = <1>; -}; diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml new file mode 100644 index 000000000000..55209a2baedc --- /dev/null +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/arm/mediatek/mediatek,pericfg.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: MediaTek Peripheral Configuration Controller + +maintainers: + - Bartosz Golaszewski <[email protected]> + +description: + The Mediatek pericfg controller provides various clocks and reset outputs + to the system. + +properties: + compatible: + oneOf: + - items: + - enum: + - mediatek,mt2701-pericfg + - mediatek,mt2712-pericfg + - mediatek,mt7622-pericfg + - mediatek,mt7629-pericfg + - mediatek,mt8135-pericfg + - mediatek,mt8173-pericfg + - mediatek,mt8183-pericfg + - mediatek,mt8516-pericfg + - const: syscon + - items: + # Special case for mt7623 for backward compatibility + - const: mediatek,mt7623-pericfg + - const: mediatek,mt2701-pericfg + - const: syscon + + reg: + maxItems: 1 + + '#clock-cells': + const: 1 + + '#reset-cells': + const: 1 + +required: + - compatible + - reg + +examples: + - | + pericfg@10003000 { + compatible = "mediatek,mt8173-pericfg", "syscon"; + reg = <0x10003000 0x1000>; + #clock-cells = <1>; + #reset-cells = <1>; + }; + + - | + pericfg@10003000 { + compatible = "mediatek,mt7623-pericfg", "mediatek,mt2701-pericfg", "syscon"; + reg = <0x10003000 0x1000>; + #clock-cells = <1>; + #reset-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml b/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml new file mode 100644 index 000000000000..f85d91a9d6e5 --- /dev/null +++ b/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/mediatek,eth-mac.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek STAR Ethernet MAC Controller + +maintainers: + - Bartosz Golaszewski <[email protected]> + +description: + This Ethernet MAC is used on the MT8* family of SoCs from MediaTek. + It's compliant with 802.3 standards and supports half- and full-duplex + modes with flow-control as well as CRC offloading and VLAN tags. + +allOf: + - $ref: "ethernet-controller.yaml#" + +properties: + compatible: + enum: + - mediatek,mt8516-eth + - mediatek,mt8518-eth + - mediatek,mt8175-eth + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + minItems: 3 + maxItems: 3 + + clock-names: + additionalItems: false + items: + - const: core + - const: reg + - const: trans + + mediatek,pericfg: + $ref: /schemas/types.yaml#definitions/phandle + description: + Phandle to the device containing the PERICFG register range. This is used + to control the MII mode. + + mdio: + type: object + description: + Creates and registers an MDIO bus. + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + - mediatek,pericfg + - phy-handle + +examples: + - | + #include <dt-bindings/interrupt-controller/arm-gic.h> + #include <dt-bindings/clock/mt8516-clk.h> + + ethernet: ethernet@11180000 { + compatible = "mediatek,mt8516-eth"; + reg = <0x11180000 0x1000>; + mediatek,pericfg = <&pericfg>; + interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>; + clocks = <&topckgen CLK_TOP_RG_ETH>, + <&topckgen CLK_TOP_66M_ETH>, + <&topckgen CLK_TOP_133M_ETH>; + clock-names = "core", "reg", "trans"; + phy-handle = <ð_phy>; + phy-mode = "rmii"; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + eth_phy: ethernet-phy@0 { + reg = <0>; + }; + }; + }; diff --git a/MAINTAINERS b/MAINTAINERS index b7844f6cfa4a..087e68b21f9f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18443,8 +18443,12 @@ R: Jonathan Lemon <[email protected]> S: Maintained -F: kernel/bpf/xskmap.c +F: include/net/xdp_sock* +F: include/net/xsk_buffer_pool.h +F: include/uapi/linux/if_xdp.h F: net/xdp/ +F: samples/bpf/xdpsock* +F: tools/lib/bpf/xsk* XEN BLOCK SUBSYSTEM M: Konrad Rzeszutek Wilk <[email protected]> diff --git a/arch/arm64/boot/dts/mediatek/mt8516.dtsi b/arch/arm64/boot/dts/mediatek/mt8516.dtsi index 2f8adf042195..89af661e7f63 100644 --- a/arch/arm64/boot/dts/mediatek/mt8516.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8516.dtsi @@ -191,6 +191,11 @@ #clock-cells = <1>; }; + pericfg: pericfg@10003050 { + compatible = "mediatek,mt8516-pericfg", "syscon"; + reg = <0 0x10003050 0 0x1000>; + }; + apmixedsys: apmixedsys@10018000 { compatible = "mediatek,mt8516-apmixedsys", "syscon"; reg = <0 0x10018000 0 0x710>; @@ -401,6 +406,18 @@ status = "disabled"; }; + ethernet: ethernet@11180000 { + compatible = "mediatek,mt8516-eth"; + reg = <0 0x11180000 0 0x1000>; + mediatek,pericfg = <&pericfg>; + interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>; + clocks = <&topckgen CLK_TOP_RG_ETH>, + <&topckgen CLK_TOP_66M_ETH>, + <&topckgen CLK_TOP_133M_ETH>; + clock-names = "core", "reg", "trans"; + status = "disabled"; + }; + rng: rng@1020c000 { compatible = "mediatek,mt8516-rng", "mediatek,mt7623-rng"; diff --git a/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi b/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi index a31093d7142b..dfceffe6950a 100644 --- a/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi +++ b/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi @@ -9,6 +9,7 @@ / { aliases { serial0 = &uart0; + ethernet0 = ðernet; }; chosen { @@ -166,6 +167,24 @@ status = "okay"; }; +ðernet { + pinctrl-names = "default"; + pinctrl-0 = <ðernet_pins_default>; + phy-handle = <ð_phy>; + phy-mode = "rmii"; + mac-address = [00 00 00 00 00 00]; + status = "okay"; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + eth_phy: ethernet-phy@0 { + reg = <0>; + }; + }; +}; + &usb0 { status = "okay"; dr_mode = "peripheral"; @@ -218,4 +237,19 @@ bias-pull-up; }; }; + + ethernet_pins_default: ethernet { + pins_ethernet { + pinmux = <MT8516_PIN_0_EINT0__FUNC_EXT_TXD0>, + <MT8516_PIN_1_EINT1__FUNC_EXT_TXD1>, + <MT8516_PIN_5_EINT5__FUNC_EXT_RXER>, + <MT8516_PIN_6_EINT6__FUNC_EXT_RXC>, + <MT8516_PIN_7_EINT7__FUNC_EXT_RXDV>, + <MT8516_PIN_8_EINT8__FUNC_EXT_RXD0>, + <MT8516_PIN_9_EINT9__FUNC_EXT_RXD1>, + <MT8516_PIN_12_EINT12__FUNC_EXT_TXEN>, + <MT8516_PIN_38_MRG_DI__FUNC_EXT_MDIO>, + <MT8516_PIN_39_MRG_DO__FUNC_EXT_MDC>; + }; + }; }; diff --git a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h index 7be3dcbf3d16..336742f6e3c3 100644 --- a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h +++ b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h @@ -768,8 +768,8 @@ enum ena_admin_os_type { ENA_ADMIN_OS_DPDK = 3, ENA_ADMIN_OS_FREEBSD = 4, ENA_ADMIN_OS_IPXE = 5, - ENA_ADMIN_OS_ESXI = 6, - ENA_ADMIN_OS_GROUPS_NUM = 6, + ENA_ADMIN_OS_ESXI = 6, + ENA_ADMIN_OS_GROUPS_NUM = 6, }; struct ena_admin_host_info { @@ -813,7 +813,8 @@ struct ena_admin_host_info { u16 reserved; - /* 1 :0 : reserved + /* 0 : reserved + * 1 : rx_offset * 2 : interrupt_moderation * 31:3 : reserved */ @@ -1124,6 +1125,8 @@ struct ena_admin_ena_mmio_req_read_less_resp { #define ENA_ADMIN_HOST_INFO_DEVICE_MASK GENMASK(7, 3) #define ENA_ADMIN_HOST_INFO_BUS_SHIFT 8 #define ENA_ADMIN_HOST_INFO_BUS_MASK GENMASK(15, 8) +#define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT 1 +#define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK BIT(1) #define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT 2 #define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK BIT(2) @@ -1133,4 +1136,4 @@ struct ena_admin_ena_mmio_req_read_less_resp { /* aenq_link_change_desc */ #define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK BIT(0) -#endif /*_ENA_ADMIN_H_ */ +#endif /* _ENA_ADMIN_H_ */ diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index b51bf62af11b..432f143559a1 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -62,7 +62,9 @@ #define ENA_REGS_ADMIN_INTR_MASK 1 -#define ENA_POLL_MS 5 +#define ENA_MIN_ADMIN_POLL_US 100 + +#define ENA_MAX_ADMIN_POLL_US 5000 /*****************************************************************************/ /*****************************************************************************/ @@ -200,17 +202,17 @@ static void comp_ctxt_release(struct ena_com_admin_queue *queue, static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue, u16 command_id, bool capture) { - if (unlikely(!queue->comp_ctx)) { - pr_err("Completion context is NULL\n"); - return NULL; - } - if (unlikely(command_id >= queue->q_depth)) { pr_err("command id is larger than the queue size. cmd_id: %u queue size %d\n", command_id, queue->q_depth); return NULL; } + if (unlikely(!queue->comp_ctx)) { + pr_err("Completion context is NULL\n"); + return NULL; + } + if (unlikely(queue->comp_ctx[command_id].occupied && capture)) { pr_err("Completion context is occupied\n"); return NULL; @@ -375,7 +377,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, io_sq->bounce_buf_ctrl.next_to_use = 0; size = io_sq->bounce_buf_ctrl.buffer_size * - io_sq->bounce_buf_ctrl.buffers_num; + io_sq->bounce_buf_ctrl.buffers_num; dev_node = dev_to_node(ena_dev->dmadev); set_dev_node(ena_dev->dmadev, ctx->numa_node); @@ -523,9 +525,6 @@ static int ena_com_comp_status_to_errno(u8 comp_status) if (unlikely(comp_status != 0)) pr_err("admin command failed[%u]\n", comp_status); - if (unlikely(comp_status > ENA_ADMIN_UNKNOWN_ERROR)) - return -EINVAL; - switch (comp_status) { case ENA_ADMIN_SUCCESS: return 0; @@ -540,7 +539,14 @@ static int ena_com_comp_status_to_errno(u8 comp_status) return -EINVAL; } - return 0; + return -EINVAL; +} + +static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us) +{ + delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us); + delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US); + usleep_range(delay_us, 2 * delay_us); } static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx, @@ -549,6 +555,7 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c unsigned long flags = 0; unsigned long timeout; int ret; + u32 exp = 0; timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout); @@ -572,7 +579,8 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c goto err; } - msleep(ENA_POLL_MS); + ena_delay_exponential_backoff_us(exp++, + admin_queue->ena_dev->ena_min_poll_delay_us); } if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) { @@ -702,8 +710,7 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev, /* The desc list entry size should be whole multiply of 8 * This requirement comes from __iowrite64_copy() */ - pr_err("illegal entry size %d\n", - llq_info->desc_list_entry_size); + pr_err("illegal entry size %d\n", llq_info->desc_list_entry_size); return -EINVAL; } @@ -775,7 +782,7 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com if (admin_queue->auto_polling) admin_queue->polling = true; } else { - pr_err("The ena device doesn't send a completion for the admin cmd %d status %d\n", + pr_err("The ena device didn't send a completion for the admin cmd %d status %d\n", comp_ctx->cmd_opcode, comp_ctx->status); } /* Check if shifted to polling mode. @@ -943,12 +950,13 @@ static void ena_com_io_queue_free(struct ena_com_dev *ena_dev, static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout, u16 exp_state) { - u32 val, i; + u32 val, exp = 0; + unsigned long timeout_stamp; - /* Convert timeout from resolution of 100ms to ENA_POLL_MS */ - timeout = (timeout * 100) / ENA_POLL_MS; + /* Convert timeout from resolution of 100ms to us resolution. */ + timeout_stamp = jiffies + usecs_to_jiffies(100 * 1000 * timeout); - for (i = 0; i < timeout; i++) { + while (1) { val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF); if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) { @@ -960,10 +968,11 @@ static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout, exp_state) return 0; - msleep(ENA_POLL_MS); - } + if (time_is_before_jiffies(timeout_stamp)) + return -ETIME; - return -ETIME; + ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us); + } } static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev, @@ -1284,13 +1293,9 @@ static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev) static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev, u16 intr_delay_resolution) { - /* Initial value of intr_delay_resolution might be 0 */ - u16 prev_intr_delay_resolution = - ena_dev->intr_delay_resolution ? - ena_dev->intr_delay_resolution : - ENA_DEFAULT_INTR_DELAY_RESOLUTION; + u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution; - if (!intr_delay_resolution) { + if (unlikely(!intr_delay_resolution)) { pr_err("Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n"); intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION; } @@ -1444,11 +1449,13 @@ void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev) { struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue; unsigned long flags = 0; + u32 exp = 0; spin_lock_irqsave(&admin_queue->q_lock, flags); while (atomic_read(&admin_queue->outstanding_cmds) != 0) { spin_unlock_irqrestore(&admin_queue->q_lock, flags); - msleep(ENA_POLL_MS); + ena_delay_exponential_backoff_us(exp++, + ena_dev->ena_min_poll_delay_us); spin_lock_irqsave(&admin_queue->q_lock, flags); } spin_unlock_irqrestore(&admin_queue->q_lock, flags); @@ -1796,6 +1803,7 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev, if (ret) goto error; + admin_queue->ena_dev = ena_dev; admin_queue->running_state = true; return 0; @@ -2003,7 +2011,7 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data) struct ena_admin_aenq_entry *aenq_e; struct ena_admin_aenq_common_desc *aenq_common; struct ena_com_aenq *aenq = &dev->aenq; - unsigned long long timestamp; + u64 timestamp; ena_aenq_handler handler_cb; u16 masked_head, processed = 0; u8 phase; @@ -2021,9 +2029,8 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data) */ dma_rmb(); - timestamp = - (unsigned long long)aenq_common->timestamp_low | - ((unsigned long long)aenq_common->timestamp_high << 32); + timestamp = (u64)aenq_common->timestamp_low | + ((u64)aenq_common->timestamp_high << 32); pr_debug("AENQ! Group[%x] Syndrom[%x] timestamp: [%llus]\n", aenq_common->group, aenq_common->syndrom, timestamp); @@ -2053,8 +2060,7 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data) /* write the aenq doorbell after all AENQ descriptors were read */ mb(); - writel_relaxed((u32)aenq->head, - dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF); + writel_relaxed((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF); } int ena_com_dev_reset(struct ena_com_dev *ena_dev, @@ -2276,13 +2282,14 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, enum ena_admin_hash_functions func, const u8 *key, u16 key_len, u32 init_val) { - struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_flow_hash_control *hash_key; struct ena_admin_get_feat_resp get_resp; - struct ena_admin_feature_rss_flow_hash_control *hash_key = - rss->hash_key; enum ena_admin_hash_functions old_func; + struct ena_rss *rss = &ena_dev->rss; int rc; + hash_key = rss->hash_key; + /* Make sure size is a mult of DWs */ if (unlikely(key_len & 0x3)) return -EINVAL; @@ -2294,7 +2301,7 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, if (unlikely(rc)) return rc; - if (!((1 << func) & get_resp.u.flow_hash_func.supported_func)) { + if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) { pr_err("Flow hash function %d isn't supported\n", func); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/amazon/ena/ena_com.h b/drivers/net/ethernet/amazon/ena/ena_com.h index 13a1b7812c46..bc187adf54e4 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.h +++ b/drivers/net/ethernet/amazon/ena/ena_com.h @@ -77,6 +77,8 @@ #define ENA_INTR_INITIAL_RX_INTERVAL_USECS 0 #define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1 +#define ENA_HASH_KEY_SIZE 40 + #define ENA_HW_HINTS_NO_TIMEOUT 0xFFFF #define ENA_FEATURE_MAX_QUEUE_EXT_VER 1 @@ -237,6 +239,7 @@ struct ena_com_stats_admin { struct ena_com_admin_queue { void *q_dmadev; + struct ena_com_dev *ena_dev; spinlock_t q_lock; /* spinlock for the admin queue */ struct ena_comp_ctx *comp_ctx; @@ -349,6 +352,8 @@ struct ena_com_dev { struct ena_intr_moder_entry *intr_moder_tbl; struct ena_com_llq_info llq_info; + + u32 ena_min_poll_delay_us; }; struct ena_com_dev_get_features_ctx { @@ -393,7 +398,7 @@ struct ena_aenq_handlers { */ int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev); -/* ena_com_set_mmio_read_mode - Enable/disable the mmio reg read mechanism +/* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism * @ena_dev: ENA communication layer struct * @readless_supported: readless mode (enable/disable) */ @@ -515,7 +520,7 @@ void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev, /* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler * @ena_dev: ENA communication layer struct * - * This method go over the admin completion queue and wake up all the pending + * This method goes over the admin completion queue and wakes up all the pending * threads that wait on the commands wait event. * * @note: Should be called after MSI-X interrupt. @@ -525,7 +530,7 @@ void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev); /* ena_com_aenq_intr_handler - AENQ interrupt handler * @ena_dev: ENA communication layer struct * - * This method go over the async event notification queue and call the proper + * This method goes over the async event notification queue and calls the proper * aenq handler. */ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data); @@ -542,14 +547,14 @@ void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev); /* ena_com_wait_for_abort_completion - Wait for admin commands abort. * @ena_dev: ENA communication layer struct * - * This method wait until all the outstanding admin commands will be completed. + * This method waits until all the outstanding admin commands are completed. */ void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev); /* ena_com_validate_version - Validate the device parameters * @ena_dev: ENA communication layer struct * - * This method validate the device parameters are the same as the saved + * This method verifies the device parameters are the same as the saved * parameters in ena_dev. * This method is useful after device reset, to validate the device mac address * and the device offloads are the same as before the reset. @@ -689,7 +694,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev); * * Retrieve the hash function from the device. * - * @note: If the caller called ena_com_fill_hash_function but didn't flash + * @note: If the caller called ena_com_fill_hash_function but didn't flush * it to the device, the new configuration will be lost. * * @return: 0 on Success and negative value otherwise. @@ -703,7 +708,7 @@ int ena_com_get_hash_function(struct ena_com_dev *ena_dev, * * Retrieve the hash key. * - * @note: If the caller called ena_com_fill_hash_key but didn't flash + * @note: If the caller called ena_com_fill_hash_key but didn't flush * it to the device, the new configuration will be lost. * * @return: 0 on Success and negative value otherwise. @@ -743,7 +748,7 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev); * * Retrieve the hash control from the device. * - * @note, If the caller called ena_com_fill_hash_ctrl but didn't flash + * @note: If the caller called ena_com_fill_hash_ctrl but didn't flush * it to the device, the new configuration will be lost. * * @return: 0 on Success and negative value otherwise. @@ -795,7 +800,7 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev); * * Retrieve the RSS indirection table from the device. * - * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flash + * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flush * it to the device, the new configuration will be lost. * * @return: 0 on Success and negative value otherwise. @@ -821,14 +826,14 @@ int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev, /* ena_com_delete_debug_area - Free the debug area resources. * @ena_dev: ENA communication layer struct * - * Free the allocate debug area. + * Free the allocated debug area. */ void ena_com_delete_debug_area(struct ena_com_dev *ena_dev); /* ena_com_delete_host_info - Free the host info resources. * @ena_dev: ENA communication layer struct * - * Free the allocate host info. + * Free the allocated host info. */ void ena_com_delete_host_info(struct ena_com_dev *ena_dev); @@ -869,9 +874,9 @@ int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev, * @cmd_completion: command completion return value. * @cmd_comp_size: command completion size. - * Submit an admin command and then wait until the device will return a + * Submit an admin command and then wait until the device returns a * completion. - * The completion will be copyed into cmd_comp. + * The completion will be copied into cmd_comp. * * @return - 0 on success, negative value on failure. */ @@ -934,7 +939,7 @@ unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev * /* ena_com_config_dev_mode - Configure the placement policy of the device. * @ena_dev: ENA communication layer struct * @llq_features: LLQ feature descriptor, retrieve via - * ena_com_get_dev_attr_feat. + * ena_com_get_dev_attr_feat. * @ena_llq_config: The default driver LLQ parameters configurations */ int ena_com_config_dev_mode(struct ena_com_dev *ena_dev, @@ -960,7 +965,7 @@ static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_d * @intr_reg: interrupt register to update. * @rx_delay_interval: Rx interval in usecs * @tx_delay_interval: Tx interval in usecs - * @unmask: unask enable/disable + * @unmask: unmask enable/disable * * Prepare interrupt update register with the supplied parameters. */ diff --git a/drivers/net/ethernet/amazon/ena/ena_common_defs.h b/drivers/net/ethernet/amazon/ena/ena_common_defs.h index 23beb7e7ed7b..8a8ded0de9ac 100644 --- a/drivers/net/ethernet/amazon/ena/ena_common_defs.h +++ b/drivers/net/ethernet/amazon/ena/ena_common_defs.h @@ -45,4 +45,4 @@ struct ena_common_mem_addr { u16 reserved16; }; -#endif /*_ENA_COMMON_H_ */ +#endif /* _ENA_COMMON_H_ */ diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.c b/drivers/net/ethernet/amazon/ena/ena_eth_com.c index 2845ac277724..ec8ea25e988d 100644 --- a/drivers/net/ethernet/amazon/ena/ena_eth_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.c @@ -519,7 +519,7 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq, struct ena_eth_io_rx_cdesc_base *cdesc = NULL; u16 cdesc_idx = 0; u16 nb_hw_desc; - u16 i; + u16 i = 0; WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type"); @@ -538,13 +538,19 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq, return -ENOSPC; } - for (i = 0; i < nb_hw_desc; i++) { + cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx); + ena_rx_ctx->pkt_offset = cdesc->offset; + + do { + ena_buf[i].len = cdesc->length; + ena_buf[i].req_id = cdesc->req_id; + + if (++i >= nb_hw_desc) + break; + cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i); - ena_buf->len = cdesc->length; - ena_buf->req_id = cdesc->req_id; - ena_buf++; - } + } while (1); /* Update SQ head ptr */ io_sq->next_to_comp += nb_hw_desc; @@ -578,10 +584,10 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq, desc->length = ena_buf->len; - desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK; - desc->ctrl |= ENA_ETH_IO_RX_DESC_LAST_MASK; - desc->ctrl |= io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK; - desc->ctrl |= ENA_ETH_IO_RX_DESC_COMP_REQ_MASK; + desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK | + ENA_ETH_IO_RX_DESC_LAST_MASK | + (io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK) | + ENA_ETH_IO_RX_DESC_COMP_REQ_MASK; desc->req_id = req_id; diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.h b/drivers/net/ethernet/amazon/ena/ena_eth_com.h index 77986c0ea52c..8b1afd3b32f2 100644 --- a/drivers/net/ethernet/amazon/ena/ena_eth_com.h +++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.h @@ -73,6 +73,7 @@ struct ena_com_rx_ctx { u32 hash; u16 descs; int max_bufs; + u8 pkt_offset; }; int ena_com_prepare_tx(struct ena_com_io_sq *io_sq, @@ -95,7 +96,7 @@ static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq, writel(intr_reg->intr_control, io_cq->unmask_reg); } -static inline int ena_com_free_desc(struct ena_com_io_sq *io_sq) +static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq) { u16 tail, next_to_comp, cnt; @@ -113,7 +114,7 @@ static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq, int temp; if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST) - return ena_com_free_desc(io_sq) >= required_buffers; + return ena_com_free_q_entries(io_sq) >= required_buffers; /* This calculation doesn't need to be 100% accurate. So to reduce * the calculation overhead just Subtract 2 lines from the free descs @@ -122,7 +123,7 @@ static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq, */ temp = required_buffers / io_sq->llq_info.descs_per_entry + 2; - return ena_com_free_desc(io_sq) > temp; + return ena_com_free_q_entries(io_sq) > temp; } static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq, diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_io_defs.h b/drivers/net/ethernet/amazon/ena/ena_eth_io_defs.h index 00e0f056a741..d105c9c56192 100644 --- a/drivers/net/ethernet/amazon/ena/ena_eth_io_defs.h +++ b/drivers/net/ethernet/amazon/ena/ena_eth_io_defs.h @@ -264,7 +264,9 @@ struct ena_eth_io_rx_cdesc_base { u16 sub_qid; - u16 reserved; + u8 offset; + + u8 reserved; }; /* 8-word format */ @@ -412,4 +414,4 @@ struct ena_eth_io_numa_node_cfg_reg { #define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT 31 #define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK BIT(31) -#endif /*_ENA_ETH_IO_H_ */ +#endif /* _ENA_ETH_IO_H_ */ diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index 830d3711d6ee..e340b65af08c 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -206,7 +206,7 @@ int ena_get_sset_count(struct net_device *netdev, int sset) if (sset != ETH_SS_STATS) return -EOPNOTSUPP; - return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX) + return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX) + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM; } @@ -260,7 +260,6 @@ static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data) for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) { ena_stats = &ena_stats_global_strings[i]; - memcpy(data, ena_stats->name, ETH_GSTRING_LEN); data += ETH_GSTRING_LEN; } @@ -307,10 +306,8 @@ static int ena_get_coalesce(struct net_device *net_dev, struct ena_adapter *adapter = netdev_priv(net_dev); struct ena_com_dev *ena_dev = adapter->ena_dev; - if (!ena_com_interrupt_moderation_supported(ena_dev)) { - /* the devie doesn't support interrupt moderation */ + if (!ena_com_interrupt_moderation_supported(ena_dev)) return -EOPNOTSUPP; - } coalesce->tx_coalesce_usecs = ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) * @@ -326,7 +323,7 @@ static int ena_get_coalesce(struct net_device *net_dev, return 0; } -static void ena_update_tx_rings_intr_moderation(struct ena_adapter *adapter) +static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter) { unsigned int val; int i; @@ -337,7 +334,7 @@ static void ena_update_tx_rings_intr_moderation(struct ena_adapter *adapter) adapter->tx_ring[i].smoothed_interval = val; } -static void ena_update_rx_rings_intr_moderation(struct ena_adapter *adapter) +static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter) { unsigned int val; int i; @@ -355,24 +352,22 @@ static int ena_set_coalesce(struct net_device *net_dev, struct ena_com_dev *ena_dev = adapter->ena_dev; int rc; - if (!ena_com_interrupt_moderation_supported(ena_dev)) { - /* the devie doesn't support interrupt moderation */ + if (!ena_com_interrupt_moderation_supported(ena_dev)) return -EOPNOTSUPP; - } rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev, coalesce->tx_coalesce_usecs); if (rc) return rc; - ena_update_tx_rings_intr_moderation(adapter); + ena_update_tx_rings_nonadaptive_intr_moderation(adapter); rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev, coalesce->rx_coalesce_usecs); if (rc) return rc; - ena_update_rx_rings_intr_moderation(adapter); + ena_update_rx_rings_nonadaptive_intr_moderation(adapter); if (coalesce->use_adaptive_rx_coalesce && !ena_com_get_adaptive_moderation_enabled(ena_dev)) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 85b87ed02dd5..46865d5bd7e7 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1435,6 +1435,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page, rx_info->page_offset, len, ENA_PAGE_SIZE); + /* The offset is non zero only for the first buffer */ + rx_info->page_offset = 0; netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "rx skb updated. len %d. data_len %d\n", @@ -1590,6 +1592,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, { u16 next_to_clean = rx_ring->next_to_clean; struct ena_com_rx_ctx ena_rx_ctx; + struct ena_rx_buffer *rx_info; struct ena_adapter *adapter; u32 res_budget, work_done; int rx_copybreak_pkt = 0; @@ -1614,6 +1617,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, ena_rx_ctx.ena_bufs = rx_ring->ena_bufs; ena_rx_ctx.max_bufs = rx_ring->sgl_size; ena_rx_ctx.descs = 0; + ena_rx_ctx.pkt_offset = 0; rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq, rx_ring->ena_com_io_sq, &ena_rx_ctx); @@ -1623,6 +1627,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, if (unlikely(ena_rx_ctx.descs == 0)) break; + rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]; + rx_info->page_offset = ena_rx_ctx.pkt_offset; + netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n", rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto, @@ -1684,7 +1691,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, rx_ring->next_to_clean = next_to_clean; - refill_required = ena_com_free_desc(rx_ring->ena_com_io_sq); + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); refill_threshold = min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER, ENA_RX_REFILL_THRESH_PACKET); @@ -2235,7 +2242,7 @@ static int ena_rss_configure(struct ena_adapter *adapter) rc = ena_rss_init_default(adapter); if (rc && (rc != -EOPNOTSUPP)) { netif_err(adapter, ifup, adapter->netdev, - "Failed to init RSS rc: %d\n", rc); + "Failed to init RSS rc: %d\n", rc); return rc; } } @@ -2308,7 +2315,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid) if (rc) { netif_err(adapter, ifup, adapter->netdev, "Failed to create I/O TX queue num %d rc: %d\n", - qid, rc); + qid, rc); return rc; } @@ -2457,7 +2464,7 @@ static int create_queues_with_size_backoff(struct ena_adapter *adapter) * ones due to past queue allocation failures. */ set_io_rings_size(adapter, adapter->requested_tx_ring_size, - adapter->requested_rx_ring_size); + adapter->requested_rx_ring_size); while (1) { if (ena_xdp_present(adapter)) { @@ -2498,7 +2505,7 @@ err_setup_tx: if (rc != -ENOMEM) { netif_err(adapter, ifup, adapter->netdev, "Queue creation failed with error code %d\n", - rc); + rc); return rc; } @@ -2521,7 +2528,7 @@ err_setup_tx: new_rx_ring_size = cur_rx_ring_size / 2; if (new_tx_ring_size < ENA_MIN_RING_SIZE || - new_rx_ring_size < ENA_MIN_RING_SIZE) { + new_rx_ring_size < ENA_MIN_RING_SIZE) { netif_err(adapter, ifup, adapter->netdev, "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n", ENA_MIN_RING_SIZE); @@ -3080,8 +3087,7 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, return qid; } -static void ena_config_host_info(struct ena_com_dev *ena_dev, - struct pci_dev *pdev) +static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev) { struct ena_admin_host_info *host_info; int rc; @@ -3111,6 +3117,7 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, host_info->num_cpus = num_online_cpus(); host_info->driver_supported_features = + ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK | ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK; rc = ena_com_set_host_attributes(ena_dev); @@ -3686,8 +3693,7 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter) for (i = 0; i < adapter->num_io_queues; i++) { rx_ring = &adapter->rx_ring[i]; - refill_required = - ena_com_free_desc(rx_ring->ena_com_io_sq); + refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq); if (unlikely(refill_required == (rx_ring->ring_size - 1))) { rx_ring->empty_rx_queue++; @@ -3825,11 +3831,11 @@ static void ena_timer_service(struct timer_list *t) mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); } -static int ena_calc_max_io_queue_num(struct pci_dev *pdev, +static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev, struct ena_com_dev *ena_dev, struct ena_com_dev_get_features_ctx *get_feat_ctx) { - int io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues; + u32 io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues; if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) { struct ena_admin_queue_ext_feature_fields *max_queue_ext = @@ -4115,8 +4121,8 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx) */ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - struct ena_com_dev_get_features_ctx get_feat_ctx; struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 }; + struct ena_com_dev_get_features_ctx get_feat_ctx; struct ena_llq_configurations llq_config; struct ena_com_dev *ena_dev = NULL; struct ena_adapter *adapter; @@ -4160,6 +4166,8 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_free_region; } + ena_dev->ena_min_poll_delay_us = ENA_ADMIN_POLL_DELAY_US; + ena_dev->dmadev = &pdev->dev; rc = ena_device_init(ena_dev, pdev, &get_feat_ctx, &wd_state); @@ -4183,7 +4191,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent) calc_queue_ctx.get_feat_ctx = &get_feat_ctx; calc_queue_ctx.pdev = pdev; - /* Initial Tx and RX interrupt delay. Assumes 1 usec granularity. + /* Initial TX and RX interrupt delay. Assumes 1 usec granularity. * Updated during device initialization with the real granularity */ ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS; @@ -4227,12 +4235,11 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->num_io_queues = max_num_io_queues; adapter->max_num_io_queues = max_num_io_queues; + adapter->last_monitored_tx_qid = 0; adapter->xdp_first_ring = 0; adapter->xdp_num_queues = 0; - adapter->last_monitored_tx_qid = 0; - adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK; adapter->wd_state = wd_state; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 680099afcccf..ba030d260940 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -50,12 +50,6 @@ #define DRV_MODULE_GEN_SUBMINOR 0 #define DRV_MODULE_NAME "ena" -#ifndef DRV_MODULE_GENERATION -#define DRV_MODULE_GENERATION \ - __stringify(DRV_MODULE_GEN_MAJOR) "." \ - __stringify(DRV_MODULE_GEN_MINOR) "." \ - __stringify(DRV_MODULE_GEN_SUBMINOR) "K" -#endif #define DEVICE_NAME "Elastic Network Adapter (ENA)" @@ -104,8 +98,6 @@ #define ENA_RX_RSS_TABLE_LOG_SIZE 7 #define ENA_RX_RSS_TABLE_SIZE (1 << ENA_RX_RSS_TABLE_LOG_SIZE) -#define ENA_HASH_KEY_SIZE 40 - /* The number of tx packet completions that will be handled each NAPI poll * cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER. */ @@ -137,6 +129,8 @@ #define ENA_IO_IRQ_FIRST_IDX 1 #define ENA_IO_IRQ_IDX(q) (ENA_IO_IRQ_FIRST_IDX + (q)) +#define ENA_ADMIN_POLL_DELAY_US 100 + /* ENA device should send keep alive msg every 1 sec. * We wait for 6 sec just to be on the safe side. */ diff --git a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h index 04fcafcc059c..b514bb1b855d 100644 --- a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h +++ b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h @@ -154,4 +154,4 @@ enum ena_regs_reset_reason_types { #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT 16 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK 0xffff0000 -#endif /*_ENA_REGS_H_ */ +#endif /* _ENA_REGS_H_ */ diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c index 86fc77d85fda..743d3b13b39d 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c @@ -88,13 +88,13 @@ static const char aq_ethtool_stat_names[][ETH_GSTRING_LEN] = { "InDroppedDma", }; -static const char aq_ethtool_queue_stat_names[][ETH_GSTRING_LEN] = { - "Queue[%d] InPackets", - "Queue[%d] OutPackets", - "Queue[%d] Restarts", - "Queue[%d] InJumboPackets", - "Queue[%d] InLroPackets", - "Queue[%d] InErrors", +static const char * const aq_ethtool_queue_stat_names[] = { + "%sQueue[%d] InPackets", + "%sQueue[%d] OutPackets", + "%sQueue[%d] Restarts", + "%sQueue[%d] InJumboPackets", + "%sQueue[%d] InLroPackets", + "%sQueue[%d] InErrors", }; #if IS_ENABLED(CONFIG_MACSEC) @@ -166,7 +166,8 @@ static u32 aq_ethtool_n_stats(struct net_device *ndev) struct aq_nic_s *nic = netdev_priv(ndev); struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(nic); u32 n_stats = ARRAY_SIZE(aq_ethtool_stat_names) + - ARRAY_SIZE(aq_ethtool_queue_stat_names) * cfg->vecs; + ARRAY_SIZE(aq_ethtool_queue_stat_names) * cfg->vecs * + cfg->tcs; #if IS_ENABLED(CONFIG_MACSEC) if (nic->macsec_cfg) { @@ -223,7 +224,7 @@ static void aq_ethtool_get_drvinfo(struct net_device *ndev, static void aq_ethtool_get_strings(struct net_device *ndev, u32 stringset, u8 *data) { - struct aq_nic_s *aq_nic = netdev_priv(ndev); + struct aq_nic_s *nic = netdev_priv(ndev); struct aq_nic_cfg_s *cfg; u8 *p = data; int i, si; @@ -231,24 +232,35 @@ static void aq_ethtool_get_strings(struct net_device *ndev, int sa; #endif - cfg = aq_nic_get_cfg(aq_nic); + cfg = aq_nic_get_cfg(nic); switch (stringset) { - case ETH_SS_STATS: + case ETH_SS_STATS: { + const int stat_cnt = ARRAY_SIZE(aq_ethtool_queue_stat_names); + char tc_string[8]; + int tc; + + memset(tc_string, 0, sizeof(tc_string)); memcpy(p, aq_ethtool_stat_names, sizeof(aq_ethtool_stat_names)); p = p + sizeof(aq_ethtool_stat_names); - for (i = 0; i < cfg->vecs; i++) { - for (si = 0; - si < ARRAY_SIZE(aq_ethtool_queue_stat_names); - si++) { - snprintf(p, ETH_GSTRING_LEN, - aq_ethtool_queue_stat_names[si], i); - p += ETH_GSTRING_LEN; + + for (tc = 0; tc < cfg->tcs; tc++) { + if (cfg->is_qos) + snprintf(tc_string, 8, "TC%d ", tc); + + for (i = 0; i < cfg->vecs; i++) { + for (si = 0; si < stat_cnt; si++) { + snprintf(p, ETH_GSTRING_LEN, + aq_ethtool_queue_stat_names[si], + tc_string, + AQ_NIC_CFG_TCVEC2RING(cfg, tc, i)); + p += ETH_GSTRING_LEN; + } } } #if IS_ENABLED(CONFIG_MACSEC) - if (!aq_nic->macsec_cfg) + if (!nic->macsec_cfg) break; memcpy(p, aq_macsec_stat_names, sizeof(aq_macsec_stat_names)); @@ -256,7 +268,7 @@ static void aq_ethtool_get_strings(struct net_device *ndev, for (i = 0; i < AQ_MACSEC_MAX_SC; i++) { struct aq_macsec_txsc *aq_txsc; - if (!(test_bit(i, &aq_nic->macsec_cfg->txsc_idx_busy))) + if (!(test_bit(i, &nic->macsec_cfg->txsc_idx_busy))) continue; for (si = 0; @@ -266,7 +278,7 @@ static void aq_ethtool_get_strings(struct net_device *ndev, aq_macsec_txsc_stat_names[si], i); p += ETH_GSTRING_LEN; } - aq_txsc = &aq_nic->macsec_cfg->aq_txsc[i]; + aq_txsc = &nic->macsec_cfg->aq_txsc[i]; for (sa = 0; sa < MACSEC_NUM_AN; sa++) { if (!(test_bit(sa, &aq_txsc->tx_sa_idx_busy))) continue; @@ -283,10 +295,10 @@ static void aq_ethtool_get_strings(struct net_device *ndev, for (i = 0; i < AQ_MACSEC_MAX_SC; i++) { struct aq_macsec_rxsc *aq_rxsc; - if (!(test_bit(i, &aq_nic->macsec_cfg->rxsc_idx_busy))) + if (!(test_bit(i, &nic->macsec_cfg->rxsc_idx_busy))) continue; - aq_rxsc = &aq_nic->macsec_cfg->aq_rxsc[i]; + aq_rxsc = &nic->macsec_cfg->aq_rxsc[i]; for (sa = 0; sa < MACSEC_NUM_AN; sa++) { if (!(test_bit(sa, &aq_rxsc->rx_sa_idx_busy))) continue; @@ -302,6 +314,7 @@ static void aq_ethtool_get_strings(struct net_device *ndev, } #endif break; + } case ETH_SS_PRIV_FLAGS: memcpy(p, aq_ethtool_priv_flag_names, sizeof(aq_ethtool_priv_flag_names)); @@ -780,8 +793,6 @@ static int aq_set_ringparam(struct net_device *ndev, dev_close(ndev); } - aq_nic_free_vectors(aq_nic); - cfg->rxds = max(ring->rx_pending, hw_caps->rxds_min); cfg->rxds = min(cfg->rxds, hw_caps->rxds_max); cfg->rxds = ALIGN(cfg->rxds, AQ_HW_RXD_MULTIPLE); @@ -790,15 +801,10 @@ static int aq_set_ringparam(struct net_device *ndev, cfg->txds = min(cfg->txds, hw_caps->txds_max); cfg->txds = ALIGN(cfg->txds, AQ_HW_TXD_MULTIPLE); - for (aq_nic->aq_vecs = 0; aq_nic->aq_vecs < cfg->vecs; - aq_nic->aq_vecs++) { - aq_nic->aq_vec[aq_nic->aq_vecs] = - aq_vec_alloc(aq_nic, aq_nic->aq_vecs, cfg); - if (unlikely(!aq_nic->aq_vec[aq_nic->aq_vecs])) { - err = -ENOMEM; - goto err_exit; - } - } + err = aq_nic_realloc_vectors(aq_nic); + if (err) + goto err_exit; + if (ndev_running) err = dev_open(ndev, NULL); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c index 03ff92bc4a7f..1bc4d33a0ce5 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c @@ -153,6 +153,8 @@ aq_check_approve_fvlan(struct aq_nic_s *aq_nic, struct aq_hw_rx_fltrs_s *rx_fltrs, struct ethtool_rx_flow_spec *fsp) { + struct aq_nic_cfg_s *cfg = &aq_nic->aq_nic_cfg; + if (fsp->location < AQ_RX_FIRST_LOC_FVLANID || fsp->location > AQ_RX_LAST_LOC_FVLANID) { netdev_err(aq_nic->ndev, @@ -170,10 +172,10 @@ aq_check_approve_fvlan(struct aq_nic_s *aq_nic, return -EINVAL; } - if (fsp->ring_cookie > aq_nic->aq_nic_cfg.num_rss_queues) { + if (fsp->ring_cookie > cfg->num_rss_queues * cfg->tcs) { netdev_err(aq_nic->ndev, "ethtool: queue number must be in range [0, %d]", - aq_nic->aq_nic_cfg.num_rss_queues - 1); + cfg->num_rss_queues * cfg->tcs - 1); return -EINVAL; } return 0; @@ -262,6 +264,7 @@ static bool __must_check aq_rule_is_not_correct(struct aq_nic_s *aq_nic, struct ethtool_rx_flow_spec *fsp) { + struct aq_nic_cfg_s *cfg = &aq_nic->aq_nic_cfg; bool rule_is_not_correct = false; if (!aq_nic) { @@ -274,11 +277,11 @@ aq_rule_is_not_correct(struct aq_nic_s *aq_nic, } else if (aq_check_filter(aq_nic, fsp)) { rule_is_not_correct = true; } else if (fsp->ring_cookie != RX_CLS_FLOW_DISC) { - if (fsp->ring_cookie >= aq_nic->aq_nic_cfg.num_rss_queues) { + if (fsp->ring_cookie >= cfg->num_rss_queues * cfg->tcs) { netdev_err(aq_nic->ndev, "ethtool: The specified action is invalid.\n" "Maximum allowable value action is %u.\n", - aq_nic->aq_nic_cfg.num_rss_queues - 1); + cfg->num_rss_queues * cfg->tcs - 1); rule_is_not_correct = true; } } diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h index 03fea9469f01..ed5b465bc664 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h @@ -18,6 +18,12 @@ #define AQ_HW_MAC_COUNTER_HZ 312500000ll #define AQ_HW_PHY_COUNTER_HZ 160000000ll +enum aq_tc_mode { + AQ_TC_MODE_INVALID = -1, + AQ_TC_MODE_8TCS, + AQ_TC_MODE_4TCS, +}; + #define AQ_RX_FIRST_LOC_FVLANID 0U #define AQ_RX_LAST_LOC_FVLANID 15U #define AQ_RX_FIRST_LOC_FETHERT 16U @@ -29,6 +35,9 @@ (AQ_RX_LAST_LOC_FVLANID - AQ_RX_FIRST_LOC_FVLANID + 1U) #define AQ_RX_QUEUE_NOT_ASSIGNED 0xFFU +/* Used for rate to Mbps conversion */ +#define AQ_MBPS_DIVISOR 125000 /* 1000000 / 8 */ + /* NIC H/W capabilities */ struct aq_hw_caps_s { u64 hw_features; @@ -46,7 +55,7 @@ struct aq_hw_caps_s { u32 mac_regs_count; u32 hw_alive_check_addr; u8 msix_irqs; - u8 tcs; + u8 tcs_max; u8 rxd_alignment; u8 rxd_size; u8 txd_alignment; @@ -118,8 +127,11 @@ struct aq_stats_s { #define AQ_HW_TXD_MULTIPLE 8U #define AQ_HW_RXD_MULTIPLE 8U +#define AQ_HW_QUEUES_MAX 32U #define AQ_HW_MULTICAST_ADDRESS_MAX 32U +#define AQ_HW_PTP_TC 2U + #define AQ_HW_LED_BLINK 0x2U #define AQ_HW_LED_DEFAULT 0x0U @@ -268,6 +280,8 @@ struct aq_hw_ops { int (*hw_rss_hash_set)(struct aq_hw_s *self, struct aq_rss_parameters *rss_params); + int (*hw_tc_rate_limit_set)(struct aq_hw_s *self); + int (*hw_get_regs)(struct aq_hw_s *self, const struct aq_hw_caps_s *aq_hw_caps, u32 *regs_buff); @@ -279,10 +293,6 @@ struct aq_hw_ops { int (*hw_set_offload)(struct aq_hw_s *self, struct aq_nic_cfg_s *aq_nic_cfg); - int (*hw_tx_tc_mode_get)(struct aq_hw_s *self, u32 *tc_mode); - - int (*hw_rx_tc_mode_get)(struct aq_hw_s *self, u32 *tc_mode); - int (*hw_ring_hwts_rx_fill)(struct aq_hw_s *self, struct aq_ring_s *aq_ring); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c index 7dbf49adcea6..342c5179f846 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c @@ -79,3 +79,29 @@ int aq_hw_err_from_flags(struct aq_hw_s *hw) err_exit: return err; } + +int aq_hw_num_tcs(struct aq_hw_s *hw) +{ + switch (hw->aq_nic_cfg->tc_mode) { + case AQ_TC_MODE_8TCS: + return 8; + case AQ_TC_MODE_4TCS: + return 4; + default: + break; + } + + return 1; +} + +int aq_hw_q_per_tc(struct aq_hw_s *hw) +{ + switch (hw->aq_nic_cfg->tc_mode) { + case AQ_TC_MODE_8TCS: + return 4; + case AQ_TC_MODE_4TCS: + return 8; + default: + return 4; + } +} diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h index 9ef82d487e01..32aa5f2fb840 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h @@ -34,5 +34,7 @@ u32 aq_hw_read_reg(struct aq_hw_s *hw, u32 reg); void aq_hw_write_reg(struct aq_hw_s *hw, u32 reg, u32 value); u64 aq_hw_read_reg64(struct aq_hw_s *hw, u32 reg); int aq_hw_err_from_flags(struct aq_hw_s *hw); +int aq_hw_num_tcs(struct aq_hw_s *hw); +int aq_hw_q_per_tc(struct aq_hw_s *hw); #endif /* AQ_HW_UTILS_H */ diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c index 91870ceaf3fe..4a6dfac857ca 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c @@ -478,7 +478,7 @@ static int aq_mdo_add_secy(struct macsec_context *ctx) set_bit(txsc_idx, &cfg->txsc_idx_busy); - return 0; + return ret; } static int aq_mdo_upd_secy(struct macsec_context *ctx) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c index 9fcab646cbd5..8a1da044e908 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c @@ -12,11 +12,13 @@ #include "aq_ethtool.h" #include "aq_ptp.h" #include "aq_filters.h" +#include "aq_hw_utils.h" #include <linux/netdevice.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/udp.h> +#include <net/pkt_cls.h> MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(AQ_CFG_DRV_AUTHOR); @@ -38,7 +40,7 @@ struct net_device *aq_ndev_alloc(void) struct net_device *ndev = NULL; struct aq_nic_s *aq_nic = NULL; - ndev = alloc_etherdev_mq(sizeof(struct aq_nic_s), AQ_CFG_VECS_MAX); + ndev = alloc_etherdev_mq(sizeof(struct aq_nic_s), AQ_HW_QUEUES_MAX); if (!ndev) return NULL; @@ -330,6 +332,73 @@ static int aq_ndo_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, return 0; } +static int aq_validate_mqprio_opt(struct aq_nic_s *self, + struct tc_mqprio_qopt_offload *mqprio, + const unsigned int num_tc) +{ + const bool has_min_rate = !!(mqprio->flags & TC_MQPRIO_F_MIN_RATE); + struct aq_nic_cfg_s *aq_nic_cfg = aq_nic_get_cfg(self); + const unsigned int tcs_max = min_t(u8, aq_nic_cfg->aq_hw_caps->tcs_max, + AQ_CFG_TCS_MAX); + + if (num_tc > tcs_max) { + netdev_err(self->ndev, "Too many TCs requested\n"); + return -EOPNOTSUPP; + } + + if (num_tc != 0 && !is_power_of_2(num_tc)) { + netdev_err(self->ndev, "TC count should be power of 2\n"); + return -EOPNOTSUPP; + } + + if (has_min_rate && !ATL_HW_IS_CHIP_FEATURE(self->aq_hw, ANTIGUA)) { + netdev_err(self->ndev, "Min tx rate is not supported\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int aq_ndo_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct tc_mqprio_qopt_offload *mqprio = type_data; + struct aq_nic_s *aq_nic = netdev_priv(dev); + bool has_min_rate; + bool has_max_rate; + int err; + int i; + + if (type != TC_SETUP_QDISC_MQPRIO) + return -EOPNOTSUPP; + + has_min_rate = !!(mqprio->flags & TC_MQPRIO_F_MIN_RATE); + has_max_rate = !!(mqprio->flags & TC_MQPRIO_F_MAX_RATE); + + err = aq_validate_mqprio_opt(aq_nic, mqprio, mqprio->qopt.num_tc); + if (err) + return err; + + for (i = 0; i < mqprio->qopt.num_tc; i++) { + if (has_max_rate) { + u64 max_rate = mqprio->max_rate[i]; + + do_div(max_rate, AQ_MBPS_DIVISOR); + aq_nic_setup_tc_max_rate(aq_nic, i, (u32)max_rate); + } + + if (has_min_rate) { + u64 min_rate = mqprio->min_rate[i]; + + do_div(min_rate, AQ_MBPS_DIVISOR); + aq_nic_setup_tc_min_rate(aq_nic, i, (u32)min_rate); + } + } + + return aq_nic_setup_tc_mqprio(aq_nic, mqprio->qopt.num_tc, + mqprio->qopt.prio_tc_map); +} + static const struct net_device_ops aq_ndev_ops = { .ndo_open = aq_ndev_open, .ndo_stop = aq_ndev_close, @@ -341,6 +410,7 @@ static const struct net_device_ops aq_ndev_ops = { .ndo_do_ioctl = aq_ndev_ioctl, .ndo_vlan_rx_add_vid = aq_ndo_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = aq_ndo_vlan_rx_kill_vid, + .ndo_setup_tc = aq_ndo_setup_tc, }; static int __init aq_ndev_init_module(void) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 1c6d12deb47a..4435c6374f7e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -26,6 +26,7 @@ #include <linux/ip.h> #include <linux/tcp.h> #include <net/ip.h> +#include <net/pkt_cls.h> static unsigned int aq_itr = AQ_CFG_INTERRUPT_MODERATION_AUTO; module_param_named(aq_itr, aq_itr, uint, 0644); @@ -64,10 +65,38 @@ static void aq_nic_rss_init(struct aq_nic_s *self, unsigned int num_rss_queues) rss_params->indirection_table[i] = i & (num_rss_queues - 1); } +/* Recalculate the number of vectors */ +static void aq_nic_cfg_update_num_vecs(struct aq_nic_s *self) +{ + struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg; + + cfg->vecs = min(cfg->aq_hw_caps->vecs, AQ_CFG_VECS_DEF); + cfg->vecs = min(cfg->vecs, num_online_cpus()); + if (self->irqvecs > AQ_HW_SERVICE_IRQS) + cfg->vecs = min(cfg->vecs, self->irqvecs - AQ_HW_SERVICE_IRQS); + /* cfg->vecs should be power of 2 for RSS */ + cfg->vecs = rounddown_pow_of_two(cfg->vecs); + + if (ATL_HW_IS_CHIP_FEATURE(self->aq_hw, ANTIGUA)) { + if (cfg->tcs > 2) + cfg->vecs = min(cfg->vecs, 4U); + } + + if (cfg->vecs <= 4) + cfg->tc_mode = AQ_TC_MODE_8TCS; + else + cfg->tc_mode = AQ_TC_MODE_4TCS; + + /*rss rings */ + cfg->num_rss_queues = min(cfg->vecs, AQ_CFG_NUM_RSS_QUEUES_DEF); + aq_nic_rss_init(self, cfg->num_rss_queues); +} + /* Checks hw_caps and 'corrects' aq_nic_cfg in runtime */ void aq_nic_cfg_start(struct aq_nic_s *self) { struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg; + int i; cfg->tcs = AQ_CFG_TCS_DEF; @@ -79,7 +108,6 @@ void aq_nic_cfg_start(struct aq_nic_s *self) cfg->rxpageorder = AQ_CFG_RX_PAGEORDER; cfg->is_rss = AQ_CFG_IS_RSS_DEF; - cfg->num_rss_queues = AQ_CFG_NUM_RSS_QUEUES_DEF; cfg->aq_rss.base_cpu_number = AQ_CFG_RSS_BASE_CPU_NUM_DEF; cfg->fc.req = AQ_CFG_FC_MODE; cfg->wol = AQ_CFG_WOL_MODES; @@ -89,29 +117,13 @@ void aq_nic_cfg_start(struct aq_nic_s *self) cfg->is_autoneg = AQ_CFG_IS_AUTONEG_DEF; cfg->is_lro = AQ_CFG_IS_LRO_DEF; + cfg->is_ptp = true; /*descriptors */ cfg->rxds = min(cfg->aq_hw_caps->rxds_max, AQ_CFG_RXDS_DEF); cfg->txds = min(cfg->aq_hw_caps->txds_max, AQ_CFG_TXDS_DEF); - /*rss rings */ - cfg->vecs = min(cfg->aq_hw_caps->vecs, AQ_CFG_VECS_DEF); - cfg->vecs = min(cfg->vecs, num_online_cpus()); - if (self->irqvecs > AQ_HW_SERVICE_IRQS) - cfg->vecs = min(cfg->vecs, self->irqvecs - AQ_HW_SERVICE_IRQS); - /* cfg->vecs should be power of 2 for RSS */ - if (cfg->vecs >= 8U) - cfg->vecs = 8U; - else if (cfg->vecs >= 4U) - cfg->vecs = 4U; - else if (cfg->vecs >= 2U) - cfg->vecs = 2U; - else - cfg->vecs = 1U; - - cfg->num_rss_queues = min(cfg->vecs, AQ_CFG_NUM_RSS_QUEUES_DEF); - - aq_nic_rss_init(self, cfg->num_rss_queues); + aq_nic_cfg_update_num_vecs(self); cfg->irq_type = aq_pci_func_get_irq_type(self); @@ -136,6 +148,9 @@ void aq_nic_cfg_start(struct aq_nic_s *self) cfg->is_vlan_rx_strip = !!(cfg->features & NETIF_F_HW_VLAN_CTAG_RX); cfg->is_vlan_tx_insert = !!(cfg->features & NETIF_F_HW_VLAN_CTAG_TX); cfg->is_vlan_force_promisc = true; + + for (i = 0; i < sizeof(cfg->prio_tc_map); i++) + cfg->prio_tc_map[i] = cfg->tcs * i / 8; } static int aq_nic_update_link_status(struct aq_nic_s *self) @@ -181,6 +196,9 @@ static int aq_nic_update_link_status(struct aq_nic_s *self) #if IS_ENABLED(CONFIG_MACSEC) aq_macsec_enable(self); #endif + if (self->aq_hw_ops->hw_tc_rate_limit_set) + self->aq_hw_ops->hw_tc_rate_limit_set(self->aq_hw); + netif_tx_wake_all_queues(self->ndev); } if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) { @@ -399,21 +417,29 @@ int aq_nic_init(struct aq_nic_s *self) err = aq_phy_init(self->aq_hw); } - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) + for (i = 0U; i < self->aq_vecs; i++) { + aq_vec = self->aq_vec[i]; + err = aq_vec_ring_alloc(aq_vec, self, i, + aq_nic_get_cfg(self)); + if (err) + goto err_exit; + aq_vec_init(aq_vec, self->aq_hw_ops, self->aq_hw); + } - err = aq_ptp_init(self, self->irqvecs - 1); - if (err < 0) - goto err_exit; + if (aq_nic_get_cfg(self)->is_ptp) { + err = aq_ptp_init(self, self->irqvecs - 1); + if (err < 0) + goto err_exit; - err = aq_ptp_ring_alloc(self); - if (err < 0) - goto err_exit; + err = aq_ptp_ring_alloc(self); + if (err < 0) + goto err_exit; - err = aq_ptp_ring_init(self); - if (err < 0) - goto err_exit; + err = aq_ptp_ring_init(self); + if (err < 0) + goto err_exit; + } netif_carrier_off(self->ndev); @@ -424,9 +450,12 @@ err_exit: int aq_nic_start(struct aq_nic_s *self) { struct aq_vec_s *aq_vec = NULL; + struct aq_nic_cfg_s *cfg; unsigned int i = 0U; int err = 0; + cfg = aq_nic_get_cfg(self); + err = self->aq_hw_ops->hw_multicast_list_set(self->aq_hw, self->mc_list.ar, self->mc_list.count); @@ -464,7 +493,7 @@ int aq_nic_start(struct aq_nic_s *self) timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0); aq_nic_service_timer_cb(&self->service_timer); - if (self->aq_nic_cfg.is_polling) { + if (cfg->is_polling) { timer_setup(&self->polling_timer, aq_nic_polling_timer_cb, 0); mod_timer(&self->polling_timer, jiffies + AQ_CFG_POLLING_TIMER_INTERVAL); @@ -482,16 +511,16 @@ int aq_nic_start(struct aq_nic_s *self) if (err < 0) goto err_exit; - if (self->aq_nic_cfg.link_irq_vec) { + if (cfg->link_irq_vec) { int irqvec = pci_irq_vector(self->pdev, - self->aq_nic_cfg.link_irq_vec); + cfg->link_irq_vec); err = request_threaded_irq(irqvec, NULL, aq_linkstate_threaded_isr, IRQF_SHARED | IRQF_ONESHOT, self->ndev->name, self); if (err < 0) goto err_exit; - self->msix_entry_mask |= (1 << self->aq_nic_cfg.link_irq_vec); + self->msix_entry_mask |= (1 << cfg->link_irq_vec); } err = self->aq_hw_ops->hw_irq_enable(self->aq_hw, @@ -500,14 +529,21 @@ int aq_nic_start(struct aq_nic_s *self) goto err_exit; } - err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs); + err = netif_set_real_num_tx_queues(self->ndev, + self->aq_vecs * cfg->tcs); if (err < 0) goto err_exit; - err = netif_set_real_num_rx_queues(self->ndev, self->aq_vecs); + err = netif_set_real_num_rx_queues(self->ndev, + self->aq_vecs * cfg->tcs); if (err < 0) goto err_exit; + for (i = 0; i < cfg->tcs; i++) { + u16 offset = self->aq_vecs * i; + + netdev_set_tc_queue(self->ndev, i, self->aq_vecs, offset); + } netif_tx_start_all_queues(self->ndev); err_exit: @@ -518,6 +554,8 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb, struct aq_ring_s *ring) { unsigned int nr_frags = skb_shinfo(skb)->nr_frags; + struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self); + struct device *dev = aq_nic_get_dev(self); struct aq_ring_buff_s *first = NULL; u8 ipver = ip_hdr(skb)->version; struct aq_ring_buff_s *dx_buff; @@ -559,7 +597,7 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb, need_context_tag = true; } - if (self->aq_nic_cfg.is_vlan_tx_insert && skb_vlan_tag_present(skb)) { + if (cfg->is_vlan_tx_insert && skb_vlan_tag_present(skb)) { dx_buff->vlan_tx_tag = skb_vlan_tag_get(skb); dx_buff->len_pkt = skb->len; dx_buff->is_vlan = 1U; @@ -574,12 +612,12 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb, } dx_buff->len = skb_headlen(skb); - dx_buff->pa = dma_map_single(aq_nic_get_dev(self), + dx_buff->pa = dma_map_single(dev, skb->data, dx_buff->len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) { + if (unlikely(dma_mapping_error(dev, dx_buff->pa))) { ret = 0; goto exit; } @@ -611,13 +649,13 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb, else buff_size = frag_len; - frag_pa = skb_frag_dma_map(aq_nic_get_dev(self), + frag_pa = skb_frag_dma_map(dev, frag, buff_offset, buff_size, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(aq_nic_get_dev(self), + if (unlikely(dma_mapping_error(dev, frag_pa))) goto mapping_error; @@ -651,12 +689,12 @@ mapping_error: if (!(dx_buff->is_gso_tcp || dx_buff->is_gso_udp) && !dx_buff->is_vlan && dx_buff->pa) { if (unlikely(dx_buff->is_sop)) { - dma_unmap_single(aq_nic_get_dev(self), + dma_unmap_single(dev, dx_buff->pa, dx_buff->len, DMA_TO_DEVICE); } else { - dma_unmap_page(aq_nic_get_dev(self), + dma_unmap_page(dev, dx_buff->pa, dx_buff->len, DMA_TO_DEVICE); @@ -670,15 +708,16 @@ exit: int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) { - unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs; + struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self); + unsigned int vec = skb->queue_mapping % cfg->vecs; + unsigned int tc = skb->queue_mapping / cfg->vecs; struct aq_ring_s *ring = NULL; unsigned int frags = 0U; int err = NETDEV_TX_OK; - unsigned int tc = 0U; frags = skb_shinfo(skb)->nr_frags + 1; - ring = self->aq_ring_tx[AQ_NIC_TCVEC2RING(self, tc, vec)]; + ring = self->aq_ring_tx[AQ_NIC_CFG_TCVEC2RING(cfg, tc, vec)]; if (frags > AQ_CFG_SKB_FRAGS_MAX) { dev_kfree_skb_any(skb); @@ -687,13 +726,14 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) aq_ring_update_queue_state(ring); - if (self->aq_nic_cfg.priv_flags & BIT(AQ_HW_LOOPBACK_DMA_NET)) { + if (cfg->priv_flags & BIT(AQ_HW_LOOPBACK_DMA_NET)) { err = NETDEV_TX_BUSY; goto err_exit; } /* Above status update may stop the queue. Check this. */ - if (__netif_subqueue_stopped(self->ndev, ring->idx)) { + if (__netif_subqueue_stopped(self->ndev, + AQ_NIC_RING2QMAP(self, ring->idx))) { err = NETDEV_TX_BUSY; goto err_exit; } @@ -823,6 +863,7 @@ u64 *aq_nic_get_stats(struct aq_nic_s *self, u64 *data) struct aq_stats_s *stats; unsigned int count = 0U; unsigned int i = 0U; + unsigned int tc; if (self->aq_fw_ops->update_stats) { mutex_lock(&self->fwreq_mutex); @@ -861,10 +902,13 @@ u64 *aq_nic_get_stats(struct aq_nic_s *self, u64 *data) data += i; - for (i = 0U, aq_vec = self->aq_vec[0]; - aq_vec && self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) { - data += count; - aq_vec_get_sw_stats(aq_vec, data, &count); + for (tc = 0U; tc < self->aq_nic_cfg.tcs; tc++) { + for (i = 0U, aq_vec = self->aq_vec[0]; + aq_vec && self->aq_vecs > i; + ++i, aq_vec = self->aq_vec[i]) { + data += count; + aq_vec_get_sw_stats(aq_vec, tc, data, &count); + } } data += count; @@ -1145,9 +1189,11 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down) if (!self) goto err_exit; - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) + for (i = 0U; i < self->aq_vecs; i++) { + aq_vec = self->aq_vec[i]; aq_vec_deinit(aq_vec); + aq_vec_ring_free(aq_vec); + } aq_ptp_unregister(self); aq_ptp_ring_deinit(self); @@ -1180,6 +1226,22 @@ void aq_nic_free_vectors(struct aq_nic_s *self) err_exit:; } +int aq_nic_realloc_vectors(struct aq_nic_s *self) +{ + struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self); + + aq_nic_free_vectors(self); + + for (self->aq_vecs = 0; self->aq_vecs < cfg->vecs; self->aq_vecs++) { + self->aq_vec[self->aq_vecs] = aq_vec_alloc(self, self->aq_vecs, + cfg); + if (unlikely(!self->aq_vec[self->aq_vecs])) + return -ENOMEM; + } + + return 0; +} + void aq_nic_shutdown(struct aq_nic_s *self) { int err = 0; @@ -1245,3 +1307,98 @@ void aq_nic_release_filter(struct aq_nic_s *self, enum aq_rx_filter_type type, break; } } + +int aq_nic_setup_tc_mqprio(struct aq_nic_s *self, u32 tcs, u8 *prio_tc_map) +{ + struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg; + const unsigned int prev_vecs = cfg->vecs; + bool ndev_running; + int err = 0; + int i; + + /* if already the same configuration or + * disable request (tcs is 0) and we already is disabled + */ + if (tcs == cfg->tcs || (tcs == 0 && !cfg->is_qos)) + return 0; + + ndev_running = netif_running(self->ndev); + if (ndev_running) + dev_close(self->ndev); + + cfg->tcs = tcs; + if (cfg->tcs == 0) + cfg->tcs = 1; + if (prio_tc_map) + memcpy(cfg->prio_tc_map, prio_tc_map, sizeof(cfg->prio_tc_map)); + else + for (i = 0; i < sizeof(cfg->prio_tc_map); i++) + cfg->prio_tc_map[i] = cfg->tcs * i / 8; + + cfg->is_qos = (tcs != 0 ? true : false); + cfg->is_ptp = (cfg->tcs <= AQ_HW_PTP_TC); + if (!cfg->is_ptp) + netdev_warn(self->ndev, "%s\n", + "PTP is auto disabled due to requested TC count."); + + netdev_set_num_tc(self->ndev, cfg->tcs); + + /* Changing the number of TCs might change the number of vectors */ + aq_nic_cfg_update_num_vecs(self); + if (prev_vecs != cfg->vecs) { + err = aq_nic_realloc_vectors(self); + if (err) + goto err_exit; + } + + if (ndev_running) + err = dev_open(self->ndev, NULL); + +err_exit: + return err; +} + +int aq_nic_setup_tc_max_rate(struct aq_nic_s *self, const unsigned int tc, + const u32 max_rate) +{ + struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg; + + if (tc >= AQ_CFG_TCS_MAX) + return -EINVAL; + + if (max_rate && max_rate < 10) { + netdev_warn(self->ndev, + "Setting %s to the minimum usable value of %dMbps.\n", + "max rate", 10); + cfg->tc_max_rate[tc] = 10; + } else { + cfg->tc_max_rate[tc] = max_rate; + } + + return 0; +} + +int aq_nic_setup_tc_min_rate(struct aq_nic_s *self, const unsigned int tc, + const u32 min_rate) +{ + struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg; + + if (tc >= AQ_CFG_TCS_MAX) + return -EINVAL; + + if (min_rate) + set_bit(tc, &cfg->tc_min_rate_msk); + else + clear_bit(tc, &cfg->tc_min_rate_msk); + + if (min_rate && min_rate < 20) { + netdev_warn(self->ndev, + "Setting %s to the minimum usable value of %dMbps.\n", + "min rate", 20); + cfg->tc_min_rate[tc] = 20; + } else { + cfg->tc_min_rate[tc] = min_rate; + } + + return 0; +} diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h index 0663b8d0220d..2ab003065e62 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h @@ -59,8 +59,15 @@ struct aq_nic_cfg_s { bool is_polling; bool is_rss; bool is_lro; + bool is_qos; + bool is_ptp; + enum aq_tc_mode tc_mode; u32 priv_flags; u8 tcs; + u8 prio_tc_map[8]; + u32 tc_max_rate[AQ_CFG_TCS_MAX]; + unsigned long tc_min_rate_msk; + u32 tc_min_rate[AQ_CFG_TCS_MAX]; struct aq_rss_parameters aq_rss; u32 eee_speeds; }; @@ -77,8 +84,16 @@ struct aq_nic_cfg_s { #define AQ_NIC_WOL_MODES (WAKE_MAGIC |\ WAKE_PHY) -#define AQ_NIC_TCVEC2RING(_NIC_, _TC_, _VEC_) \ - ((_TC_) * AQ_CFG_TCS_MAX + (_VEC_)) +#define AQ_NIC_CFG_RING_PER_TC(_NIC_CFG_) \ + (((_NIC_CFG_)->tc_mode == AQ_TC_MODE_4TCS) ? 8 : 4) + +#define AQ_NIC_CFG_TCVEC2RING(_NIC_CFG_, _TC_, _VEC_) \ + ((_TC_) * AQ_NIC_CFG_RING_PER_TC(_NIC_CFG_) + (_VEC_)) + +#define AQ_NIC_RING2QMAP(_NIC_, _ID_) \ + ((_ID_) / AQ_NIC_CFG_RING_PER_TC(&(_NIC_)->aq_nic_cfg) * \ + (_NIC_)->aq_vecs + \ + ((_ID_) % AQ_NIC_CFG_RING_PER_TC(&(_NIC_)->aq_nic_cfg))) struct aq_hw_rx_fl2 { struct aq_rx_filter_vlan aq_vlans[AQ_VLAN_MAX_FILTERS]; @@ -104,7 +119,7 @@ struct aq_nic_s { atomic_t flags; u32 msg_enable; struct aq_vec_s *aq_vec[AQ_CFG_VECS_MAX]; - struct aq_ring_s *aq_ring_tx[AQ_CFG_VECS_MAX * AQ_CFG_TCS_MAX]; + struct aq_ring_s *aq_ring_tx[AQ_HW_QUEUES_MAX]; struct aq_hw_s *aq_hw; struct net_device *ndev; unsigned int aq_vecs; @@ -164,6 +179,7 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down); void aq_nic_set_power(struct aq_nic_s *self); void aq_nic_free_hot_resources(struct aq_nic_s *self); void aq_nic_free_vectors(struct aq_nic_s *self); +int aq_nic_realloc_vectors(struct aq_nic_s *self); int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu); int aq_nic_set_mac(struct aq_nic_s *self, struct net_device *ndev); int aq_nic_set_packet_filter(struct aq_nic_s *self, unsigned int flags); @@ -181,4 +197,9 @@ void aq_nic_shutdown(struct aq_nic_s *self); u8 aq_nic_reserve_filter(struct aq_nic_s *self, enum aq_rx_filter_type type); void aq_nic_release_filter(struct aq_nic_s *self, enum aq_rx_filter_type type, u32 location); +int aq_nic_setup_tc_mqprio(struct aq_nic_s *self, u32 tcs, u8 *prio_tc_map); +int aq_nic_setup_tc_max_rate(struct aq_nic_s *self, const unsigned int tc, + const u32 max_rate); +int aq_nic_setup_tc_min_rate(struct aq_nic_s *self, const unsigned int tc, + const u32 min_rate); #endif /* AQ_NIC_H */ diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index d10fff8a8c71..41c0f560f95b 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -431,6 +431,9 @@ static int atl_resume_common(struct device *dev, bool deep) netif_tx_start_all_queues(nic->ndev); err_exit: + if (ret < 0) + aq_nic_deinit(nic, true); + rtnl_unlock(); return ret; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c b/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c index 58e8c641e8b3..599ced261b2a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c @@ -945,26 +945,29 @@ void aq_ptp_ring_deinit(struct aq_nic_s *aq_nic) #define PTP_4TC_RING_IDX 16 #define PTP_HWST_RING_IDX 31 +/* Index must be 8 (8 TCs) or 16 (4 TCs). + * It depends on Traffic Class mode. + */ +static unsigned int ptp_ring_idx(const enum aq_tc_mode tc_mode) +{ + if (tc_mode == AQ_TC_MODE_8TCS) + return PTP_8TC_RING_IDX; + + return PTP_4TC_RING_IDX; +} + int aq_ptp_ring_alloc(struct aq_nic_s *aq_nic) { struct aq_ptp_s *aq_ptp = aq_nic->aq_ptp; unsigned int tx_ring_idx, rx_ring_idx; struct aq_ring_s *hwts; - u32 tx_tc_mode, rx_tc_mode; struct aq_ring_s *ring; int err; if (!aq_ptp) return 0; - /* Index must to be 8 (8 TCs) or 16 (4 TCs). - * It depends from Traffic Class mode. - */ - aq_nic->aq_hw_ops->hw_tx_tc_mode_get(aq_nic->aq_hw, &tx_tc_mode); - if (tx_tc_mode == 0) - tx_ring_idx = PTP_8TC_RING_IDX; - else - tx_ring_idx = PTP_4TC_RING_IDX; + tx_ring_idx = ptp_ring_idx(aq_nic->aq_nic_cfg.tc_mode); ring = aq_ring_tx_alloc(&aq_ptp->ptp_tx, aq_nic, tx_ring_idx, &aq_nic->aq_nic_cfg); @@ -973,11 +976,7 @@ int aq_ptp_ring_alloc(struct aq_nic_s *aq_nic) goto err_exit; } - aq_nic->aq_hw_ops->hw_rx_tc_mode_get(aq_nic->aq_hw, &rx_tc_mode); - if (rx_tc_mode == 0) - rx_ring_idx = PTP_8TC_RING_IDX; - else - rx_ring_idx = PTP_4TC_RING_IDX; + rx_ring_idx = ptp_ring_idx(aq_nic->aq_nic_cfg.tc_mode); ring = aq_ring_rx_alloc(&aq_ptp->ptp_rx, aq_nic, rx_ring_idx, &aq_nic->aq_nic_cfg); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index bae95a618560..68fdb3994088 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -232,8 +232,11 @@ void aq_ring_queue_wake(struct aq_ring_s *ring) { struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic); - if (__netif_subqueue_stopped(ndev, ring->idx)) { - netif_wake_subqueue(ndev, ring->idx); + if (__netif_subqueue_stopped(ndev, + AQ_NIC_RING2QMAP(ring->aq_nic, + ring->idx))) { + netif_wake_subqueue(ndev, + AQ_NIC_RING2QMAP(ring->aq_nic, ring->idx)); ring->stats.tx.queue_restarts++; } } @@ -242,8 +245,11 @@ void aq_ring_queue_stop(struct aq_ring_s *ring) { struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic); - if (!__netif_subqueue_stopped(ndev, ring->idx)) - netif_stop_subqueue(ndev, ring->idx); + if (!__netif_subqueue_stopped(ndev, + AQ_NIC_RING2QMAP(ring->aq_nic, + ring->idx))) + netif_stop_subqueue(ndev, + AQ_NIC_RING2QMAP(ring->aq_nic, ring->idx)); } bool aq_ring_tx_clean(struct aq_ring_s *self) @@ -466,7 +472,10 @@ int aq_ring_rx_clean(struct aq_ring_s *self, buff->is_hash_l4 ? PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_NONE); /* Send all PTP traffic to 0 queue */ - skb_record_rx_queue(skb, is_ptp_ring ? 0 : self->idx); + skb_record_rx_queue(skb, + is_ptp_ring ? 0 + : AQ_NIC_RING2QMAP(self->aq_nic, + self->idx)); ++self->stats.rx.packets; self->stats.rx.bytes += skb->len; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index f40a427970dc..d1d43c8ce400 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -103,16 +103,11 @@ err_exit: struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg) { - struct aq_ring_s *ring = NULL; struct aq_vec_s *self = NULL; - unsigned int i = 0U; - int err = 0; self = kzalloc(sizeof(*self), GFP_KERNEL); - if (!self) { - err = -ENOMEM; + if (!self) goto err_exit; - } self->aq_nic = aq_nic; self->aq_ring_param.vec_idx = idx; @@ -128,10 +123,20 @@ struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx, netif_napi_add(aq_nic_get_ndev(aq_nic), &self->napi, aq_vec_poll, AQ_CFG_NAPI_WEIGHT); +err_exit: + return self; +} + +int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic, + unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg) +{ + struct aq_ring_s *ring = NULL; + unsigned int i = 0U; + int err = 0; + for (i = 0; i < aq_nic_cfg->tcs; ++i) { - unsigned int idx_ring = AQ_NIC_TCVEC2RING(self->nic, - self->tx_rings, - self->aq_ring_param.vec_idx); + const unsigned int idx_ring = AQ_NIC_CFG_TCVEC2RING(aq_nic_cfg, + i, idx); ring = aq_ring_tx_alloc(&self->ring[i][AQ_VEC_TX_ID], aq_nic, idx_ring, aq_nic_cfg); @@ -156,11 +161,11 @@ struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx, err_exit: if (err < 0) { - aq_vec_free(self); + aq_vec_ring_free(self); self = NULL; } - return self; + return err; } int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops, @@ -270,6 +275,18 @@ err_exit:; void aq_vec_free(struct aq_vec_s *self) { + if (!self) + goto err_exit; + + netif_napi_del(&self->napi); + + kfree(self); + +err_exit:; +} + +void aq_vec_ring_free(struct aq_vec_s *self) +{ struct aq_ring_s *ring = NULL; unsigned int i = 0U; @@ -279,13 +296,12 @@ void aq_vec_free(struct aq_vec_s *self) for (i = 0U, ring = self->ring[0]; self->tx_rings > i; ++i, ring = self->ring[i]) { aq_ring_free(&ring[AQ_VEC_TX_ID]); - aq_ring_free(&ring[AQ_VEC_RX_ID]); + if (i < self->rx_rings) + aq_ring_free(&ring[AQ_VEC_RX_ID]); } - netif_napi_del(&self->napi); - - kfree(self); - + self->tx_rings = 0; + self->rx_rings = 0; err_exit:; } @@ -333,16 +349,14 @@ cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self) return &self->aq_ring_param.affinity_mask; } -void aq_vec_add_stats(struct aq_vec_s *self, - struct aq_ring_stats_rx_s *stats_rx, - struct aq_ring_stats_tx_s *stats_tx) +static void aq_vec_add_stats(struct aq_vec_s *self, + const unsigned int tc, + struct aq_ring_stats_rx_s *stats_rx, + struct aq_ring_stats_tx_s *stats_tx) { - struct aq_ring_s *ring = NULL; - unsigned int r = 0U; + struct aq_ring_s *ring = self->ring[tc]; - for (r = 0U, ring = self->ring[0]; - self->tx_rings > r; ++r, ring = self->ring[r]) { - struct aq_ring_stats_tx_s *tx = &ring[AQ_VEC_TX_ID].stats.tx; + if (tc < self->rx_rings) { struct aq_ring_stats_rx_s *rx = &ring[AQ_VEC_RX_ID].stats.rx; stats_rx->packets += rx->packets; @@ -353,6 +367,10 @@ void aq_vec_add_stats(struct aq_vec_s *self, stats_rx->pg_losts += rx->pg_losts; stats_rx->pg_flips += rx->pg_flips; stats_rx->pg_reuses += rx->pg_reuses; + } + + if (tc < self->tx_rings) { + struct aq_ring_stats_tx_s *tx = &ring[AQ_VEC_TX_ID].stats.tx; stats_tx->packets += tx->packets; stats_tx->bytes += tx->bytes; @@ -361,7 +379,8 @@ void aq_vec_add_stats(struct aq_vec_s *self, } } -int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data, unsigned int *p_count) +int aq_vec_get_sw_stats(struct aq_vec_s *self, const unsigned int tc, u64 *data, + unsigned int *p_count) { struct aq_ring_stats_rx_s stats_rx; struct aq_ring_stats_tx_s stats_tx; @@ -369,7 +388,8 @@ int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data, unsigned int *p_count) memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s)); memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s)); - aq_vec_add_stats(self, &stats_rx, &stats_tx); + + aq_vec_add_stats(self, tc, &stats_rx, &stats_tx); /* This data should mimic aq_ethtool_queue_stat_names structure */ diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.h b/drivers/net/ethernet/aquantia/atlantic/aq_vec.h index 0fe8e0904c7f..541af85e6510 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.h @@ -25,17 +25,17 @@ irqreturn_t aq_vec_isr(int irq, void *private); irqreturn_t aq_vec_isr_legacy(int irq, void *private); struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg); +int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic, + unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg); int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops, struct aq_hw_s *aq_hw); void aq_vec_deinit(struct aq_vec_s *self); void aq_vec_free(struct aq_vec_s *self); +void aq_vec_ring_free(struct aq_vec_s *self); int aq_vec_start(struct aq_vec_s *self); void aq_vec_stop(struct aq_vec_s *self); cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self); -int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data, +int aq_vec_get_sw_stats(struct aq_vec_s *self, const unsigned int tc, u64 *data, unsigned int *p_count); -void aq_vec_add_stats(struct aq_vec_s *self, - struct aq_ring_stats_rx_s *stats_rx, - struct aq_ring_stats_tx_s *stats_tx); #endif /* AQ_VEC_H */ diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c index 1b0670a8ae33..a312864969af 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c @@ -21,7 +21,7 @@ .msix_irqs = 4U, \ .irq_mask = ~0U, \ .vecs = HW_ATL_A0_RSS_MAX, \ - .tcs = HW_ATL_A0_TC_MAX, \ + .tcs_max = HW_ATL_A0_TC_MAX, \ .rxd_alignment = 1U, \ .rxd_size = HW_ATL_A0_RXD_SIZE, \ .rxds_max = HW_ATL_A0_MAX_RXD, \ @@ -136,10 +136,10 @@ static int hw_atl_a0_hw_qos_set(struct aq_hw_s *self) hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U); hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U); - hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF, 0U); - hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0x64, 0U); - hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, 0U); - hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, 0U); + hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0U, 0xFFF); + hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0U, 0x64); + hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0U, 0x50); + hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0U, 0x1E); /* Tx buf size */ buff_size = HW_ATL_A0_TXBUF_MAX; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index fa3cd7e9954b..14d79f70cad7 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -23,7 +23,7 @@ .msix_irqs = 8U, \ .irq_mask = ~0U, \ .vecs = HW_ATL_B0_RSS_MAX, \ - .tcs = HW_ATL_B0_TC_MAX, \ + .tcs_max = HW_ATL_B0_TC_MAX, \ .rxd_alignment = 1U, \ .rxd_size = HW_ATL_B0_RXD_SIZE, \ .rxds_max = HW_ATL_B0_MAX_RXD, \ @@ -46,7 +46,8 @@ NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | \ NETIF_F_GSO_UDP_L4 | \ - NETIF_F_GSO_PARTIAL, \ + NETIF_F_GSO_PARTIAL | \ + NETIF_F_HW_TC, \ .hw_priv_flags = IFF_UNICAST_FLT, \ .flow_control = true, \ .mtu = HW_ATL_B0_MTU_JUMBO, \ @@ -114,12 +115,34 @@ static int hw_atl_b0_set_fc(struct aq_hw_s *self, u32 fc, u32 tc) return 0; } +static int hw_atl_b0_tc_ptp_set(struct aq_hw_s *self) +{ + /* Init TC2 for PTP_TX */ + hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_TXBUF_SIZE, + AQ_HW_PTP_TC); + + /* Init TC2 for PTP_RX */ + hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_RXBUF_SIZE, + AQ_HW_PTP_TC); + /* No flow control for PTP */ + hw_atl_rpb_rx_xoff_en_per_tc_set(self, 0U, AQ_HW_PTP_TC); + + return aq_hw_err_from_flags(self); +} + static int hw_atl_b0_hw_qos_set(struct aq_hw_s *self) { - unsigned int i_priority = 0U; - u32 buff_size = 0U; + struct aq_nic_cfg_s *cfg = self->aq_nic_cfg; + u32 tx_buff_size = HW_ATL_B0_TXBUF_MAX; + u32 rx_buff_size = HW_ATL_B0_RXBUF_MAX; + unsigned int prio = 0U; u32 tc = 0U; + if (cfg->is_ptp) { + tx_buff_size -= HW_ATL_B0_PTP_TXBUF_SIZE; + rx_buff_size -= HW_ATL_B0_PTP_RXBUF_SIZE; + } + /* TPS Descriptor rate init */ hw_atl_tps_tx_pkt_shed_desc_rate_curr_time_res_set(self, 0x0U); hw_atl_tps_tx_pkt_shed_desc_rate_lim_set(self, 0xA); @@ -127,63 +150,39 @@ static int hw_atl_b0_hw_qos_set(struct aq_hw_s *self) /* TPS VM init */ hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(self, 0U); - /* TPS TC credits init */ - hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U); - hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U); - - tc = 0; - - /* TX Packet Scheduler Data TC0 */ - hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF, tc); - hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0x64, tc); - hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, tc); - hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, tc); - - /* Tx buf size TC0 */ - buff_size = HW_ATL_B0_TXBUF_MAX - HW_ATL_B0_PTP_TXBUF_SIZE; - - hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, buff_size, tc); - hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, - (buff_size * - (1024 / 32U) * 66U) / - 100U, tc); - hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, - (buff_size * - (1024 / 32U) * 50U) / - 100U, tc); - /* Init TC2 for PTP_TX */ - tc = 2; + tx_buff_size /= cfg->tcs; + rx_buff_size /= cfg->tcs; + for (tc = 0; tc < cfg->tcs; tc++) { + u32 threshold = 0U; - hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_TXBUF_SIZE, - tc); + /* Tx buf size TC0 */ + hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc); - /* QoS Rx buf size per TC */ - tc = 0; - buff_size = HW_ATL_B0_RXBUF_MAX - HW_ATL_B0_PTP_RXBUF_SIZE; + threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U; + hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc); - hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, buff_size, tc); - hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, - (buff_size * - (1024U / 32U) * 66U) / - 100U, tc); - hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, - (buff_size * - (1024U / 32U) * 50U) / - 100U, tc); + threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U; + hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc); - hw_atl_b0_set_fc(self, self->aq_nic_cfg->fc.req, tc); + /* QoS Rx buf size per TC */ + hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc); - /* Init TC2 for PTP_RX */ - tc = 2; + threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U; + hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc); - hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_RXBUF_SIZE, - tc); - /* No flow control for PTP */ - hw_atl_rpb_rx_xoff_en_per_tc_set(self, 0U, tc); + threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U; + hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc); + + hw_atl_b0_set_fc(self, self->aq_nic_cfg->fc.req, tc); + } + + if (cfg->is_ptp) + hw_atl_b0_tc_ptp_set(self); /* QoS 802.1p priority -> TC mapping */ - for (i_priority = 8U; i_priority--;) - hw_atl_rpf_rpb_user_priority_tc_map_set(self, i_priority, 0U); + for (prio = 0; prio < 8; ++prio) + hw_atl_rpf_rpb_user_priority_tc_map_set(self, prio, + cfg->prio_tc_map[prio]); return aq_hw_err_from_flags(self); } @@ -311,10 +310,124 @@ int hw_atl_b0_hw_offload_set(struct aq_hw_s *self, return aq_hw_err_from_flags(self); } +static int hw_atl_b0_hw_init_tx_tc_rate_limit(struct aq_hw_s *self) +{ + static const u32 max_weight = BIT(HW_ATL_TPS_DATA_TCTWEIGHT_WIDTH) - 1; + /* Scale factor is based on the number of bits in fractional portion */ + static const u32 scale = BIT(HW_ATL_TPS_DESC_RATE_Y_WIDTH); + static const u32 frac_msk = HW_ATL_TPS_DESC_RATE_Y_MSK >> + HW_ATL_TPS_DESC_RATE_Y_SHIFT; + const u32 link_speed = self->aq_link_status.mbps; + struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg; + unsigned long num_min_rated_tcs = 0; + u32 tc_weight[AQ_CFG_TCS_MAX]; + u32 fixed_max_credit; + u8 min_rate_msk = 0; + u32 sum_weight = 0; + int tc; + + /* By default max_credit is based upon MTU (in unit of 64b) */ + fixed_max_credit = nic_cfg->aq_hw_caps->mtu / 64; + + if (link_speed) { + min_rate_msk = nic_cfg->tc_min_rate_msk & + (BIT(nic_cfg->tcs) - 1); + num_min_rated_tcs = hweight8(min_rate_msk); + } + + /* First, calculate weights where min_rate is specified */ + if (num_min_rated_tcs) { + for (tc = 0; tc != nic_cfg->tcs; tc++) { + if (!nic_cfg->tc_min_rate[tc]) { + tc_weight[tc] = 0; + continue; + } + + tc_weight[tc] = (-1L + link_speed + + nic_cfg->tc_min_rate[tc] * + max_weight) / + link_speed; + tc_weight[tc] = min(tc_weight[tc], max_weight); + sum_weight += tc_weight[tc]; + } + } + + /* WSP, if min_rate is set for at least one TC. + * RR otherwise. + */ + hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, min_rate_msk ? 1U : 0U); + /* Data TC Arbiter takes precedence over Descriptor TC Arbiter, + * leave Descriptor TC Arbiter as RR. + */ + hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U); + + hw_atl_tps_tx_desc_rate_mode_set(self, nic_cfg->is_qos ? 1U : 0U); + + for (tc = 0; tc != nic_cfg->tcs; tc++) { + const u32 en = (nic_cfg->tc_max_rate[tc] != 0) ? 1U : 0U; + const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0); + u32 weight, max_credit; + + hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, tc, + fixed_max_credit); + hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, tc, 0x1E); + + if (num_min_rated_tcs) { + weight = tc_weight[tc]; + + if (!weight && sum_weight < max_weight) + weight = (max_weight - sum_weight) / + (nic_cfg->tcs - num_min_rated_tcs); + else if (!weight) + weight = 0x64; + + max_credit = max(8 * weight, fixed_max_credit); + } else { + weight = 0x64; + max_credit = 0xFFF; + } + + hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, tc, weight); + hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, tc, + max_credit); + + hw_atl_tps_tx_desc_rate_en_set(self, desc, en); + + if (en) { + /* Nominal rate is always 10G */ + const u32 rate = 10000U * scale / + nic_cfg->tc_max_rate[tc]; + const u32 rate_int = rate >> + HW_ATL_TPS_DESC_RATE_Y_WIDTH; + const u32 rate_frac = rate & frac_msk; + + hw_atl_tps_tx_desc_rate_x_set(self, desc, rate_int); + hw_atl_tps_tx_desc_rate_y_set(self, desc, rate_frac); + } else { + /* A value of 1 indicates the queue is not + * rate controlled. + */ + hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U); + hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U); + } + } + for (tc = nic_cfg->tcs; tc != AQ_CFG_TCS_MAX; tc++) { + const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0); + + hw_atl_tps_tx_desc_rate_en_set(self, desc, 0U); + hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U); + hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U); + } + + return aq_hw_err_from_flags(self); +} + static int hw_atl_b0_hw_init_tx_path(struct aq_hw_s *self) { + struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg; + /* Tx TC/Queue number config */ - hw_atl_tpb_tps_tx_tc_mode_set(self, 1U); + hw_atl_tpb_tps_tx_tc_mode_set(self, nic_cfg->tc_mode); hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U); hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U); @@ -334,20 +447,32 @@ static int hw_atl_b0_hw_init_tx_path(struct aq_hw_s *self) return aq_hw_err_from_flags(self); } +void hw_atl_b0_hw_init_rx_rss_ctrl1(struct aq_hw_s *self) +{ + struct aq_nic_cfg_s *cfg = self->aq_nic_cfg; + u32 rss_ctrl1 = HW_ATL_RSS_DISABLED; + + if (cfg->is_rss) + rss_ctrl1 = (cfg->tc_mode == AQ_TC_MODE_8TCS) ? + HW_ATL_RSS_ENABLED_8TCS_2INDEX_BITS : + HW_ATL_RSS_ENABLED_4TCS_3INDEX_BITS; + + hw_atl_reg_rx_flr_rss_control1set(self, rss_ctrl1); +} + static int hw_atl_b0_hw_init_rx_path(struct aq_hw_s *self) { struct aq_nic_cfg_s *cfg = self->aq_nic_cfg; int i; /* Rx TC/RSS number config */ - hw_atl_rpb_rpf_rx_traf_class_mode_set(self, 1U); + hw_atl_rpb_rpf_rx_traf_class_mode_set(self, cfg->tc_mode); /* Rx flow control */ hw_atl_rpb_rx_flow_ctl_mode_set(self, 1U); /* RSS Ring selection */ - hw_atl_reg_rx_flr_rss_control1set(self, cfg->is_rss ? - 0xB3333333U : 0x00000000U); + hw_atl_b0_hw_init_rx_rss_ctrl1(self); /* Multicast filters */ for (i = HW_ATL_B0_MAC_MAX; i--;) { @@ -1078,18 +1203,6 @@ int hw_atl_b0_hw_ring_rx_stop(struct aq_hw_s *self, struct aq_ring_s *ring) return aq_hw_err_from_flags(self); } -static int hw_atl_b0_tx_tc_mode_get(struct aq_hw_s *self, u32 *tc_mode) -{ - *tc_mode = hw_atl_tpb_tps_tx_tc_mode_get(self); - return aq_hw_err_from_flags(self); -} - -static int hw_atl_b0_rx_tc_mode_get(struct aq_hw_s *self, u32 *tc_mode) -{ - *tc_mode = hw_atl_rpb_rpf_rx_traf_class_mode_get(self); - return aq_hw_err_from_flags(self); -} - #define get_ptp_ts_val_u64(self, indx) \ ((u64)(hw_atl_pcs_ptp_clock_get(self, indx) & 0xffff)) @@ -1503,13 +1616,11 @@ const struct aq_hw_ops hw_atl_ops_b0 = { .hw_interrupt_moderation_set = hw_atl_b0_hw_interrupt_moderation_set, .hw_rss_set = hw_atl_b0_hw_rss_set, .hw_rss_hash_set = hw_atl_b0_hw_rss_hash_set, + .hw_tc_rate_limit_set = hw_atl_b0_hw_init_tx_tc_rate_limit, .hw_get_regs = hw_atl_utils_hw_get_regs, .hw_get_hw_stats = hw_atl_utils_get_hw_stats, .hw_get_fw_version = hw_atl_utils_get_fw_version, - .hw_tx_tc_mode_get = hw_atl_b0_tx_tc_mode_get, - .hw_rx_tc_mode_get = hw_atl_b0_rx_tc_mode_get, - .hw_ring_hwts_rx_fill = hw_atl_b0_hw_ring_hwts_rx_fill, .hw_ring_hwts_rx_receive = hw_atl_b0_hw_ring_hwts_rx_receive, diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.h index b855459272ca..30f468f2084d 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.h @@ -58,6 +58,8 @@ int hw_atl_b0_hw_ring_tx_head_update(struct aq_hw_s *self, int hw_atl_b0_hw_ring_tx_stop(struct aq_hw_s *self, struct aq_ring_s *ring); int hw_atl_b0_hw_ring_rx_stop(struct aq_hw_s *self, struct aq_ring_s *ring); +void hw_atl_b0_hw_init_rx_rss_ctrl1(struct aq_hw_s *self); + int hw_atl_b0_hw_mac_addr_set(struct aq_hw_s *self, u8 *mac_addr); int hw_atl_b0_hw_start(struct aq_hw_s *self); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h index 7ab23a1751d3..cf460d61a45e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h @@ -75,7 +75,7 @@ #define HW_ATL_B0_RSS_HASHKEY_BITS 320U #define HW_ATL_B0_TCRSS_4_8 1 -#define HW_ATL_B0_TC_MAX 1U +#define HW_ATL_B0_TC_MAX 8U #define HW_ATL_B0_RSS_MAX 8U #define HW_ATL_B0_LRO_RXD_MAX 16U @@ -151,6 +151,10 @@ #define HW_ATL_B0_MAX_RXD 8184U #define HW_ATL_B0_MAX_TXD 8184U +#define HW_ATL_RSS_DISABLED 0x00000000U +#define HW_ATL_RSS_ENABLED_8TCS_2INDEX_BITS 0xA2222222U +#define HW_ATL_RSS_ENABLED_4TCS_3INDEX_BITS 0x80003333U + /* HW layer capabilities */ #endif /* HW_ATL_B0_INTERNAL_H */ diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c index 9e2d01a6aac8..3c8e8047ea1e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c @@ -754,7 +754,7 @@ void hw_atl_rpfl2_accept_all_mc_packets_set(struct aq_hw_s *aq_hw, } void hw_atl_rpf_rpb_user_priority_tc_map_set(struct aq_hw_s *aq_hw, - u32 user_priority_tc_map, u32 tc) + u32 user_priority, u32 tc) { /* register address for bitfield rx_tc_up{t}[2:0] */ static u32 rpf_rpb_rx_tc_upt_adr[8] = { @@ -773,10 +773,9 @@ void hw_atl_rpf_rpb_user_priority_tc_map_set(struct aq_hw_s *aq_hw, 0U, 4U, 8U, 12U, 16U, 20U, 24U, 28U }; - aq_hw_write_reg_bit(aq_hw, rpf_rpb_rx_tc_upt_adr[tc], - rpf_rpb_rx_tc_upt_msk[tc], - rpf_rpb_rx_tc_upt_shft[tc], - user_priority_tc_map); + aq_hw_write_reg_bit(aq_hw, rpf_rpb_rx_tc_upt_adr[user_priority], + rpf_rpb_rx_tc_upt_msk[user_priority], + rpf_rpb_rx_tc_upt_shft[user_priority], tc); } void hw_atl_rpf_rss_key_addr_set(struct aq_hw_s *aq_hw, u32 rss_key_addr) @@ -1464,8 +1463,8 @@ void hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(struct aq_hw_s *aq_hw, } void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw, - u32 max_credit, - u32 tc) + const u32 tc, + const u32 max_credit) { aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_TCTCREDIT_MAX_ADR(tc), HW_ATL_TPS_DESC_TCTCREDIT_MAX_MSK, @@ -1474,13 +1473,13 @@ void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw, } void hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(struct aq_hw_s *aq_hw, - u32 tx_pkt_shed_desc_tc_weight, - u32 tc) + const u32 tc, + const u32 weight) { aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_TCTWEIGHT_ADR(tc), HW_ATL_TPS_DESC_TCTWEIGHT_MSK, HW_ATL_TPS_DESC_TCTWEIGHT_SHIFT, - tx_pkt_shed_desc_tc_weight); + weight); } void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw, @@ -1493,8 +1492,8 @@ void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw, } void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw, - u32 max_credit, - u32 tc) + const u32 tc, + const u32 max_credit) { aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DATA_TCTCREDIT_MAX_ADR(tc), HW_ATL_TPS_DATA_TCTCREDIT_MAX_MSK, @@ -1503,13 +1502,49 @@ void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw, } void hw_atl_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw, - u32 tx_pkt_shed_tc_data_weight, - u32 tc) + const u32 tc, + const u32 weight) { aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DATA_TCTWEIGHT_ADR(tc), HW_ATL_TPS_DATA_TCTWEIGHT_MSK, HW_ATL_TPS_DATA_TCTWEIGHT_SHIFT, - tx_pkt_shed_tc_data_weight); + weight); +} + +void hw_atl_tps_tx_desc_rate_mode_set(struct aq_hw_s *aq_hw, + const u32 rate_mode) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_TX_DESC_RATE_MODE_ADR, + HW_ATL_TPS_TX_DESC_RATE_MODE_MSK, + HW_ATL_TPS_TX_DESC_RATE_MODE_SHIFT, + rate_mode); +} + +void hw_atl_tps_tx_desc_rate_en_set(struct aq_hw_s *aq_hw, const u32 desc, + const u32 enable) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_EN_ADR(desc), + HW_ATL_TPS_DESC_RATE_EN_MSK, + HW_ATL_TPS_DESC_RATE_EN_SHIFT, + enable); +} + +void hw_atl_tps_tx_desc_rate_x_set(struct aq_hw_s *aq_hw, const u32 desc, + const u32 rate_int) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_X_ADR(desc), + HW_ATL_TPS_DESC_RATE_X_MSK, + HW_ATL_TPS_DESC_RATE_X_SHIFT, + rate_int); +} + +void hw_atl_tps_tx_desc_rate_y_set(struct aq_hw_s *aq_hw, const u32 desc, + const u32 rate_frac) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_Y_ADR(desc), + HW_ATL_TPS_DESC_RATE_Y_MSK, + HW_ATL_TPS_DESC_RATE_Y_SHIFT, + rate_frac); } /* tx */ diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h index b88cb84805d5..61a6f70c51cd 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h @@ -688,13 +688,13 @@ void hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(struct aq_hw_s *aq_hw, /* set tx packet scheduler descriptor tc max credit */ void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw, - u32 max_credit, - u32 tc); + const u32 tc, + const u32 max_credit); /* set tx packet scheduler descriptor tc weight */ void hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(struct aq_hw_s *aq_hw, - u32 tx_pkt_shed_desc_tc_weight, - u32 tc); + const u32 tc, + const u32 weight); /* set tx packet scheduler descriptor vm arbitration mode */ void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw, @@ -702,13 +702,29 @@ void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw, /* set tx packet scheduler tc data max credit */ void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw, - u32 max_credit, - u32 tc); + const u32 tc, + const u32 max_credit); /* set tx packet scheduler tc data weight */ void hw_atl_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw, - u32 tx_pkt_shed_tc_data_weight, - u32 tc); + const u32 tc, + const u32 weight); + +/* set tx descriptor rate mode */ +void hw_atl_tps_tx_desc_rate_mode_set(struct aq_hw_s *aq_hw, + const u32 rate_mode); + +/* set tx packet scheduler descriptor rate enable */ +void hw_atl_tps_tx_desc_rate_en_set(struct aq_hw_s *aq_hw, const u32 desc, + const u32 enable); + +/* set tx packet scheduler descriptor rate integral value */ +void hw_atl_tps_tx_desc_rate_x_set(struct aq_hw_s *aq_hw, const u32 desc, + const u32 rate_int); + +/* set tx packet scheduler descriptor rate fractional value */ +void hw_atl_tps_tx_desc_rate_y_set(struct aq_hw_s *aq_hw, const u32 desc, + const u32 rate_frac); /* tx */ diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h index 18de2f7b8959..06220792daf1 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h @@ -2038,6 +2038,42 @@ /* default value of bitfield lso_tcp_flag_mid[b:0] */ #define HW_ATL_THM_LSO_TCP_FLAG_MID_DEFAULT 0x0 +/* tx tx_tc_mode bitfield definitions + * preprocessor definitions for the bitfield "tx_tc_mode". + * port="pif_tpb_tx_tc_mode_i,pif_tps_tx_tc_mode_i" + */ + +/* register address for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900 +/* bitmask for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100 +/* inverted bitmask for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF +/* lower bit position of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8 +/* width of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1 +/* default value of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0 + +/* tx tx_desc_rate_mode bitfield definitions + * preprocessor definitions for the bitfield "tx_desc_rate_mode". + * port="pif_tps_desc_rate_mode_i" + */ + +/* register address for bitfield tx_desc_rate_mode */ +#define HW_ATL_TPS_TX_DESC_RATE_MODE_ADR 0x00007900 +/* bitmask for bitfield tx_desc_rate_mode */ +#define HW_ATL_TPS_TX_DESC_RATE_MODE_MSK 0x00000080 +/* inverted bitmask for bitfield tx_desc_rate_mode */ +#define HW_ATL_TPS_TX_DESC_RATE_MODE_MSKN 0xFFFFFF7F +/* lower bit position of bitfield tx_desc_rate_mode */ +#define HW_ATL_TPS_TX_DESC_RATE_MODE_SHIFT 7 +/* width of bitfield tx_desc_rate_mode */ +#define HW_ATL_TPS_TX_DESC_RATE_MODE_WIDTH 1 +/* default value of bitfield tx_desc_rate_mode */ +#define HW_ATL_TPS_TX_DESC_RATE_MODE_DEFAULT 0x0 + /* tx tx_buf_en bitfield definitions * preprocessor definitions for the bitfield "tx_buf_en". * port="pif_tpb_tx_buf_en_i" @@ -2056,19 +2092,6 @@ /* default value of bitfield tx_buf_en */ #define HW_ATL_TPB_TX_BUF_EN_DEFAULT 0x0 -/* register address for bitfield tx_tc_mode */ -#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900 -/* bitmask for bitfield tx_tc_mode */ -#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100 -/* inverted bitmask for bitfield tx_tc_mode */ -#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF -/* lower bit position of bitfield tx_tc_mode */ -#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8 -/* width of bitfield tx_tc_mode */ -#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1 -/* default value of bitfield tx_tc_mode */ -#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0 - /* tx tx{b}_hi_thresh[c:0] bitfield definitions * preprocessor definitions for the bitfield "tx{b}_hi_thresh[c:0]". * parameter: buffer {b} | stride size 0x10 | range [0, 7] @@ -2270,6 +2293,58 @@ /* default value of bitfield data_tc_arb_mode */ #define HW_ATL_TPS_DATA_TC_ARB_MODE_DEFAULT 0x0 +/* tx desc{r}_rate_en bitfield definitions + * preprocessor definitions for the bitfield "desc{r}_rate_en". + * port="pif_tps_desc_rate_en_i[0]" + */ + +/* register address for bitfield desc{r}_rate_en */ +#define HW_ATL_TPS_DESC_RATE_EN_ADR(desc) (0x00007408 + (desc) * 0x10) +/* bitmask for bitfield desc{r}_rate_en */ +#define HW_ATL_TPS_DESC_RATE_EN_MSK 0x80000000 +/* inverted bitmask for bitfield desc{r}_rate_en */ +#define HW_ATL_TPS_DESC_RATE_EN_MSKN 0x7FFFFFFF +/* lower bit position of bitfield desc{r}_rate_en */ +#define HW_ATL_TPS_DESC_RATE_EN_SHIFT 31 +/* width of bitfield desc{r}_rate_en */ +#define HW_ATL_TPS_DESC_RATE_EN_WIDTH 1 +/* default value of bitfield desc{r}_rate_en */ +#define HW_ATL_TPS_DESC_RATE_EN_DEFAULT 0x0 + +/* tx desc{r}_rate_x bitfield definitions + * preprocessor definitions for the bitfield "desc{r}_rate_x". + * port="pif_tps_desc0_rate_x" + */ +/* register address for bitfield desc{r}_rate_x */ +#define HW_ATL_TPS_DESC_RATE_X_ADR(desc) (0x00007408 + (desc) * 0x10) +/* bitmask for bitfield desc{r}_rate_x */ +#define HW_ATL_TPS_DESC_RATE_X_MSK 0x03FF0000 +/* inverted bitmask for bitfield desc{r}_rate_x */ +#define HW_ATL_TPS_DESC_RATE_X_MSKN 0xFC00FFFF +/* lower bit position of bitfield desc{r}_rate_x */ +#define HW_ATL_TPS_DESC_RATE_X_SHIFT 16 +/* width of bitfield desc{r}_rate_x */ +#define HW_ATL_TPS_DESC_RATE_X_WIDTH 10 +/* default value of bitfield desc{r}_rate_x */ +#define HW_ATL_TPS_DESC_RATE_X_DEFAULT 0x0 + +/* tx desc{r}_rate_y bitfield definitions + * preprocessor definitions for the bitfield "desc{r}_rate_y". + * port="pif_tps_desc0_rate_y" + */ +/* register address for bitfield desc{r}_rate_y */ +#define HW_ATL_TPS_DESC_RATE_Y_ADR(desc) (0x00007408 + (desc) * 0x10) +/* bitmask for bitfield desc{r}_rate_y */ +#define HW_ATL_TPS_DESC_RATE_Y_MSK 0x00003FFF +/* inverted bitmask for bitfield desc{r}_rate_y */ +#define HW_ATL_TPS_DESC_RATE_Y_MSKN 0xFFFFC000 +/* lower bit position of bitfield desc{r}_rate_y */ +#define HW_ATL_TPS_DESC_RATE_Y_SHIFT 0 +/* width of bitfield desc{r}_rate_y */ +#define HW_ATL_TPS_DESC_RATE_Y_WIDTH 14 +/* default value of bitfield desc{r}_rate_y */ +#define HW_ATL_TPS_DESC_RATE_Y_DEFAULT 0x0 + /* tx desc_rate_ta_rst bitfield definitions * preprocessor definitions for the bitfield "desc_rate_ta_rst". * port="pif_tps_desc_rate_ta_rst_i" diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c index 6f2b33ae3d06..8df9d4ef36f0 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c @@ -10,6 +10,7 @@ #include "hw_atl/hw_atl_b0.h" #include "hw_atl/hw_atl_utils.h" #include "hw_atl/hw_atl_llh.h" +#include "hw_atl/hw_atl_llh_internal.h" #include "hw_atl2_utils.h" #include "hw_atl2_llh.h" #include "hw_atl2_internal.h" @@ -23,7 +24,7 @@ static int hw_atl2_act_rslvr_table_set(struct aq_hw_s *self, u8 location, .msix_irqs = 8U, \ .irq_mask = ~0U, \ .vecs = HW_ATL2_RSS_MAX, \ - .tcs = HW_ATL2_TC_MAX, \ + .tcs_max = HW_ATL2_TC_MAX, \ .rxd_alignment = 1U, \ .rxd_size = HW_ATL2_RXD_SIZE, \ .rxds_max = HW_ATL2_MAX_RXD, \ @@ -47,7 +48,8 @@ static int hw_atl2_act_rslvr_table_set(struct aq_hw_s *self, u8 location, NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | \ NETIF_F_GSO_UDP_L4 | \ - NETIF_F_GSO_PARTIAL, \ + NETIF_F_GSO_PARTIAL | \ + NETIF_F_HW_TC, \ .hw_priv_flags = IFF_UNICAST_FLT, \ .flow_control = true, \ .mtu = HW_ATL2_MTU_JUMBO, \ @@ -91,16 +93,49 @@ static int hw_atl2_hw_reset(struct aq_hw_s *self) static int hw_atl2_hw_queue_to_tc_map_set(struct aq_hw_s *self) { - if (!hw_atl_rpb_rpf_rx_traf_class_mode_get(self)) { - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(0), 0x11110000); - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(8), 0x33332222); - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(16), 0x55554444); - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(24), 0x77776666); - } else { - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(0), 0x00000000); - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(8), 0x11111111); - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(16), 0x22222222); - aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(24), 0x33333333); + struct aq_nic_cfg_s *cfg = self->aq_nic_cfg; + unsigned int tcs, q_per_tc; + unsigned int tc, q; + u32 rx_map = 0; + u32 tx_map = 0; + + hw_atl2_tpb_tx_tc_q_rand_map_en_set(self, 1U); + + switch (cfg->tc_mode) { + case AQ_TC_MODE_8TCS: + tcs = 8; + q_per_tc = 4; + break; + case AQ_TC_MODE_4TCS: + tcs = 4; + q_per_tc = 8; + break; + default: + return -EINVAL; + } + + for (tc = 0; tc != tcs; tc++) { + unsigned int tc_q_offset = tc * q_per_tc; + + for (q = tc_q_offset; q != tc_q_offset + q_per_tc; q++) { + rx_map |= tc << HW_ATL2_RX_Q_TC_MAP_SHIFT(q); + if (HW_ATL2_RX_Q_TC_MAP_ADR(q) != + HW_ATL2_RX_Q_TC_MAP_ADR(q + 1)) { + aq_hw_write_reg(self, + HW_ATL2_RX_Q_TC_MAP_ADR(q), + rx_map); + rx_map = 0; + } + + tx_map |= tc << HW_ATL2_TX_Q_TC_MAP_SHIFT(q); + if (HW_ATL2_TX_Q_TC_MAP_ADR(q) != + HW_ATL2_TX_Q_TC_MAP_ADR(q + 1)) { + aq_hw_write_reg(self, + HW_ATL2_TX_Q_TC_MAP_ADR(q), + tx_map); + tx_map = 0; + } + } } return aq_hw_err_from_flags(self); @@ -112,7 +147,6 @@ static int hw_atl2_hw_qos_set(struct aq_hw_s *self) u32 tx_buff_size = HW_ATL2_TXBUF_MAX; u32 rx_buff_size = HW_ATL2_RXBUF_MAX; unsigned int prio = 0U; - u32 threshold = 0U; u32 tc = 0U; /* TPS Descriptor rate init */ @@ -122,42 +156,36 @@ static int hw_atl2_hw_qos_set(struct aq_hw_s *self) /* TPS VM init */ hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(self, 0U); - /* TPS TC credits init */ - hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U); - hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U); + tx_buff_size /= cfg->tcs; + rx_buff_size /= cfg->tcs; + for (tc = 0; tc < cfg->tcs; tc++) { + u32 threshold = 0U; - tc = 0; + /* Tx buf size TC0 */ + hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc); - /* TX Packet Scheduler Data TC0 */ - hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF0, tc); - hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(self, 0x640, tc); - hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, tc); - hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, tc); + threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U; + hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc); - /* Tx buf size TC0 */ - hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc); + threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U; + hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc); - threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U; - hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc); + /* QoS Rx buf size per TC */ + hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc); - threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U; - hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc); + threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U; + hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc); - /* QoS Rx buf size per TC */ - hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc); - - threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U; - hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc); - - threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U; - hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc); + threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U; + hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc); + } /* QoS 802.1p priority -> TC mapping */ for (prio = 0; prio < 8; ++prio) hw_atl_rpf_rpb_user_priority_tc_map_set(self, prio, - cfg->tcs * prio / 8); + cfg->prio_tc_map[prio]); - /* ATL2 Apply legacy ring to TC mapping */ + /* ATL2 Apply ring to TC mapping */ hw_atl2_hw_queue_to_tc_map_set(self); return aq_hw_err_from_flags(self); @@ -166,19 +194,149 @@ static int hw_atl2_hw_qos_set(struct aq_hw_s *self) static int hw_atl2_hw_rss_set(struct aq_hw_s *self, struct aq_rss_parameters *rss_params) { - u8 *indirection_table = rss_params->indirection_table; + u8 *indirection_table = rss_params->indirection_table; + const u32 num_tcs = aq_hw_num_tcs(self); + u32 rpf_redir2_enable; + int tc; int i; - for (i = HW_ATL2_RSS_REDIRECTION_MAX; i--;) - hw_atl2_new_rpf_rss_redir_set(self, 0, i, indirection_table[i]); + rpf_redir2_enable = num_tcs > 4 ? 1 : 0; + + hw_atl2_rpf_redirection_table2_select_set(self, rpf_redir2_enable); + + for (i = HW_ATL2_RSS_REDIRECTION_MAX; i--;) { + for (tc = 0; tc != num_tcs; tc++) { + hw_atl2_new_rpf_rss_redir_set(self, tc, i, + tc * + aq_hw_q_per_tc(self) + + indirection_table[i]); + } + } + + return aq_hw_err_from_flags(self); +} + +static int hw_atl2_hw_init_tx_tc_rate_limit(struct aq_hw_s *self) +{ + static const u32 max_weight = BIT(HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH) - 1; + /* Scale factor is based on the number of bits in fractional portion */ + static const u32 scale = BIT(HW_ATL_TPS_DESC_RATE_Y_WIDTH); + static const u32 frac_msk = HW_ATL_TPS_DESC_RATE_Y_MSK >> + HW_ATL_TPS_DESC_RATE_Y_SHIFT; + const u32 link_speed = self->aq_link_status.mbps; + struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg; + unsigned long num_min_rated_tcs = 0; + u32 tc_weight[AQ_CFG_TCS_MAX]; + u32 fixed_max_credit_4b; + u32 fixed_max_credit; + u8 min_rate_msk = 0; + u32 sum_weight = 0; + int tc; + + /* By default max_credit is based upon MTU (in unit of 64b) */ + fixed_max_credit = nic_cfg->aq_hw_caps->mtu / 64; + /* in unit of 4b */ + fixed_max_credit_4b = nic_cfg->aq_hw_caps->mtu / 4; + + if (link_speed) { + min_rate_msk = nic_cfg->tc_min_rate_msk & + (BIT(nic_cfg->tcs) - 1); + num_min_rated_tcs = hweight8(min_rate_msk); + } + + /* First, calculate weights where min_rate is specified */ + if (num_min_rated_tcs) { + for (tc = 0; tc != nic_cfg->tcs; tc++) { + if (!nic_cfg->tc_min_rate[tc]) { + tc_weight[tc] = 0; + continue; + } + + tc_weight[tc] = (-1L + link_speed + + nic_cfg->tc_min_rate[tc] * + max_weight) / + link_speed; + tc_weight[tc] = min(tc_weight[tc], max_weight); + sum_weight += tc_weight[tc]; + } + } + + /* WSP, if min_rate is set for at least one TC. + * RR otherwise. + */ + hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(self, min_rate_msk ? 1U : 0U); + /* Data TC Arbiter takes precedence over Descriptor TC Arbiter, + * leave Descriptor TC Arbiter as RR. + */ + hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U); + + hw_atl_tps_tx_desc_rate_mode_set(self, nic_cfg->is_qos ? 1U : 0U); + + for (tc = 0; tc != nic_cfg->tcs; tc++) { + const u32 en = (nic_cfg->tc_max_rate[tc] != 0) ? 1U : 0U; + const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0); + u32 weight, max_credit; + + hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, tc, + fixed_max_credit); + hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, tc, 0x1E); + + if (num_min_rated_tcs) { + weight = tc_weight[tc]; + + if (!weight && sum_weight < max_weight) + weight = (max_weight - sum_weight) / + (nic_cfg->tcs - num_min_rated_tcs); + else if (!weight) + weight = 0x640; + + max_credit = max(2 * weight, fixed_max_credit_4b); + } else { + weight = 0x640; + max_credit = 0xFFF0; + } + + hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(self, tc, weight); + hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(self, tc, + max_credit); + + hw_atl_tps_tx_desc_rate_en_set(self, desc, en); + + if (en) { + /* Nominal rate is always 10G */ + const u32 rate = 10000U * scale / + nic_cfg->tc_max_rate[tc]; + const u32 rate_int = rate >> + HW_ATL_TPS_DESC_RATE_Y_WIDTH; + const u32 rate_frac = rate & frac_msk; + + hw_atl_tps_tx_desc_rate_x_set(self, desc, rate_int); + hw_atl_tps_tx_desc_rate_y_set(self, desc, rate_frac); + } else { + /* A value of 1 indicates the queue is not + * rate controlled. + */ + hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U); + hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U); + } + } + for (tc = nic_cfg->tcs; tc != AQ_CFG_TCS_MAX; tc++) { + const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0); + + hw_atl_tps_tx_desc_rate_en_set(self, desc, 0U); + hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U); + hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U); + } return aq_hw_err_from_flags(self); } static int hw_atl2_hw_init_tx_path(struct aq_hw_s *self) { + struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg; + /* Tx TC/RSS number config */ - hw_atl_tpb_tps_tx_tc_mode_set(self, 1U); + hw_atl_tpb_tps_tx_tc_mode_set(self, nic_cfg->tc_mode); hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U); hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U); @@ -201,13 +359,29 @@ static int hw_atl2_hw_init_tx_path(struct aq_hw_s *self) static void hw_atl2_hw_init_new_rx_filters(struct aq_hw_s *self) { struct hw_atl2_priv *priv = (struct hw_atl2_priv *)self->priv; + u8 *prio_tc_map = self->aq_nic_cfg->prio_tc_map; + u16 action; u8 index; + int i; + /* Action Resolver Table (ART) is used by RPF to decide which action + * to take with a packet based upon input tag and tag mask, where: + * - input tag is a combination of 3-bit VLan Prio (PTP) and + * 29-bit concatenation of all tags from filter block; + * - tag mask is a mask used for matching against input tag. + * The input_tag is compared with the all the Requested_tags in the + * Record table to find a match. Action field of the selected matched + * REC entry is used for further processing. If multiple entries match, + * the lowest REC entry, Action field will be selected. + */ hw_atl2_rpf_act_rslvr_section_en_set(self, 0xFFFF); hw_atl2_rpfl2_uc_flr_tag_set(self, HW_ATL2_RPF_TAG_BASE_UC, HW_ATL2_MAC_UC); hw_atl2_rpfl2_bc_flr_tag_set(self, HW_ATL2_RPF_TAG_BASE_UC); + /* FW reserves the beginning of ART, thus all driver entries must + * start from the offset specified in FW caps. + */ index = priv->art_base_index + HW_ATL2_RPF_L2_PROMISC_OFF_INDEX; hw_atl2_act_rslvr_table_set(self, index, 0, HW_ATL2_RPF_TAG_UC_MASK | @@ -220,33 +394,17 @@ static void hw_atl2_hw_init_new_rx_filters(struct aq_hw_s *self) HW_ATL2_RPF_TAG_UNTAG_MASK, HW_ATL2_ACTION_DROP); - index = priv->art_base_index + HW_ATL2_RPF_VLAN_INDEX; - hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_VLAN, - HW_ATL2_RPF_TAG_VLAN_MASK, - HW_ATL2_ACTION_ASSIGN_TC(0)); - - index = priv->art_base_index + HW_ATL2_RPF_MAC_INDEX; - hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_UC, - HW_ATL2_RPF_TAG_UC_MASK, - HW_ATL2_ACTION_ASSIGN_TC(0)); - - index = priv->art_base_index + HW_ATL2_RPF_ALLMC_INDEX; - hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_ALLMC, - HW_ATL2_RPF_TAG_ALLMC_MASK, - HW_ATL2_ACTION_ASSIGN_TC(0)); + /* Configure ART to map given VLan Prio (PCP) to the TC index for + * RSS redirection table. + */ + for (i = 0; i < 8; i++) { + action = HW_ATL2_ACTION_ASSIGN_TC(prio_tc_map[i]); - index = priv->art_base_index + HW_ATL2_RPF_UNTAG_INDEX; - hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_UNTAG_MASK, - HW_ATL2_RPF_TAG_UNTAG_MASK, - HW_ATL2_ACTION_ASSIGN_TC(0)); - - index = priv->art_base_index + HW_ATL2_RPF_VLAN_PROMISC_ON_INDEX; - hw_atl2_act_rslvr_table_set(self, index, 0, HW_ATL2_RPF_TAG_VLAN_MASK, - HW_ATL2_ACTION_DISABLE); - - index = priv->art_base_index + HW_ATL2_RPF_L2_PROMISC_ON_INDEX; - hw_atl2_act_rslvr_table_set(self, index, 0, HW_ATL2_RPF_TAG_UC_MASK, - HW_ATL2_ACTION_DISABLE); + index = priv->art_base_index + HW_ATL2_RPF_PCP_TO_TC_INDEX + i; + hw_atl2_act_rslvr_table_set(self, index, + i << HW_ATL2_RPF_TAG_PCP_OFFSET, + HW_ATL2_RPF_TAG_PCP_MASK, action); + } } static void hw_atl2_hw_new_rx_filter_vlan_promisc(struct aq_hw_s *self, @@ -309,7 +467,7 @@ static int hw_atl2_hw_init_rx_path(struct aq_hw_s *self) int i; /* Rx TC/RSS number config */ - hw_atl_rpb_rpf_rx_traf_class_mode_set(self, 1U); + hw_atl_rpb_rpf_rx_traf_class_mode_set(self, cfg->tc_mode); /* Rx flow control */ hw_atl_rpb_rx_flow_ctl_mode_set(self, 1U); @@ -317,9 +475,7 @@ static int hw_atl2_hw_init_rx_path(struct aq_hw_s *self) hw_atl2_rpf_rss_hash_type_set(self, HW_ATL2_RPF_RSS_HASH_TYPE_ALL); /* RSS Ring selection */ - hw_atl_reg_rx_flr_rss_control1set(self, cfg->is_rss ? - HW_ATL_RSS_ENABLED_3INDEX_BITS : - HW_ATL_RSS_DISABLED); + hw_atl_b0_hw_init_rx_rss_ctrl1(self); /* Multicast filters */ for (i = HW_ATL2_MAC_MAX; i--;) { @@ -678,6 +834,7 @@ const struct aq_hw_ops hw_atl2_ops = { .hw_interrupt_moderation_set = hw_atl2_hw_interrupt_moderation_set, .hw_rss_set = hw_atl2_hw_rss_set, .hw_rss_hash_set = hw_atl_b0_hw_rss_hash_set, + .hw_tc_rate_limit_set = hw_atl2_hw_init_tx_tc_rate_limit, .hw_get_hw_stats = hw_atl2_utils_get_hw_stats, .hw_get_fw_version = hw_atl2_utils_get_fw_version, .hw_set_offload = hw_atl_b0_hw_offload_set, diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_internal.h index e66b3583bfe9..5a89bb8722f9 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_internal.h @@ -31,7 +31,7 @@ #define HW_ATL2_RSS_REDIRECTION_MAX 64U -#define HW_ATL2_TC_MAX 1U +#define HW_ATL2_TC_MAX 8U #define HW_ATL2_RSS_MAX 8U #define HW_ATL2_INTR_MODER_MAX 0x1FF @@ -82,13 +82,6 @@ enum HW_ATL2_RPF_ART_INDEX { HW_ATL2_RPF_VLAN_USER_INDEX = HW_ATL2_RPF_ET_PCP_USER_INDEX + 16, HW_ATL2_RPF_PCP_TO_TC_INDEX = HW_ATL2_RPF_VLAN_USER_INDEX + HW_ATL_VLAN_MAX_FILTERS, - HW_ATL2_RPF_VLAN_INDEX = HW_ATL2_RPF_PCP_TO_TC_INDEX + - AQ_CFG_TCS_MAX, - HW_ATL2_RPF_MAC_INDEX, - HW_ATL2_RPF_ALLMC_INDEX, - HW_ATL2_RPF_UNTAG_INDEX, - HW_ATL2_RPF_VLAN_PROMISC_ON_INDEX, - HW_ATL2_RPF_L2_PROMISC_ON_INDEX, }; #define HW_ATL2_ACTION(ACTION, RSS, INDEX, VALID) \ @@ -124,9 +117,6 @@ enum HW_ATL2_RPF_RSS_HASH_TYPE { HW_ATL2_RPF_RSS_HASH_TYPE_IPV6_EX_UDP, }; -#define HW_ATL_RSS_DISABLED 0x00000000U -#define HW_ATL_RSS_ENABLED_3INDEX_BITS 0xB3333333U - #define HW_ATL_MCAST_FLT_ANY_TO_HOST 0x00010FFFU struct hw_atl2_priv { diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c index e779d70fde66..cd954b11d24a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c @@ -7,6 +7,14 @@ #include "hw_atl2_llh_internal.h" #include "aq_hw_utils.h" +void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw, + u32 select) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_ADR, + HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSK, + HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_SHIFT, select); +} + void hw_atl2_rpf_rss_hash_type_set(struct aq_hw_s *aq_hw, u32 rss_hash_type) { aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_PIF_RPF_RSS_HASH_TYPEI_ADR, @@ -60,6 +68,15 @@ void hw_atl2_rpf_vlan_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter) /* TX */ +void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw, + const u32 tc_q_rand_map_en) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_ADR, + HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSK, + HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_SHIFT, + tc_q_rand_map_en); +} + void hw_atl2_tpb_tx_buf_clk_gate_en_set(struct aq_hw_s *aq_hw, u32 clk_gate_en) { aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_TX_BUF_CLK_GATE_EN_ADR, @@ -76,9 +93,18 @@ void hw_atl2_reg_tx_intr_moder_ctrl_set(struct aq_hw_s *aq_hw, tx_intr_moderation_ctl); } +void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw, + const u32 data_arb_mode) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TC_ARB_MODE_ADR, + HW_ATL2_TPS_DATA_TC_ARB_MODE_MSK, + HW_ATL2_TPS_DATA_TC_ARB_MODE_SHIFT, + data_arb_mode); +} + void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw, - u32 max_credit, - u32 tc) + const u32 tc, + const u32 max_credit) { aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TCTCREDIT_MAX_ADR(tc), HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK, @@ -87,13 +113,13 @@ void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw, } void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw, - u32 tx_pkt_shed_tc_data_weight, - u32 tc) + const u32 tc, + const u32 weight) { aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TCTWEIGHT_ADR(tc), HW_ATL2_TPS_DATA_TCTWEIGHT_MSK, HW_ATL2_TPS_DATA_TCTWEIGHT_SHIFT, - tx_pkt_shed_tc_data_weight); + weight); } u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h index 8c6d78a64d42..98c7a4621297 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h @@ -15,6 +15,10 @@ void hw_atl2_reg_tx_intr_moder_ctrl_set(struct aq_hw_s *aq_hw, u32 tx_intr_moderation_ctl, u32 queue); +/* Set Redirection Table 2 Select */ +void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw, + u32 select); + /** Set RSS HASH type */ void hw_atl2_rpf_rss_hash_type_set(struct aq_hw_s *aq_hw, u32 rss_hash_type); @@ -34,18 +38,25 @@ void hw_atl2_new_rpf_rss_redir_set(struct aq_hw_s *aq_hw, u32 tc, u32 index, /* Set VLAN filter tag */ void hw_atl2_rpf_vlan_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter); +/* set tx random TC-queue mapping enable bit */ +void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw, + const u32 tc_q_rand_map_en); + /* set tx buffer clock gate enable */ void hw_atl2_tpb_tx_buf_clk_gate_en_set(struct aq_hw_s *aq_hw, u32 clk_gate_en); +void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw, + const u32 data_arb_mode); + /* set tx packet scheduler tc data max credit */ void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw, - u32 max_credit, - u32 tc); + const u32 tc, + const u32 max_credit); /* set tx packet scheduler tc data weight */ void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw, - u32 tx_pkt_shed_tc_data_weight, - u32 tc); + const u32 tc, + const u32 weight); u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h index cde9e9d2836d..e34c5cda061e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h @@ -6,6 +6,16 @@ #ifndef HW_ATL2_LLH_INTERNAL_H #define HW_ATL2_LLH_INTERNAL_H +/* RX pif_rpf_redir_2_en_i Bitfield Definitions + * PORT="pif_rpf_redir_2_en_i" + */ +#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_ADR 0x000054C8 +#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSK 0x00001000 +#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSKN 0xFFFFEFFF +#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_SHIFT 12 +#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_WIDTH 1 +#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_DEFAULT 0x0 + /* RX pif_rpf_rss_hash_type_i Bitfield Definitions */ #define HW_ATL2_RPF_PIF_RPF_RSS_HASH_TYPEI_ADR 0x000054C8 @@ -122,6 +132,24 @@ /* Default value of bitfield rx_q{Q}_tc_map[2:0] */ #define HW_ATL2_RX_Q_TC_MAP_DEFAULT 0x0 +/* tx tx_tc_q_rand_map_en bitfield definitions + * preprocessor definitions for the bitfield "tx_tc_q_rand_map_en". + * port="pif_tpb_tx_tc_q_rand_map_en_i" + */ + +/* register address for bitfield tx_tc_q_rand_map_en */ +#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_ADR 0x00007900 +/* bitmask for bitfield tx_tc_q_rand_map_en */ +#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSK 0x00000200 +/* inverted bitmask for bitfield tx_tc_q_rand_map_en */ +#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSKN 0xFFFFFDFF +/* lower bit position of bitfield tx_tc_q_rand_map_en */ +#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_SHIFT 9 +/* width of bitfield tx_tc_q_rand_map_en */ +#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_WIDTH 1 +/* default value of bitfield tx_tc_q_rand_map_en */ +#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_DEFAULT 0x0 + /* tx tx_buffer_clk_gate_en bitfield definitions * preprocessor definitions for the bitfield "tx_buffer_clk_gate_en". * port="pif_tpb_tx_buffer_clk_gate_en_i" @@ -140,42 +168,77 @@ /* default value of bitfield tx_buffer_clk_gate_en */ #define HW_ATL2_TPB_TX_BUF_CLK_GATE_EN_DEFAULT 0x0 -/* tx data_tc{t}_credit_max[b:0] bitfield definitions - * preprocessor definitions for the bitfield "data_tc{t}_credit_max[b:0]". +/* tx tx_q_tc_map{q} bitfield definitions + * preprocessor definitions for the bitfield "tx_q_tc_map{q}". + * parameter: queue {q} | bit-level stride | range [0, 31] + * port="pif_tpb_tx_q_tc_map0_i[2:0]" + */ + +/* register address for bitfield tx_q_tc_map{q} */ +#define HW_ATL2_TX_Q_TC_MAP_ADR(queue) \ + (((queue) < 32) ? 0x0000799C + ((queue) / 4) * 4 : 0) +/* lower bit position of bitfield tx_q_tc_map{q} */ +#define HW_ATL2_TX_Q_TC_MAP_SHIFT(queue) \ + (((queue) < 32) ? ((queue) * 8) % 32 : 0) +/* width of bitfield tx_q_tc_map{q} */ +#define HW_ATL2_TX_Q_TC_MAP_WIDTH 3 +/* default value of bitfield tx_q_tc_map{q} */ +#define HW_ATL2_TX_Q_TC_MAP_DEFAULT 0x0 + +/* tx data_tc_arb_mode bitfield definitions + * preprocessor definitions for the bitfield "data_tc_arb_mode". + * port="pif_tps_data_tc_arb_mode_i" + */ + +/* register address for bitfield data_tc_arb_mode */ +#define HW_ATL2_TPS_DATA_TC_ARB_MODE_ADR 0x00007100 +/* bitmask for bitfield data_tc_arb_mode */ +#define HW_ATL2_TPS_DATA_TC_ARB_MODE_MSK 0x00000003 +/* inverted bitmask for bitfield data_tc_arb_mode */ +#define HW_ATL2_TPS_DATA_TC_ARB_MODE_MSKN 0xfffffffc +/* lower bit position of bitfield data_tc_arb_mode */ +#define HW_ATL2_TPS_DATA_TC_ARB_MODE_SHIFT 0 +/* width of bitfield data_tc_arb_mode */ +#define HW_ATL2_TPS_DATA_TC_ARB_MODE_WIDTH 2 +/* default value of bitfield data_tc_arb_mode */ +#define HW_ATL2_TPS_DATA_TC_ARB_MODE_DEFAULT 0x0 + +/* tx data_tc{t}_credit_max[f:0] bitfield definitions + * preprocessor definitions for the bitfield "data_tc{t}_credit_max[f:0]". * parameter: tc {t} | stride size 0x4 | range [0, 7] - * port="pif_tps_data_tc0_credit_max_i[11:0]" + * port="pif_tps_data_tc0_credit_max_i[15:0]" */ -/* register address for bitfield data_tc{t}_credit_max[b:0] */ +/* register address for bitfield data_tc{t}_credit_max[f:0] */ #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_ADR(tc) (0x00007110 + (tc) * 0x4) -/* bitmask for bitfield data_tc{t}_credit_max[b:0] */ -#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK 0x0fff0000 -/* inverted bitmask for bitfield data_tc{t}_credit_max[b:0] */ -#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSKN 0xf000ffff -/* lower bit position of bitfield data_tc{t}_credit_max[b:0] */ +/* bitmask for bitfield data_tc{t}_credit_max[f:0] */ +#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK 0xffff0000 +/* inverted bitmask for bitfield data_tc{t}_credit_max[f:0] */ +#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSKN 0x0000ffff +/* lower bit position of bitfield data_tc{t}_credit_max[f:0] */ #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_SHIFT 16 -/* width of bitfield data_tc{t}_credit_max[b:0] */ -#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_WIDTH 12 -/* default value of bitfield data_tc{t}_credit_max[b:0] */ +/* width of bitfield data_tc{t}_credit_max[f:0] */ +#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_WIDTH 16 +/* default value of bitfield data_tc{t}_credit_max[f:0] */ #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_DEFAULT 0x0 -/* tx data_tc{t}_weight[8:0] bitfield definitions - * preprocessor definitions for the bitfield "data_tc{t}_weight[8:0]". +/* tx data_tc{t}_weight[e:0] bitfield definitions + * preprocessor definitions for the bitfield "data_tc{t}_weight[e:0]". * parameter: tc {t} | stride size 0x4 | range [0, 7] - * port="pif_tps_data_tc0_weight_i[8:0]" + * port="pif_tps_data_tc0_weight_i[14:0]" */ -/* register address for bitfield data_tc{t}_weight[8:0] */ +/* register address for bitfield data_tc{t}_weight[e:0] */ #define HW_ATL2_TPS_DATA_TCTWEIGHT_ADR(tc) (0x00007110 + (tc) * 0x4) -/* bitmask for bitfield data_tc{t}_weight[8:0] */ -#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSK 0x000001ff -/* inverted bitmask for bitfield data_tc{t}_weight[8:0] */ -#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSKN 0xfffffe00 -/* lower bit position of bitfield data_tc{t}_weight[8:0] */ +/* bitmask for bitfield data_tc{t}_weight[e:0] */ +#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSK 0x00007fff +/* inverted bitmask for bitfield data_tc{t}_weight[e:0] */ +#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSKN 0xffff8000 +/* lower bit position of bitfield data_tc{t}_weight[e:0] */ #define HW_ATL2_TPS_DATA_TCTWEIGHT_SHIFT 0 -/* width of bitfield data_tc{t}_weight[8:0] */ -#define HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH 9 -/* default value of bitfield data_tc{t}_weight[8:0] */ +/* width of bitfield data_tc{t}_weight[e:0] */ +#define HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH 15 +/* default value of bitfield data_tc{t}_weight[e:0] */ #define HW_ATL2_TPS_DATA_TCTWEIGHT_DEFAULT 0x0 /* tx interrupt moderation control register definitions diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index fc1405a8ed74..5a41801acb6a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -60,6 +60,7 @@ #define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__) extern struct list_head adapter_list; +extern struct list_head uld_list; extern struct mutex uld_mutex; /* Suspend an Ethernet Tx queue with fewer available descriptors than this. @@ -822,6 +823,13 @@ struct sge_uld_txq_info { u16 ntxq; /* # of egress uld queues */ }; +/* struct to maintain ULD list to reallocate ULD resources on hotplug */ +struct cxgb4_uld_list { + struct cxgb4_uld_info uld_info; + struct list_head list_node; + enum cxgb4_uld uld_type; +}; + enum sge_eosw_state { CXGB4_EO_STATE_CLOSED = 0, /* Not ready to accept traffic */ CXGB4_EO_STATE_FLOWC_OPEN_SEND, /* Send FLOWC open request */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index d05c2371d8c7..7a0414f379be 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -180,6 +180,7 @@ static struct dentry *cxgb4_debugfs_root; LIST_HEAD(adapter_list); DEFINE_MUTEX(uld_mutex); +LIST_HEAD(uld_list); static int cfg_queues(struct adapter *adap); @@ -6519,11 +6520,8 @@ fw_attach_fail: /* PCIe EEH recovery on powerpc platforms needs fundamental reset */ pdev->needs_freset = 1; - if (is_uld(adapter)) { - mutex_lock(&uld_mutex); - list_add_tail(&adapter->list_node, &adapter_list); - mutex_unlock(&uld_mutex); - } + if (is_uld(adapter)) + cxgb4_uld_enable(adapter); if (!is_t4(adapter->params.chip)) cxgb4_ptp_init(adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index e65b52375dd8..6b1d3df4b9ba 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -681,6 +681,74 @@ static void cxgb4_set_ktls_feature(struct adapter *adap, bool enable) } #endif +static void cxgb4_uld_alloc_resources(struct adapter *adap, + enum cxgb4_uld type, + const struct cxgb4_uld_info *p) +{ + int ret = 0; + + if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) || + (type != CXGB4_ULD_CRYPTO && !is_offload(adap))) + return; + if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip)) + return; + ret = cfg_queues_uld(adap, type, p); + if (ret) + goto out; + ret = setup_sge_queues_uld(adap, type, p->lro); + if (ret) + goto free_queues; + if (adap->flags & CXGB4_USING_MSIX) { + ret = request_msix_queue_irqs_uld(adap, type); + if (ret) + goto free_rxq; + } + if (adap->flags & CXGB4_FULL_INIT_DONE) + enable_rx_uld(adap, type); +#ifdef CONFIG_CHELSIO_TLS_DEVICE + /* send mbox to enable ktls related settings. */ + if (type == CXGB4_ULD_CRYPTO && + (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW)) + cxgb4_set_ktls_feature(adap, 1); +#endif + if (adap->uld[type].add) + goto free_irq; + ret = setup_sge_txq_uld(adap, type, p); + if (ret) + goto free_irq; + adap->uld[type] = *p; + ret = uld_attach(adap, type); + if (ret) + goto free_txq; + return; +free_txq: + release_sge_txq_uld(adap, type); +free_irq: + if (adap->flags & CXGB4_FULL_INIT_DONE) + quiesce_rx_uld(adap, type); + if (adap->flags & CXGB4_USING_MSIX) + free_msix_queue_irqs_uld(adap, type); +free_rxq: + free_sge_queues_uld(adap, type); +free_queues: + free_queues_uld(adap, type); +out: + dev_warn(adap->pdev_dev, + "ULD registration failed for uld type %d\n", type); +} + +void cxgb4_uld_enable(struct adapter *adap) +{ + struct cxgb4_uld_list *uld_entry; + + mutex_lock(&uld_mutex); + list_add_tail(&adap->list_node, &adapter_list); + list_for_each_entry(uld_entry, &uld_list, list_node) + cxgb4_uld_alloc_resources(adap, uld_entry->uld_type, + &uld_entry->uld_info); + mutex_unlock(&uld_mutex); +} + /* cxgb4_register_uld - register an upper-layer driver * @type: the ULD type * @p: the ULD methods @@ -691,63 +759,23 @@ static void cxgb4_set_ktls_feature(struct adapter *adap, bool enable) void cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p) { + struct cxgb4_uld_list *uld_entry; struct adapter *adap; - int ret = 0; if (type >= CXGB4_ULD_MAX) return; + uld_entry = kzalloc(sizeof(*uld_entry), GFP_KERNEL); + if (!uld_entry) + return; + + memcpy(&uld_entry->uld_info, p, sizeof(struct cxgb4_uld_info)); mutex_lock(&uld_mutex); - list_for_each_entry(adap, &adapter_list, list_node) { - if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) || - (type != CXGB4_ULD_CRYPTO && !is_offload(adap))) - continue; - if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip)) - continue; - ret = cfg_queues_uld(adap, type, p); - if (ret) - goto out; - ret = setup_sge_queues_uld(adap, type, p->lro); - if (ret) - goto free_queues; - if (adap->flags & CXGB4_USING_MSIX) { - ret = request_msix_queue_irqs_uld(adap, type); - if (ret) - goto free_rxq; - } - if (adap->flags & CXGB4_FULL_INIT_DONE) - enable_rx_uld(adap, type); -#ifdef CONFIG_CHELSIO_TLS_DEVICE - /* send mbox to enable ktls related settings. */ - if (type == CXGB4_ULD_CRYPTO && - (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW)) - cxgb4_set_ktls_feature(adap, 1); -#endif - if (adap->uld[type].add) - goto free_irq; - ret = setup_sge_txq_uld(adap, type, p); - if (ret) - goto free_irq; - adap->uld[type] = *p; - ret = uld_attach(adap, type); - if (ret) - goto free_txq; - continue; -free_txq: - release_sge_txq_uld(adap, type); -free_irq: - if (adap->flags & CXGB4_FULL_INIT_DONE) - quiesce_rx_uld(adap, type); - if (adap->flags & CXGB4_USING_MSIX) - free_msix_queue_irqs_uld(adap, type); -free_rxq: - free_sge_queues_uld(adap, type); -free_queues: - free_queues_uld(adap, type); -out: - dev_warn(adap->pdev_dev, - "ULD registration failed for uld type %d\n", type); - } + list_for_each_entry(adap, &adapter_list, list_node) + cxgb4_uld_alloc_resources(adap, type, p); + + uld_entry->uld_type = type; + list_add_tail(&uld_entry->list_node, &uld_list); mutex_unlock(&uld_mutex); return; } @@ -761,6 +789,7 @@ EXPORT_SYMBOL(cxgb4_register_uld); */ int cxgb4_unregister_uld(enum cxgb4_uld type) { + struct cxgb4_uld_list *uld_entry, *tmp; struct adapter *adap; if (type >= CXGB4_ULD_MAX) @@ -783,6 +812,13 @@ int cxgb4_unregister_uld(enum cxgb4_uld type) cxgb4_set_ktls_feature(adap, 0); #endif } + + list_for_each_entry_safe(uld_entry, tmp, &uld_list, list_node) { + if (uld_entry->uld_type == type) { + list_del(&uld_entry->list_node); + kfree(uld_entry); + } + } mutex_unlock(&uld_mutex); return 0; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 16796785eea3..085fa1424f9a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -327,6 +327,7 @@ enum cxgb4_control { CXGB4_CONTROL_DB_DROP, }; +struct adapter; struct pci_dev; struct l2t_data; struct net_device; @@ -465,6 +466,7 @@ struct cxgb4_uld_info { int (*tx_handler)(struct sk_buff *skb, struct net_device *dev); }; +void cxgb4_uld_enable(struct adapter *adap); void cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p); int cxgb4_unregister_uld(enum cxgb4_uld type); int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb); diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index 05bc6e216bca..d9fa4600f745 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -542,8 +542,13 @@ void e1000_reinit_locked(struct e1000_adapter *adapter) WARN_ON(in_interrupt()); while (test_and_set_bit(__E1000_RESETTING, &adapter->flags)) msleep(1); - e1000_down(adapter); - e1000_up(adapter); + + /* only run the task if not already down */ + if (!test_bit(__E1000_DOWN, &adapter->flags)) { + e1000_down(adapter); + e1000_up(adapter); + } + clear_bit(__E1000_RESETTING, &adapter->flags); } @@ -1433,10 +1438,15 @@ int e1000_close(struct net_device *netdev) struct e1000_hw *hw = &adapter->hw; int count = E1000_CHECK_RESET_COUNT; - while (test_bit(__E1000_RESETTING, &adapter->flags) && count--) + while (test_and_set_bit(__E1000_RESETTING, &adapter->flags) && count--) usleep_range(10000, 20000); - WARN_ON(test_bit(__E1000_RESETTING, &adapter->flags)); + WARN_ON(count < 0); + + /* signal that we're down so that the reset task will no longer run */ + set_bit(__E1000_DOWN, &adapter->flags); + clear_bit(__E1000_RESETTING, &adapter->flags); + e1000_down(adapter); e1000_power_down_phy(adapter); e1000_free_irq(adapter); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 735bf25952fc..f999cca37a8a 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -300,7 +300,11 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) * so forcibly disable it. */ hw->dev_spec.ich8lan.ulp_state = e1000_ulp_state_unknown; - e1000_disable_ulp_lpt_lp(hw, true); + ret_val = e1000_disable_ulp_lpt_lp(hw, true); + if (ret_val) { + e_warn("Failed to disable ULP\n"); + goto out; + } ret_val = hw->phy.ops.acquire(hw); if (ret_val) { diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index e0b074820b47..32f23a15ff64 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -107,6 +107,45 @@ static const struct e1000_reg_info e1000_reg_info_tbl[] = { {0, NULL} }; +struct e1000e_me_supported { + u16 device_id; /* supported device ID */ +}; + +static const struct e1000e_me_supported me_supported[] = { + {E1000_DEV_ID_PCH_LPT_I217_LM}, + {E1000_DEV_ID_PCH_LPTLP_I218_LM}, + {E1000_DEV_ID_PCH_I218_LM2}, + {E1000_DEV_ID_PCH_I218_LM3}, + {E1000_DEV_ID_PCH_SPT_I219_LM}, + {E1000_DEV_ID_PCH_SPT_I219_LM2}, + {E1000_DEV_ID_PCH_LBG_I219_LM3}, + {E1000_DEV_ID_PCH_SPT_I219_LM4}, + {E1000_DEV_ID_PCH_SPT_I219_LM5}, + {E1000_DEV_ID_PCH_CNP_I219_LM6}, + {E1000_DEV_ID_PCH_CNP_I219_LM7}, + {E1000_DEV_ID_PCH_ICP_I219_LM8}, + {E1000_DEV_ID_PCH_ICP_I219_LM9}, + {E1000_DEV_ID_PCH_CMP_I219_LM10}, + {E1000_DEV_ID_PCH_CMP_I219_LM11}, + {E1000_DEV_ID_PCH_CMP_I219_LM12}, + {E1000_DEV_ID_PCH_TGP_I219_LM13}, + {E1000_DEV_ID_PCH_TGP_I219_LM14}, + {E1000_DEV_ID_PCH_TGP_I219_LM15}, + {0} +}; + +static bool e1000e_check_me(u16 device_id) +{ + struct e1000e_me_supported *id; + + for (id = (struct e1000e_me_supported *)me_supported; + id->device_id; id++) + if (device_id == id->device_id) + return true; + + return false; +} + /** * __ew32_prepare - prepare to write to MAC CSR register on certain parts * @hw: pointer to the HW structure @@ -5294,6 +5333,10 @@ static void e1000_watchdog_task(struct work_struct *work) /* oops */ break; } + if (hw->mac.type == e1000_pch_spt) { + netdev->features &= ~NETIF_F_TSO; + netdev->features &= ~NETIF_F_TSO6; + } } /* enable transmits in the hardware, need to do this @@ -6912,7 +6955,8 @@ static int e1000e_pm_suspend(struct device *dev) e1000e_pm_thaw(dev); /* Introduce S0ix implementation */ - if (hw->mac.type >= e1000_pch_cnp) + if (hw->mac.type >= e1000_pch_cnp && + !e1000e_check_me(hw->adapter->pdev->device)) e1000e_s0ix_entry_flow(adapter); return rc; @@ -6927,7 +6971,8 @@ static int e1000e_pm_resume(struct device *dev) int rc; /* Introduce S0ix implementation */ - if (hw->mac.type >= e1000_pch_cnp) + if (hw->mac.type >= e1000_pch_cnp && + !e1000e_check_me(hw->adapter->pdev->device)) e1000e_s0ix_exit_flow(adapter); rc = __e1000_resume(pdev); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2a037ec244b9..ea7395b391e5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11,7 +11,7 @@ #include "i40e_diag.h" #include "i40e_xsk.h" #include <net/udp_tunnel.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* All i40e tracepoints are defined by the include below, which * must be included exactly once across the whole kernel with * CREATE_TRACE_POINTS defined @@ -3260,26 +3260,31 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) if (ring->vsi->type == I40E_VSI_MAIN) xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + kfree(ring->rx_bi); ring->xsk_umem = i40e_xsk_umem(ring); if (ring->xsk_umem) { - ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + ret = i40e_alloc_rx_bi_zc(ring); + if (ret) + return ret; + ring->rx_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem); /* For AF_XDP ZC, we disallow packets to span on * multiple buffers, thus letting us skip that * handling in the fast-path. */ chain_len = 1; - ring->zca.free = i40e_zca_free; ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &ring->zca); + MEM_TYPE_XSK_BUFF_POOL, + NULL); if (ret) return ret; dev_info(&vsi->back->pdev->dev, - "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", + "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->queue_index); } else { + ret = i40e_alloc_rx_bi(ring); + if (ret) + return ret; ring->rx_buf_len = vsi->rx_buf_len; if (ring->vsi->type == I40E_VSI_MAIN) { ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, @@ -3344,9 +3349,12 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q); writel(0, ring->tail); - ok = ring->xsk_umem ? - i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) : - !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); + if (ring->xsk_umem) { + xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq); + ok = i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)); + } else { + ok = !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); + } if (!ok) { /* Log this in case the user has forgotten to give the kernel * any buffers, even later in the application. diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index a3772beffe02..f613782f2f56 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -521,28 +521,29 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi, /** * i40e_fd_handle_status - check the Programming Status for FD * @rx_ring: the Rx ring for this descriptor - * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor. + * @qword0_raw: qword0 + * @qword1: qword1 after le_to_cpu * @prog_id: the id originally used for programming * * This is used to verify if the FD programming or invalidation * requested by SW to the HW is successful or not and take actions accordingly. **/ -void i40e_fd_handle_status(struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, u8 prog_id) +static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw, + u64 qword1, u8 prog_id) { struct i40e_pf *pf = rx_ring->vsi->back; struct pci_dev *pdev = pf->pdev; + struct i40e_32b_rx_wb_qw0 *qw0; u32 fcnt_prog, fcnt_avail; u32 error; - u64 qw; - qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len); - error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >> + qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw; + error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >> I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT; if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) { - pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id); - if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) || + pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id); + if (qw0->hi_dword.fd_id != 0 || (I40E_DEBUG_FD & pf->hw.debug_mask)) dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n", pf->fd_inv); @@ -560,7 +561,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring, /* store the current atr filter count */ pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf); - if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) && + if (qw0->hi_dword.fd_id == 0 && test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) { /* These set_bit() calls aren't atomic with the * test_bit() here, but worse case we potentially @@ -589,7 +590,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring, } else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) { if (I40E_DEBUG_FD & pf->hw.debug_mask) dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n", - rx_desc->wb.qword0.hi_dword.fd_id); + qw0->hi_dword.fd_id); } } @@ -1195,6 +1196,11 @@ clear_counts: rc->total_packets = 0; } +static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx) +{ + return &rx_ring->rx_bi[idx]; +} + /** * i40e_reuse_rx_page - page flip buffer and store it back on the ring * @rx_ring: rx descriptor ring to store buffers on @@ -1208,7 +1214,7 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring, struct i40e_rx_buffer *new_buff; u16 nta = rx_ring->next_to_alloc; - new_buff = &rx_ring->rx_bi[nta]; + new_buff = i40e_rx_bi(rx_ring, nta); /* update, and store next to alloc */ nta++; @@ -1227,29 +1233,10 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring, } /** - * i40e_rx_is_programming_status - check for programming status descriptor - * @qw: qword representing status_error_len in CPU ordering - * - * The value of in the descriptor length field indicate if this - * is a programming status descriptor for flow director or FCoE - * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise - * it is a packet descriptor. - **/ -static inline bool i40e_rx_is_programming_status(u64 qw) -{ - /* The Rx filter programming status and SPH bit occupy the same - * spot in the descriptor. Since we don't support packet split we - * can just reuse the bit as an indication that this is a - * programming status descriptor. - */ - return qw & I40E_RXD_QW1_LENGTH_SPH_MASK; -} - -/** - * i40e_clean_programming_status - try clean the programming status descriptor + * i40e_clean_programming_status - clean the programming status descriptor * @rx_ring: the rx ring that has this descriptor - * @rx_desc: the rx descriptor written back by HW - * @qw: qword representing status_error_len in CPU ordering + * @qword0_raw: qword0 + * @qword1: qword1 representing status_error_len in CPU ordering * * Flow director should handle FD_FILTER_STATUS to check its filter programming * status being successful or not and take actions accordingly. FCoE should @@ -1257,34 +1244,16 @@ static inline bool i40e_rx_is_programming_status(u64 qw) * * Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL. **/ -struct i40e_rx_buffer *i40e_clean_programming_status( - struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, - u64 qw) +void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw, + u64 qword1) { - struct i40e_rx_buffer *rx_buffer; - u32 ntc; u8 id; - if (!i40e_rx_is_programming_status(qw)) - return NULL; - - ntc = rx_ring->next_to_clean; - - /* fetch, update, and store next to clean */ - rx_buffer = &rx_ring->rx_bi[ntc++]; - ntc = (ntc < rx_ring->count) ? ntc : 0; - rx_ring->next_to_clean = ntc; - - prefetch(I40E_RX_DESC(rx_ring, ntc)); - - id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >> + id = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >> I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT; if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS) - i40e_fd_handle_status(rx_ring, rx_desc, id); - - return rx_buffer; + i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id); } /** @@ -1336,13 +1305,25 @@ err: return -ENOMEM; } +int i40e_alloc_rx_bi(struct i40e_ring *rx_ring) +{ + unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count; + + rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL); + return rx_ring->rx_bi ? 0 : -ENOMEM; +} + +static void i40e_clear_rx_bi(struct i40e_ring *rx_ring) +{ + memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count); +} + /** * i40e_clean_rx_ring - Free Rx buffers * @rx_ring: ring to be cleaned **/ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) { - unsigned long bi_size; u16 i; /* ring already cleared, nothing to do */ @@ -1361,7 +1342,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) /* Free all the Rx ring sk_buffs */ for (i = 0; i < rx_ring->count; i++) { - struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; + struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i); if (!rx_bi->page) continue; @@ -1388,8 +1369,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) } skip_free: - bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count; - memset(rx_ring->rx_bi, 0, bi_size); + if (rx_ring->xsk_umem) + i40e_clear_rx_bi_zc(rx_ring); + else + i40e_clear_rx_bi(rx_ring); /* Zero out the descriptor ring */ memset(rx_ring->desc, 0, rx_ring->size); @@ -1430,15 +1413,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring) int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) { struct device *dev = rx_ring->dev; - int err = -ENOMEM; - int bi_size; - - /* warn if we are about to overwrite the pointer */ - WARN_ON(rx_ring->rx_bi); - bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count; - rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL); - if (!rx_ring->rx_bi) - goto err; + int err; u64_stats_init(&rx_ring->syncp); @@ -1451,7 +1426,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) if (!rx_ring->desc) { dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n", rx_ring->size); - goto err; + return -ENOMEM; } rx_ring->next_to_alloc = 0; @@ -1463,16 +1438,12 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->queue_index); if (err < 0) - goto err; + return err; } rx_ring->xdp_prog = rx_ring->vsi->xdp_prog; return 0; -err: - kfree(rx_ring->rx_bi); - rx_ring->rx_bi = NULL; - return err; } /** @@ -1592,7 +1563,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count) return false; rx_desc = I40E_RX_DESC(rx_ring, ntu); - bi = &rx_ring->rx_bi[ntu]; + bi = i40e_rx_bi(rx_ring, ntu); do { if (!i40e_alloc_mapped_page(rx_ring, bi)) @@ -1614,7 +1585,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count) ntu++; if (unlikely(ntu == rx_ring->count)) { rx_desc = I40E_RX_DESC(rx_ring, 0); - bi = rx_ring->rx_bi; + bi = i40e_rx_bi(rx_ring, 0); ntu = 0; } @@ -1981,7 +1952,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring, { struct i40e_rx_buffer *rx_buffer; - rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean]; + rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); prefetchw(rx_buffer->page); /* we are reusing so sync this buffer for CPU use */ @@ -2382,9 +2353,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) */ dma_rmb(); - rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc, - qword); - if (unlikely(rx_buffer)) { + if (i40e_rx_is_programming_status(qword)) { + i40e_clean_programming_status(rx_ring, + rx_desc->raw.qword[0], + qword); + rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + i40e_inc_ntc(rx_ring); i40e_reuse_rx_page(rx_ring, rx_buffer); cleaned_count++; continue; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 36d37f31a287..5c255977fd58 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -296,17 +296,9 @@ struct i40e_tx_buffer { struct i40e_rx_buffer { dma_addr_t dma; - union { - struct { - struct page *page; - __u32 page_offset; - __u16 pagecnt_bias; - }; - struct { - void *addr; - u64 handle; - }; - }; + struct page *page; + __u32 page_offset; + __u16 pagecnt_bias; }; struct i40e_queue_stats { @@ -358,6 +350,7 @@ struct i40e_ring { union { struct i40e_tx_buffer *tx_bi; struct i40e_rx_buffer *rx_bi; + struct xdp_buff **rx_bi_zc; }; DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS); u16 queue_index; /* Queue number of ring */ @@ -419,7 +412,6 @@ struct i40e_ring { struct i40e_channel *ch; struct xdp_rxq_info xdp_rxq; struct xdp_umem *xsk_umem; - struct zero_copy_allocator zca; /* ZC allocator anchor */ } ____cacheline_internodealigned_in_smp; static inline bool ring_uses_build_skb(struct i40e_ring *ring) @@ -495,6 +487,7 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); +int i40e_alloc_rx_bi(struct i40e_ring *rx_ring); /** * i40e_get_head - Retrieve head from head writeback diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h index 8af0e99c6c0d..667c4dc4b39f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h @@ -4,13 +4,9 @@ #ifndef I40E_TXRX_COMMON_ #define I40E_TXRX_COMMON_ -void i40e_fd_handle_status(struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, u8 prog_id); int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring); -struct i40e_rx_buffer *i40e_clean_programming_status( - struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, - u64 qw); +void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw, + u64 qword1); void i40e_process_skb_fields(struct i40e_ring *rx_ring, union i40e_rx_desc *rx_desc, struct sk_buff *skb); void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring); @@ -84,6 +80,38 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring, } } +/** + * i40e_rx_is_programming_status - check for programming status descriptor + * @qword1: qword1 representing status_error_len in CPU ordering + * + * The value of in the descriptor length field indicate if this + * is a programming status descriptor for flow director or FCoE + * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise + * it is a packet descriptor. + **/ +static inline bool i40e_rx_is_programming_status(u64 qword1) +{ + /* The Rx filter programming status and SPH bit occupy the same + * spot in the descriptor. Since we don't support packet split we + * can just reuse the bit as an indication that this is a + * programming status descriptor. + */ + return qword1 & I40E_RXD_QW1_LENGTH_SPH_MASK; +} + +/** + * i40e_inc_ntc: Advance the next_to_clean index + * @rx_ring: Rx ring + **/ +static inline void i40e_inc_ntc(struct i40e_ring *rx_ring) +{ + u32 ntc = rx_ring->next_to_clean + 1; + + ntc = (ntc < rx_ring->count) ? ntc : 0; + rx_ring->next_to_clean = ntc; + prefetch(I40E_RX_DESC(rx_ring, ntc)); +} + void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring); void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring); bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi); diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index 6ea2867ff60f..63e098f7cb63 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -689,7 +689,7 @@ union i40e_32byte_rx_desc { __le64 rsvd2; } read; struct { - struct { + struct i40e_32b_rx_wb_qw0 { struct { union { __le16 mirroring_status; @@ -727,6 +727,9 @@ union i40e_32byte_rx_desc { } hi_dword; } qword3; } wb; /* writeback */ + struct { + u64 qword[4]; + } raw; }; enum i40e_rx_desc_status_bits { diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 2b9184aead5f..f3953744c505 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,68 +2,30 @@ /* Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "i40e.h" #include "i40e_txrx_common.h" #include "i40e_xsk.h" -/** - * i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev - * @vsi: Current VSI - * @umem: UMEM to DMA map - * - * Returns 0 on success, <0 on failure - **/ -static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem) +int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring) { - struct i40e_pf *pf = vsi->back; - struct device *dev; - unsigned int i, j; - dma_addr_t dma; - - dev = &pf->pdev->dev; - for (i = 0; i < umem->npgs; i++) { - dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE, - DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); - if (dma_mapping_error(dev, dma)) - goto out_unmap; + unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count; - umem->pages[i].dma = dma; - } - - return 0; - -out_unmap: - for (j = 0; j < i; j++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); - umem->pages[i].dma = 0; - } - - return -1; + rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL); + return rx_ring->rx_bi_zc ? 0 : -ENOMEM; } -/** - * i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev - * @vsi: Current VSI - * @umem: UMEM to DMA map - **/ -static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem) +void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring) { - struct i40e_pf *pf = vsi->back; - struct device *dev; - unsigned int i; - - dev = &pf->pdev->dev; - - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); + memset(rx_ring->rx_bi_zc, 0, + sizeof(*rx_ring->rx_bi_zc) * rx_ring->count); +} - umem->pages[i].dma = 0; - } +static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx) +{ + return &rx_ring->rx_bi_zc[idx]; } /** @@ -78,7 +40,6 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, u16 qid) { struct net_device *netdev = vsi->netdev; - struct xdp_umem_fq_reuse *reuseq; bool if_running; int err; @@ -92,13 +53,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, qid >= netdev->real_num_tx_queues) return -EINVAL; - reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count); - if (!reuseq) - return -ENOMEM; - - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - err = i40e_xsk_umem_dma_map(vsi, umem); + err = xsk_buff_dma_map(umem, &vsi->back->pdev->dev, I40E_RX_DMA_ATTR); if (err) return err; @@ -151,7 +106,7 @@ static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid) } clear_bit(qid, vsi->af_xdp_zc_qps); - i40e_xsk_umem_dma_unmap(vsi, umem); + xsk_buff_dma_unmap(umem, I40E_RX_DMA_ATTR); if (if_running) { err = i40e_queue_pair_enable(vsi, qid); @@ -190,11 +145,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, **/ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) { - struct xdp_umem *umem = rx_ring->xsk_umem; int err, result = I40E_XDP_PASS; struct i40e_ring *xdp_ring; struct bpf_prog *xdp_prog; - u64 offset; u32 act; rcu_read_lock(); @@ -203,9 +156,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) */ xdp_prog = READ_ONCE(rx_ring->xdp_prog); act = bpf_prog_run_xdp(xdp_prog, xdp); - offset = xdp->data - xdp->data_hard_start; - - xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset); switch (act) { case XDP_PASS: @@ -232,107 +182,26 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) return result; } -/** - * i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer - * @rx_ring: Rx ring - * @bi: Rx buffer to populate - * - * This function allocates an Rx buffer. The buffer can come from fill - * queue, or via the recycle queue (next_to_alloc). - * - * Returns true for a successful allocation, false otherwise - **/ -static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - void *addr = bi->addr; - u64 handle, hr; - - if (addr) { - rx_ring->rx_stats.page_reuse_count++; - return true; - } - - if (!xsk_umem_peek_addr(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr(umem); - return true; -} - -/** - * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer - * @rx_ring: Rx ring - * @bi: Rx buffer to populate - * - * This function allocates an Rx buffer. The buffer can come from fill - * queue, or via the reuse queue. - * - * Returns true for a successful allocation, false otherwise - **/ -static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - u64 handle, hr; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - handle &= rx_ring->xsk_umem->chunk_mask; - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr_rq(umem); - return true; -} - -static __always_inline bool -__i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count, - bool alloc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi)) +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count) { u16 ntu = rx_ring->next_to_use; union i40e_rx_desc *rx_desc; - struct i40e_rx_buffer *bi; + struct xdp_buff **bi, *xdp; + dma_addr_t dma; bool ok = true; rx_desc = I40E_RX_DESC(rx_ring, ntu); - bi = &rx_ring->rx_bi[ntu]; + bi = i40e_rx_bi(rx_ring, ntu); do { - if (!alloc(rx_ring, bi)) { + xdp = xsk_buff_alloc(rx_ring->xsk_umem); + if (!xdp) { ok = false; goto no_buffers; } - - dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0, - rx_ring->rx_buf_len, - DMA_BIDIRECTIONAL); - - rx_desc->read.pkt_addr = cpu_to_le64(bi->dma); + *bi = xdp; + dma = xsk_buff_xdp_get_dma(xdp); + rx_desc->read.pkt_addr = cpu_to_le64(dma); + rx_desc->read.hdr_addr = 0; rx_desc++; bi++; @@ -340,11 +209,10 @@ __i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count, if (unlikely(ntu == rx_ring->count)) { rx_desc = I40E_RX_DESC(rx_ring, 0); - bi = rx_ring->rx_bi; + bi = i40e_rx_bi(rx_ring, 0); ntu = 0; } - rx_desc->wb.qword1.status_error_len = 0; count--; } while (count); @@ -356,127 +224,8 @@ no_buffers: } /** - * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers - * @rx_ring: Rx ring - * @count: The number of buffers to allocate - * - * This function allocates a number of Rx buffers from the reuse queue - * or fill ring and places them on the Rx ring. - * - * Returns true for a successful allocation, false otherwise - **/ -bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count) -{ - return __i40e_alloc_rx_buffers_zc(rx_ring, count, - i40e_alloc_buffer_slow_zc); -} - -/** - * i40e_alloc_rx_buffers_fast_zc - Allocates a number of Rx buffers - * @rx_ring: Rx ring - * @count: The number of buffers to allocate - * - * This function allocates a number of Rx buffers from the fill ring - * or the internal recycle mechanism and places them on the Rx ring. - * - * Returns true for a successful allocation, false otherwise - **/ -static bool i40e_alloc_rx_buffers_fast_zc(struct i40e_ring *rx_ring, u16 count) -{ - return __i40e_alloc_rx_buffers_zc(rx_ring, count, - i40e_alloc_buffer_zc); -} - -/** - * i40e_get_rx_buffer_zc - Return the current Rx buffer - * @rx_ring: Rx ring - * @size: The size of the rx buffer (read from descriptor) - * - * This function returns the current, received Rx buffer, and also - * does DMA synchronization. the Rx ring. - * - * Returns the received Rx buffer - **/ -static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring, - const unsigned int size) -{ - struct i40e_rx_buffer *bi; - - bi = &rx_ring->rx_bi[rx_ring->next_to_clean]; - - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - bi->dma, 0, - size, - DMA_BIDIRECTIONAL); - - return bi; -} - -/** - * i40e_reuse_rx_buffer_zc - Recycle an Rx buffer - * @rx_ring: Rx ring - * @old_bi: The Rx buffer to recycle - * - * This function recycles a finished Rx buffer, and places it on the - * recycle queue (next_to_alloc). - **/ -static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *old_bi) -{ - struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc]; - u16 nta = rx_ring->next_to_alloc; - - /* update, and store next to alloc */ - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - /* transfer page from old buffer to new buffer */ - new_bi->dma = old_bi->dma; - new_bi->addr = old_bi->addr; - new_bi->handle = old_bi->handle; - - old_bi->addr = NULL; -} - -/** - * i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations - * @alloc: Zero-copy allocator - * @handle: Buffer handle - **/ -void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) -{ - struct i40e_rx_buffer *bi; - struct i40e_ring *rx_ring; - u64 hr, mask; - u16 nta; - - rx_ring = container_of(alloc, struct i40e_ring, zca); - hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; - mask = rx_ring->xsk_umem->chunk_mask; - - nta = rx_ring->next_to_alloc; - bi = &rx_ring->rx_bi[nta]; - - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - handle &= mask; - - bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle, - rx_ring->xsk_umem->headroom); -} - -/** * i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer * @rx_ring: Rx ring - * @bi: Rx buffer * @xdp: xdp_buff * * This functions allocates a new skb from a zero-copy Rx buffer. @@ -484,7 +233,6 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) * Returns the skb, or NULL on failure. **/ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi, struct xdp_buff *xdp) { unsigned int metasize = xdp->data - xdp->data_meta; @@ -503,24 +251,11 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, if (metasize) skb_metadata_set(skb, metasize); - i40e_reuse_rx_buffer_zc(rx_ring, bi); + xsk_buff_free(xdp); return skb; } /** - * i40e_inc_ntc: Advance the next_to_clean index - * @rx_ring: Rx ring - **/ -static void i40e_inc_ntc(struct i40e_ring *rx_ring) -{ - u32 ntc = rx_ring->next_to_clean + 1; - - ntc = (ntc < rx_ring->count) ? ntc : 0; - rx_ring->next_to_clean = ntc; - prefetch(I40E_RX_DESC(rx_ring, ntc)); -} - -/** * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring * @rx_ring: Rx ring * @budget: NAPI budget @@ -531,25 +266,20 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); - struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; - struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { - struct i40e_rx_buffer *bi; union i40e_rx_desc *rx_desc; + struct xdp_buff **bi; unsigned int size; u64 qword; if (cleaned_count >= I40E_RX_BUFFER_WRITE) { failure = failure || - !i40e_alloc_rx_buffers_fast_zc(rx_ring, - cleaned_count); + !i40e_alloc_rx_buffers_zc(rx_ring, + cleaned_count); cleaned_count = 0; } @@ -562,35 +292,36 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) */ dma_rmb(); - bi = i40e_clean_programming_status(rx_ring, rx_desc, - qword); - if (unlikely(bi)) { - i40e_reuse_rx_buffer_zc(rx_ring, bi); + if (i40e_rx_is_programming_status(qword)) { + i40e_clean_programming_status(rx_ring, + rx_desc->raw.qword[0], + qword); + bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + xsk_buff_free(*bi); + *bi = NULL; cleaned_count++; + i40e_inc_ntc(rx_ring); continue; } + bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT; if (!size) break; - bi = i40e_get_rx_buffer_zc(rx_ring, size); - xdp.data = bi->addr; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + size; - xdp.handle = bi->handle; + bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + (*bi)->data_end = (*bi)->data + size; + xsk_buff_dma_sync_for_cpu(*bi); - xdp_res = i40e_run_xdp_zc(rx_ring, &xdp); + xdp_res = i40e_run_xdp_zc(rx_ring, *bi); if (xdp_res) { - if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) { + if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) xdp_xmit |= xdp_res; - bi->addr = NULL; - } else { - i40e_reuse_rx_buffer_zc(rx_ring, bi); - } + else + xsk_buff_free(*bi); + *bi = NULL; total_rx_bytes += size; total_rx_packets++; @@ -606,7 +337,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) * BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that * SBP is *not* set in PRT_SBPVSI (default not set). */ - skb = i40e_construct_skb_zc(rx_ring, bi, &xdp); + skb = i40e_construct_skb_zc(rx_ring, *bi); + *bi = NULL; if (!skb) { rx_ring->rx_stats.alloc_buff_failed++; break; @@ -664,10 +396,9 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc)) break; - dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr); - - dma_sync_single_for_device(xdp_ring->dev, dma, desc.len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma, + desc.len); tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use]; tx_bi->bytecount = desc.len; @@ -826,13 +557,13 @@ void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring) u16 i; for (i = 0; i < rx_ring->count; i++) { - struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; + struct xdp_buff *rx_bi = *i40e_rx_bi(rx_ring, i); - if (!rx_bi->addr) + if (!rx_bi) continue; - xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_bi->handle); - rx_bi->addr = NULL; + xsk_buff_free(rx_bi); + rx_bi = NULL; } } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index 9ed59c14eb55..ea919a7d60ec 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -12,12 +12,13 @@ int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair); int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair); int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, u16 qid); -void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count); int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget); bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi, struct i40e_ring *tx_ring, int napi_budget); int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags); +int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring); +void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring); #endif /* _I40E_XSK_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 00c072f61a32..94d833b4e745 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019, Intel Corporation. */ +#include <net/xdp_sock_drv.h> #include "ice_base.h" #include "ice_dcb_lib.h" @@ -309,24 +310,23 @@ int ice_setup_rx_ctx(struct ice_ring *ring) if (ring->xsk_umem) { xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); - ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + ring->rx_buf_len = + xsk_umem_get_rx_frame_size(ring->xsk_umem); /* For AF_XDP ZC, we disallow packets to span on * multiple buffers, thus letting us skip that * handling in the fast-path. */ chain_len = 1; - ring->zca.free = ice_zca_free; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &ring->zca); + MEM_TYPE_XSK_BUFF_POOL, + NULL); if (err) return err; + xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq); - dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", + dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->q_index); } else { - ring->zca.free = NULL; if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, @@ -427,7 +427,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) writel(0, ring->tail); err = ring->xsk_umem ? - ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) : + ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring)) : ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring)); if (err) dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index 7c4030caeea4..cf21b4fe928a 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -162,17 +162,16 @@ struct ice_tx_offload_params { }; struct ice_rx_buf { - struct sk_buff *skb; - dma_addr_t dma; union { struct { + struct sk_buff *skb; + dma_addr_t dma; struct page *page; unsigned int page_offset; u16 pagecnt_bias; }; struct { - void *addr; - u64 handle; + struct xdp_buff *xdp; }; }; }; @@ -296,7 +295,6 @@ struct ice_ring { struct rcu_head rcu; /* to avoid race on free */ struct bpf_prog *xdp_prog; struct xdp_umem *xsk_umem; - struct zero_copy_allocator zca; /* CL3 - 3rd cacheline starts here */ struct xdp_rxq_info xdp_rxq; /* CLX - the below items are only accessed infrequently and should be diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 20ac54e3156d..b6f928c9e9c9 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019, Intel Corporation. */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "ice.h" #include "ice_base.h" @@ -280,28 +280,6 @@ static int ice_xsk_alloc_umems(struct ice_vsi *vsi) } /** - * ice_xsk_add_umem - add a UMEM region for XDP sockets - * @vsi: VSI to which the UMEM will be added - * @umem: pointer to a requested UMEM region - * @qid: queue ID - * - * Returns 0 on success, negative on error - */ -static int ice_xsk_add_umem(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid) -{ - int err; - - err = ice_xsk_alloc_umems(vsi); - if (err) - return err; - - vsi->xsk_umems[qid] = umem; - vsi->num_xsk_umems_used++; - - return 0; -} - -/** * ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid * @vsi: VSI from which the VSI will be removed * @qid: Ring/qid associated with the UMEM @@ -318,65 +296,6 @@ static void ice_xsk_remove_umem(struct ice_vsi *vsi, u16 qid) } } -/** - * ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets - * @vsi: VSI to map the UMEM region - * @umem: UMEM to map - * - * Returns 0 on success, negative on error - */ -static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem) -{ - struct ice_pf *pf = vsi->back; - struct device *dev; - unsigned int i; - - dev = ice_pf_to_dev(pf); - for (i = 0; i < umem->npgs; i++) { - dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0, - PAGE_SIZE, - DMA_BIDIRECTIONAL, - ICE_RX_DMA_ATTR); - if (dma_mapping_error(dev, dma)) { - dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d\n", - i); - goto out_unmap; - } - - umem->pages[i].dma = dma; - } - - return 0; - -out_unmap: - for (; i > 0; i--) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR); - umem->pages[i].dma = 0; - } - - return -EFAULT; -} - -/** - * ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets - * @vsi: VSI from which the UMEM will be unmapped - * @umem: UMEM to unmap - */ -static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem) -{ - struct ice_pf *pf = vsi->back; - struct device *dev; - unsigned int i; - - dev = ice_pf_to_dev(pf); - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR); - - umem->pages[i].dma = 0; - } -} /** * ice_xsk_umem_disable - disable a UMEM region @@ -391,7 +310,7 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid) !vsi->xsk_umems[qid]) return -EINVAL; - ice_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]); + xsk_buff_dma_unmap(vsi->xsk_umems[qid], ICE_RX_DMA_ATTR); ice_xsk_remove_umem(vsi, qid); return 0; @@ -408,7 +327,6 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid) static int ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid) { - struct xdp_umem_fq_reuse *reuseq; int err; if (vsi->type != ICE_VSI_PF) @@ -419,20 +337,18 @@ ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid) if (qid >= vsi->num_xsk_umems) return -EINVAL; + err = ice_xsk_alloc_umems(vsi); + if (err) + return err; + if (vsi->xsk_umems && vsi->xsk_umems[qid]) return -EBUSY; - reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count); - if (!reuseq) - return -ENOMEM; - - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - err = ice_xsk_umem_dma_map(vsi, umem); - if (err) - return err; + vsi->xsk_umems[qid] = umem; + vsi->num_xsk_umems_used++; - err = ice_xsk_add_umem(vsi, umem, qid); + err = xsk_buff_dma_map(vsi->xsk_umems[qid], ice_pf_to_dev(vsi->back), + ICE_RX_DMA_ATTR); if (err) return err; @@ -484,137 +400,22 @@ xsk_umem_if_up: } /** - * ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations - * @zca: zero-cpoy allocator - * @handle: Buffer handle - */ -void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle) -{ - struct ice_rx_buf *rx_buf; - struct ice_ring *rx_ring; - struct xdp_umem *umem; - u64 hr, mask; - u16 nta; - - rx_ring = container_of(zca, struct ice_ring, zca); - umem = rx_ring->xsk_umem; - hr = umem->headroom + XDP_PACKET_HEADROOM; - - mask = umem->chunk_mask; - - nta = rx_ring->next_to_alloc; - rx_buf = &rx_ring->rx_buf[nta]; - - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - handle &= mask; - - rx_buf->dma = xdp_umem_get_dma(umem, handle); - rx_buf->dma += hr; - - rx_buf->addr = xdp_umem_get_data(umem, handle); - rx_buf->addr += hr; - - rx_buf->handle = (u64)handle + umem->headroom; -} - -/** - * ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem - * @rx_ring: ring with an xdp_umem bound to it - * @rx_buf: buffer to which xsk page address will be assigned - * - * This function allocates an Rx buffer in the hot path. - * The buffer can come from fill queue or recycle queue. - * - * Returns true if an assignment was successful, false if not. - */ -static __always_inline bool -ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - void *addr = rx_buf->addr; - u64 handle, hr; - - if (addr) { - rx_ring->rx_stats.page_reuse_count++; - return true; - } - - if (!xsk_umem_peek_addr(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - rx_buf->dma = xdp_umem_get_dma(umem, handle); - rx_buf->dma += hr; - - rx_buf->addr = xdp_umem_get_data(umem, handle); - rx_buf->addr += hr; - - rx_buf->handle = handle + umem->headroom; - - xsk_umem_release_addr(umem); - return true; -} - -/** - * ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem - * @rx_ring: ring with an xdp_umem bound to it - * @rx_buf: buffer to which xsk page address will be assigned - * - * This function allocates an Rx buffer in the slow path. - * The buffer can come from fill queue or recycle queue. - * - * Returns true if an assignment was successful, false if not. - */ -static __always_inline bool -ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - u64 handle, headroom; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - handle &= umem->chunk_mask; - headroom = umem->headroom + XDP_PACKET_HEADROOM; - - rx_buf->dma = xdp_umem_get_dma(umem, handle); - rx_buf->dma += headroom; - - rx_buf->addr = xdp_umem_get_data(umem, handle); - rx_buf->addr += headroom; - - rx_buf->handle = handle + umem->headroom; - - xsk_umem_release_addr_rq(umem); - return true; -} - -/** * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers * @rx_ring: Rx ring * @count: The number of buffers to allocate - * @alloc: the function pointer to call for allocation * * This function allocates a number of Rx buffers from the fill ring * or the internal recycle mechanism and places them on the Rx ring. * * Returns false if all allocations were successful, true if any fail. */ -static bool -ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count, - bool (*alloc)(struct ice_ring *, struct ice_rx_buf *)) +bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count) { union ice_32b_rx_flex_desc *rx_desc; u16 ntu = rx_ring->next_to_use; struct ice_rx_buf *rx_buf; bool ret = false; + dma_addr_t dma; if (!count) return false; @@ -623,16 +424,14 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count, rx_buf = &rx_ring->rx_buf[ntu]; do { - if (!alloc(rx_ring, rx_buf)) { + rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_umem); + if (!rx_buf->xdp) { ret = true; break; } - dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0, - rx_ring->rx_buf_len, - DMA_BIDIRECTIONAL); - - rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma); + dma = xsk_buff_xdp_get_dma(rx_buf->xdp); + rx_desc->read.pkt_addr = cpu_to_le64(dma); rx_desc->wb.status_error0 = 0; rx_desc++; @@ -653,32 +452,6 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count, } /** - * ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path - * @rx_ring: Rx ring - * @count: number of bufs to allocate - * - * Returns false on success, true on failure. - */ -static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count) -{ - return ice_alloc_rx_bufs_zc(rx_ring, count, - ice_alloc_buf_fast_zc); -} - -/** - * ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path - * @rx_ring: Rx ring - * @count: number of bufs to allocate - * - * Returns false on success, true on failure. - */ -bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count) -{ - return ice_alloc_rx_bufs_zc(rx_ring, count, - ice_alloc_buf_slow_zc); -} - -/** * ice_bump_ntc - Bump the next_to_clean counter of an Rx ring * @rx_ring: Rx ring */ @@ -692,76 +465,21 @@ static void ice_bump_ntc(struct ice_ring *rx_ring) } /** - * ice_get_rx_buf_zc - Fetch the current Rx buffer - * @rx_ring: Rx ring - * @size: size of a buffer - * - * This function returns the current, received Rx buffer and does - * DMA synchronization. - * - * Returns a pointer to the received Rx buffer. - */ -static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size) -{ - struct ice_rx_buf *rx_buf; - - rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; - - dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0, - size, DMA_BIDIRECTIONAL); - - return rx_buf; -} - -/** - * ice_reuse_rx_buf_zc - reuse an Rx buffer - * @rx_ring: Rx ring - * @old_buf: The buffer to recycle - * - * This function recycles a finished Rx buffer, and places it on the recycle - * queue (next_to_alloc). - */ -static void -ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf) -{ - unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask; - u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; - u16 nta = rx_ring->next_to_alloc; - struct ice_rx_buf *new_buf; - - new_buf = &rx_ring->rx_buf[nta++]; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - new_buf->dma = old_buf->dma & mask; - new_buf->dma += hr; - - new_buf->addr = (void *)((unsigned long)old_buf->addr & mask); - new_buf->addr += hr; - - new_buf->handle = old_buf->handle & mask; - new_buf->handle += rx_ring->xsk_umem->headroom; - - old_buf->addr = NULL; -} - -/** * ice_construct_skb_zc - Create an sk_buff from zero-copy buffer * @rx_ring: Rx ring * @rx_buf: zero-copy Rx buffer - * @xdp: XDP buffer * * This function allocates a new skb from a zero-copy Rx buffer. * * Returns the skb on success, NULL on failure. */ static struct sk_buff * -ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, - struct xdp_buff *xdp) +ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) { - unsigned int metasize = xdp->data - xdp->data_meta; - unsigned int datasize = xdp->data_end - xdp->data; - unsigned int datasize_hard = xdp->data_end - - xdp->data_hard_start; + unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta; + unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data; + unsigned int datasize_hard = rx_buf->xdp->data_end - + rx_buf->xdp->data_hard_start; struct sk_buff *skb; skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard, @@ -769,13 +487,13 @@ ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, if (unlikely(!skb)) return NULL; - skb_reserve(skb, xdp->data - xdp->data_hard_start); - memcpy(__skb_put(skb, datasize), xdp->data, datasize); + skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start); + memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize); if (metasize) skb_metadata_set(skb, metasize); - ice_reuse_rx_buf_zc(rx_ring, rx_buf); - + xsk_buff_free(rx_buf->xdp); + rx_buf->xdp = NULL; return skb; } @@ -802,7 +520,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp) } act = bpf_prog_run_xdp(xdp_prog, xdp); - xdp->handle += xdp->data - xdp->data_hard_start; switch (act) { case XDP_PASS: break; @@ -840,13 +557,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); - struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_xmit = 0; bool failure = false; - struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { union ice_32b_rx_flex_desc *rx_desc; @@ -858,8 +570,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) u8 rx_ptype; if (cleaned_count >= ICE_RX_BUF_WRITE) { - failure |= ice_alloc_rx_bufs_fast_zc(rx_ring, - cleaned_count); + failure |= ice_alloc_rx_bufs_zc(rx_ring, + cleaned_count); cleaned_count = 0; } @@ -880,25 +592,19 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) if (!size) break; - rx_buf = ice_get_rx_buf_zc(rx_ring, size); - if (!rx_buf->addr) - break; - xdp.data = rx_buf->addr; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + size; - xdp.handle = rx_buf->handle; + rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; + rx_buf->xdp->data_end = rx_buf->xdp->data + size; + xsk_buff_dma_sync_for_cpu(rx_buf->xdp); - xdp_res = ice_run_xdp_zc(rx_ring, &xdp); + xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp); if (xdp_res) { - if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { + if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) xdp_xmit |= xdp_res; - rx_buf->addr = NULL; - } else { - ice_reuse_rx_buf_zc(rx_ring, rx_buf); - } + else + xsk_buff_free(rx_buf->xdp); + rx_buf->xdp = NULL; total_rx_bytes += size; total_rx_packets++; cleaned_count++; @@ -908,7 +614,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) } /* XDP_PASS path */ - skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp); + skb = ice_construct_skb_zc(rx_ring, rx_buf); if (!skb) { rx_ring->rx_stats.alloc_buf_failed++; break; @@ -979,10 +685,9 @@ static bool ice_xmit_zc(struct ice_ring *xdp_ring, int budget) if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc)) break; - dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr); - - dma_sync_single_for_device(xdp_ring->dev, dma, desc.len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma, + desc.len); tx_buf->bytecount = desc.len; @@ -1165,11 +870,10 @@ void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring) for (i = 0; i < rx_ring->count; i++) { struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i]; - if (!rx_buf->addr) + if (!rx_buf->xdp) continue; - xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_buf->handle); - rx_buf->addr = NULL; + rx_buf->xdp = NULL; } } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h index 8a4ba7c6d549..fc1a06b4df36 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.h +++ b/drivers/net/ethernet/intel/ice/ice_xsk.h @@ -10,11 +10,10 @@ struct ice_vsi; #ifdef CONFIG_XDP_SOCKETS int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid); -void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle); int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget); bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring, int budget); int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags); -bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count); +bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count); bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi); void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring); void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring); @@ -27,12 +26,6 @@ ice_xsk_umem_setup(struct ice_vsi __always_unused *vsi, return -EOPNOTSUPP; } -static inline void -ice_zca_free(struct zero_copy_allocator __always_unused *zca, - unsigned long __always_unused handle) -{ -} - static inline int ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring, int __always_unused budget) @@ -48,8 +41,8 @@ ice_clean_tx_irq_zc(struct ice_ring __always_unused *xdp_ring, } static inline bool -ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring, - u16 __always_unused count) +ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring, + u16 __always_unused count) { return false; } diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 39d3b76a6f5d..2cd003c5ad43 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -143,7 +143,8 @@ static int igb_get_link_ksettings(struct net_device *netdev, u32 speed; u32 supported, advertising; - status = rd32(E1000_STATUS); + status = pm_runtime_suspended(&adapter->pdev->dev) ? + 0 : rd32(E1000_STATUS); if (hw->phy.media_type == e1000_media_type_copper) { supported = (SUPPORTED_10baseT_Half | diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 812e1cd695cf..14f9edaaaf83 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -16,8 +16,7 @@ #include "igc_hw.h" -/* forward declaration */ -void igc_set_ethtool_ops(struct net_device *); +void igc_ethtool_set_ops(struct net_device *); /* Transmit and receive queues */ #define IGC_MAX_RX_QUEUES 4 @@ -29,6 +28,11 @@ void igc_set_ethtool_ops(struct net_device *); #define MAX_ETYPE_FILTER 8 #define IGC_RETA_SIZE 128 +enum igc_mac_filter_type { + IGC_MAC_FILTER_TYPE_DST = 0, + IGC_MAC_FILTER_TYPE_SRC +}; + struct igc_tx_queue_stats { u64 packets; u64 bytes; @@ -183,14 +187,12 @@ struct igc_adapter { u32 rss_queues; u32 rss_indir_tbl_init; - /* RX network flow classification support */ - struct hlist_head nfc_filter_list; - unsigned int nfc_filter_count; - - /* lock for RX network flow classification filter */ - spinlock_t nfc_lock; - - struct igc_mac_addr *mac_table; + /* Any access to elements in nfc_rule_list is protected by the + * nfc_rule_lock. + */ + struct mutex nfc_rule_lock; + struct list_head nfc_rule_list; + unsigned int nfc_rule_count; u8 rss_indir_tbl[IGC_RETA_SIZE]; @@ -230,15 +232,6 @@ void igc_write_rss_indir_tbl(struct igc_adapter *adapter); bool igc_has_link(struct igc_adapter *adapter); void igc_reset(struct igc_adapter *adapter); int igc_set_spd_dplx(struct igc_adapter *adapter, u32 spd, u8 dplx); -int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const s8 queue, const u8 flags); -int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const u8 flags); -int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio, - int queue); -void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio); -int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype, int queue); -int igc_del_etype_filter(struct igc_adapter *adapter, u16 etype); void igc_update_stats(struct igc_adapter *adapter); /* igc_dump declarations */ @@ -449,39 +442,22 @@ enum igc_filter_match_flags { IGC_FILTER_FLAG_DST_MAC_ADDR = 0x8, }; -/* RX network flow classification data structure */ -struct igc_nfc_input { - /* Byte layout in order, all values with MSB first: - * match_flags - 1 byte - * etype - 2 bytes - * vlan_tci - 2 bytes - */ +struct igc_nfc_filter { u8 match_flags; - __be16 etype; - __be16 vlan_tci; + u16 etype; + u16 vlan_tci; u8 src_addr[ETH_ALEN]; u8 dst_addr[ETH_ALEN]; }; -struct igc_nfc_filter { - struct hlist_node nfc_node; - struct igc_nfc_input filter; - unsigned long cookie; - u16 sw_idx; +struct igc_nfc_rule { + struct list_head list; + struct igc_nfc_filter filter; + u32 location; u16 action; }; -struct igc_mac_addr { - u8 addr[ETH_ALEN]; - s8 queue; - u8 state; /* bitmask */ -}; - -#define IGC_MAC_STATE_DEFAULT 0x1 -#define IGC_MAC_STATE_IN_USE 0x2 -#define IGC_MAC_STATE_SRC_ADDR 0x4 - -#define IGC_MAX_RXNFC_FILTERS 16 +#define IGC_MAX_RXNFC_RULES 16 /* igc_desc_unused - calculate if we have unused descriptors */ static inline u16 igc_desc_unused(const struct igc_ring *ring) @@ -557,12 +533,11 @@ static inline s32 igc_read_phy_reg(struct igc_hw *hw, u32 offset, u16 *data) return 0; } -/* forward declaration */ void igc_reinit_locked(struct igc_adapter *); -int igc_add_filter(struct igc_adapter *adapter, - struct igc_nfc_filter *input); -int igc_erase_filter(struct igc_adapter *adapter, - struct igc_nfc_filter *input); +struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter, + u32 location); +int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule); +void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule); void igc_ptp_init(struct igc_adapter *adapter); void igc_ptp_reset(struct igc_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 51d8a15e239c..3d8d40d6fa3f 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -62,6 +62,9 @@ * (RAR[15]) for our directed address used by controllers with * manageability enabled, allowing us room for 15 multicast addresses. */ +#define IGC_RAH_RAH_MASK 0x0000FFFF +#define IGC_RAH_ASEL_MASK 0x00030000 +#define IGC_RAH_ASEL_SRC_ADDR BIT(16) #define IGC_RAH_QSEL_MASK 0x000C0000 #define IGC_RAH_QSEL_SHIFT 18 #define IGC_RAH_QSEL_ENABLE BIT(28) @@ -262,13 +265,9 @@ #define IGC_TXD_POPTS_IXSM 0x01 /* Insert IP checksum */ #define IGC_TXD_POPTS_TXSM 0x02 /* Insert TCP/UDP checksum */ #define IGC_TXD_CMD_EOP 0x01000000 /* End of Packet */ -#define IGC_TXD_CMD_IFCS 0x02000000 /* Insert FCS (Ethernet CRC) */ #define IGC_TXD_CMD_IC 0x04000000 /* Insert Checksum */ -#define IGC_TXD_CMD_RS 0x08000000 /* Report Status */ -#define IGC_TXD_CMD_RPS 0x10000000 /* Report Packet Sent */ #define IGC_TXD_CMD_DEXT 0x20000000 /* Desc extension (0 = legacy) */ #define IGC_TXD_CMD_VLE 0x40000000 /* Add VLAN tag */ -#define IGC_TXD_CMD_IDE 0x80000000 /* Enable Tidv register */ #define IGC_TXD_STAT_DD 0x00000001 /* Descriptor Done */ #define IGC_TXD_STAT_EC 0x00000002 /* Excess Collisions */ #define IGC_TXD_STAT_LC 0x00000004 /* Late Collisions */ diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index c6586e2be3a8..946e775e34ae 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -124,8 +124,8 @@ static const char igc_priv_flags_strings[][ETH_GSTRING_LEN] = { #define IGC_PRIV_FLAGS_STR_LEN ARRAY_SIZE(igc_priv_flags_strings) -static void igc_get_drvinfo(struct net_device *netdev, - struct ethtool_drvinfo *drvinfo) +static void igc_ethtool_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -139,13 +139,13 @@ static void igc_get_drvinfo(struct net_device *netdev, drvinfo->n_priv_flags = IGC_PRIV_FLAGS_STR_LEN; } -static int igc_get_regs_len(struct net_device *netdev) +static int igc_ethtool_get_regs_len(struct net_device *netdev) { return IGC_REGS_LEN * sizeof(u32); } -static void igc_get_regs(struct net_device *netdev, - struct ethtool_regs *regs, void *p) +static void igc_ethtool_get_regs(struct net_device *netdev, + struct ethtool_regs *regs, void *p) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_hw *hw = &adapter->hw; @@ -323,7 +323,8 @@ static void igc_get_regs(struct net_device *netdev, regs_buff[205 + i] = rd32(IGC_ETQF(i)); } -static void igc_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) +static void igc_ethtool_get_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -354,7 +355,8 @@ static void igc_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) wol->wolopts |= WAKE_PHY; } -static int igc_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) +static int igc_ethtool_set_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -382,21 +384,21 @@ static int igc_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) return 0; } -static u32 igc_get_msglevel(struct net_device *netdev) +static u32 igc_ethtool_get_msglevel(struct net_device *netdev) { struct igc_adapter *adapter = netdev_priv(netdev); return adapter->msg_enable; } -static void igc_set_msglevel(struct net_device *netdev, u32 data) +static void igc_ethtool_set_msglevel(struct net_device *netdev, u32 data) { struct igc_adapter *adapter = netdev_priv(netdev); adapter->msg_enable = data; } -static int igc_nway_reset(struct net_device *netdev) +static int igc_ethtool_nway_reset(struct net_device *netdev) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -405,7 +407,7 @@ static int igc_nway_reset(struct net_device *netdev) return 0; } -static u32 igc_get_link(struct net_device *netdev) +static u32 igc_ethtool_get_link(struct net_device *netdev) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_mac_info *mac = &adapter->hw.mac; @@ -422,15 +424,15 @@ static u32 igc_get_link(struct net_device *netdev) return igc_has_link(adapter); } -static int igc_get_eeprom_len(struct net_device *netdev) +static int igc_ethtool_get_eeprom_len(struct net_device *netdev) { struct igc_adapter *adapter = netdev_priv(netdev); return adapter->hw.nvm.word_size * 2; } -static int igc_get_eeprom(struct net_device *netdev, - struct ethtool_eeprom *eeprom, u8 *bytes) +static int igc_ethtool_get_eeprom(struct net_device *netdev, + struct ethtool_eeprom *eeprom, u8 *bytes) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_hw *hw = &adapter->hw; @@ -476,8 +478,8 @@ static int igc_get_eeprom(struct net_device *netdev, return ret_val; } -static int igc_set_eeprom(struct net_device *netdev, - struct ethtool_eeprom *eeprom, u8 *bytes) +static int igc_ethtool_set_eeprom(struct net_device *netdev, + struct ethtool_eeprom *eeprom, u8 *bytes) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_hw *hw = &adapter->hw; @@ -544,8 +546,8 @@ static int igc_set_eeprom(struct net_device *netdev, return ret_val; } -static void igc_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) +static void igc_ethtool_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -555,8 +557,8 @@ static void igc_get_ringparam(struct net_device *netdev, ring->tx_pending = adapter->tx_ring_count; } -static int igc_set_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) +static int igc_ethtool_set_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_ring *temp_ring; @@ -670,8 +672,8 @@ clear_reset: return err; } -static void igc_get_pauseparam(struct net_device *netdev, - struct ethtool_pauseparam *pause) +static void igc_ethtool_get_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pause) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_hw *hw = &adapter->hw; @@ -689,8 +691,8 @@ static void igc_get_pauseparam(struct net_device *netdev, } } -static int igc_set_pauseparam(struct net_device *netdev, - struct ethtool_pauseparam *pause) +static int igc_ethtool_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pause) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_hw *hw = &adapter->hw; @@ -729,7 +731,8 @@ static int igc_set_pauseparam(struct net_device *netdev, return retval; } -static void igc_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +static void igc_ethtool_get_strings(struct net_device *netdev, u32 stringset, + u8 *data) { struct igc_adapter *adapter = netdev_priv(netdev); u8 *p = data; @@ -780,7 +783,7 @@ static void igc_get_strings(struct net_device *netdev, u32 stringset, u8 *data) } } -static int igc_get_sset_count(struct net_device *netdev, int sset) +static int igc_ethtool_get_sset_count(struct net_device *netdev, int sset) { switch (sset) { case ETH_SS_STATS: @@ -794,7 +797,7 @@ static int igc_get_sset_count(struct net_device *netdev, int sset) } } -static void igc_get_ethtool_stats(struct net_device *netdev, +static void igc_ethtool_get_stats(struct net_device *netdev, struct ethtool_stats *stats, u64 *data) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -850,8 +853,8 @@ static void igc_get_ethtool_stats(struct net_device *netdev, spin_unlock(&adapter->stats64_lock); } -static int igc_get_coalesce(struct net_device *netdev, - struct ethtool_coalesce *ec) +static int igc_ethtool_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec) { struct igc_adapter *adapter = netdev_priv(netdev); @@ -870,8 +873,8 @@ static int igc_get_coalesce(struct net_device *netdev, return 0; } -static int igc_set_coalesce(struct net_device *netdev, - struct ethtool_coalesce *ec) +static int igc_ethtool_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec) { struct igc_adapter *adapter = netdev_priv(netdev); int i; @@ -928,81 +931,83 @@ static int igc_set_coalesce(struct net_device *netdev, } #define ETHER_TYPE_FULL_MASK ((__force __be16)~0) -static int igc_get_ethtool_nfc_entry(struct igc_adapter *adapter, - struct ethtool_rxnfc *cmd) +static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter, + struct ethtool_rxnfc *cmd) { struct ethtool_rx_flow_spec *fsp = &cmd->fs; - struct igc_nfc_filter *rule = NULL; + struct igc_nfc_rule *rule = NULL; - /* report total rule count */ - cmd->data = IGC_MAX_RXNFC_FILTERS; + cmd->data = IGC_MAX_RXNFC_RULES; - hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) { - if (fsp->location <= rule->sw_idx) - break; + mutex_lock(&adapter->nfc_rule_lock); + + rule = igc_get_nfc_rule(adapter, fsp->location); + if (!rule) + goto out; + + fsp->flow_type = ETHER_FLOW; + fsp->ring_cookie = rule->action; + + if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) { + fsp->h_u.ether_spec.h_proto = htons(rule->filter.etype); + fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK; } - if (!rule || fsp->location != rule->sw_idx) - return -EINVAL; + if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { + fsp->flow_type |= FLOW_EXT; + fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci); + fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK); + } - if (rule->filter.match_flags) { - fsp->flow_type = ETHER_FLOW; - fsp->ring_cookie = rule->action; - if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) { - fsp->h_u.ether_spec.h_proto = rule->filter.etype; - fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK; - } - if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { - fsp->flow_type |= FLOW_EXT; - fsp->h_ext.vlan_tci = rule->filter.vlan_tci; - fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK); - } - if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) { - ether_addr_copy(fsp->h_u.ether_spec.h_dest, - rule->filter.dst_addr); - /* As we only support matching by the full - * mask, return the mask to userspace - */ - eth_broadcast_addr(fsp->m_u.ether_spec.h_dest); - } - if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) { - ether_addr_copy(fsp->h_u.ether_spec.h_source, - rule->filter.src_addr); - /* As we only support matching by the full - * mask, return the mask to userspace - */ - eth_broadcast_addr(fsp->m_u.ether_spec.h_source); - } + if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) { + ether_addr_copy(fsp->h_u.ether_spec.h_dest, + rule->filter.dst_addr); + eth_broadcast_addr(fsp->m_u.ether_spec.h_dest); + } - return 0; + if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) { + ether_addr_copy(fsp->h_u.ether_spec.h_source, + rule->filter.src_addr); + eth_broadcast_addr(fsp->m_u.ether_spec.h_source); } + + mutex_unlock(&adapter->nfc_rule_lock); + return 0; + +out: + mutex_unlock(&adapter->nfc_rule_lock); return -EINVAL; } -static int igc_get_ethtool_nfc_all(struct igc_adapter *adapter, - struct ethtool_rxnfc *cmd, - u32 *rule_locs) +static int igc_ethtool_get_nfc_rules(struct igc_adapter *adapter, + struct ethtool_rxnfc *cmd, + u32 *rule_locs) { - struct igc_nfc_filter *rule; + struct igc_nfc_rule *rule; int cnt = 0; - /* report total rule count */ - cmd->data = IGC_MAX_RXNFC_FILTERS; + cmd->data = IGC_MAX_RXNFC_RULES; + + mutex_lock(&adapter->nfc_rule_lock); - hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) { - if (cnt == cmd->rule_cnt) + list_for_each_entry(rule, &adapter->nfc_rule_list, list) { + if (cnt == cmd->rule_cnt) { + mutex_unlock(&adapter->nfc_rule_lock); return -EMSGSIZE; - rule_locs[cnt] = rule->sw_idx; + } + rule_locs[cnt] = rule->location; cnt++; } + mutex_unlock(&adapter->nfc_rule_lock); + cmd->rule_cnt = cnt; return 0; } -static int igc_get_rss_hash_opts(struct igc_adapter *adapter, - struct ethtool_rxnfc *cmd) +static int igc_ethtool_get_rss_hash_opts(struct igc_adapter *adapter, + struct ethtool_rxnfc *cmd) { cmd->data = 0; @@ -1051,41 +1056,33 @@ static int igc_get_rss_hash_opts(struct igc_adapter *adapter, return 0; } -static int igc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, - u32 *rule_locs) +static int igc_ethtool_get_rxnfc(struct net_device *dev, + struct ethtool_rxnfc *cmd, u32 *rule_locs) { struct igc_adapter *adapter = netdev_priv(dev); - int ret = -EOPNOTSUPP; switch (cmd->cmd) { case ETHTOOL_GRXRINGS: cmd->data = adapter->num_rx_queues; - ret = 0; - break; + return 0; case ETHTOOL_GRXCLSRLCNT: - cmd->rule_cnt = adapter->nfc_filter_count; - ret = 0; - break; + cmd->rule_cnt = adapter->nfc_rule_count; + return 0; case ETHTOOL_GRXCLSRULE: - ret = igc_get_ethtool_nfc_entry(adapter, cmd); - break; + return igc_ethtool_get_nfc_rule(adapter, cmd); case ETHTOOL_GRXCLSRLALL: - ret = igc_get_ethtool_nfc_all(adapter, cmd, rule_locs); - break; + return igc_ethtool_get_nfc_rules(adapter, cmd, rule_locs); case ETHTOOL_GRXFH: - ret = igc_get_rss_hash_opts(adapter, cmd); - break; + return igc_ethtool_get_rss_hash_opts(adapter, cmd); default: - break; + return -EOPNOTSUPP; } - - return ret; } #define UDP_RSS_FLAGS (IGC_FLAG_RSS_FIELD_IPV4_UDP | \ IGC_FLAG_RSS_FIELD_IPV6_UDP) -static int igc_set_rss_hash_opt(struct igc_adapter *adapter, - struct ethtool_rxnfc *nfc) +static int igc_ethtool_set_rss_hash_opt(struct igc_adapter *adapter, + struct ethtool_rxnfc *nfc) { u32 flags = adapter->flags; @@ -1186,252 +1183,185 @@ static int igc_set_rss_hash_opt(struct igc_adapter *adapter, return 0; } -int igc_add_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input) +static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule, + const struct ethtool_rx_flow_spec *fsp) { - struct igc_hw *hw = &adapter->hw; - int err = -EINVAL; - - if (hw->mac.type == igc_i225 && - !(input->filter.match_flags & ~IGC_FILTER_FLAG_SRC_MAC_ADDR)) { - netdev_err(adapter->netdev, - "i225 doesn't support flow classification rules specifying only source addresses\n"); - return -EOPNOTSUPP; - } + INIT_LIST_HEAD(&rule->list); - if (input->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) { - u16 etype = ntohs(input->filter.etype); + rule->action = fsp->ring_cookie; + rule->location = fsp->location; - err = igc_add_etype_filter(adapter, etype, input->action); - if (err) - return err; + if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) { + rule->filter.vlan_tci = ntohs(fsp->h_ext.vlan_tci); + rule->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI; } - if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) { - err = igc_add_mac_filter(adapter, input->filter.dst_addr, - input->action, 0); - if (err) - return err; + if (fsp->m_u.ether_spec.h_proto == ETHER_TYPE_FULL_MASK) { + rule->filter.etype = ntohs(fsp->h_u.ether_spec.h_proto); + rule->filter.match_flags = IGC_FILTER_FLAG_ETHER_TYPE; } - if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) { - err = igc_add_mac_filter(adapter, input->filter.src_addr, - input->action, - IGC_MAC_STATE_SRC_ADDR); - if (err) - return err; + /* Both source and destination address filters only support the full + * mask. + */ + if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) { + rule->filter.match_flags |= IGC_FILTER_FLAG_SRC_MAC_ADDR; + ether_addr_copy(rule->filter.src_addr, + fsp->h_u.ether_spec.h_source); } - if (input->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { - int prio = (ntohs(input->filter.vlan_tci) & VLAN_PRIO_MASK) >> - VLAN_PRIO_SHIFT; - err = igc_add_vlan_prio_filter(adapter, prio, input->action); - if (err) - return err; + if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) { + rule->filter.match_flags |= IGC_FILTER_FLAG_DST_MAC_ADDR; + ether_addr_copy(rule->filter.dst_addr, + fsp->h_u.ether_spec.h_dest); } - - return 0; } -int igc_erase_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input) +/** + * igc_ethtool_check_nfc_rule() - Check if NFC rule is valid + * @adapter: Pointer to adapter + * @rule: Rule under evaluation + * + * Rules with both destination and source MAC addresses are considered invalid + * since the driver doesn't support them. + * + * Also, if there is already another rule with the same filter in a different + * location, @rule is considered invalid. + * + * Context: Expects adapter->nfc_rule_lock to be held by caller. + * + * Return: 0 in case of success, negative errno code otherwise. + */ +static int igc_ethtool_check_nfc_rule(struct igc_adapter *adapter, + struct igc_nfc_rule *rule) { - if (input->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) { - u16 etype = ntohs(input->filter.etype); - - igc_del_etype_filter(adapter, etype); - } + struct net_device *dev = adapter->netdev; + u8 flags = rule->filter.match_flags; + struct igc_nfc_rule *tmp; - if (input->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { - int prio = (ntohs(input->filter.vlan_tci) & VLAN_PRIO_MASK) >> - VLAN_PRIO_SHIFT; - igc_del_vlan_prio_filter(adapter, prio); + if (!flags) { + netdev_dbg(dev, "Rule with no match\n"); + return -EINVAL; } - if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) - igc_del_mac_filter(adapter, input->filter.src_addr, - IGC_MAC_STATE_SRC_ADDR); - - if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) - igc_del_mac_filter(adapter, input->filter.dst_addr, 0); - - return 0; -} - -static int igc_update_ethtool_nfc_entry(struct igc_adapter *adapter, - struct igc_nfc_filter *input, - u16 sw_idx) -{ - struct igc_nfc_filter *rule, *parent; - int err = -EINVAL; - - parent = NULL; - rule = NULL; - - hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) { - /* hash found, or no matching entry */ - if (rule->sw_idx >= sw_idx) - break; - parent = rule; + if (flags & IGC_FILTER_FLAG_DST_MAC_ADDR && + flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) { + netdev_dbg(dev, "Filters with both dst and src are not supported\n"); + return -EOPNOTSUPP; } - /* if there is an old rule occupying our place remove it */ - if (rule && rule->sw_idx == sw_idx) { - if (!input) - err = igc_erase_filter(adapter, rule); - - hlist_del(&rule->nfc_node); - kfree(rule); - adapter->nfc_filter_count--; + list_for_each_entry(tmp, &adapter->nfc_rule_list, list) { + if (!memcmp(&rule->filter, &tmp->filter, + sizeof(rule->filter)) && + tmp->location != rule->location) { + netdev_dbg(dev, "Rule already exists\n"); + return -EEXIST; + } } - /* If no input this was a delete, err should be 0 if a rule was - * successfully found and removed from the list else -EINVAL - */ - if (!input) - return err; - - /* initialize node */ - INIT_HLIST_NODE(&input->nfc_node); - - /* add filter to the list */ - if (parent) - hlist_add_behind(&input->nfc_node, &parent->nfc_node); - else - hlist_add_head(&input->nfc_node, &adapter->nfc_filter_list); - - /* update counts */ - adapter->nfc_filter_count++; - return 0; } -static int igc_add_ethtool_nfc_entry(struct igc_adapter *adapter, - struct ethtool_rxnfc *cmd) +static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter, + struct ethtool_rxnfc *cmd) { struct net_device *netdev = adapter->netdev; struct ethtool_rx_flow_spec *fsp = (struct ethtool_rx_flow_spec *)&cmd->fs; - struct igc_nfc_filter *input, *rule; - int err = 0; + struct igc_nfc_rule *rule, *old_rule; + int err; - if (!(netdev->hw_features & NETIF_F_NTUPLE)) + if (!(netdev->hw_features & NETIF_F_NTUPLE)) { + netdev_dbg(netdev, "N-tuple filters disabled\n"); return -EOPNOTSUPP; + } - /* Don't allow programming if the action is a queue greater than - * the number of online Rx queues. - */ - if (fsp->ring_cookie == RX_CLS_FLOW_DISC || - fsp->ring_cookie >= adapter->num_rx_queues) { - netdev_err(netdev, - "ethtool -N: The specified action is invalid\n"); - return -EINVAL; + if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW) { + netdev_dbg(netdev, "Only ethernet flow type is supported\n"); + return -EOPNOTSUPP; } - /* Don't allow indexes to exist outside of available space */ - if (fsp->location >= IGC_MAX_RXNFC_FILTERS) { - netdev_err(netdev, "Location out of range\n"); - return -EINVAL; + if ((fsp->flow_type & FLOW_EXT) && + fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) { + netdev_dbg(netdev, "VLAN mask not supported\n"); + return -EOPNOTSUPP; } - if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW) + if (fsp->ring_cookie >= adapter->num_rx_queues) { + netdev_dbg(netdev, "Invalid action\n"); return -EINVAL; - - input = kzalloc(sizeof(*input), GFP_KERNEL); - if (!input) - return -ENOMEM; - - if (fsp->m_u.ether_spec.h_proto == ETHER_TYPE_FULL_MASK) { - input->filter.etype = fsp->h_u.ether_spec.h_proto; - input->filter.match_flags = IGC_FILTER_FLAG_ETHER_TYPE; } - /* Only support matching addresses by the full mask */ - if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) { - input->filter.match_flags |= IGC_FILTER_FLAG_SRC_MAC_ADDR; - ether_addr_copy(input->filter.src_addr, - fsp->h_u.ether_spec.h_source); + if (fsp->location >= IGC_MAX_RXNFC_RULES) { + netdev_dbg(netdev, "Invalid location\n"); + return -EINVAL; } - /* Only support matching addresses by the full mask */ - if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) { - input->filter.match_flags |= IGC_FILTER_FLAG_DST_MAC_ADDR; - ether_addr_copy(input->filter.dst_addr, - fsp->h_u.ether_spec.h_dest); - } + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (!rule) + return -ENOMEM; - if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) { - if (fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) { - netdev_dbg(netdev, "VLAN mask not supported\n"); - err = -EOPNOTSUPP; - goto err_out; - } - input->filter.vlan_tci = fsp->h_ext.vlan_tci; - input->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI; - } + igc_ethtool_init_nfc_rule(rule, fsp); - input->action = fsp->ring_cookie; - input->sw_idx = fsp->location; + mutex_lock(&adapter->nfc_rule_lock); - spin_lock(&adapter->nfc_lock); + err = igc_ethtool_check_nfc_rule(adapter, rule); + if (err) + goto err; - hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) { - if (!memcmp(&input->filter, &rule->filter, - sizeof(input->filter))) { - err = -EEXIST; - netdev_err(netdev, - "ethtool: this filter is already set\n"); - goto err_out_w_lock; - } - } + old_rule = igc_get_nfc_rule(adapter, fsp->location); + if (old_rule) + igc_del_nfc_rule(adapter, old_rule); - err = igc_add_filter(adapter, input); + err = igc_add_nfc_rule(adapter, rule); if (err) - goto err_out_w_lock; - - igc_update_ethtool_nfc_entry(adapter, input, input->sw_idx); + goto err; - spin_unlock(&adapter->nfc_lock); + mutex_unlock(&adapter->nfc_rule_lock); return 0; -err_out_w_lock: - spin_unlock(&adapter->nfc_lock); -err_out: - kfree(input); +err: + mutex_unlock(&adapter->nfc_rule_lock); + kfree(rule); return err; } -static int igc_del_ethtool_nfc_entry(struct igc_adapter *adapter, - struct ethtool_rxnfc *cmd) +static int igc_ethtool_del_nfc_rule(struct igc_adapter *adapter, + struct ethtool_rxnfc *cmd) { struct ethtool_rx_flow_spec *fsp = (struct ethtool_rx_flow_spec *)&cmd->fs; - int err; + struct igc_nfc_rule *rule; - spin_lock(&adapter->nfc_lock); - err = igc_update_ethtool_nfc_entry(adapter, NULL, fsp->location); - spin_unlock(&adapter->nfc_lock); + mutex_lock(&adapter->nfc_rule_lock); - return err; + rule = igc_get_nfc_rule(adapter, fsp->location); + if (!rule) { + mutex_unlock(&adapter->nfc_rule_lock); + return -EINVAL; + } + + igc_del_nfc_rule(adapter, rule); + + mutex_unlock(&adapter->nfc_rule_lock); + return 0; } -static int igc_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) +static int igc_ethtool_set_rxnfc(struct net_device *dev, + struct ethtool_rxnfc *cmd) { struct igc_adapter *adapter = netdev_priv(dev); - int ret = -EOPNOTSUPP; switch (cmd->cmd) { case ETHTOOL_SRXFH: - ret = igc_set_rss_hash_opt(adapter, cmd); - break; + return igc_ethtool_set_rss_hash_opt(adapter, cmd); case ETHTOOL_SRXCLSRLINS: - ret = igc_add_ethtool_nfc_entry(adapter, cmd); - break; + return igc_ethtool_add_nfc_rule(adapter, cmd); case ETHTOOL_SRXCLSRLDEL: - ret = igc_del_ethtool_nfc_entry(adapter, cmd); + return igc_ethtool_del_nfc_rule(adapter, cmd); default: - break; + return -EOPNOTSUPP; } - - return ret; } void igc_write_rss_indir_tbl(struct igc_adapter *adapter) @@ -1456,13 +1386,13 @@ void igc_write_rss_indir_tbl(struct igc_adapter *adapter) } } -static u32 igc_get_rxfh_indir_size(struct net_device *netdev) +static u32 igc_ethtool_get_rxfh_indir_size(struct net_device *netdev) { return IGC_RETA_SIZE; } -static int igc_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, - u8 *hfunc) +static int igc_ethtool_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) { struct igc_adapter *adapter = netdev_priv(netdev); int i; @@ -1477,8 +1407,8 @@ static int igc_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, return 0; } -static int igc_set_rxfh(struct net_device *netdev, const u32 *indir, - const u8 *key, const u8 hfunc) +static int igc_ethtool_set_rxfh(struct net_device *netdev, const u32 *indir, + const u8 *key, const u8 hfunc) { struct igc_adapter *adapter = netdev_priv(netdev); u32 num_queues; @@ -1506,18 +1436,13 @@ static int igc_set_rxfh(struct net_device *netdev, const u32 *indir, return 0; } -static unsigned int igc_max_channels(struct igc_adapter *adapter) -{ - return igc_get_max_rss_queues(adapter); -} - -static void igc_get_channels(struct net_device *netdev, - struct ethtool_channels *ch) +static void igc_ethtool_get_channels(struct net_device *netdev, + struct ethtool_channels *ch) { struct igc_adapter *adapter = netdev_priv(netdev); /* Report maximum channels */ - ch->max_combined = igc_max_channels(adapter); + ch->max_combined = igc_get_max_rss_queues(adapter); /* Report info for other vector */ if (adapter->flags & IGC_FLAG_HAS_MSIX) { @@ -1528,8 +1453,8 @@ static void igc_get_channels(struct net_device *netdev, ch->combined_count = adapter->rss_queues; } -static int igc_set_channels(struct net_device *netdev, - struct ethtool_channels *ch) +static int igc_ethtool_set_channels(struct net_device *netdev, + struct ethtool_channels *ch) { struct igc_adapter *adapter = netdev_priv(netdev); unsigned int count = ch->combined_count; @@ -1544,7 +1469,7 @@ static int igc_set_channels(struct net_device *netdev, return -EINVAL; /* Verify the number of channels doesn't exceed hw limits */ - max_combined = igc_max_channels(adapter); + max_combined = igc_get_max_rss_queues(adapter); if (count > max_combined) return -EINVAL; @@ -1561,8 +1486,8 @@ static int igc_set_channels(struct net_device *netdev, return 0; } -static int igc_get_ts_info(struct net_device *dev, - struct ethtool_ts_info *info) +static int igc_ethtool_get_ts_info(struct net_device *dev, + struct ethtool_ts_info *info) { struct igc_adapter *adapter = netdev_priv(dev); @@ -1594,7 +1519,7 @@ static int igc_get_ts_info(struct net_device *dev, } } -static u32 igc_get_priv_flags(struct net_device *netdev) +static u32 igc_ethtool_get_priv_flags(struct net_device *netdev) { struct igc_adapter *adapter = netdev_priv(netdev); u32 priv_flags = 0; @@ -1605,7 +1530,7 @@ static u32 igc_get_priv_flags(struct net_device *netdev) return priv_flags; } -static int igc_set_priv_flags(struct net_device *netdev, u32 priv_flags) +static int igc_ethtool_set_priv_flags(struct net_device *netdev, u32 priv_flags) { struct igc_adapter *adapter = netdev_priv(netdev); unsigned int flags = adapter->flags; @@ -1640,8 +1565,8 @@ static void igc_ethtool_complete(struct net_device *netdev) pm_runtime_put(&adapter->pdev->dev); } -static int igc_get_link_ksettings(struct net_device *netdev, - struct ethtool_link_ksettings *cmd) +static int igc_ethtool_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *cmd) { struct igc_adapter *adapter = netdev_priv(netdev); struct igc_hw *hw = &adapter->hw; @@ -1747,8 +1672,9 @@ static int igc_get_link_ksettings(struct net_device *netdev, return 0; } -static int igc_set_link_ksettings(struct net_device *netdev, - const struct ethtool_link_ksettings *cmd) +static int +igc_ethtool_set_link_ksettings(struct net_device *netdev, + const struct ethtool_link_ksettings *cmd) { struct igc_adapter *adapter = netdev_priv(netdev); struct net_device *dev = adapter->netdev; @@ -1814,8 +1740,8 @@ static int igc_set_link_ksettings(struct net_device *netdev, return 0; } -static void igc_diag_test(struct net_device *netdev, - struct ethtool_test *eth_test, u64 *data) +static void igc_ethtool_diag_test(struct net_device *netdev, + struct ethtool_test *eth_test, u64 *data) { struct igc_adapter *adapter = netdev_priv(netdev); bool if_running = netif_running(netdev); @@ -1874,45 +1800,45 @@ static void igc_diag_test(struct net_device *netdev, static const struct ethtool_ops igc_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_USECS, - .get_drvinfo = igc_get_drvinfo, - .get_regs_len = igc_get_regs_len, - .get_regs = igc_get_regs, - .get_wol = igc_get_wol, - .set_wol = igc_set_wol, - .get_msglevel = igc_get_msglevel, - .set_msglevel = igc_set_msglevel, - .nway_reset = igc_nway_reset, - .get_link = igc_get_link, - .get_eeprom_len = igc_get_eeprom_len, - .get_eeprom = igc_get_eeprom, - .set_eeprom = igc_set_eeprom, - .get_ringparam = igc_get_ringparam, - .set_ringparam = igc_set_ringparam, - .get_pauseparam = igc_get_pauseparam, - .set_pauseparam = igc_set_pauseparam, - .get_strings = igc_get_strings, - .get_sset_count = igc_get_sset_count, - .get_ethtool_stats = igc_get_ethtool_stats, - .get_coalesce = igc_get_coalesce, - .set_coalesce = igc_set_coalesce, - .get_rxnfc = igc_get_rxnfc, - .set_rxnfc = igc_set_rxnfc, - .get_rxfh_indir_size = igc_get_rxfh_indir_size, - .get_rxfh = igc_get_rxfh, - .set_rxfh = igc_set_rxfh, - .get_ts_info = igc_get_ts_info, - .get_channels = igc_get_channels, - .set_channels = igc_set_channels, - .get_priv_flags = igc_get_priv_flags, - .set_priv_flags = igc_set_priv_flags, + .get_drvinfo = igc_ethtool_get_drvinfo, + .get_regs_len = igc_ethtool_get_regs_len, + .get_regs = igc_ethtool_get_regs, + .get_wol = igc_ethtool_get_wol, + .set_wol = igc_ethtool_set_wol, + .get_msglevel = igc_ethtool_get_msglevel, + .set_msglevel = igc_ethtool_set_msglevel, + .nway_reset = igc_ethtool_nway_reset, + .get_link = igc_ethtool_get_link, + .get_eeprom_len = igc_ethtool_get_eeprom_len, + .get_eeprom = igc_ethtool_get_eeprom, + .set_eeprom = igc_ethtool_set_eeprom, + .get_ringparam = igc_ethtool_get_ringparam, + .set_ringparam = igc_ethtool_set_ringparam, + .get_pauseparam = igc_ethtool_get_pauseparam, + .set_pauseparam = igc_ethtool_set_pauseparam, + .get_strings = igc_ethtool_get_strings, + .get_sset_count = igc_ethtool_get_sset_count, + .get_ethtool_stats = igc_ethtool_get_stats, + .get_coalesce = igc_ethtool_get_coalesce, + .set_coalesce = igc_ethtool_set_coalesce, + .get_rxnfc = igc_ethtool_get_rxnfc, + .set_rxnfc = igc_ethtool_set_rxnfc, + .get_rxfh_indir_size = igc_ethtool_get_rxfh_indir_size, + .get_rxfh = igc_ethtool_get_rxfh, + .set_rxfh = igc_ethtool_set_rxfh, + .get_ts_info = igc_ethtool_get_ts_info, + .get_channels = igc_ethtool_get_channels, + .set_channels = igc_ethtool_set_channels, + .get_priv_flags = igc_ethtool_get_priv_flags, + .set_priv_flags = igc_ethtool_set_priv_flags, .begin = igc_ethtool_begin, .complete = igc_ethtool_complete, - .get_link_ksettings = igc_get_link_ksettings, - .set_link_ksettings = igc_set_link_ksettings, - .self_test = igc_diag_test, + .get_link_ksettings = igc_ethtool_get_link_ksettings, + .set_link_ksettings = igc_ethtool_set_link_ksettings, + .self_test = igc_ethtool_diag_test, }; -void igc_set_ethtool_ops(struct net_device *netdev) +void igc_ethtool_set_ops(struct net_device *netdev) { netdev->ethtool_ops = &igc_ethtool_ops; } diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/ethernet/intel/igc/igc_mac.c index 12aa6b5fcb5d..89445ab02a98 100644 --- a/drivers/net/ethernet/intel/igc/igc_mac.c +++ b/drivers/net/ethernet/intel/igc/igc_mac.c @@ -307,12 +307,8 @@ void igc_clear_hw_cntrs_base(struct igc_hw *hw) rd32(IGC_ICTXQMTC); rd32(IGC_ICRXDMTC); - rd32(IGC_CBTMPC); - rd32(IGC_HTDPMC); - rd32(IGC_CBRMPC); rd32(IGC_RPTHC); rd32(IGC_HGPTC); - rd32(IGC_HTCBDPC); rd32(IGC_HGORCL); rd32(IGC_HGORCH); rd32(IGC_HGOTCL); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 0df5617eb9d0..97d26991c87e 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -766,12 +766,14 @@ static void igc_setup_tctl(struct igc_adapter *adapter) * igc_set_mac_filter_hw() - Set MAC address filter in hardware * @adapter: Pointer to adapter where the filter should be set * @index: Filter index - * @addr: Destination MAC address + * @type: MAC address filter type (source or destination) + * @addr: MAC address * @queue: If non-negative, queue assignment feature is enabled and frames * matching the filter are enqueued onto 'queue'. Otherwise, queue * assignment is disabled. */ static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index, + enum igc_mac_filter_type type, const u8 *addr, int queue) { struct net_device *dev = adapter->netdev; @@ -784,6 +786,11 @@ static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index, ral = le32_to_cpup((__le32 *)(addr)); rah = le16_to_cpup((__le16 *)(addr + 4)); + if (type == IGC_MAC_FILTER_TYPE_SRC) { + rah &= ~IGC_RAH_ASEL_MASK; + rah |= IGC_RAH_ASEL_SRC_ADDR; + } + if (queue >= 0) { rah &= ~IGC_RAH_QSEL_MASK; rah |= (queue << IGC_RAH_QSEL_SHIFT); @@ -820,17 +827,12 @@ static void igc_clear_mac_filter_hw(struct igc_adapter *adapter, int index) /* Set default MAC address for the PF in the first RAR entry */ static void igc_set_default_mac_filter(struct igc_adapter *adapter) { - struct igc_mac_addr *mac_table = &adapter->mac_table[0]; struct net_device *dev = adapter->netdev; u8 *addr = adapter->hw.mac.addr; netdev_dbg(dev, "Set default MAC address filter: address %pM", addr); - ether_addr_copy(mac_table->addr, addr); - mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE; - mac_table->queue = -1; - - igc_set_mac_filter_hw(adapter, 0, addr, mac_table->queue); + igc_set_mac_filter_hw(adapter, 0, IGC_MAC_FILTER_TYPE_DST, addr, -1); } /** @@ -2172,34 +2174,26 @@ static bool igc_clean_tx_irq(struct igc_q_vector *q_vector, int napi_budget) return !!budget; } -static void igc_nfc_filter_restore(struct igc_adapter *adapter) -{ - struct igc_nfc_filter *rule; - - spin_lock(&adapter->nfc_lock); - - hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) - igc_add_filter(adapter, rule); - - spin_unlock(&adapter->nfc_lock); -} - -static int igc_find_mac_filter(struct igc_adapter *adapter, const u8 *addr, - u8 flags) +static int igc_find_mac_filter(struct igc_adapter *adapter, + enum igc_mac_filter_type type, const u8 *addr) { - int max_entries = adapter->hw.mac.rar_entry_count; - struct igc_mac_addr *entry; + struct igc_hw *hw = &adapter->hw; + int max_entries = hw->mac.rar_entry_count; + u32 ral, rah; int i; for (i = 0; i < max_entries; i++) { - entry = &adapter->mac_table[i]; + ral = rd32(IGC_RAL(i)); + rah = rd32(IGC_RAH(i)); - if (!(entry->state & IGC_MAC_STATE_IN_USE)) + if (!(rah & IGC_RAH_AV)) continue; - if (!ether_addr_equal(addr, entry->addr)) + if (!!(rah & IGC_RAH_ASEL_SRC_ADDR) != type) continue; - if ((entry->state & IGC_MAC_STATE_SRC_ADDR) != - (flags & IGC_MAC_STATE_SRC_ADDR)) + if ((rah & IGC_RAH_RAH_MASK) != + le16_to_cpup((__le16 *)(addr + 4))) + continue; + if (ral != le32_to_cpup((__le32 *)(addr))) continue; return i; @@ -2210,14 +2204,15 @@ static int igc_find_mac_filter(struct igc_adapter *adapter, const u8 *addr, static int igc_get_avail_mac_filter_slot(struct igc_adapter *adapter) { - int max_entries = adapter->hw.mac.rar_entry_count; - struct igc_mac_addr *entry; + struct igc_hw *hw = &adapter->hw; + int max_entries = hw->mac.rar_entry_count; + u32 rah; int i; for (i = 0; i < max_entries; i++) { - entry = &adapter->mac_table[i]; + rah = rd32(IGC_RAH(i)); - if (!(entry->state & IGC_MAC_STATE_IN_USE)) + if (!(rah & IGC_RAH_AV)) return i; } @@ -2227,91 +2222,70 @@ static int igc_get_avail_mac_filter_slot(struct igc_adapter *adapter) /** * igc_add_mac_filter() - Add MAC address filter * @adapter: Pointer to adapter where the filter should be added + * @type: MAC address filter type (source or destination) * @addr: MAC address * @queue: If non-negative, queue assignment feature is enabled and frames * matching the filter are enqueued onto 'queue'. Otherwise, queue * assignment is disabled. - * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source - * address * * Return: 0 in case of success, negative errno code otherwise. */ -int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const s8 queue, const u8 flags) +static int igc_add_mac_filter(struct igc_adapter *adapter, + enum igc_mac_filter_type type, const u8 *addr, + int queue) { struct net_device *dev = adapter->netdev; int index; - if (!is_valid_ether_addr(addr)) - return -EINVAL; - if (flags & IGC_MAC_STATE_SRC_ADDR) - return -ENOTSUPP; - - index = igc_find_mac_filter(adapter, addr, flags); + index = igc_find_mac_filter(adapter, type, addr); if (index >= 0) - goto update_queue_assignment; + goto update_filter; index = igc_get_avail_mac_filter_slot(adapter); if (index < 0) return -ENOSPC; - netdev_dbg(dev, "Add MAC address filter: index %d address %pM queue %d", - index, addr, queue); + netdev_dbg(dev, "Add MAC address filter: index %d type %s address %pM queue %d\n", + index, type == IGC_MAC_FILTER_TYPE_DST ? "dst" : "src", + addr, queue); - ether_addr_copy(adapter->mac_table[index].addr, addr); - adapter->mac_table[index].state |= IGC_MAC_STATE_IN_USE | flags; -update_queue_assignment: - adapter->mac_table[index].queue = queue; - - igc_set_mac_filter_hw(adapter, index, addr, queue); +update_filter: + igc_set_mac_filter_hw(adapter, index, type, addr, queue); return 0; } /** * igc_del_mac_filter() - Delete MAC address filter * @adapter: Pointer to adapter where the filter should be deleted from + * @type: MAC address filter type (source or destination) * @addr: MAC address - * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source - * address - * - * Return: 0 in case of success, negative errno code otherwise. */ -int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, - const u8 flags) +static void igc_del_mac_filter(struct igc_adapter *adapter, + enum igc_mac_filter_type type, const u8 *addr) { struct net_device *dev = adapter->netdev; - struct igc_mac_addr *entry; int index; - if (!is_valid_ether_addr(addr)) - return -EINVAL; - - index = igc_find_mac_filter(adapter, addr, flags); + index = igc_find_mac_filter(adapter, type, addr); if (index < 0) - return -ENOENT; - - entry = &adapter->mac_table[index]; + return; - if (entry->state & IGC_MAC_STATE_DEFAULT) { + if (index == 0) { /* If this is the default filter, we don't actually delete it. * We just reset to its default value i.e. disable queue * assignment. */ netdev_dbg(dev, "Disable default MAC filter queue assignment"); - entry->queue = -1; - igc_set_mac_filter_hw(adapter, 0, addr, entry->queue); + igc_set_mac_filter_hw(adapter, 0, type, addr, -1); } else { - netdev_dbg(dev, "Delete MAC address filter: index %d address %pM", - index, addr); + netdev_dbg(dev, "Delete MAC address filter: index %d type %s address %pM\n", + index, + type == IGC_MAC_FILTER_TYPE_DST ? "dst" : "src", + addr); - entry->state = 0; - entry->queue = -1; - memset(entry->addr, 0, ETH_ALEN); igc_clear_mac_filter_hw(adapter, index); } - - return 0; } /** @@ -2322,7 +2296,8 @@ int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr, * * Return: 0 in case of success, negative errno code otherwise. */ -int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio, int queue) +static int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio, + int queue) { struct net_device *dev = adapter->netdev; struct igc_hw *hw = &adapter->hw; @@ -2350,7 +2325,7 @@ int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio, int queue) * @adapter: Pointer to adapter where the filter should be deleted from * @prio: VLAN priority value */ -void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio) +static void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio) { struct igc_hw *hw = &adapter->hw; u32 vlanpqf; @@ -2391,7 +2366,8 @@ static int igc_get_avail_etype_filter_slot(struct igc_adapter *adapter) * * Return: 0 in case of success, negative errno code otherwise. */ -int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype, int queue) +static int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype, + int queue) { struct igc_hw *hw = &adapter->hw; int index; @@ -2440,37 +2416,194 @@ static int igc_find_etype_filter(struct igc_adapter *adapter, u16 etype) * igc_del_etype_filter() - Delete ethertype filter * @adapter: Pointer to adapter where the filter should be deleted from * @etype: Ethertype value - * - * Return: 0 in case of success, negative errno code otherwise. */ -int igc_del_etype_filter(struct igc_adapter *adapter, u16 etype) +static void igc_del_etype_filter(struct igc_adapter *adapter, u16 etype) { struct igc_hw *hw = &adapter->hw; int index; index = igc_find_etype_filter(adapter, etype); if (index < 0) - return -ENOENT; + return; wr32(IGC_ETQF(index), 0); netdev_dbg(adapter->netdev, "Delete ethertype filter: etype %04x\n", etype); +} + +static int igc_enable_nfc_rule(struct igc_adapter *adapter, + const struct igc_nfc_rule *rule) +{ + int err; + + if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) { + err = igc_add_etype_filter(adapter, rule->filter.etype, + rule->action); + if (err) + return err; + } + + if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) { + err = igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_SRC, + rule->filter.src_addr, rule->action); + if (err) + return err; + } + + if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) { + err = igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, + rule->filter.dst_addr, rule->action); + if (err) + return err; + } + + if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { + int prio = (rule->filter.vlan_tci & VLAN_PRIO_MASK) >> + VLAN_PRIO_SHIFT; + + err = igc_add_vlan_prio_filter(adapter, prio, rule->action); + if (err) + return err; + } + return 0; } +static void igc_disable_nfc_rule(struct igc_adapter *adapter, + const struct igc_nfc_rule *rule) +{ + if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) + igc_del_etype_filter(adapter, rule->filter.etype); + + if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { + int prio = (rule->filter.vlan_tci & VLAN_PRIO_MASK) >> + VLAN_PRIO_SHIFT; + + igc_del_vlan_prio_filter(adapter, prio); + } + + if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) + igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_SRC, + rule->filter.src_addr); + + if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) + igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, + rule->filter.dst_addr); +} + +/** + * igc_get_nfc_rule() - Get NFC rule + * @adapter: Pointer to adapter + * @location: Rule location + * + * Context: Expects adapter->nfc_rule_lock to be held by caller. + * + * Return: Pointer to NFC rule at @location. If not found, NULL. + */ +struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter, + u32 location) +{ + struct igc_nfc_rule *rule; + + list_for_each_entry(rule, &adapter->nfc_rule_list, list) { + if (rule->location == location) + return rule; + if (rule->location > location) + break; + } + + return NULL; +} + +/** + * igc_del_nfc_rule() - Delete NFC rule + * @adapter: Pointer to adapter + * @rule: Pointer to rule to be deleted + * + * Disable NFC rule in hardware and delete it from adapter. + * + * Context: Expects adapter->nfc_rule_lock to be held by caller. + */ +void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule) +{ + igc_disable_nfc_rule(adapter, rule); + + list_del(&rule->list); + adapter->nfc_rule_count--; + + kfree(rule); +} + +static void igc_flush_nfc_rules(struct igc_adapter *adapter) +{ + struct igc_nfc_rule *rule, *tmp; + + mutex_lock(&adapter->nfc_rule_lock); + + list_for_each_entry_safe(rule, tmp, &adapter->nfc_rule_list, list) + igc_del_nfc_rule(adapter, rule); + + mutex_unlock(&adapter->nfc_rule_lock); +} + +/** + * igc_add_nfc_rule() - Add NFC rule + * @adapter: Pointer to adapter + * @rule: Pointer to rule to be added + * + * Enable NFC rule in hardware and add it to adapter. + * + * Context: Expects adapter->nfc_rule_lock to be held by caller. + * + * Return: 0 on success, negative errno on failure. + */ +int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule) +{ + struct igc_nfc_rule *pred, *cur; + int err; + + err = igc_enable_nfc_rule(adapter, rule); + if (err) + return err; + + pred = NULL; + list_for_each_entry(cur, &adapter->nfc_rule_list, list) { + if (cur->location >= rule->location) + break; + pred = cur; + } + + list_add(&rule->list, pred ? &pred->list : &adapter->nfc_rule_list); + adapter->nfc_rule_count++; + return 0; +} + +static void igc_restore_nfc_rules(struct igc_adapter *adapter) +{ + struct igc_nfc_rule *rule; + + mutex_lock(&adapter->nfc_rule_lock); + + list_for_each_entry_reverse(rule, &adapter->nfc_rule_list, list) + igc_enable_nfc_rule(adapter, rule); + + mutex_unlock(&adapter->nfc_rule_lock); +} + static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr) { struct igc_adapter *adapter = netdev_priv(netdev); - return igc_add_mac_filter(adapter, addr, -1, 0); + return igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, addr, -1); } static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) { struct igc_adapter *adapter = netdev_priv(netdev); - return igc_del_mac_filter(adapter, addr, 0); + igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, addr); + return 0; } /** @@ -2541,7 +2674,7 @@ static void igc_configure(struct igc_adapter *adapter) igc_setup_rctl(adapter); igc_set_default_mac_filter(adapter); - igc_nfc_filter_restore(adapter); + igc_restore_nfc_rules(adapter); igc_configure_tx(adapter); igc_configure_rx(adapter); @@ -2735,12 +2868,7 @@ void igc_set_flag_queue_pairs(struct igc_adapter *adapter, unsigned int igc_get_max_rss_queues(struct igc_adapter *adapter) { - unsigned int max_rss_queues; - - /* Determine the maximum number of RSS queues supported. */ - max_rss_queues = IGC_MAX_RX_QUEUES; - - return max_rss_queues; + return IGC_MAX_RX_QUEUES; } static void igc_init_queue_configuration(struct igc_adapter *adapter) @@ -3415,8 +3543,6 @@ static int igc_sw_init(struct igc_adapter *adapter) struct pci_dev *pdev = adapter->pdev; struct igc_hw *hw = &adapter->hw; - int size = sizeof(struct igc_mac_addr) * hw->mac.rar_entry_count; - pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word); /* set default ring sizes */ @@ -3435,15 +3561,14 @@ static int igc_sw_init(struct igc_adapter *adapter) VLAN_HLEN; adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN; - spin_lock_init(&adapter->nfc_lock); + mutex_init(&adapter->nfc_rule_lock); + INIT_LIST_HEAD(&adapter->nfc_rule_list); + adapter->nfc_rule_count = 0; + spin_lock_init(&adapter->stats64_lock); /* Assume MSI-X interrupts, will be checked during IRQ allocation */ adapter->flags |= IGC_FLAG_HAS_MSIX; - adapter->mac_table = kzalloc(size, GFP_ATOMIC); - if (!adapter->mac_table) - return -ENOMEM; - igc_init_queue_configuration(adapter); /* This call may decrease the number of queues */ @@ -3666,18 +3791,6 @@ void igc_update_stats(struct igc_adapter *adapter) adapter->stats.mgpdc += rd32(IGC_MGTPDC); } -static void igc_nfc_filter_exit(struct igc_adapter *adapter) -{ - struct igc_nfc_filter *rule; - - spin_lock(&adapter->nfc_lock); - - hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) - igc_erase_filter(adapter, rule); - - spin_unlock(&adapter->nfc_lock); -} - /** * igc_down - Close the interface * @adapter: board private structure @@ -3696,8 +3809,6 @@ void igc_down(struct igc_adapter *adapter) wr32(IGC_RCTL, rctl & ~IGC_RCTL_EN); /* flush and sleep below */ - igc_nfc_filter_exit(adapter); - /* set trans_start so we don't get spurious watchdogs during reset */ netif_trans_update(netdev); @@ -3846,20 +3957,8 @@ static int igc_set_features(struct net_device *netdev, if (!(changed & (NETIF_F_RXALL | NETIF_F_NTUPLE))) return 0; - if (!(features & NETIF_F_NTUPLE)) { - struct hlist_node *node2; - struct igc_nfc_filter *rule; - - spin_lock(&adapter->nfc_lock); - hlist_for_each_entry_safe(rule, node2, - &adapter->nfc_filter_list, nfc_node) { - igc_erase_filter(adapter, rule); - hlist_del(&rule->nfc_node); - kfree(rule); - } - spin_unlock(&adapter->nfc_lock); - adapter->nfc_filter_count = 0; - } + if (!(features & NETIF_F_NTUPLE)) + igc_flush_nfc_rules(adapter); netdev->features = features; @@ -4947,7 +5046,7 @@ static int igc_probe(struct pci_dev *pdev, hw->hw_addr = adapter->io_addr; netdev->netdev_ops = &igc_netdev_ops; - igc_set_ethtool_ops(netdev); + igc_ethtool_set_ops(netdev); netdev->watchdog_timeo = 5 * HZ; netdev->mem_start = pci_resource_start(pdev, 0); @@ -5126,6 +5225,8 @@ static void igc_remove(struct pci_dev *pdev) pm_runtime_get_noresume(&pdev->dev); + igc_flush_nfc_rules(adapter); + igc_ptp_stop(adapter); set_bit(__IGC_DOWN, &adapter->state); @@ -5146,7 +5247,6 @@ static void igc_remove(struct pci_dev *pdev) pci_iounmap(pdev, adapter->io_addr); pci_release_mem_regions(pdev); - kfree(adapter->mac_table); free_netdev(netdev); pci_disable_pcie_error_reporting(pdev); diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index 61db951f0947..7f999cfc9b39 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -68,13 +68,6 @@ #define IGC_ICRXDMTC 0x04120 /* Rx Descriptor Min Threshold Count */ #define IGC_ICRXOC 0x04124 /* Receiver Overrun Count */ -#define IGC_CBTMPC 0x0402C /* Circuit Breaker TX Packet Count */ -#define IGC_HTDPMC 0x0403C /* Host Transmit Discarded Packets */ -#define IGC_CBRMPC 0x040FC /* Circuit Breaker RX Packet Count */ -#define IGC_RPTHC 0x04104 /* Rx Packets To Host */ -#define IGC_HGPTC 0x04118 /* Host Good Packets TX Count */ -#define IGC_HTCBDPC 0x04124 /* Host TX Circ.Breaker Drop Count */ - /* MSI-X Table Register Descriptions */ #define IGC_PBACL 0x05B68 /* MSIx PBA Clear - R/W 1 to clear */ @@ -131,9 +124,6 @@ #define IGC_MMDAC 13 /* MMD Access Control */ #define IGC_MMDAAD 14 /* MMD Access Address/Data */ -/* Good transmitted packets counter registers */ -#define IGC_PQGPTC(_n) (0x010014 + (0x100 * (_n))) - /* Statistics Register Descriptions */ #define IGC_CRCERRS 0x04000 /* CRC Error Count - R/clr */ #define IGC_ALGNERRC 0x04004 /* Alignment Error Count - R/clr */ @@ -206,7 +196,6 @@ #define IGC_HGOTCL 0x04130 /* Host Good Octets Transmit Count Low */ #define IGC_HGOTCH 0x04134 /* Host Good Octets Transmit Count High */ #define IGC_LENERRS 0x04138 /* Length Errors Count */ -#define IGC_HRMPC 0x0A018 /* Header Redirection Missed Packet Count */ /* Time sync registers */ #define IGC_TSICR 0x0B66C /* Time Sync Interrupt Cause */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 2833e4f041ce..5ddfc83a1e46 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -224,17 +224,17 @@ struct ixgbe_tx_buffer { }; struct ixgbe_rx_buffer { - struct sk_buff *skb; - dma_addr_t dma; union { struct { + struct sk_buff *skb; + dma_addr_t dma; struct page *page; __u32 page_offset; __u16 pagecnt_bias; }; struct { - void *addr; - u64 handle; + bool discard; + struct xdp_buff *xdp; }; }; }; @@ -351,7 +351,6 @@ struct ixgbe_ring { }; struct xdp_rxq_info xdp_rxq; struct xdp_umem *xsk_umem; - struct zero_copy_allocator zca; /* ZC allocator anchor */ u16 ring_idx; /* {rx,tx,xdp}_ring back reference idx */ u16 rx_buf_len; } ____cacheline_internodealigned_in_smp; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index eab5934b04f5..45fc7ce1a543 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -35,7 +35,7 @@ #include <net/tc_act/tc_mirred.h> #include <net/vxlan.h> #include <net/mpls.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xfrm.h> #include "ixgbe.h" @@ -3745,8 +3745,7 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter, /* configure the packet buffer length */ if (rx_ring->xsk_umem) { - u32 xsk_buf_len = rx_ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + u32 xsk_buf_len = xsk_umem_get_rx_frame_size(rx_ring->xsk_umem); /* If the MAC support setting RXDCTL.RLPML, the * SRRCTL[n].BSIZEPKT is set to PAGE_SIZE and @@ -4093,11 +4092,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); ring->xsk_umem = ixgbe_xsk_umem(adapter, ring); if (ring->xsk_umem) { - ring->zca.free = ixgbe_zca_free; WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &ring->zca)); - + MEM_TYPE_XSK_BUFF_POOL, + NULL)); + xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq); } else { WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL)); @@ -4153,8 +4151,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, } if (ring->xsk_umem && hw->mac.type != ixgbe_mac_82599EB) { - u32 xsk_buf_len = ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + u32 xsk_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem); rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | IXGBE_RXDCTL_RLPML_EN); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h index 6d01700b46bc..7887ae4aaf4f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h @@ -35,7 +35,7 @@ int ixgbe_xsk_umem_setup(struct ixgbe_adapter *adapter, struct xdp_umem *umem, void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); -void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count); +bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count); int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, struct ixgbe_ring *rx_ring, const int budget); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index a656ee9a1fae..86add9fbd36c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -2,7 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "ixgbe.h" @@ -20,54 +20,11 @@ struct xdp_umem *ixgbe_xsk_umem(struct ixgbe_adapter *adapter, return xdp_get_umem_from_qid(adapter->netdev, qid); } -static int ixgbe_xsk_umem_dma_map(struct ixgbe_adapter *adapter, - struct xdp_umem *umem) -{ - struct device *dev = &adapter->pdev->dev; - unsigned int i, j; - dma_addr_t dma; - - for (i = 0; i < umem->npgs; i++) { - dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE, - DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR); - if (dma_mapping_error(dev, dma)) - goto out_unmap; - - umem->pages[i].dma = dma; - } - - return 0; - -out_unmap: - for (j = 0; j < i; j++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR); - umem->pages[i].dma = 0; - } - - return -1; -} - -static void ixgbe_xsk_umem_dma_unmap(struct ixgbe_adapter *adapter, - struct xdp_umem *umem) -{ - struct device *dev = &adapter->pdev->dev; - unsigned int i; - - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR); - - umem->pages[i].dma = 0; - } -} - static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter, struct xdp_umem *umem, u16 qid) { struct net_device *netdev = adapter->netdev; - struct xdp_umem_fq_reuse *reuseq; bool if_running; int err; @@ -78,13 +35,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter, qid >= netdev->real_num_tx_queues) return -EINVAL; - reuseq = xsk_reuseq_prepare(adapter->rx_ring[0]->count); - if (!reuseq) - return -ENOMEM; - - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - err = ixgbe_xsk_umem_dma_map(adapter, umem); + err = xsk_buff_dma_map(umem, &adapter->pdev->dev, IXGBE_RX_DMA_ATTR); if (err) return err; @@ -124,7 +75,7 @@ static int ixgbe_xsk_umem_disable(struct ixgbe_adapter *adapter, u16 qid) ixgbe_txrx_ring_disable(adapter, qid); clear_bit(qid, adapter->af_xdp_zc_qps); - ixgbe_xsk_umem_dma_unmap(adapter, umem); + xsk_buff_dma_unmap(umem, IXGBE_RX_DMA_ATTR); if (if_running) ixgbe_txrx_ring_enable(adapter, qid); @@ -143,19 +94,14 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, struct ixgbe_ring *rx_ring, struct xdp_buff *xdp) { - struct xdp_umem *umem = rx_ring->xsk_umem; int err, result = IXGBE_XDP_PASS; struct bpf_prog *xdp_prog; struct xdp_frame *xdpf; - u64 offset; u32 act; rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); act = bpf_prog_run_xdp(xdp_prog, xdp); - offset = xdp->data - xdp->data_hard_start; - - xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset); switch (act) { case XDP_PASS: @@ -186,140 +132,16 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, return result; } -static struct -ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring, - unsigned int size) -{ - struct ixgbe_rx_buffer *bi; - - bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; - - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - bi->dma, 0, - size, - DMA_BIDIRECTIONAL); - - return bi; -} - -static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *obi) -{ - u16 nta = rx_ring->next_to_alloc; - struct ixgbe_rx_buffer *nbi; - - nbi = &rx_ring->rx_buffer_info[rx_ring->next_to_alloc]; - /* update, and store next to alloc */ - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - /* transfer page from old buffer to new buffer */ - nbi->dma = obi->dma; - nbi->addr = obi->addr; - nbi->handle = obi->handle; - - obi->addr = NULL; - obi->skb = NULL; -} - -void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) -{ - struct ixgbe_rx_buffer *bi; - struct ixgbe_ring *rx_ring; - u64 hr, mask; - u16 nta; - - rx_ring = container_of(alloc, struct ixgbe_ring, zca); - hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; - mask = rx_ring->xsk_umem->chunk_mask; - - nta = rx_ring->next_to_alloc; - bi = rx_ring->rx_buffer_info; - - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - handle &= mask; - - bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle, - rx_ring->xsk_umem->headroom); -} - -static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - void *addr = bi->addr; - u64 handle, hr; - - if (addr) - return true; - - if (!xsk_umem_peek_addr(umem, &handle)) { - rx_ring->rx_stats.alloc_rx_page_failed++; - return false; - } - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr(umem); - return true; -} - -static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - u64 handle, hr; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) { - rx_ring->rx_stats.alloc_rx_page_failed++; - return false; - } - - handle &= rx_ring->xsk_umem->chunk_mask; - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr_rq(umem); - return true; -} - -static __always_inline bool -__ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, - bool alloc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi)) +bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count) { union ixgbe_adv_rx_desc *rx_desc; struct ixgbe_rx_buffer *bi; u16 i = rx_ring->next_to_use; + dma_addr_t dma; bool ok = true; /* nothing to do */ - if (!cleaned_count) + if (!count) return true; rx_desc = IXGBE_RX_DESC(rx_ring, i); @@ -327,21 +149,18 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, i -= rx_ring->count; do { - if (!alloc(rx_ring, bi)) { + bi->xdp = xsk_buff_alloc(rx_ring->xsk_umem); + if (!bi->xdp) { ok = false; break; } - /* sync the buffer for use by the device */ - dma_sync_single_range_for_device(rx_ring->dev, bi->dma, - bi->page_offset, - rx_ring->rx_buf_len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_xdp_get_dma(bi->xdp); /* Refresh the desc even if buffer_addrs didn't change * because each write-back erases this info. */ - rx_desc->read.pkt_addr = cpu_to_le64(bi->dma); + rx_desc->read.pkt_addr = cpu_to_le64(dma); rx_desc++; bi++; @@ -355,17 +174,14 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, /* clear the length for the next_to_use descriptor */ rx_desc->wb.upper.length = 0; - cleaned_count--; - } while (cleaned_count); + count--; + } while (count); i += rx_ring->count; if (rx_ring->next_to_use != i) { rx_ring->next_to_use = i; - /* update next to alloc since we have filled the ring */ - rx_ring->next_to_alloc = i; - /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only * applicable for weak-ordered memory model archs, @@ -378,40 +194,27 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, return ok; } -void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count) -{ - __ixgbe_alloc_rx_buffers_zc(rx_ring, count, - ixgbe_alloc_buffer_slow_zc); -} - -static bool ixgbe_alloc_rx_buffers_fast_zc(struct ixgbe_ring *rx_ring, - u16 count) -{ - return __ixgbe_alloc_rx_buffers_zc(rx_ring, count, - ixgbe_alloc_buffer_zc); -} - static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi, - struct xdp_buff *xdp) + struct ixgbe_rx_buffer *bi) { - unsigned int metasize = xdp->data - xdp->data_meta; - unsigned int datasize = xdp->data_end - xdp->data; + unsigned int metasize = bi->xdp->data - bi->xdp->data_meta; + unsigned int datasize = bi->xdp->data_end - bi->xdp->data; struct sk_buff *skb; /* allocate a skb to store the frags */ skb = __napi_alloc_skb(&rx_ring->q_vector->napi, - xdp->data_end - xdp->data_hard_start, + bi->xdp->data_end - bi->xdp->data_hard_start, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; - skb_reserve(skb, xdp->data - xdp->data_hard_start); - memcpy(__skb_put(skb, datasize), xdp->data, datasize); + skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start); + memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize); if (metasize) skb_metadata_set(skb, metasize); - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); + xsk_buff_free(bi->xdp); + bi->xdp = NULL; return skb; } @@ -431,14 +234,9 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbe_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbe_desc_unused(rx_ring); - struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; - struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; @@ -448,8 +246,8 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, /* return some buffers to hardware, one at a time is too slow */ if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) { failure = failure || - !ixgbe_alloc_rx_buffers_fast_zc(rx_ring, - cleaned_count); + !ixgbe_alloc_rx_buffers_zc(rx_ring, + cleaned_count); cleaned_count = 0; } @@ -464,42 +262,40 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, */ dma_rmb(); - bi = ixgbe_get_rx_buffer_zc(rx_ring, size); + bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; if (unlikely(!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))) { struct ixgbe_rx_buffer *next_bi; - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); + xsk_buff_free(bi->xdp); + bi->xdp = NULL; ixgbe_inc_ntc(rx_ring); next_bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; - next_bi->skb = ERR_PTR(-EINVAL); + next_bi->discard = true; continue; } - if (unlikely(bi->skb)) { - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); + if (unlikely(bi->discard)) { + xsk_buff_free(bi->xdp); + bi->xdp = NULL; + bi->discard = false; ixgbe_inc_ntc(rx_ring); continue; } - xdp.data = bi->addr; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + size; - xdp.handle = bi->handle; - - xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, &xdp); + bi->xdp->data_end = bi->xdp->data + size; + xsk_buff_dma_sync_for_cpu(bi->xdp); + xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp); if (xdp_res) { - if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) { + if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) xdp_xmit |= xdp_res; - bi->addr = NULL; - bi->skb = NULL; - } else { - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); - } + else + xsk_buff_free(bi->xdp); + + bi->xdp = NULL; total_rx_packets++; total_rx_bytes += size; @@ -509,7 +305,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, } /* XDP_PASS path */ - skb = ixgbe_construct_skb_zc(rx_ring, bi, &xdp); + skb = ixgbe_construct_skb_zc(rx_ring, bi); if (!skb) { rx_ring->rx_stats.alloc_rx_buff_failed++; break; @@ -561,17 +357,17 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring) { - u16 i = rx_ring->next_to_clean; - struct ixgbe_rx_buffer *bi = &rx_ring->rx_buffer_info[i]; + struct ixgbe_rx_buffer *bi; + u16 i; - while (i != rx_ring->next_to_alloc) { - xsk_umem_fq_reuse(rx_ring->xsk_umem, bi->handle); - i++; - bi++; - if (i == rx_ring->count) { - i = 0; - bi = rx_ring->rx_buffer_info; - } + for (i = 0; i < rx_ring->count; i++) { + bi = &rx_ring->rx_buffer_info[i]; + + if (!bi->xdp) + continue; + + xsk_buff_free(bi->xdp); + bi->xdp = NULL; } } @@ -594,10 +390,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc)) break; - dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr); - - dma_sync_single_for_device(xdp_ring->dev, dma, desc.len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma, + desc.len); tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use]; tx_bi->bytecount = desc.len; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 41d2a0eac5fa..37947949345c 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3566,10 +3566,6 @@ static void mvneta_start_dev(struct mvneta_port *pp) MVNETA_CAUSE_LINK_CHANGE); phylink_start(pp->phylink); - - /* We may have called phy_speed_down before */ - phy_speed_up(pp->dev->phydev); - netif_tx_start_all_queues(pp->dev); } @@ -3577,9 +3573,6 @@ static void mvneta_stop_dev(struct mvneta_port *pp) { unsigned int cpu; - if (device_may_wakeup(&pp->dev->dev)) - phy_speed_down(pp->dev->phydev, false); - phylink_stop(pp->phylink); if (!pp->neta_armada3700) { @@ -4052,10 +4045,6 @@ static int mvneta_mdio_probe(struct mvneta_port *pp) phylink_ethtool_get_wol(pp->phylink, &wol); device_set_wakeup_capable(&pp->dev->dev, !!wol.supported); - /* PHY WoL may be enabled but device wakeup disabled */ - if (wol.supported) - device_set_wakeup_enable(&pp->dev->dev, !!wol.wolopts); - return err; } diff --git a/drivers/net/ethernet/mediatek/Kconfig b/drivers/net/ethernet/mediatek/Kconfig index 4968352ba188..500c15e7ea4a 100644 --- a/drivers/net/ethernet/mediatek/Kconfig +++ b/drivers/net/ethernet/mediatek/Kconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config NET_VENDOR_MEDIATEK - bool "MediaTek ethernet driver" + bool "MediaTek devices" depends on ARCH_MEDIATEK || SOC_MT7621 || SOC_MT7620 ---help--- If you have a Mediatek SoC with ethernet, say Y. @@ -14,4 +14,11 @@ config NET_MEDIATEK_SOC This driver supports the gigabit ethernet MACs in the MediaTek SoC family. +config NET_MEDIATEK_STAR_EMAC + tristate "MediaTek STAR Ethernet MAC support" + select PHYLIB + help + This driver supports the ethernet MAC IP first used on + MediaTek MT85** SoCs. + endif #NET_VENDOR_MEDIATEK diff --git a/drivers/net/ethernet/mediatek/Makefile b/drivers/net/ethernet/mediatek/Makefile index 2d8362f9341b..3a777b4a6cd3 100644 --- a/drivers/net/ethernet/mediatek/Makefile +++ b/drivers/net/ethernet/mediatek/Makefile @@ -3,5 +3,6 @@ # Makefile for the Mediatek SoCs built-in ethernet macs # -obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o +obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o mtk_eth-y := mtk_eth_soc.o mtk_sgmii.o mtk_eth_path.o +obj-$(CONFIG_NET_MEDIATEK_STAR_EMAC) += mtk_star_emac.o diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c new file mode 100644 index 000000000000..789c77af501f --- /dev/null +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -0,0 +1,1678 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 MediaTek Corporation + * Copyright (c) 2020 BayLibre SAS + * + * Author: Bartosz Golaszewski <[email protected]> + */ + +#include <linux/bits.h> +#include <linux/clk.h> +#include <linux/compiler.h> +#include <linux/dma-mapping.h> +#include <linux/etherdevice.h> +#include <linux/kernel.h> +#include <linux/mfd/syscon.h> +#include <linux/mii.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/of.h> +#include <linux/of_mdio.h> +#include <linux/of_net.h> +#include <linux/platform_device.h> +#include <linux/pm.h> +#include <linux/regmap.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> + +#define MTK_STAR_DRVNAME "mtk_star_emac" + +#define MTK_STAR_WAIT_TIMEOUT 300 +#define MTK_STAR_MAX_FRAME_SIZE 1514 +#define MTK_STAR_SKB_ALIGNMENT 16 +#define MTK_STAR_NAPI_WEIGHT 64 +#define MTK_STAR_HASHTABLE_MC_LIMIT 256 +#define MTK_STAR_HASHTABLE_SIZE_MAX 512 + +/* Normally we'd use NET_IP_ALIGN but on arm64 its value is 0 and it doesn't + * work for this controller. + */ +#define MTK_STAR_IP_ALIGN 2 + +static const char *const mtk_star_clk_names[] = { "core", "reg", "trans" }; +#define MTK_STAR_NCLKS ARRAY_SIZE(mtk_star_clk_names) + +/* PHY Control Register 0 */ +#define MTK_STAR_REG_PHY_CTRL0 0x0000 +#define MTK_STAR_BIT_PHY_CTRL0_WTCMD BIT(13) +#define MTK_STAR_BIT_PHY_CTRL0_RDCMD BIT(14) +#define MTK_STAR_BIT_PHY_CTRL0_RWOK BIT(15) +#define MTK_STAR_MSK_PHY_CTRL0_PREG GENMASK(12, 8) +#define MTK_STAR_OFF_PHY_CTRL0_PREG 8 +#define MTK_STAR_MSK_PHY_CTRL0_RWDATA GENMASK(31, 16) +#define MTK_STAR_OFF_PHY_CTRL0_RWDATA 16 + +/* PHY Control Register 1 */ +#define MTK_STAR_REG_PHY_CTRL1 0x0004 +#define MTK_STAR_BIT_PHY_CTRL1_LINK_ST BIT(0) +#define MTK_STAR_BIT_PHY_CTRL1_AN_EN BIT(8) +#define MTK_STAR_OFF_PHY_CTRL1_FORCE_SPD 9 +#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_10M 0x00 +#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_100M 0x01 +#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_1000M 0x02 +#define MTK_STAR_BIT_PHY_CTRL1_FORCE_DPX BIT(11) +#define MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_RX BIT(12) +#define MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_TX BIT(13) + +/* MAC Configuration Register */ +#define MTK_STAR_REG_MAC_CFG 0x0008 +#define MTK_STAR_OFF_MAC_CFG_IPG 10 +#define MTK_STAR_VAL_MAC_CFG_IPG_96BIT GENMASK(4, 0) +#define MTK_STAR_BIT_MAC_CFG_MAXLEN_1522 BIT(16) +#define MTK_STAR_BIT_MAC_CFG_AUTO_PAD BIT(19) +#define MTK_STAR_BIT_MAC_CFG_CRC_STRIP BIT(20) +#define MTK_STAR_BIT_MAC_CFG_VLAN_STRIP BIT(22) +#define MTK_STAR_BIT_MAC_CFG_NIC_PD BIT(31) + +/* Flow-Control Configuration Register */ +#define MTK_STAR_REG_FC_CFG 0x000c +#define MTK_STAR_BIT_FC_CFG_BP_EN BIT(7) +#define MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR BIT(8) +#define MTK_STAR_OFF_FC_CFG_SEND_PAUSE_TH 16 +#define MTK_STAR_MSK_FC_CFG_SEND_PAUSE_TH GENMASK(27, 16) +#define MTK_STAR_VAL_FC_CFG_SEND_PAUSE_TH_2K 0x800 + +/* ARL Configuration Register */ +#define MTK_STAR_REG_ARL_CFG 0x0010 +#define MTK_STAR_BIT_ARL_CFG_HASH_ALG BIT(0) +#define MTK_STAR_BIT_ARL_CFG_MISC_MODE BIT(4) + +/* MAC High and Low Bytes Registers */ +#define MTK_STAR_REG_MY_MAC_H 0x0014 +#define MTK_STAR_REG_MY_MAC_L 0x0018 + +/* Hash Table Control Register */ +#define MTK_STAR_REG_HASH_CTRL 0x001c +#define MTK_STAR_MSK_HASH_CTRL_HASH_BIT_ADDR GENMASK(8, 0) +#define MTK_STAR_BIT_HASH_CTRL_HASH_BIT_DATA BIT(12) +#define MTK_STAR_BIT_HASH_CTRL_ACC_CMD BIT(13) +#define MTK_STAR_BIT_HASH_CTRL_CMD_START BIT(14) +#define MTK_STAR_BIT_HASH_CTRL_BIST_OK BIT(16) +#define MTK_STAR_BIT_HASH_CTRL_BIST_DONE BIT(17) +#define MTK_STAR_BIT_HASH_CTRL_BIST_EN BIT(31) + +/* TX DMA Control Register */ +#define MTK_STAR_REG_TX_DMA_CTRL 0x0034 +#define MTK_STAR_BIT_TX_DMA_CTRL_START BIT(0) +#define MTK_STAR_BIT_TX_DMA_CTRL_STOP BIT(1) +#define MTK_STAR_BIT_TX_DMA_CTRL_RESUME BIT(2) + +/* RX DMA Control Register */ +#define MTK_STAR_REG_RX_DMA_CTRL 0x0038 +#define MTK_STAR_BIT_RX_DMA_CTRL_START BIT(0) +#define MTK_STAR_BIT_RX_DMA_CTRL_STOP BIT(1) +#define MTK_STAR_BIT_RX_DMA_CTRL_RESUME BIT(2) + +/* DMA Address Registers */ +#define MTK_STAR_REG_TX_DPTR 0x003c +#define MTK_STAR_REG_RX_DPTR 0x0040 +#define MTK_STAR_REG_TX_BASE_ADDR 0x0044 +#define MTK_STAR_REG_RX_BASE_ADDR 0x0048 + +/* Interrupt Status Register */ +#define MTK_STAR_REG_INT_STS 0x0050 +#define MTK_STAR_REG_INT_STS_PORT_STS_CHG BIT(2) +#define MTK_STAR_REG_INT_STS_MIB_CNT_TH BIT(3) +#define MTK_STAR_BIT_INT_STS_FNRC BIT(6) +#define MTK_STAR_BIT_INT_STS_TNTC BIT(8) + +/* Interrupt Mask Register */ +#define MTK_STAR_REG_INT_MASK 0x0054 +#define MTK_STAR_BIT_INT_MASK_FNRC BIT(6) + +/* Misc. Config Register */ +#define MTK_STAR_REG_TEST1 0x005c +#define MTK_STAR_BIT_TEST1_RST_HASH_MBIST BIT(31) + +/* Extended Configuration Register */ +#define MTK_STAR_REG_EXT_CFG 0x0060 +#define MTK_STAR_OFF_EXT_CFG_SND_PAUSE_RLS 16 +#define MTK_STAR_MSK_EXT_CFG_SND_PAUSE_RLS GENMASK(26, 16) +#define MTK_STAR_VAL_EXT_CFG_SND_PAUSE_RLS_1K 0x400 + +/* EthSys Configuration Register */ +#define MTK_STAR_REG_SYS_CONF 0x0094 +#define MTK_STAR_BIT_MII_PAD_OUT_ENABLE BIT(0) +#define MTK_STAR_BIT_EXT_MDC_MODE BIT(1) +#define MTK_STAR_BIT_SWC_MII_MODE BIT(2) + +/* MAC Clock Configuration Register */ +#define MTK_STAR_REG_MAC_CLK_CONF 0x00ac +#define MTK_STAR_MSK_MAC_CLK_CONF GENMASK(7, 0) +#define MTK_STAR_BIT_CLK_DIV_10 0x0a + +/* Counter registers. */ +#define MTK_STAR_REG_C_RXOKPKT 0x0100 +#define MTK_STAR_REG_C_RXOKBYTE 0x0104 +#define MTK_STAR_REG_C_RXRUNT 0x0108 +#define MTK_STAR_REG_C_RXLONG 0x010c +#define MTK_STAR_REG_C_RXDROP 0x0110 +#define MTK_STAR_REG_C_RXCRC 0x0114 +#define MTK_STAR_REG_C_RXARLDROP 0x0118 +#define MTK_STAR_REG_C_RXVLANDROP 0x011c +#define MTK_STAR_REG_C_RXCSERR 0x0120 +#define MTK_STAR_REG_C_RXPAUSE 0x0124 +#define MTK_STAR_REG_C_TXOKPKT 0x0128 +#define MTK_STAR_REG_C_TXOKBYTE 0x012c +#define MTK_STAR_REG_C_TXPAUSECOL 0x0130 +#define MTK_STAR_REG_C_TXRTY 0x0134 +#define MTK_STAR_REG_C_TXSKIP 0x0138 +#define MTK_STAR_REG_C_TX_ARP 0x013c +#define MTK_STAR_REG_C_RX_RERR 0x01d8 +#define MTK_STAR_REG_C_RX_UNI 0x01dc +#define MTK_STAR_REG_C_RX_MULTI 0x01e0 +#define MTK_STAR_REG_C_RX_BROAD 0x01e4 +#define MTK_STAR_REG_C_RX_ALIGNERR 0x01e8 +#define MTK_STAR_REG_C_TX_UNI 0x01ec +#define MTK_STAR_REG_C_TX_MULTI 0x01f0 +#define MTK_STAR_REG_C_TX_BROAD 0x01f4 +#define MTK_STAR_REG_C_TX_TIMEOUT 0x01f8 +#define MTK_STAR_REG_C_TX_LATECOL 0x01fc +#define MTK_STAR_REG_C_RX_LENGTHERR 0x0214 +#define MTK_STAR_REG_C_RX_TWIST 0x0218 + +/* Ethernet CFG Control */ +#define MTK_PERICFG_REG_NIC_CFG_CON 0x03c4 +#define MTK_PERICFG_MSK_NIC_CFG_CON_CFG_MII GENMASK(3, 0) +#define MTK_PERICFG_BIT_NIC_CFG_CON_RMII BIT(0) + +/* Represents the actual structure of descriptors used by the MAC. We can + * reuse the same structure for both TX and RX - the layout is the same, only + * the flags differ slightly. + */ +struct mtk_star_ring_desc { + /* Contains both the status flags as well as packet length. */ + u32 status; + u32 data_ptr; + u32 vtag; + u32 reserved; +}; + +#define MTK_STAR_DESC_MSK_LEN GENMASK(15, 0) +#define MTK_STAR_DESC_BIT_RX_CRCE BIT(24) +#define MTK_STAR_DESC_BIT_RX_OSIZE BIT(25) +#define MTK_STAR_DESC_BIT_INT BIT(27) +#define MTK_STAR_DESC_BIT_LS BIT(28) +#define MTK_STAR_DESC_BIT_FS BIT(29) +#define MTK_STAR_DESC_BIT_EOR BIT(30) +#define MTK_STAR_DESC_BIT_COWN BIT(31) + +/* Helper structure for storing data read from/written to descriptors in order + * to limit reads from/writes to DMA memory. + */ +struct mtk_star_ring_desc_data { + unsigned int len; + unsigned int flags; + dma_addr_t dma_addr; + struct sk_buff *skb; +}; + +#define MTK_STAR_RING_NUM_DESCS 128 +#define MTK_STAR_NUM_TX_DESCS MTK_STAR_RING_NUM_DESCS +#define MTK_STAR_NUM_RX_DESCS MTK_STAR_RING_NUM_DESCS +#define MTK_STAR_NUM_DESCS_TOTAL (MTK_STAR_RING_NUM_DESCS * 2) +#define MTK_STAR_DMA_SIZE \ + (MTK_STAR_NUM_DESCS_TOTAL * sizeof(struct mtk_star_ring_desc)) + +struct mtk_star_ring { + struct mtk_star_ring_desc *descs; + struct sk_buff *skbs[MTK_STAR_RING_NUM_DESCS]; + dma_addr_t dma_addrs[MTK_STAR_RING_NUM_DESCS]; + unsigned int head; + unsigned int tail; +}; + +struct mtk_star_priv { + struct net_device *ndev; + + struct regmap *regs; + struct regmap *pericfg; + + struct clk_bulk_data clks[MTK_STAR_NCLKS]; + + void *ring_base; + struct mtk_star_ring_desc *descs_base; + dma_addr_t dma_addr; + struct mtk_star_ring tx_ring; + struct mtk_star_ring rx_ring; + + struct mii_bus *mii; + struct napi_struct napi; + + struct device_node *phy_node; + phy_interface_t phy_intf; + struct phy_device *phydev; + unsigned int link; + int speed; + int duplex; + int pause; + + /* Protects against concurrent descriptor access. */ + spinlock_t lock; + + struct rtnl_link_stats64 stats; + struct work_struct stats_work; +}; + +static struct device *mtk_star_get_dev(struct mtk_star_priv *priv) +{ + return priv->ndev->dev.parent; +} + +static const struct regmap_config mtk_star_regmap_config = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .disable_locking = true, +}; + +static void mtk_star_ring_init(struct mtk_star_ring *ring, + struct mtk_star_ring_desc *descs) +{ + memset(ring, 0, sizeof(*ring)); + ring->descs = descs; + ring->head = 0; + ring->tail = 0; +} + +static int mtk_star_ring_pop_tail(struct mtk_star_ring *ring, + struct mtk_star_ring_desc_data *desc_data) +{ + struct mtk_star_ring_desc *desc = &ring->descs[ring->tail]; + unsigned int status; + + status = READ_ONCE(desc->status); + dma_rmb(); /* Make sure we read the status bits before checking it. */ + + if (!(status & MTK_STAR_DESC_BIT_COWN)) + return -1; + + desc_data->len = status & MTK_STAR_DESC_MSK_LEN; + desc_data->flags = status & ~MTK_STAR_DESC_MSK_LEN; + desc_data->dma_addr = ring->dma_addrs[ring->tail]; + desc_data->skb = ring->skbs[ring->tail]; + + ring->dma_addrs[ring->tail] = 0; + ring->skbs[ring->tail] = NULL; + + status &= MTK_STAR_DESC_BIT_COWN | MTK_STAR_DESC_BIT_EOR; + + WRITE_ONCE(desc->data_ptr, 0); + WRITE_ONCE(desc->status, status); + + ring->tail = (ring->tail + 1) % MTK_STAR_RING_NUM_DESCS; + + return 0; +} + +static void mtk_star_ring_push_head(struct mtk_star_ring *ring, + struct mtk_star_ring_desc_data *desc_data, + unsigned int flags) +{ + struct mtk_star_ring_desc *desc = &ring->descs[ring->head]; + unsigned int status; + + status = READ_ONCE(desc->status); + + ring->skbs[ring->head] = desc_data->skb; + ring->dma_addrs[ring->head] = desc_data->dma_addr; + + status |= desc_data->len; + if (flags) + status |= flags; + + WRITE_ONCE(desc->data_ptr, desc_data->dma_addr); + WRITE_ONCE(desc->status, status); + status &= ~MTK_STAR_DESC_BIT_COWN; + /* Flush previous modifications before ownership change. */ + dma_wmb(); + WRITE_ONCE(desc->status, status); + + ring->head = (ring->head + 1) % MTK_STAR_RING_NUM_DESCS; +} + +static void +mtk_star_ring_push_head_rx(struct mtk_star_ring *ring, + struct mtk_star_ring_desc_data *desc_data) +{ + mtk_star_ring_push_head(ring, desc_data, 0); +} + +static void +mtk_star_ring_push_head_tx(struct mtk_star_ring *ring, + struct mtk_star_ring_desc_data *desc_data) +{ + static const unsigned int flags = MTK_STAR_DESC_BIT_FS | + MTK_STAR_DESC_BIT_LS | + MTK_STAR_DESC_BIT_INT; + + mtk_star_ring_push_head(ring, desc_data, flags); +} + +static unsigned int mtk_star_ring_num_used_descs(struct mtk_star_ring *ring) +{ + return abs(ring->head - ring->tail); +} + +static bool mtk_star_ring_full(struct mtk_star_ring *ring) +{ + return mtk_star_ring_num_used_descs(ring) == MTK_STAR_RING_NUM_DESCS; +} + +static bool mtk_star_ring_descs_available(struct mtk_star_ring *ring) +{ + return mtk_star_ring_num_used_descs(ring) > 0; +} + +static dma_addr_t mtk_star_dma_map_rx(struct mtk_star_priv *priv, + struct sk_buff *skb) +{ + struct device *dev = mtk_star_get_dev(priv); + + /* Data pointer for the RX DMA descriptor must be aligned to 4N + 2. */ + return dma_map_single(dev, skb_tail_pointer(skb) - 2, + skb_tailroom(skb), DMA_FROM_DEVICE); +} + +static void mtk_star_dma_unmap_rx(struct mtk_star_priv *priv, + struct mtk_star_ring_desc_data *desc_data) +{ + struct device *dev = mtk_star_get_dev(priv); + + dma_unmap_single(dev, desc_data->dma_addr, + skb_tailroom(desc_data->skb), DMA_FROM_DEVICE); +} + +static dma_addr_t mtk_star_dma_map_tx(struct mtk_star_priv *priv, + struct sk_buff *skb) +{ + struct device *dev = mtk_star_get_dev(priv); + + return dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE); +} + +static void mtk_star_dma_unmap_tx(struct mtk_star_priv *priv, + struct mtk_star_ring_desc_data *desc_data) +{ + struct device *dev = mtk_star_get_dev(priv); + + return dma_unmap_single(dev, desc_data->dma_addr, + skb_headlen(desc_data->skb), DMA_TO_DEVICE); +} + +static void mtk_star_nic_disable_pd(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG, + MTK_STAR_BIT_MAC_CFG_NIC_PD, 0); +} + +/* Unmask the three interrupts we care about, mask all others. */ +static void mtk_star_intr_enable(struct mtk_star_priv *priv) +{ + unsigned int val = MTK_STAR_BIT_INT_STS_TNTC | + MTK_STAR_BIT_INT_STS_FNRC | + MTK_STAR_REG_INT_STS_MIB_CNT_TH; + + regmap_write(priv->regs, MTK_STAR_REG_INT_MASK, ~val); +} + +static void mtk_star_intr_disable(struct mtk_star_priv *priv) +{ + regmap_write(priv->regs, MTK_STAR_REG_INT_MASK, ~0); +} + +static void mtk_star_intr_enable_tx(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_TNTC, 0); +} + +static void mtk_star_intr_enable_rx(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_FNRC, 0); +} + +static void mtk_star_intr_enable_stats(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_REG_INT_STS_MIB_CNT_TH, 0); +} + +static void mtk_star_intr_disable_tx(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_TNTC, + MTK_STAR_BIT_INT_STS_TNTC); +} + +static void mtk_star_intr_disable_rx(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_BIT_INT_STS_FNRC, + MTK_STAR_BIT_INT_STS_FNRC); +} + +static void mtk_star_intr_disable_stats(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK, + MTK_STAR_REG_INT_STS_MIB_CNT_TH, + MTK_STAR_REG_INT_STS_MIB_CNT_TH); +} + +static unsigned int mtk_star_intr_read(struct mtk_star_priv *priv) +{ + unsigned int val; + + regmap_read(priv->regs, MTK_STAR_REG_INT_STS, &val); + + return val; +} + +static unsigned int mtk_star_intr_ack_all(struct mtk_star_priv *priv) +{ + unsigned int val; + + val = mtk_star_intr_read(priv); + regmap_write(priv->regs, MTK_STAR_REG_INT_STS, val); + + return val; +} + +static void mtk_star_dma_init(struct mtk_star_priv *priv) +{ + struct mtk_star_ring_desc *desc; + unsigned int val; + int i; + + priv->descs_base = (struct mtk_star_ring_desc *)priv->ring_base; + + for (i = 0; i < MTK_STAR_NUM_DESCS_TOTAL; i++) { + desc = &priv->descs_base[i]; + + memset(desc, 0, sizeof(*desc)); + desc->status = MTK_STAR_DESC_BIT_COWN; + if ((i == MTK_STAR_NUM_TX_DESCS - 1) || + (i == MTK_STAR_NUM_DESCS_TOTAL - 1)) + desc->status |= MTK_STAR_DESC_BIT_EOR; + } + + mtk_star_ring_init(&priv->tx_ring, priv->descs_base); + mtk_star_ring_init(&priv->rx_ring, + priv->descs_base + MTK_STAR_NUM_TX_DESCS); + + /* Set DMA pointers. */ + val = (unsigned int)priv->dma_addr; + regmap_write(priv->regs, MTK_STAR_REG_TX_BASE_ADDR, val); + regmap_write(priv->regs, MTK_STAR_REG_TX_DPTR, val); + + val += sizeof(struct mtk_star_ring_desc) * MTK_STAR_NUM_TX_DESCS; + regmap_write(priv->regs, MTK_STAR_REG_RX_BASE_ADDR, val); + regmap_write(priv->regs, MTK_STAR_REG_RX_DPTR, val); +} + +static void mtk_star_dma_start(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL, + MTK_STAR_BIT_TX_DMA_CTRL_START, + MTK_STAR_BIT_TX_DMA_CTRL_START); + regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL, + MTK_STAR_BIT_RX_DMA_CTRL_START, + MTK_STAR_BIT_RX_DMA_CTRL_START); +} + +static void mtk_star_dma_stop(struct mtk_star_priv *priv) +{ + regmap_write(priv->regs, MTK_STAR_REG_TX_DMA_CTRL, + MTK_STAR_BIT_TX_DMA_CTRL_STOP); + regmap_write(priv->regs, MTK_STAR_REG_RX_DMA_CTRL, + MTK_STAR_BIT_RX_DMA_CTRL_STOP); +} + +static void mtk_star_dma_disable(struct mtk_star_priv *priv) +{ + int i; + + mtk_star_dma_stop(priv); + + /* Take back all descriptors. */ + for (i = 0; i < MTK_STAR_NUM_DESCS_TOTAL; i++) + priv->descs_base[i].status |= MTK_STAR_DESC_BIT_COWN; +} + +static void mtk_star_dma_resume_rx(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL, + MTK_STAR_BIT_RX_DMA_CTRL_RESUME, + MTK_STAR_BIT_RX_DMA_CTRL_RESUME); +} + +static void mtk_star_dma_resume_tx(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL, + MTK_STAR_BIT_TX_DMA_CTRL_RESUME, + MTK_STAR_BIT_TX_DMA_CTRL_RESUME); +} + +static void mtk_star_set_mac_addr(struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + u8 *mac_addr = ndev->dev_addr; + unsigned int high, low; + + high = mac_addr[0] << 8 | mac_addr[1] << 0; + low = mac_addr[2] << 24 | mac_addr[3] << 16 | + mac_addr[4] << 8 | mac_addr[5]; + + regmap_write(priv->regs, MTK_STAR_REG_MY_MAC_H, high); + regmap_write(priv->regs, MTK_STAR_REG_MY_MAC_L, low); +} + +static void mtk_star_reset_counters(struct mtk_star_priv *priv) +{ + static const unsigned int counter_regs[] = { + MTK_STAR_REG_C_RXOKPKT, + MTK_STAR_REG_C_RXOKBYTE, + MTK_STAR_REG_C_RXRUNT, + MTK_STAR_REG_C_RXLONG, + MTK_STAR_REG_C_RXDROP, + MTK_STAR_REG_C_RXCRC, + MTK_STAR_REG_C_RXARLDROP, + MTK_STAR_REG_C_RXVLANDROP, + MTK_STAR_REG_C_RXCSERR, + MTK_STAR_REG_C_RXPAUSE, + MTK_STAR_REG_C_TXOKPKT, + MTK_STAR_REG_C_TXOKBYTE, + MTK_STAR_REG_C_TXPAUSECOL, + MTK_STAR_REG_C_TXRTY, + MTK_STAR_REG_C_TXSKIP, + MTK_STAR_REG_C_TX_ARP, + MTK_STAR_REG_C_RX_RERR, + MTK_STAR_REG_C_RX_UNI, + MTK_STAR_REG_C_RX_MULTI, + MTK_STAR_REG_C_RX_BROAD, + MTK_STAR_REG_C_RX_ALIGNERR, + MTK_STAR_REG_C_TX_UNI, + MTK_STAR_REG_C_TX_MULTI, + MTK_STAR_REG_C_TX_BROAD, + MTK_STAR_REG_C_TX_TIMEOUT, + MTK_STAR_REG_C_TX_LATECOL, + MTK_STAR_REG_C_RX_LENGTHERR, + MTK_STAR_REG_C_RX_TWIST, + }; + + unsigned int i, val; + + for (i = 0; i < ARRAY_SIZE(counter_regs); i++) + regmap_read(priv->regs, counter_regs[i], &val); +} + +static void mtk_star_update_stat(struct mtk_star_priv *priv, + unsigned int reg, u64 *stat) +{ + unsigned int val; + + regmap_read(priv->regs, reg, &val); + *stat += val; +} + +/* Try to get as many stats as possible from the internal registers instead + * of tracking them ourselves. + */ +static void mtk_star_update_stats(struct mtk_star_priv *priv) +{ + struct rtnl_link_stats64 *stats = &priv->stats; + + /* OK packets and bytes. */ + mtk_star_update_stat(priv, MTK_STAR_REG_C_RXOKPKT, &stats->rx_packets); + mtk_star_update_stat(priv, MTK_STAR_REG_C_TXOKPKT, &stats->tx_packets); + mtk_star_update_stat(priv, MTK_STAR_REG_C_RXOKBYTE, &stats->rx_bytes); + mtk_star_update_stat(priv, MTK_STAR_REG_C_TXOKBYTE, &stats->tx_bytes); + + /* RX & TX multicast. */ + mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_MULTI, &stats->multicast); + mtk_star_update_stat(priv, MTK_STAR_REG_C_TX_MULTI, &stats->multicast); + + /* Collisions. */ + mtk_star_update_stat(priv, MTK_STAR_REG_C_TXPAUSECOL, + &stats->collisions); + mtk_star_update_stat(priv, MTK_STAR_REG_C_TX_LATECOL, + &stats->collisions); + mtk_star_update_stat(priv, MTK_STAR_REG_C_RXRUNT, &stats->collisions); + + /* RX Errors. */ + mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_LENGTHERR, + &stats->rx_length_errors); + mtk_star_update_stat(priv, MTK_STAR_REG_C_RXLONG, + &stats->rx_over_errors); + mtk_star_update_stat(priv, MTK_STAR_REG_C_RXCRC, &stats->rx_crc_errors); + mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_ALIGNERR, + &stats->rx_frame_errors); + mtk_star_update_stat(priv, MTK_STAR_REG_C_RXDROP, + &stats->rx_fifo_errors); + /* Sum of the general RX error counter + all of the above. */ + mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_RERR, &stats->rx_errors); + stats->rx_errors += stats->rx_length_errors; + stats->rx_errors += stats->rx_over_errors; + stats->rx_errors += stats->rx_crc_errors; + stats->rx_errors += stats->rx_frame_errors; + stats->rx_errors += stats->rx_fifo_errors; +} + +/* This runs in process context and parallel TX and RX paths executing in + * napi context may result in losing some stats data but this should happen + * seldom enough to be acceptable. + */ +static void mtk_star_update_stats_work(struct work_struct *work) +{ + struct mtk_star_priv *priv = container_of(work, struct mtk_star_priv, + stats_work); + + mtk_star_update_stats(priv); + mtk_star_reset_counters(priv); + mtk_star_intr_enable_stats(priv); +} + +static struct sk_buff *mtk_star_alloc_skb(struct net_device *ndev) +{ + uintptr_t tail, offset; + struct sk_buff *skb; + + skb = dev_alloc_skb(MTK_STAR_MAX_FRAME_SIZE); + if (!skb) + return NULL; + + /* Align to 16 bytes. */ + tail = (uintptr_t)skb_tail_pointer(skb); + if (tail & (MTK_STAR_SKB_ALIGNMENT - 1)) { + offset = tail & (MTK_STAR_SKB_ALIGNMENT - 1); + skb_reserve(skb, MTK_STAR_SKB_ALIGNMENT - offset); + } + + /* Ensure 16-byte alignment of the skb pointer: eth_type_trans() will + * extract the Ethernet header (14 bytes) so we need two more bytes. + */ + skb_reserve(skb, MTK_STAR_IP_ALIGN); + + return skb; +} + +static int mtk_star_prepare_rx_skbs(struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + struct mtk_star_ring *ring = &priv->rx_ring; + struct device *dev = mtk_star_get_dev(priv); + struct mtk_star_ring_desc *desc; + struct sk_buff *skb; + dma_addr_t dma_addr; + int i; + + for (i = 0; i < MTK_STAR_NUM_RX_DESCS; i++) { + skb = mtk_star_alloc_skb(ndev); + if (!skb) + return -ENOMEM; + + dma_addr = mtk_star_dma_map_rx(priv, skb); + if (dma_mapping_error(dev, dma_addr)) { + dev_kfree_skb(skb); + return -ENOMEM; + } + + desc = &ring->descs[i]; + desc->data_ptr = dma_addr; + desc->status |= skb_tailroom(skb) & MTK_STAR_DESC_MSK_LEN; + desc->status &= ~MTK_STAR_DESC_BIT_COWN; + ring->skbs[i] = skb; + ring->dma_addrs[i] = dma_addr; + } + + return 0; +} + +static void +mtk_star_ring_free_skbs(struct mtk_star_priv *priv, struct mtk_star_ring *ring, + void (*unmap_func)(struct mtk_star_priv *, + struct mtk_star_ring_desc_data *)) +{ + struct mtk_star_ring_desc_data desc_data; + struct mtk_star_ring_desc *desc; + int i; + + for (i = 0; i < MTK_STAR_RING_NUM_DESCS; i++) { + if (!ring->dma_addrs[i]) + continue; + + desc = &ring->descs[i]; + + desc_data.dma_addr = ring->dma_addrs[i]; + desc_data.skb = ring->skbs[i]; + + unmap_func(priv, &desc_data); + dev_kfree_skb(desc_data.skb); + } +} + +static void mtk_star_free_rx_skbs(struct mtk_star_priv *priv) +{ + struct mtk_star_ring *ring = &priv->rx_ring; + + mtk_star_ring_free_skbs(priv, ring, mtk_star_dma_unmap_rx); +} + +static void mtk_star_free_tx_skbs(struct mtk_star_priv *priv) +{ + struct mtk_star_ring *ring = &priv->tx_ring; + + mtk_star_ring_free_skbs(priv, ring, mtk_star_dma_unmap_tx); +} + +/* All processing for TX and RX happens in the napi poll callback. */ +static irqreturn_t mtk_star_handle_irq(int irq, void *data) +{ + struct mtk_star_priv *priv; + struct net_device *ndev; + bool need_napi = false; + unsigned int status; + + ndev = data; + priv = netdev_priv(ndev); + + if (netif_running(ndev)) { + status = mtk_star_intr_read(priv); + + if (status & MTK_STAR_BIT_INT_STS_TNTC) { + mtk_star_intr_disable_tx(priv); + need_napi = true; + } + + if (status & MTK_STAR_BIT_INT_STS_FNRC) { + mtk_star_intr_disable_rx(priv); + need_napi = true; + } + + if (need_napi) + napi_schedule(&priv->napi); + + /* One of the counters reached 0x8000000 - update stats and + * reset all counters. + */ + if (unlikely(status & MTK_STAR_REG_INT_STS_MIB_CNT_TH)) { + mtk_star_intr_disable_stats(priv); + schedule_work(&priv->stats_work); + } + + mtk_star_intr_ack_all(priv); + } + + return IRQ_HANDLED; +} + +/* Wait for the completion of any previous command - CMD_START bit must be + * cleared by hardware. + */ +static int mtk_star_hash_wait_cmd_start(struct mtk_star_priv *priv) +{ + unsigned int val; + + return regmap_read_poll_timeout_atomic(priv->regs, + MTK_STAR_REG_HASH_CTRL, val, + !(val & MTK_STAR_BIT_HASH_CTRL_CMD_START), + 10, MTK_STAR_WAIT_TIMEOUT); +} + +static int mtk_star_hash_wait_ok(struct mtk_star_priv *priv) +{ + unsigned int val; + int ret; + + /* Wait for BIST_DONE bit. */ + ret = regmap_read_poll_timeout_atomic(priv->regs, + MTK_STAR_REG_HASH_CTRL, val, + val & MTK_STAR_BIT_HASH_CTRL_BIST_DONE, + 10, MTK_STAR_WAIT_TIMEOUT); + if (ret) + return ret; + + /* Check the BIST_OK bit. */ + regmap_read(priv->regs, MTK_STAR_REG_HASH_CTRL, &val); + if (!(val & MTK_STAR_BIT_HASH_CTRL_BIST_OK)) + return -EIO; + + return 0; +} + +static int mtk_star_set_hashbit(struct mtk_star_priv *priv, + unsigned int hash_addr) +{ + unsigned int val; + int ret; + + ret = mtk_star_hash_wait_cmd_start(priv); + if (ret) + return ret; + + val = hash_addr & MTK_STAR_MSK_HASH_CTRL_HASH_BIT_ADDR; + val |= MTK_STAR_BIT_HASH_CTRL_ACC_CMD; + val |= MTK_STAR_BIT_HASH_CTRL_CMD_START; + val |= MTK_STAR_BIT_HASH_CTRL_BIST_EN; + val |= MTK_STAR_BIT_HASH_CTRL_HASH_BIT_DATA; + regmap_write(priv->regs, MTK_STAR_REG_HASH_CTRL, val); + + return mtk_star_hash_wait_ok(priv); +} + +static int mtk_star_reset_hash_table(struct mtk_star_priv *priv) +{ + int ret; + + ret = mtk_star_hash_wait_cmd_start(priv); + if (ret) + return ret; + + regmap_update_bits(priv->regs, MTK_STAR_REG_HASH_CTRL, + MTK_STAR_BIT_HASH_CTRL_BIST_EN, + MTK_STAR_BIT_HASH_CTRL_BIST_EN); + regmap_update_bits(priv->regs, MTK_STAR_REG_TEST1, + MTK_STAR_BIT_TEST1_RST_HASH_MBIST, + MTK_STAR_BIT_TEST1_RST_HASH_MBIST); + + return mtk_star_hash_wait_ok(priv); +} + +static void mtk_star_phy_config(struct mtk_star_priv *priv) +{ + unsigned int val; + + if (priv->speed == SPEED_1000) + val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_1000M; + else if (priv->speed == SPEED_100) + val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_100M; + else + val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_10M; + val <<= MTK_STAR_OFF_PHY_CTRL1_FORCE_SPD; + + val |= MTK_STAR_BIT_PHY_CTRL1_AN_EN; + val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_RX; + val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_TX; + /* Only full-duplex supported for now. */ + val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_DPX; + + regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL1, val); + + if (priv->pause) { + val = MTK_STAR_VAL_FC_CFG_SEND_PAUSE_TH_2K; + val <<= MTK_STAR_OFF_FC_CFG_SEND_PAUSE_TH; + val |= MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR; + } else { + val = 0; + } + + regmap_update_bits(priv->regs, MTK_STAR_REG_FC_CFG, + MTK_STAR_MSK_FC_CFG_SEND_PAUSE_TH | + MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR, val); + + if (priv->pause) { + val = MTK_STAR_VAL_EXT_CFG_SND_PAUSE_RLS_1K; + val <<= MTK_STAR_OFF_EXT_CFG_SND_PAUSE_RLS; + } else { + val = 0; + } + + regmap_update_bits(priv->regs, MTK_STAR_REG_EXT_CFG, + MTK_STAR_MSK_EXT_CFG_SND_PAUSE_RLS, val); +} + +static void mtk_star_adjust_link(struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + struct phy_device *phydev = priv->phydev; + bool new_state = false; + + if (phydev->link) { + if (!priv->link) { + priv->link = phydev->link; + new_state = true; + } + + if (priv->speed != phydev->speed) { + priv->speed = phydev->speed; + new_state = true; + } + + if (priv->pause != phydev->pause) { + priv->pause = phydev->pause; + new_state = true; + } + } else { + if (priv->link) { + priv->link = phydev->link; + new_state = true; + } + } + + if (new_state) { + if (phydev->link) + mtk_star_phy_config(priv); + + phy_print_status(ndev->phydev); + } +} + +static void mtk_star_init_config(struct mtk_star_priv *priv) +{ + unsigned int val; + + val = (MTK_STAR_BIT_MII_PAD_OUT_ENABLE | + MTK_STAR_BIT_EXT_MDC_MODE | + MTK_STAR_BIT_SWC_MII_MODE); + + regmap_write(priv->regs, MTK_STAR_REG_SYS_CONF, val); + regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CLK_CONF, + MTK_STAR_MSK_MAC_CLK_CONF, + MTK_STAR_BIT_CLK_DIV_10); +} + +static void mtk_star_set_mode_rmii(struct mtk_star_priv *priv) +{ + regmap_update_bits(priv->pericfg, MTK_PERICFG_REG_NIC_CFG_CON, + MTK_PERICFG_MSK_NIC_CFG_CON_CFG_MII, + MTK_PERICFG_BIT_NIC_CFG_CON_RMII); +} + +static int mtk_star_enable(struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + unsigned int val; + int ret; + + mtk_star_nic_disable_pd(priv); + mtk_star_intr_disable(priv); + mtk_star_dma_stop(priv); + + mtk_star_set_mac_addr(ndev); + + /* Configure the MAC */ + val = MTK_STAR_VAL_MAC_CFG_IPG_96BIT; + val <<= MTK_STAR_OFF_MAC_CFG_IPG; + val |= MTK_STAR_BIT_MAC_CFG_MAXLEN_1522; + val |= MTK_STAR_BIT_MAC_CFG_AUTO_PAD; + val |= MTK_STAR_BIT_MAC_CFG_CRC_STRIP; + regmap_write(priv->regs, MTK_STAR_REG_MAC_CFG, val); + + /* Enable Hash Table BIST and reset it */ + ret = mtk_star_reset_hash_table(priv); + if (ret) + return ret; + + /* Setup the hashing algorithm */ + regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG, + MTK_STAR_BIT_ARL_CFG_HASH_ALG | + MTK_STAR_BIT_ARL_CFG_MISC_MODE, 0); + + /* Don't strip VLAN tags */ + regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG, + MTK_STAR_BIT_MAC_CFG_VLAN_STRIP, 0); + + /* Setup DMA */ + mtk_star_dma_init(priv); + + ret = mtk_star_prepare_rx_skbs(ndev); + if (ret) + goto err_out; + + /* Request the interrupt */ + ret = request_irq(ndev->irq, mtk_star_handle_irq, + IRQF_TRIGGER_FALLING, ndev->name, ndev); + if (ret) + goto err_free_skbs; + + napi_enable(&priv->napi); + + mtk_star_intr_ack_all(priv); + mtk_star_intr_enable(priv); + + /* Connect to and start PHY */ + priv->phydev = of_phy_connect(ndev, priv->phy_node, + mtk_star_adjust_link, 0, priv->phy_intf); + if (!priv->phydev) { + netdev_err(ndev, "failed to connect to PHY\n"); + goto err_free_irq; + } + + mtk_star_dma_start(priv); + phy_start(priv->phydev); + netif_start_queue(ndev); + + return 0; + +err_free_irq: + free_irq(ndev->irq, ndev); +err_free_skbs: + mtk_star_free_rx_skbs(priv); +err_out: + return ret; +} + +static void mtk_star_disable(struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + + netif_stop_queue(ndev); + napi_disable(&priv->napi); + mtk_star_intr_disable(priv); + mtk_star_dma_disable(priv); + mtk_star_intr_ack_all(priv); + phy_stop(priv->phydev); + phy_disconnect(priv->phydev); + free_irq(ndev->irq, ndev); + mtk_star_free_rx_skbs(priv); + mtk_star_free_tx_skbs(priv); +} + +static int mtk_star_netdev_open(struct net_device *ndev) +{ + return mtk_star_enable(ndev); +} + +static int mtk_star_netdev_stop(struct net_device *ndev) +{ + mtk_star_disable(ndev); + + return 0; +} + +static int mtk_star_netdev_ioctl(struct net_device *ndev, + struct ifreq *req, int cmd) +{ + if (!netif_running(ndev)) + return -EINVAL; + + return phy_mii_ioctl(ndev->phydev, req, cmd); +} + +static int mtk_star_netdev_start_xmit(struct sk_buff *skb, + struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + struct mtk_star_ring *ring = &priv->tx_ring; + struct device *dev = mtk_star_get_dev(priv); + struct mtk_star_ring_desc_data desc_data; + + desc_data.dma_addr = mtk_star_dma_map_tx(priv, skb); + if (dma_mapping_error(dev, desc_data.dma_addr)) + goto err_drop_packet; + + desc_data.skb = skb; + desc_data.len = skb->len; + + spin_lock_bh(&priv->lock); + + mtk_star_ring_push_head_tx(ring, &desc_data); + + netdev_sent_queue(ndev, skb->len); + + if (mtk_star_ring_full(ring)) + netif_stop_queue(ndev); + + spin_unlock_bh(&priv->lock); + + mtk_star_dma_resume_tx(priv); + + return NETDEV_TX_OK; + +err_drop_packet: + dev_kfree_skb(skb); + ndev->stats.tx_dropped++; + return NETDEV_TX_BUSY; +} + +/* Returns the number of bytes sent or a negative number on the first + * descriptor owned by DMA. + */ +static int mtk_star_tx_complete_one(struct mtk_star_priv *priv) +{ + struct mtk_star_ring *ring = &priv->tx_ring; + struct mtk_star_ring_desc_data desc_data; + int ret; + + ret = mtk_star_ring_pop_tail(ring, &desc_data); + if (ret) + return ret; + + mtk_star_dma_unmap_tx(priv, &desc_data); + ret = desc_data.skb->len; + dev_kfree_skb_irq(desc_data.skb); + + return ret; +} + +static void mtk_star_tx_complete_all(struct mtk_star_priv *priv) +{ + struct mtk_star_ring *ring = &priv->tx_ring; + struct net_device *ndev = priv->ndev; + int ret, pkts_compl, bytes_compl; + bool wake = false; + + spin_lock(&priv->lock); + + for (pkts_compl = 0, bytes_compl = 0;; + pkts_compl++, bytes_compl += ret, wake = true) { + if (!mtk_star_ring_descs_available(ring)) + break; + + ret = mtk_star_tx_complete_one(priv); + if (ret < 0) + break; + } + + netdev_completed_queue(ndev, pkts_compl, bytes_compl); + + if (wake && netif_queue_stopped(ndev)) + netif_wake_queue(ndev); + + mtk_star_intr_enable_tx(priv); + + spin_unlock(&priv->lock); +} + +static void mtk_star_netdev_get_stats64(struct net_device *ndev, + struct rtnl_link_stats64 *stats) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + + mtk_star_update_stats(priv); + + memcpy(stats, &priv->stats, sizeof(*stats)); +} + +static void mtk_star_set_rx_mode(struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + struct netdev_hw_addr *hw_addr; + unsigned int hash_addr, i; + int ret; + + if (ndev->flags & IFF_PROMISC) { + regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG, + MTK_STAR_BIT_ARL_CFG_MISC_MODE, + MTK_STAR_BIT_ARL_CFG_MISC_MODE); + } else if (netdev_mc_count(ndev) > MTK_STAR_HASHTABLE_MC_LIMIT || + ndev->flags & IFF_ALLMULTI) { + for (i = 0; i < MTK_STAR_HASHTABLE_SIZE_MAX; i++) { + ret = mtk_star_set_hashbit(priv, i); + if (ret) + goto hash_fail; + } + } else { + /* Clear previous settings. */ + ret = mtk_star_reset_hash_table(priv); + if (ret) + goto hash_fail; + + netdev_for_each_mc_addr(hw_addr, ndev) { + hash_addr = (hw_addr->addr[0] & 0x01) << 8; + hash_addr += hw_addr->addr[5]; + ret = mtk_star_set_hashbit(priv, hash_addr); + if (ret) + goto hash_fail; + } + } + + return; + +hash_fail: + if (ret == -ETIMEDOUT) + netdev_err(ndev, "setting hash bit timed out\n"); + else + /* Should be -EIO */ + netdev_err(ndev, "unable to set hash bit"); +} + +static const struct net_device_ops mtk_star_netdev_ops = { + .ndo_open = mtk_star_netdev_open, + .ndo_stop = mtk_star_netdev_stop, + .ndo_start_xmit = mtk_star_netdev_start_xmit, + .ndo_get_stats64 = mtk_star_netdev_get_stats64, + .ndo_set_rx_mode = mtk_star_set_rx_mode, + .ndo_do_ioctl = mtk_star_netdev_ioctl, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, +}; + +static void mtk_star_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, MTK_STAR_DRVNAME, sizeof(info->driver)); +} + +/* TODO Add ethtool stats. */ +static const struct ethtool_ops mtk_star_ethtool_ops = { + .get_drvinfo = mtk_star_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_link_ksettings = phy_ethtool_get_link_ksettings, + .set_link_ksettings = phy_ethtool_set_link_ksettings, +}; + +static int mtk_star_receive_packet(struct mtk_star_priv *priv) +{ + struct mtk_star_ring *ring = &priv->rx_ring; + struct device *dev = mtk_star_get_dev(priv); + struct mtk_star_ring_desc_data desc_data; + struct net_device *ndev = priv->ndev; + struct sk_buff *curr_skb, *new_skb; + dma_addr_t new_dma_addr; + int ret; + + spin_lock(&priv->lock); + ret = mtk_star_ring_pop_tail(ring, &desc_data); + spin_unlock(&priv->lock); + if (ret) + return -1; + + curr_skb = desc_data.skb; + + if ((desc_data.flags & MTK_STAR_DESC_BIT_RX_CRCE) || + (desc_data.flags & MTK_STAR_DESC_BIT_RX_OSIZE)) { + /* Error packet -> drop and reuse skb. */ + new_skb = curr_skb; + goto push_new_skb; + } + + /* Prepare new skb before receiving the current one. Reuse the current + * skb if we fail at any point. + */ + new_skb = mtk_star_alloc_skb(ndev); + if (!new_skb) { + ndev->stats.rx_dropped++; + new_skb = curr_skb; + goto push_new_skb; + } + + new_dma_addr = mtk_star_dma_map_rx(priv, new_skb); + if (dma_mapping_error(dev, new_dma_addr)) { + ndev->stats.rx_dropped++; + dev_kfree_skb(new_skb); + new_skb = curr_skb; + netdev_err(ndev, "DMA mapping error of RX descriptor\n"); + goto push_new_skb; + } + + /* We can't fail anymore at this point: it's safe to unmap the skb. */ + mtk_star_dma_unmap_rx(priv, &desc_data); + + skb_put(desc_data.skb, desc_data.len); + desc_data.skb->ip_summed = CHECKSUM_NONE; + desc_data.skb->protocol = eth_type_trans(desc_data.skb, ndev); + desc_data.skb->dev = ndev; + netif_receive_skb(desc_data.skb); + +push_new_skb: + desc_data.dma_addr = new_dma_addr; + desc_data.len = skb_tailroom(new_skb); + desc_data.skb = new_skb; + + spin_lock(&priv->lock); + mtk_star_ring_push_head_rx(ring, &desc_data); + spin_unlock(&priv->lock); + + return 0; +} + +static int mtk_star_process_rx(struct mtk_star_priv *priv, int budget) +{ + int received, ret; + + for (received = 0, ret = 0; received < budget && ret == 0; received++) + ret = mtk_star_receive_packet(priv); + + mtk_star_dma_resume_rx(priv); + + return received; +} + +static int mtk_star_poll(struct napi_struct *napi, int budget) +{ + struct mtk_star_priv *priv; + int received = 0; + + priv = container_of(napi, struct mtk_star_priv, napi); + + /* Clean-up all TX descriptors. */ + mtk_star_tx_complete_all(priv); + /* Receive up to $budget packets. */ + received = mtk_star_process_rx(priv, budget); + + if (received < budget) { + napi_complete_done(napi, received); + mtk_star_intr_enable_rx(priv); + } + + return received; +} + +static void mtk_star_mdio_rwok_clear(struct mtk_star_priv *priv) +{ + regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0, + MTK_STAR_BIT_PHY_CTRL0_RWOK); +} + +static int mtk_star_mdio_rwok_wait(struct mtk_star_priv *priv) +{ + unsigned int val; + + return regmap_read_poll_timeout(priv->regs, MTK_STAR_REG_PHY_CTRL0, + val, val & MTK_STAR_BIT_PHY_CTRL0_RWOK, + 10, MTK_STAR_WAIT_TIMEOUT); +} + +static int mtk_star_mdio_read(struct mii_bus *mii, int phy_id, int regnum) +{ + struct mtk_star_priv *priv = mii->priv; + unsigned int val, data; + int ret; + + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + + mtk_star_mdio_rwok_clear(priv); + + val = (regnum << MTK_STAR_OFF_PHY_CTRL0_PREG); + val &= MTK_STAR_MSK_PHY_CTRL0_PREG; + val |= MTK_STAR_BIT_PHY_CTRL0_RDCMD; + + regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0, val); + + ret = mtk_star_mdio_rwok_wait(priv); + if (ret) + return ret; + + regmap_read(priv->regs, MTK_STAR_REG_PHY_CTRL0, &data); + + data &= MTK_STAR_MSK_PHY_CTRL0_RWDATA; + data >>= MTK_STAR_OFF_PHY_CTRL0_RWDATA; + + return data; +} + +static int mtk_star_mdio_write(struct mii_bus *mii, int phy_id, + int regnum, u16 data) +{ + struct mtk_star_priv *priv = mii->priv; + unsigned int val; + + if (regnum & MII_ADDR_C45) + return -EOPNOTSUPP; + + mtk_star_mdio_rwok_clear(priv); + + val = data; + val <<= MTK_STAR_OFF_PHY_CTRL0_RWDATA; + val &= MTK_STAR_MSK_PHY_CTRL0_RWDATA; + regnum <<= MTK_STAR_OFF_PHY_CTRL0_PREG; + regnum &= MTK_STAR_MSK_PHY_CTRL0_PREG; + val |= regnum; + val |= MTK_STAR_BIT_PHY_CTRL0_WTCMD; + + regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0, val); + + return mtk_star_mdio_rwok_wait(priv); +} + +static int mtk_star_mdio_init(struct net_device *ndev) +{ + struct mtk_star_priv *priv = netdev_priv(ndev); + struct device *dev = mtk_star_get_dev(priv); + struct device_node *of_node, *mdio_node; + int ret; + + of_node = dev->of_node; + + mdio_node = of_get_child_by_name(of_node, "mdio"); + if (!mdio_node) + return -ENODEV; + + if (!of_device_is_available(mdio_node)) { + ret = -ENODEV; + goto out_put_node; + } + + priv->mii = devm_mdiobus_alloc(dev); + if (!priv->mii) { + ret = -ENOMEM; + goto out_put_node; + } + + snprintf(priv->mii->id, MII_BUS_ID_SIZE, "%s", dev_name(dev)); + priv->mii->name = "mtk-mac-mdio"; + priv->mii->parent = dev; + priv->mii->read = mtk_star_mdio_read; + priv->mii->write = mtk_star_mdio_write; + priv->mii->priv = priv; + + ret = of_mdiobus_register(priv->mii, mdio_node); + +out_put_node: + of_node_put(mdio_node); + return ret; +} + +static int mtk_star_suspend(struct device *dev) +{ + struct mtk_star_priv *priv; + struct net_device *ndev; + + ndev = dev_get_drvdata(dev); + priv = netdev_priv(ndev); + + if (netif_running(ndev)) + mtk_star_disable(ndev); + + clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks); + + return 0; +} + +static int mtk_star_resume(struct device *dev) +{ + struct mtk_star_priv *priv; + struct net_device *ndev; + int ret; + + ndev = dev_get_drvdata(dev); + priv = netdev_priv(ndev); + + ret = clk_bulk_prepare_enable(MTK_STAR_NCLKS, priv->clks); + if (ret) + return ret; + + if (netif_running(ndev)) { + ret = mtk_star_enable(ndev); + if (ret) + clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks); + } + + return ret; +} + +static void mtk_star_clk_disable_unprepare(void *data) +{ + struct mtk_star_priv *priv = data; + + clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks); +} + +static void mtk_star_mdiobus_unregister(void *data) +{ + struct mtk_star_priv *priv = data; + + mdiobus_unregister(priv->mii); +} + +static void mtk_star_unregister_netdev(void *data) +{ + struct net_device *ndev = data; + + unregister_netdev(ndev); +} + +static int mtk_star_probe(struct platform_device *pdev) +{ + struct device_node *of_node; + struct mtk_star_priv *priv; + struct net_device *ndev; + struct device *dev; + void __iomem *base; + int ret, i; + + dev = &pdev->dev; + of_node = dev->of_node; + + ndev = devm_alloc_etherdev(dev, sizeof(*priv)); + if (!ndev) + return -ENOMEM; + + priv = netdev_priv(ndev); + priv->ndev = ndev; + SET_NETDEV_DEV(ndev, dev); + platform_set_drvdata(pdev, ndev); + + ndev->min_mtu = ETH_ZLEN; + ndev->max_mtu = MTK_STAR_MAX_FRAME_SIZE; + + spin_lock_init(&priv->lock); + INIT_WORK(&priv->stats_work, mtk_star_update_stats_work); + + base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(base)) + return PTR_ERR(base); + + /* We won't be checking the return values of regmap read & write + * functions. They can only fail for mmio if there's a clock attached + * to regmap which is not the case here. + */ + priv->regs = devm_regmap_init_mmio(dev, base, + &mtk_star_regmap_config); + if (IS_ERR(priv->regs)) + return PTR_ERR(priv->regs); + + priv->pericfg = syscon_regmap_lookup_by_phandle(of_node, + "mediatek,pericfg"); + if (IS_ERR(priv->pericfg)) { + dev_err(dev, "Failed to lookup the PERICFG syscon\n"); + return PTR_ERR(priv->pericfg); + } + + ndev->irq = platform_get_irq(pdev, 0); + if (ndev->irq < 0) + return ndev->irq; + + for (i = 0; i < MTK_STAR_NCLKS; i++) + priv->clks[i].id = mtk_star_clk_names[i]; + ret = devm_clk_bulk_get(dev, MTK_STAR_NCLKS, priv->clks); + if (ret) + return ret; + + ret = clk_bulk_prepare_enable(MTK_STAR_NCLKS, priv->clks); + if (ret) + return ret; + + ret = devm_add_action_or_reset(dev, + mtk_star_clk_disable_unprepare, priv); + if (ret) + return ret; + + ret = of_get_phy_mode(of_node, &priv->phy_intf); + if (ret) { + return ret; + } else if (priv->phy_intf != PHY_INTERFACE_MODE_RMII) { + dev_err(dev, "unsupported phy mode: %s\n", + phy_modes(priv->phy_intf)); + return -EINVAL; + } + + priv->phy_node = of_parse_phandle(of_node, "phy-handle", 0); + if (!priv->phy_node) { + dev_err(dev, "failed to retrieve the phy handle from device tree\n"); + return -ENODEV; + } + + mtk_star_set_mode_rmii(priv); + + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(dev, "unsupported DMA mask\n"); + return ret; + } + + priv->ring_base = dmam_alloc_coherent(dev, MTK_STAR_DMA_SIZE, + &priv->dma_addr, + GFP_KERNEL | GFP_DMA); + if (!priv->ring_base) + return -ENOMEM; + + mtk_star_nic_disable_pd(priv); + mtk_star_init_config(priv); + + ret = mtk_star_mdio_init(ndev); + if (ret) + return ret; + + ret = devm_add_action_or_reset(dev, mtk_star_mdiobus_unregister, priv); + if (ret) + return ret; + + ret = eth_platform_get_mac_address(dev, ndev->dev_addr); + if (ret || !is_valid_ether_addr(ndev->dev_addr)) + eth_hw_addr_random(ndev); + + ndev->netdev_ops = &mtk_star_netdev_ops; + ndev->ethtool_ops = &mtk_star_ethtool_ops; + + netif_napi_add(ndev, &priv->napi, mtk_star_poll, MTK_STAR_NAPI_WEIGHT); + + ret = register_netdev(ndev); + if (ret) + return ret; + + ret = devm_add_action_or_reset(dev, mtk_star_unregister_netdev, ndev); + if (ret) + return ret; + + return 0; +} + +static const struct of_device_id mtk_star_of_match[] = { + { .compatible = "mediatek,mt8516-eth", }, + { .compatible = "mediatek,mt8518-eth", }, + { .compatible = "mediatek,mt8175-eth", }, + { } +}; +MODULE_DEVICE_TABLE(of, mtk_star_of_match); + +static SIMPLE_DEV_PM_OPS(mtk_star_pm_ops, + mtk_star_suspend, mtk_star_resume); + +static struct platform_driver mtk_star_driver = { + .driver = { + .name = MTK_STAR_DRVNAME, + .pm = &mtk_star_pm_ops, + .of_match_table = of_match_ptr(mtk_star_of_match), + }, + .probe = mtk_star_probe, +}; +module_platform_driver(mtk_star_driver); + +MODULE_AUTHOR("Bartosz Golaszewski <[email protected]>"); +MODULE_DESCRIPTION("Mediatek STAR Ethernet MAC Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 7d69a3061f17..4256d59eca2b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -78,9 +78,24 @@ config MLX5_ESWITCH Legacy SRIOV mode (L2 mac vlan steering based). Switchdev mode (eswitch offloads). +config MLX5_CLS_ACT + bool "MLX5 TC classifier action support" + depends on MLX5_ESWITCH && NET_CLS_ACT + default y + help + mlx5 ConnectX offloads support for TC classifier action (NET_CLS_ACT), + works in both native NIC mdoe and Switchdev SRIOV mode. + Actions get attached to a Hardware offloaded classifiers and are + invoked after a successful classification. Actions are used to + overwrite the classification result, instantly drop or redirect and/or + reformat packets in wire speeds without involving the host cpu. + + If set to N, TC offloads in both NIC and switchdev modes will be disabled. + If unsure, set to Y + config MLX5_TC_CT bool "MLX5 TC connection tracking offload support" - depends on MLX5_CORE_EN && NET_SWITCHDEV && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT + depends on MLX5_CLS_ACT && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT default y help Say Y here if you want to support offloading connection tracking rules diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index d3c7dbd7f1d5..e5ee9103fefb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -33,17 +33,19 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o -mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \ - lib/geneve.o en/mapping.o en/tc_tun_vxlan.o en/tc_tun_gre.o \ - en/tc_tun_geneve.o diag/en_tc_tracepoint.o mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o +mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o lib/geneve.o lib/port_tun.o lag_mp.o +mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \ + en/mapping.o esw/chains.o en/tc_tun.o \ + en/tc_tun_vxlan.o en/tc_tun_gre.o en/tc_tun_geneve.o \ + en/tc_tun_mplsoudp.o diag/en_tc_tracepoint.o mlx5_core-$(CONFIG_MLX5_TC_CT) += en/tc_ct.o # # Core extra # mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \ - ecpf.o rdma.o esw/chains.o + ecpf.o rdma.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 81fd53569463..4906aee6798d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -365,10 +365,7 @@ struct mlx5e_dma_info { dma_addr_t addr; union { struct page *page; - struct { - u64 handle; - void *data; - } xsk; + struct xdp_buff *xsk; }; }; @@ -581,7 +578,6 @@ struct mlx5e_rq { } mpwqe; }; struct { - u16 umem_headroom; u16 headroom; u32 frame0_sz; u8 map_dir; /* dma map direction */ @@ -614,7 +610,6 @@ struct mlx5e_rq { struct page_pool *page_pool; /* AF_XDP zero-copy */ - struct zero_copy_allocator zca; struct xdp_umem *umem; struct work_struct recover_work; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index eb2e1f2138e4..38e4f19d69f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -12,15 +12,16 @@ static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params, u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) { - u16 headroom = NET_IP_ALIGN; + u16 headroom; - if (mlx5e_rx_is_xdp(params, xsk)) { + if (xsk) + return xsk->headroom; + + headroom = NET_IP_ALIGN; + if (mlx5e_rx_is_xdp(params, xsk)) headroom += XDP_PACKET_HEADROOM; - if (xsk) - headroom += xsk->headroom; - } else { + else headroom += MLX5_RX_HEADROOM; - } return headroom; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c new file mode 100644 index 000000000000..baa162432e75 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies. */ + +#include <linux/refcount.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/rtnetlink.h> +#include <linux/workqueue.h> +#include <linux/rwlock.h> +#include <linux/spinlock.h> +#include <linux/notifier.h> +#include <net/netevent.h> +#include "neigh.h" +#include "tc.h" +#include "en_rep.h" +#include "fs_core.h" +#include "diag/en_rep_tracepoint.h" + +static unsigned long mlx5e_rep_ipv6_interval(void) +{ + if (IS_ENABLED(CONFIG_IPV6) && ipv6_stub->nd_tbl) + return NEIGH_VAR(&ipv6_stub->nd_tbl->parms, DELAY_PROBE_TIME); + + return ~0UL; +} + +static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv) +{ + unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME); + unsigned long ipv6_interval = mlx5e_rep_ipv6_interval(); + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + + rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval); + mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval); +} + +void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + + mlx5_fc_queue_stats_work(priv->mdev, + &neigh_update->neigh_stats_work, + neigh_update->min_interval); +} + +static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe) +{ + return refcount_inc_not_zero(&nhe->refcnt); +} + +static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe); + +void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe) +{ + if (refcount_dec_and_test(&nhe->refcnt)) { + mlx5e_rep_neigh_entry_remove(nhe); + kfree_rcu(nhe, rcu); + } +} + +static struct mlx5e_neigh_hash_entry * +mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv, + struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_neigh_hash_entry *next = NULL; + + rcu_read_lock(); + + for (next = nhe ? + list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, + &nhe->neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list) : + list_first_or_null_rcu(&rpriv->neigh_update.neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list); + next; + next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, + &next->neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list)) + if (mlx5e_rep_neigh_entry_hold(next)) + break; + + rcu_read_unlock(); + + if (nhe) + mlx5e_rep_neigh_entry_release(nhe); + + return next; +} + +static void mlx5e_rep_neigh_stats_work(struct work_struct *work) +{ + struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv, + neigh_update.neigh_stats_work.work); + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_neigh_hash_entry *nhe = NULL; + + rtnl_lock(); + if (!list_empty(&rpriv->neigh_update.neigh_list)) + mlx5e_rep_queue_neigh_stats_work(priv); + + while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL) + mlx5e_tc_update_neigh_used_value(nhe); + + rtnl_unlock(); +} + +static void mlx5e_rep_neigh_update(struct work_struct *work) +{ + struct mlx5e_neigh_hash_entry *nhe = + container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work); + struct neighbour *n = nhe->n; + struct mlx5e_encap_entry *e; + unsigned char ha[ETH_ALEN]; + struct mlx5e_priv *priv; + bool neigh_connected; + u8 nud_state, dead; + + rtnl_lock(); + + /* If these parameters are changed after we release the lock, + * we'll receive another event letting us know about it. + * We use this lock to avoid inconsistency between the neigh validity + * and it's hw address. + */ + read_lock_bh(&n->lock); + memcpy(ha, n->ha, ETH_ALEN); + nud_state = n->nud_state; + dead = n->dead; + read_unlock_bh(&n->lock); + + neigh_connected = (nud_state & NUD_VALID) && !dead; + + trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected); + + list_for_each_entry(e, &nhe->encap_list, encap_list) { + if (!mlx5e_encap_take(e)) + continue; + + priv = netdev_priv(e->out_dev); + mlx5e_rep_update_flows(priv, e, neigh_connected, ha); + mlx5e_encap_put(priv, e); + } + mlx5e_rep_neigh_entry_release(nhe); + rtnl_unlock(); + neigh_release(n); +} + +static void mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv, + struct mlx5e_neigh_hash_entry *nhe, + struct neighbour *n) +{ + /* Take a reference to ensure the neighbour and mlx5 encap + * entry won't be destructed until we drop the reference in + * delayed work. + */ + neigh_hold(n); + + /* This assignment is valid as long as the the neigh reference + * is taken + */ + nhe->n = n; + + if (!queue_work(priv->wq, &nhe->neigh_update_work)) { + mlx5e_rep_neigh_entry_release(nhe); + neigh_release(n); + } +} + +static int mlx5e_rep_netevent_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv, + neigh_update.netevent_nb); + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_neigh_hash_entry *nhe = NULL; + struct mlx5e_neigh m_neigh = {}; + struct neigh_parms *p; + struct neighbour *n; + bool found = false; + + switch (event) { + case NETEVENT_NEIGH_UPDATE: + n = ptr; +#if IS_ENABLED(CONFIG_IPV6) + if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl) +#else + if (n->tbl != &arp_tbl) +#endif + return NOTIFY_DONE; + + m_neigh.dev = n->dev; + m_neigh.family = n->ops->family; + memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len); + + rcu_read_lock(); + nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh); + rcu_read_unlock(); + if (!nhe) + return NOTIFY_DONE; + + mlx5e_rep_queue_neigh_update_work(priv, nhe, n); + break; + + case NETEVENT_DELAY_PROBE_TIME_UPDATE: + p = ptr; + + /* We check the device is present since we don't care about + * changes in the default table, we only care about changes + * done per device delay prob time parameter. + */ +#if IS_ENABLED(CONFIG_IPV6) + if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != &arp_tbl)) +#else + if (!p->dev || p->tbl != &arp_tbl) +#endif + return NOTIFY_DONE; + + rcu_read_lock(); + list_for_each_entry_rcu(nhe, &neigh_update->neigh_list, + neigh_list) { + if (p->dev == nhe->m_neigh.dev) { + found = true; + break; + } + } + rcu_read_unlock(); + if (!found) + return NOTIFY_DONE; + + neigh_update->min_interval = min_t(unsigned long, + NEIGH_VAR(p, DELAY_PROBE_TIME), + neigh_update->min_interval); + mlx5_fc_update_sampling_interval(priv->mdev, + neigh_update->min_interval); + break; + } + return NOTIFY_DONE; +} + +static const struct rhashtable_params mlx5e_neigh_ht_params = { + .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node), + .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh), + .key_len = sizeof(struct mlx5e_neigh), + .automatic_shrinking = true, +}; + +int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + int err; + + err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params); + if (err) + return err; + + INIT_LIST_HEAD(&neigh_update->neigh_list); + mutex_init(&neigh_update->encap_lock); + INIT_DELAYED_WORK(&neigh_update->neigh_stats_work, + mlx5e_rep_neigh_stats_work); + mlx5e_rep_neigh_update_init_interval(rpriv); + + rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event; + err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb); + if (err) + goto out_err; + return 0; + +out_err: + rhashtable_destroy(&neigh_update->neigh_ht); + return err; +} + +void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + + unregister_netevent_notifier(&neigh_update->netevent_nb); + + flush_workqueue(priv->wq); /* flush neigh update works */ + + cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work); + + mutex_destroy(&neigh_update->encap_lock); + rhashtable_destroy(&neigh_update->neigh_ht); +} + +static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv, + struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + int err; + + err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht, + &nhe->rhash_node, + mlx5e_neigh_ht_params); + if (err) + return err; + + list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list); + + return err; +} + +static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv; + + mutex_lock(&rpriv->neigh_update.encap_lock); + + list_del_rcu(&nhe->neigh_list); + + rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht, + &nhe->rhash_node, + mlx5e_neigh_ht_params); + mutex_unlock(&rpriv->neigh_update.encap_lock); +} + +/* This function must only be called under the representor's encap_lock or + * inside rcu read lock section. + */ +struct mlx5e_neigh_hash_entry * +mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, + struct mlx5e_neigh *m_neigh) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + struct mlx5e_neigh_hash_entry *nhe; + + nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh, + mlx5e_neigh_ht_params); + return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL; +} + +int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct mlx5e_neigh_hash_entry **nhe) +{ + int err; + + *nhe = kzalloc(sizeof(**nhe), GFP_KERNEL); + if (!*nhe) + return -ENOMEM; + + (*nhe)->priv = priv; + memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh)); + INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update); + spin_lock_init(&(*nhe)->encap_list_lock); + INIT_LIST_HEAD(&(*nhe)->encap_list); + refcount_set(&(*nhe)->refcnt, 1); + + err = mlx5e_rep_neigh_entry_insert(priv, *nhe); + if (err) + goto out_free; + return 0; + +out_free: + kfree(*nhe); + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h new file mode 100644 index 000000000000..32b239189c95 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __MLX5_EN_REP_NEIGH__ +#define __MLX5_EN_REP_NEIGH__ + +#include "en.h" +#include "en_rep.h" + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + +int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv); + +struct mlx5e_neigh_hash_entry * +mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, + struct mlx5e_neigh *m_neigh); +int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct mlx5e_neigh_hash_entry **nhe); +void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe); + +void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv); + +#else /* CONFIG_MLX5_CLS_ACT */ + +static inline int +mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) { return 0; } +static inline void +mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) {} + +#endif /* CONFIG_MLX5_CLS_ACT */ + +#endif /* __MLX5_EN_REP_NEIGH__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c new file mode 100644 index 000000000000..c609a5e50ebc --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -0,0 +1,711 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies. */ + +#include <net/dst_metadata.h> +#include <linux/netdevice.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/rtnetlink.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include "tc.h" +#include "neigh.h" +#include "en_rep.h" +#include "eswitch.h" +#include "esw/chains.h" +#include "en/tc_ct.h" +#include "en/mapping.h" +#include "en/tc_tun.h" +#include "lib/port_tun.h" + +struct mlx5e_rep_indr_block_priv { + struct net_device *netdev; + struct mlx5e_rep_priv *rpriv; + + struct list_head list; +}; + +int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy; + struct mlx5e_neigh_hash_entry *nhe; + int err; + + err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type); + if (err) + return err; + + mutex_lock(&rpriv->neigh_update.encap_lock); + nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh); + if (!nhe) { + err = mlx5e_rep_neigh_entry_create(priv, e, &nhe); + if (err) { + mutex_unlock(&rpriv->neigh_update.encap_lock); + mlx5_tun_entropy_refcount_dec(tun_entropy, + e->reformat_type); + return err; + } + } + + e->nhe = nhe; + spin_lock(&nhe->encap_list_lock); + list_add_rcu(&e->encap_list, &nhe->encap_list); + spin_unlock(&nhe->encap_list_lock); + + mutex_unlock(&rpriv->neigh_update.encap_lock); + + return 0; +} + +void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy; + + if (!e->nhe) + return; + + spin_lock(&e->nhe->encap_list_lock); + list_del_rcu(&e->encap_list); + spin_unlock(&e->nhe->encap_list_lock); + + mlx5e_rep_neigh_entry_release(e->nhe); + e->nhe = NULL; + mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type); +} + +void mlx5e_rep_update_flows(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + bool neigh_connected, + unsigned char ha[ETH_ALEN]) +{ + struct ethhdr *eth = (struct ethhdr *)e->encap_header; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + bool encap_connected; + LIST_HEAD(flow_list); + + ASSERT_RTNL(); + + /* wait for encap to be fully initialized */ + wait_for_completion(&e->res_ready); + + mutex_lock(&esw->offloads.encap_tbl_lock); + encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID); + if (e->compl_result < 0 || (encap_connected == neigh_connected && + ether_addr_equal(e->h_dest, ha))) + goto unlock; + + mlx5e_take_all_encap_flows(e, &flow_list); + + if ((e->flags & MLX5_ENCAP_ENTRY_VALID) && + (!neigh_connected || !ether_addr_equal(e->h_dest, ha))) + mlx5e_tc_encap_flows_del(priv, e, &flow_list); + + if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) { + ether_addr_copy(e->h_dest, ha); + ether_addr_copy(eth->h_dest, ha); + /* Update the encap source mac, in case that we delete + * the flows when encap source mac changed. + */ + ether_addr_copy(eth->h_source, e->route_dev->dev_addr); + + mlx5e_tc_encap_flows_add(priv, e, &flow_list); + } +unlock: + mutex_unlock(&esw->offloads.encap_tbl_lock); + mlx5e_put_encap_flow_list(priv, &flow_list); +} + +static int +mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv, + struct flow_cls_offload *cls_flower, int flags) +{ + switch (cls_flower->command) { + case FLOW_CLS_REPLACE: + return mlx5e_configure_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_DESTROY: + return mlx5e_delete_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_STATS: + return mlx5e_stats_flower(priv->netdev, priv, cls_flower, + flags); + default: + return -EOPNOTSUPP; + } +} + +static +int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *ma) +{ + switch (ma->command) { + case TC_CLSMATCHALL_REPLACE: + return mlx5e_tc_configure_matchall(priv, ma); + case TC_CLSMATCHALL_DESTROY: + return mlx5e_tc_delete_matchall(priv, ma); + case TC_CLSMATCHALL_STATS: + mlx5e_tc_stats_matchall(priv, ma); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD); + struct mlx5e_priv *priv = cb_priv; + + switch (type) { + case TC_SETUP_CLSFLOWER: + return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags); + case TC_SETUP_CLSMATCHALL: + return mlx5e_rep_setup_tc_cls_matchall(priv, type_data); + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + struct flow_cls_offload tmp, *f = type_data; + struct mlx5e_priv *priv = cb_priv; + struct mlx5_eswitch *esw; + unsigned long flags; + int err; + + flags = MLX5_TC_FLAG(INGRESS) | + MLX5_TC_FLAG(ESW_OFFLOAD) | + MLX5_TC_FLAG(FT_OFFLOAD); + esw = priv->mdev->priv.eswitch; + + switch (type) { + case TC_SETUP_CLSFLOWER: + memcpy(&tmp, f, sizeof(*f)); + + if (!mlx5_esw_chains_prios_supported(esw)) + return -EOPNOTSUPP; + + /* Re-use tc offload path by moving the ft flow to the + * reserved ft chain. + * + * FT offload can use prio range [0, INT_MAX], so we normalize + * it to range [1, mlx5_esw_chains_get_prio_range(esw)] + * as with tc, where prio 0 isn't supported. + * + * We only support chain 0 of FT offload. + */ + if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw)) + return -EOPNOTSUPP; + if (tmp.common.chain_index != 0) + return -EOPNOTSUPP; + + tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw); + tmp.common.prio++; + err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags); + memcpy(&f->stats, &tmp.stats, sizeof(f->stats)); + return err; + default: + return -EOPNOTSUPP; + } +} + +static LIST_HEAD(mlx5e_rep_block_tc_cb_list); +static LIST_HEAD(mlx5e_rep_block_ft_cb_list); +int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct flow_block_offload *f = type_data; + + f->unlocked_driver_cb = true; + + switch (type) { + case TC_SETUP_BLOCK: + return flow_block_cb_setup_simple(type_data, + &mlx5e_rep_block_tc_cb_list, + mlx5e_rep_setup_tc_cb, + priv, priv, true); + case TC_SETUP_FT: + return flow_block_cb_setup_simple(type_data, + &mlx5e_rep_block_ft_cb_list, + mlx5e_rep_setup_ft_cb, + priv, priv, true); + default: + return -EOPNOTSUPP; + } +} + +int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + int err; + + mutex_init(&uplink_priv->unready_flows_lock); + INIT_LIST_HEAD(&uplink_priv->unready_flows); + + /* init shared tc flow table */ + err = mlx5e_tc_esw_init(&uplink_priv->tc_ht); + return err; +} + +void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv) +{ + /* delete shared tc flow table */ + mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht); + mutex_destroy(&rpriv->uplink_priv.unready_flows_lock); +} + +void mlx5e_rep_tc_enable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work, + mlx5e_tc_reoffload_flows_work); +} + +void mlx5e_rep_tc_disable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work); +} + +int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work); + + return NOTIFY_OK; +} + +static struct mlx5e_rep_indr_block_priv * +mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv, + struct net_device *netdev) +{ + struct mlx5e_rep_indr_block_priv *cb_priv; + + /* All callback list access should be protected by RTNL. */ + ASSERT_RTNL(); + + list_for_each_entry(cb_priv, + &rpriv->uplink_priv.tc_indr_block_priv_list, + list) + if (cb_priv->netdev == netdev) + return cb_priv; + + return NULL; +} + +static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv, + struct net_device *netdev); + +void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_rep_indr_block_priv *cb_priv, *temp; + struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list; + + list_for_each_entry_safe(cb_priv, temp, head, list) { + mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev); + kfree(cb_priv); + } +} + +static int +mlx5e_rep_indr_offload(struct net_device *netdev, + struct flow_cls_offload *flower, + struct mlx5e_rep_indr_block_priv *indr_priv, + unsigned long flags) +{ + struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev); + int err = 0; + + switch (flower->command) { + case FLOW_CLS_REPLACE: + err = mlx5e_configure_flower(netdev, priv, flower, flags); + break; + case FLOW_CLS_DESTROY: + err = mlx5e_delete_flower(netdev, priv, flower, flags); + break; + case FLOW_CLS_STATS: + err = mlx5e_stats_flower(netdev, priv, flower, flags); + break; + default: + err = -EOPNOTSUPP; + } + + return err; +} + +static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type, + void *type_data, void *indr_priv) +{ + unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD); + struct mlx5e_rep_indr_block_priv *priv = indr_priv; + + switch (type) { + case TC_SETUP_CLSFLOWER: + return mlx5e_rep_indr_offload(priv->netdev, type_data, priv, + flags); + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type, + void *type_data, void *indr_priv) +{ + struct mlx5e_rep_indr_block_priv *priv = indr_priv; + struct flow_cls_offload *f = type_data; + struct flow_cls_offload tmp; + struct mlx5e_priv *mpriv; + struct mlx5_eswitch *esw; + unsigned long flags; + int err; + + mpriv = netdev_priv(priv->rpriv->netdev); + esw = mpriv->mdev->priv.eswitch; + + flags = MLX5_TC_FLAG(EGRESS) | + MLX5_TC_FLAG(ESW_OFFLOAD) | + MLX5_TC_FLAG(FT_OFFLOAD); + + switch (type) { + case TC_SETUP_CLSFLOWER: + memcpy(&tmp, f, sizeof(*f)); + + /* Re-use tc offload path by moving the ft flow to the + * reserved ft chain. + * + * FT offload can use prio range [0, INT_MAX], so we normalize + * it to range [1, mlx5_esw_chains_get_prio_range(esw)] + * as with tc, where prio 0 isn't supported. + * + * We only support chain 0 of FT offload. + */ + if (!mlx5_esw_chains_prios_supported(esw) || + tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw) || + tmp.common.chain_index) + return -EOPNOTSUPP; + + tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw); + tmp.common.prio++; + err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags); + memcpy(&f->stats, &tmp.stats, sizeof(f->stats)); + return err; + default: + return -EOPNOTSUPP; + } +} + +static void mlx5e_rep_indr_block_unbind(void *cb_priv) +{ + struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv; + + list_del(&indr_priv->list); + kfree(indr_priv); +} + +static LIST_HEAD(mlx5e_block_cb_list); + +static int +mlx5e_rep_indr_setup_block(struct net_device *netdev, + struct mlx5e_rep_priv *rpriv, + struct flow_block_offload *f, + flow_setup_cb_t *setup_cb) +{ + struct mlx5e_rep_indr_block_priv *indr_priv; + struct flow_block_cb *block_cb; + + if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + return -EOPNOTSUPP; + + f->unlocked_driver_cb = true; + f->driver_block_list = &mlx5e_block_cb_list; + + switch (f->command) { + case FLOW_BLOCK_BIND: + indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev); + if (indr_priv) + return -EEXIST; + + indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL); + if (!indr_priv) + return -ENOMEM; + + indr_priv->netdev = netdev; + indr_priv->rpriv = rpriv; + list_add(&indr_priv->list, + &rpriv->uplink_priv.tc_indr_block_priv_list); + + block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv, + mlx5e_rep_indr_block_unbind); + if (IS_ERR(block_cb)) { + list_del(&indr_priv->list); + kfree(indr_priv); + return PTR_ERR(block_cb); + } + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list); + + return 0; + case FLOW_BLOCK_UNBIND: + indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev); + if (!indr_priv) + return -ENOENT; + + block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv); + if (!block_cb) + return -ENOENT; + + flow_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); + return 0; + default: + return -EOPNOTSUPP; + } + return 0; +} + +static +int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv, + enum tc_setup_type type, void *type_data) +{ + switch (type) { + case TC_SETUP_BLOCK: + return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data, + mlx5e_rep_indr_setup_tc_cb); + case TC_SETUP_FT: + return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data, + mlx5e_rep_indr_setup_ft_cb); + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv, + struct net_device *netdev) +{ + int err; + + err = __flow_indr_block_cb_register(netdev, rpriv, + mlx5e_rep_indr_setup_cb, + rpriv); + if (err) { + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + + mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n", + netdev_name(netdev), err); + } + return err; +} + +static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv, + struct net_device *netdev) +{ + __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb, + rpriv); +} + +static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv, + uplink_priv.netdevice_nb); + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && + !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev)) + return NOTIFY_OK; + + switch (event) { + case NETDEV_REGISTER: + mlx5e_rep_indr_register_block(rpriv, netdev); + break; + case NETDEV_UNREGISTER: + mlx5e_rep_indr_unregister_block(rpriv, netdev); + break; + } + return NOTIFY_OK; +} + +int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + int err; + + /* init indirect block notifications */ + INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list); + + uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event; + err = register_netdevice_notifier_dev_net(rpriv->netdev, + &uplink_priv->netdevice_nb, + &uplink_priv->netdevice_nn); + return err; +} + +void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + + /* clean indirect TC block notifications */ + unregister_netdevice_notifier_dev_net(rpriv->netdev, + &uplink_priv->netdevice_nb, + &uplink_priv->netdevice_nn); +} + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5e_tc_update_priv *tc_priv, + u32 tunnel_id) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct tunnel_match_enc_opts enc_opts = {}; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + struct metadata_dst *tun_dst; + struct tunnel_match_key key; + u32 tun_id, enc_opts_id; + struct net_device *dev; + int err; + + enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK; + tun_id = tunnel_id >> ENC_OPTS_BITS; + + if (!tun_id) + return true; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key); + if (err) { + WARN_ON_ONCE(true); + netdev_dbg(priv->netdev, + "Couldn't find tunnel for tun_id: %d, err: %d\n", + tun_id, err); + return false; + } + + if (enc_opts_id) { + err = mapping_find(uplink_priv->tunnel_enc_opts_mapping, + enc_opts_id, &enc_opts); + if (err) { + netdev_dbg(priv->netdev, + "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n", + enc_opts_id, err); + return false; + } + } + + tun_dst = tun_rx_dst(enc_opts.key.len); + if (!tun_dst) { + WARN_ON_ONCE(true); + return false; + } + + ip_tunnel_key_init(&tun_dst->u.tun_info.key, + key.enc_ipv4.src, key.enc_ipv4.dst, + key.enc_ip.tos, key.enc_ip.ttl, + 0, /* label */ + key.enc_tp.src, key.enc_tp.dst, + key32_to_tunnel_id(key.enc_key_id.keyid), + TUNNEL_KEY); + + if (enc_opts.key.len) + ip_tunnel_info_opts_set(&tun_dst->u.tun_info, + enc_opts.key.data, + enc_opts.key.len, + enc_opts.key.dst_opt_type); + + skb_dst_set(skb, (struct dst_entry *)tun_dst); + dev = dev_get_by_index(&init_net, key.filter_ifindex); + if (!dev) { + netdev_dbg(priv->netdev, + "Couldn't find tunnel device with ifindex: %d\n", + key.filter_ifindex); + return false; + } + + /* Set tun_dev so we do dev_put() after datapath */ + tc_priv->tun_dev = dev; + + skb->dev = dev; + + return true; +} +#endif /* CONFIG_NET_TC_SKB_EXT */ + +bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, + struct sk_buff *skb, + struct mlx5e_tc_update_priv *tc_priv) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + u32 chain = 0, reg_c0, reg_c1, tunnel_id, tuple_id; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + struct tc_skb_ext *tc_skb_ext; + struct mlx5_eswitch *esw; + struct mlx5e_priv *priv; + int tunnel_moffset; + int err; + + reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK); + if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG) + reg_c0 = 0; + reg_c1 = be32_to_cpu(cqe->ft_metadata); + + if (!reg_c0) + return true; + + priv = netdev_priv(skb->dev); + esw = priv->mdev->priv.eswitch; + + err = mlx5_eswitch_get_chain_for_tag(esw, reg_c0, &chain); + if (err) { + netdev_dbg(priv->netdev, + "Couldn't find chain for chain tag: %d, err: %d\n", + reg_c0, err); + return false; + } + + if (chain) { + tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); + if (!tc_skb_ext) { + WARN_ON(1); + return false; + } + + tc_skb_ext->chain = chain; + + tuple_id = reg_c1 & TUPLE_ID_MAX; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + if (!mlx5e_tc_ct_restore_flow(uplink_priv, skb, tuple_id)) + return false; + } + + tunnel_moffset = mlx5e_tc_attr_to_reg_mappings[TUNNEL_TO_REG].moffset; + tunnel_id = reg_c1 >> (8 * tunnel_moffset); + return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id); +#endif /* CONFIG_NET_TC_SKB_EXT */ + + return true; +} + +void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv) +{ + if (tc_priv->tun_dev) + dev_put(tc_priv->tun_dev); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h new file mode 100644 index 000000000000..86f92abf2fdd --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __MLX5_EN_REP_TC_H__ +#define __MLX5_EN_REP_TC_H__ + +#include <linux/skbuff.h> +#include "en_tc.h" +#include "en_rep.h" + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + +int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv); + +int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv); + +void mlx5e_rep_tc_enable(struct mlx5e_priv *priv); +void mlx5e_rep_tc_disable(struct mlx5e_priv *priv); + +int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv); + +void mlx5e_rep_update_flows(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + bool neigh_connected, + unsigned char ha[ETH_ALEN]); + +int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e); +void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e); + +int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data); +void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv); + +bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, + struct sk_buff *skb, + struct mlx5e_tc_update_priv *tc_priv); +void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv); + +#else /* CONFIG_MLX5_CLS_ACT */ + +struct mlx5e_rep_priv; +static inline int +mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv) { return 0; } +static inline void +mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv) {} + +static inline int +mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) { return 0; } +static inline void +mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) {} + +static inline void +mlx5e_rep_tc_enable(struct mlx5e_priv *priv) {} +static inline void +mlx5e_rep_tc_disable(struct mlx5e_priv *priv) {} + +static inline int +mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv) { return NOTIFY_DONE; } + +static inline int +mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) { return -EOPNOTSUPP; } + +static inline void +mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) {} + +struct mlx5e_tc_update_priv; +static inline bool +mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, + struct sk_buff *skb, + struct mlx5e_tc_update_priv *tc_priv) { return true; } +static inline void +mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv) {} + +#endif /* CONFIG_MLX5_CLS_ACT */ + +#endif /* __MLX5_EN_REP_TC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index b45c3f46570b..e99382f58807 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -4,8 +4,11 @@ #include <net/vxlan.h> #include <net/gre.h> #include <net/geneve.h> +#include <net/bareudp.h> #include "en/tc_tun.h" #include "en_tc.h" +#include "rep/tc.h" +#include "rep/neigh.h" struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev) { @@ -16,6 +19,8 @@ struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev) else if (netif_is_gretap(tunnel_dev) || netif_is_ip6gretap(tunnel_dev)) return &gre_tunnel; + else if (netif_is_bareudp(tunnel_dev)) + return &mplsoudp_tunnel; else return NULL; } @@ -96,9 +101,8 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, } rt = ip_route_output_key(dev_net(mirred_dev), fl4); - ret = PTR_ERR_OR_ZERO(rt); - if (ret) - return ret; + if (IS_ERR(rt)) + return PTR_ERR(rt); if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) { ip_rt_put(rt); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h index 1630f0ec3ad7..704359df6095 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h @@ -16,6 +16,7 @@ enum { MLX5E_TC_TUNNEL_TYPE_VXLAN, MLX5E_TC_TUNNEL_TYPE_GENEVE, MLX5E_TC_TUNNEL_TYPE_GRETAP, + MLX5E_TC_TUNNEL_TYPE_MPLSOUDP, }; struct mlx5e_tc_tunnel { @@ -46,6 +47,7 @@ struct mlx5e_tc_tunnel { extern struct mlx5e_tc_tunnel vxlan_tunnel; extern struct mlx5e_tc_tunnel geneve_tunnel; extern struct mlx5e_tc_tunnel gre_tunnel; +extern struct mlx5e_tc_tunnel mplsoudp_tunnel; struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c new file mode 100644 index 000000000000..98ee62e427d2 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies. */ + +#include <net/bareudp.h> +#include <net/mpls.h> +#include "en/tc_tun.h" + +static bool can_offload(struct mlx5e_priv *priv) +{ + return MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, reformat_l3_tunnel_to_l2); +} + +static int calc_hlen(struct mlx5e_encap_entry *e) +{ + return sizeof(struct udphdr) + MPLS_HLEN; +} + +static int init_encap_attr(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + e->tunnel = &mplsoudp_tunnel; + e->reformat_type = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + return 0; +} + +static inline __be32 mpls_label_id_field(__be32 label, u8 tos, u8 ttl) +{ + u32 res; + + /* mpls label is 32 bits long and construction as follows: + * 20 bits label + * 3 bits tos + * 1 bit bottom of stack. Since we support only one label, this bit is + * always set. + * 8 bits TTL + */ + res = be32_to_cpu(label) << 12 | 1 << 8 | (tos & 7) << 9 | ttl; + return cpu_to_be32(res); +} + +static int generate_ip_tun_hdr(char buf[], + __u8 *ip_proto, + struct mlx5e_encap_entry *r) +{ + const struct ip_tunnel_key *tun_key = &r->tun_info->key; + __be32 tun_id = tunnel_id_to_key32(tun_key->tun_id); + struct udphdr *udp = (struct udphdr *)(buf); + struct mpls_shim_hdr *mpls; + + mpls = (struct mpls_shim_hdr *)(udp + 1); + *ip_proto = IPPROTO_UDP; + + udp->dest = tun_key->tp_dst; + mpls->label_stack_entry = mpls_label_id_field(tun_id, tun_key->tos, tun_key->ttl); + + return 0; +} + +static int parse_udp_ports(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + return mlx5e_tc_tun_parse_udp_ports(priv, spec, f, headers_c, headers_v); +} + +static int parse_tunnel(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct flow_match_enc_keyid enc_keyid; + struct flow_match_mpls match; + void *misc2_c; + void *misc2_v; + + misc2_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + misc2_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS)) + return 0; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) + return 0; + + flow_rule_match_enc_keyid(rule, &enc_keyid); + + if (!enc_keyid.mask->keyid) + return 0; + + if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_CW_MPLS_UDP)) + return -EOPNOTSUPP; + + flow_rule_match_mpls(rule, &match); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_label, match.mask->mpls_label); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_label, match.key->mpls_label); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_exp, match.mask->mpls_tc); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_exp, match.key->mpls_tc); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_s_bos, match.mask->mpls_bos); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_s_bos, match.key->mpls_bos); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_ttl, match.mask->mpls_ttl); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_ttl, match.key->mpls_ttl); + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + + return 0; +} + +struct mlx5e_tc_tunnel mplsoudp_tunnel = { + .tunnel_type = MLX5E_TC_TUNNEL_TYPE_MPLSOUDP, + .match_level = MLX5_MATCH_L4, + .can_offload = can_offload, + .calc_hlen = calc_hlen, + .init_encap_attr = init_encap_attr, + .generate_ip_tun_hdr = generate_ip_tun_hdr, + .parse_udp_ports = parse_udp_ports, + .parse_tunnel = parse_tunnel, +}; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 42202d19245c..3bea1d4be53b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -31,7 +31,7 @@ */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "en/xdp.h" #include "en/params.h" @@ -71,7 +71,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, xdptxd.data = xdpf->data; xdptxd.len = xdpf->len; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) { + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { /* The xdp_buff was in the UMEM and was copied into a newly * allocated page. The UMEM page was returned via the ZCA, and * this new page has to be mapped at this point and has to be @@ -119,50 +119,33 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, /* returns true if packet was consumed by xdp */ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, - void *va, u16 *rx_headroom, u32 *len, bool xsk) + u32 *len, struct xdp_buff *xdp) { struct bpf_prog *prog = READ_ONCE(rq->xdp_prog); - struct xdp_umem *umem = rq->umem; - struct xdp_buff xdp; u32 act; int err; if (!prog) return false; - xdp.data = va + *rx_headroom; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + *len; - xdp.data_hard_start = va; - if (xsk) - xdp.handle = di->xsk.handle; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = rq->buff.frame0_sz; - - act = bpf_prog_run_xdp(prog, &xdp); - if (xsk) { - u64 off = xdp.data - xdp.data_hard_start; - - xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off); - } + act = bpf_prog_run_xdp(prog, xdp); switch (act) { case XDP_PASS: - *rx_headroom = xdp.data - xdp.data_hard_start; - *len = xdp.data_end - xdp.data; + *len = xdp->data_end - xdp->data; return false; case XDP_TX: - if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp))) + if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp))) goto xdp_abort; __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ return true; case XDP_REDIRECT: /* When XDP enabled then page-refcnt==1 here */ - err = xdp_do_redirect(rq->netdev, &xdp, prog); + err = xdp_do_redirect(rq->netdev, xdp, prog); if (unlikely(err)) goto xdp_abort; __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); __set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); - if (!xsk) + if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL) mlx5e_page_dma_unmap(rq, di); rq->stats->xdp_redirect++; return true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index be64eb68f4e5..ca48c293151b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -61,7 +61,7 @@ struct mlx5e_xsk_param; int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, - void *va, u16 *rx_headroom, u32 *len, bool xsk); + u32 *len, struct xdp_buff *xdp); void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq); bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq); void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index 62fc8a128a8d..a33a1f762c70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -3,71 +3,10 @@ #include "rx.h" #include "en/xdp.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* RX data path */ -bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count) -{ - /* Check in advance that we have enough frames, instead of allocating - * one-by-one, failing and moving frames to the Reuse Ring. - */ - return xsk_umem_has_addrs_rq(rq->umem, count); -} - -int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info) -{ - struct xdp_umem *umem = rq->umem; - u64 handle; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) - return -ENOMEM; - - dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle, - rq->buff.umem_headroom); - dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle); - - /* No need to add headroom to the DMA address. In striding RQ case, we - * just provide pages for UMR, and headroom is counted at the setup - * stage when creating a WQE. In non-striding RQ case, headroom is - * accounted in mlx5e_alloc_rx_wqe. - */ - dma_info->addr = xdp_umem_get_dma(umem, handle); - - xsk_umem_release_addr_rq(umem); - - dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE, - DMA_BIDIRECTIONAL); - - return 0; -} - -static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle) -{ - xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask); -} - -/* XSKRQ uses pages from UMEM, they must not be released. They are returned to - * the userspace if possible, and if not, this function is called to reuse them - * in the driver. - */ -void mlx5e_xsk_page_release(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info) -{ - mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle); -} - -/* Return a frame back to the hardware to fill in again. It is used by XDP when - * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP. - */ -void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle) -{ - struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca); - - mlx5e_xsk_recycle_frame(rq, handle); -} - static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data, u32 cqe_bcnt) { @@ -90,11 +29,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, u32 head_offset, u32 page_idx) { - struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; - u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom; + struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk; u32 cqe_bcnt32 = cqe_bcnt; - void *va, *data; - u32 frag_size; bool consumed; /* Check packet size. Note LRO doesn't use linear SKB */ @@ -103,22 +39,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, return NULL; } - /* head_offset is not used in this function, because di->xsk.data and - * di->addr point directly to the necessary place. Furthermore, in the - * current implementation, UMR pages are mapped to XSK frames, so + /* head_offset is not used in this function, because xdp->data and the + * DMA address point directly to the necessary place. Furthermore, in + * the current implementation, UMR pages are mapped to XSK frames, so * head_offset should always be 0. */ WARN_ON_ONCE(head_offset); - va = di->xsk.data; - data = va + rx_headroom; - frag_size = rq->buff.headroom + cqe_bcnt32; - - dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL); - prefetch(data); + xdp->data_end = xdp->data + cqe_bcnt32; + xdp_set_data_meta_invalid(xdp); + xsk_buff_dma_sync_for_cpu(xdp); + prefetch(xdp->data); rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true); + consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp); rcu_read_unlock(); /* Possible flows: @@ -145,7 +79,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, /* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the * frame. On SKB allocation failure, NULL is returned. */ - return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32); + return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32); } struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, @@ -153,25 +87,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt) { - struct mlx5e_dma_info *di = wi->di; - u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom; - void *va, *data; + struct xdp_buff *xdp = wi->di->xsk; bool consumed; - u32 frag_size; - /* wi->offset is not used in this function, because di->xsk.data and - * di->addr point directly to the necessary place. Furthermore, in the - * current implementation, one page = one packet = one frame, so + /* wi->offset is not used in this function, because xdp->data and the + * DMA address point directly to the necessary place. Furthermore, the + * XSK allocator allocates frames per packet, instead of pages, so * wi->offset should always be 0. */ WARN_ON_ONCE(wi->offset); - va = di->xsk.data; - data = va + rx_headroom; - frag_size = rq->buff.headroom + cqe_bcnt; - - dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL); - prefetch(data); + xdp->data_end = xdp->data + cqe_bcnt; + xdp_set_data_meta_invalid(xdp); + xsk_buff_dma_sync_for_cpu(xdp); + prefetch(xdp->data); if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) { rq->stats->wqe_err++; @@ -179,7 +108,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, } rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true); + consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp); rcu_read_unlock(); if (likely(consumed)) @@ -189,5 +118,5 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, * will be handled by mlx5e_put_rx_frag. * On SKB allocation failure, NULL is returned. */ - return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt); + return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h index cab0e93497ae..d147b2f13b54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h @@ -5,16 +5,10 @@ #define __MLX5_EN_XSK_RX_H__ #include "en.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* RX data path */ -bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count); -int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info); -void mlx5e_xsk_page_release(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info); -void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle); struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, u16 cqe_bcnt, @@ -25,6 +19,23 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt); +static inline int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + dma_info->xsk = xsk_buff_alloc(rq->umem); + if (!dma_info->xsk) + return -ENOMEM; + + /* Store the DMA address without headroom. In striding RQ case, we just + * provide pages for UMR, and headroom is counted at the setup stage + * when creating a WQE. In non-striding RQ case, headroom is accounted + * in mlx5e_alloc_rx_wqe. + */ + dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk); + + return 0; +} + static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err) { if (!xsk_umem_uses_need_wakeup(rq->umem)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index 3bcdb5b2fc20..83dce9cdb8c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -5,7 +5,7 @@ #include "umem.h" #include "en/xdp.h" #include "en/params.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) { @@ -92,12 +92,11 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) break; } - xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr); - xdptxd.data = xdp_umem_get_data(umem, desc.addr); + xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr); + xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr); xdptxd.len = desc.len; - dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr, - xdptxd.len, DMA_BIDIRECTIONAL); + xsk_buff_raw_dma_sync_for_device(umem, xdptxd.dma_addr, xdptxd.len); if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) { if (sq->mpwqe.wqe) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h index 79b487d89757..39fa0a705856 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h @@ -5,7 +5,7 @@ #define __MLX5_EN_XSK_TX_H__ #include "en.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* TX data path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c index 4baaa5788320..7b17fcd0a56d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2019 Mellanox Technologies. */ -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "umem.h" #include "setup.h" #include "en/params.h" @@ -10,40 +10,14 @@ static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv, struct xdp_umem *umem) { struct device *dev = priv->mdev->device; - u32 i; - for (i = 0; i < umem->npgs; i++) { - dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE, - DMA_BIDIRECTIONAL); - - if (unlikely(dma_mapping_error(dev, dma))) - goto err_unmap; - umem->pages[i].dma = dma; - } - - return 0; - -err_unmap: - while (i--) { - dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL); - umem->pages[i].dma = 0; - } - - return -ENOMEM; + return xsk_buff_dma_map(umem, dev, 0); } static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv, struct xdp_umem *umem) { - struct device *dev = priv->mdev->device; - u32 i; - - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL); - umem->pages[i].dma = 0; - } + return xsk_buff_dma_unmap(umem, 0); } static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk) @@ -90,13 +64,14 @@ static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix) static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem) { - return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff; + return xsk_umem_get_headroom(umem) <= 0xffff && + xsk_umem_get_chunk_size(umem) <= 0xffff; } void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk) { - xsk->headroom = umem->headroom; - xsk->chunk_size = umem->chunk_size_nohr + umem->headroom; + xsk->headroom = xsk_umem_get_headroom(umem); + xsk->chunk_size = xsk_umem_get_chunk_size(umem); } static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv, @@ -241,18 +216,6 @@ int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid) mlx5e_xsk_disable_umem(priv, ix); } -int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries) -{ - struct xdp_umem_fq_reuse *reuseq; - - reuseq = xsk_reuseq_prepare(nentries); - if (unlikely(!reuseq)) - return -ENOMEM; - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - return 0; -} - u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk) { u16 res = xsk->refcnt ? params->num_channels : 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 07823abe5557..8ff9cbe6943d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -38,7 +38,7 @@ #include <linux/bpf.h> #include <linux/if_bridge.h> #include <net/page_pool.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "eswitch.h" #include "en.h" #include "en/txrx.h" @@ -373,7 +373,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, struct mlx5_core_dev *mdev = c->mdev; void *rqc = rqp->rqc; void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); - u32 num_xsk_frames = 0; u32 rq_xdp_ix; u32 pool_size; int wq_sz; @@ -413,7 +412,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk); - rq->buff.umem_headroom = xsk ? xsk->headroom : 0; pool_size = 1 << params->log_rq_mtu_frames; switch (rq->wq_type) { @@ -427,10 +425,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq); - if (xsk) - num_xsk_frames = wq_sz << - mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk); - pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params, xsk); @@ -482,9 +476,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq); - if (xsk) - num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags; - rq->wqe.info = rqp->frags_info; rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; @@ -525,19 +516,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, } if (xsk) { - rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem); - - err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames); - if (unlikely(err)) { - mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n", - num_xsk_frames); - goto err_free; - } - - rq->zca.free = mlx5e_xsk_zca_free; err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &rq->zca); + MEM_TYPE_XSK_BUFF_POOL, NULL); + xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq); } else { /* Create a page_pool and register it with rxq */ pp_params.order = 0; @@ -3539,41 +3520,6 @@ out: return err; } -#ifdef CONFIG_MLX5_ESWITCH -static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv, - struct flow_cls_offload *cls_flower, - unsigned long flags) -{ - switch (cls_flower->command) { - case FLOW_CLS_REPLACE: - return mlx5e_configure_flower(priv->netdev, priv, cls_flower, - flags); - case FLOW_CLS_DESTROY: - return mlx5e_delete_flower(priv->netdev, priv, cls_flower, - flags); - case FLOW_CLS_STATS: - return mlx5e_stats_flower(priv->netdev, priv, cls_flower, - flags); - default: - return -EOPNOTSUPP; - } -} - -static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, - void *cb_priv) -{ - unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD); - struct mlx5e_priv *priv = cb_priv; - - switch (type) { - case TC_SETUP_CLSFLOWER: - return mlx5e_setup_tc_cls_flower(priv, type_data, flags); - default: - return -EOPNOTSUPP; - } -} -#endif - static LIST_HEAD(mlx5e_block_cb_list); static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, @@ -3582,7 +3528,6 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, struct mlx5e_priv *priv = netdev_priv(dev); switch (type) { -#ifdef CONFIG_MLX5_ESWITCH case TC_SETUP_BLOCK: { struct flow_block_offload *f = type_data; @@ -3592,7 +3537,6 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, mlx5e_setup_tc_block_cb, priv, priv, true); } -#endif case TC_SETUP_QDISC_MQPRIO: return mlx5e_setup_tc_mqprio(priv, type_data); default: @@ -3765,7 +3709,7 @@ static int set_feature_cvlan_filter(struct net_device *netdev, bool enable) return 0; } -#ifdef CONFIG_MLX5_ESWITCH +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) static int set_feature_tc_num_filters(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -3876,7 +3820,7 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features) err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro); err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER, set_feature_cvlan_filter); -#ifdef CONFIG_MLX5_ESWITCH +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters); #endif err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 52351c105627..a46405c6d560 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -35,7 +35,6 @@ #include <net/switchdev.h> #include <net/pkt_cls.h> #include <net/act_api.h> -#include <net/netevent.h> #include <net/arp.h> #include <net/devlink.h> #include <net/ipv6_stubs.h> @@ -45,9 +44,9 @@ #include "en.h" #include "en_rep.h" #include "en_tc.h" -#include "en/tc_tun.h" +#include "en/rep/tc.h" +#include "en/rep/neigh.h" #include "fs_core.h" -#include "lib/port_tun.h" #include "lib/mlx5.h" #define CREATE_TRACE_POINTS #include "diag/en_rep_tracepoint.h" @@ -58,16 +57,6 @@ static const char mlx5e_rep_driver_name[] = "mlx5e_rep"; -struct mlx5e_rep_indr_block_priv { - struct net_device *netdev; - struct mlx5e_rep_priv *rpriv; - - struct list_head list; -}; - -static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv, - struct net_device *netdev); - static void mlx5e_rep_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { @@ -485,706 +474,6 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv) mlx5e_sqs2vport_stop(esw, rep); } -static unsigned long mlx5e_rep_ipv6_interval(void) -{ - if (IS_ENABLED(CONFIG_IPV6) && ipv6_stub->nd_tbl) - return NEIGH_VAR(&ipv6_stub->nd_tbl->parms, DELAY_PROBE_TIME); - - return ~0UL; -} - -static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv) -{ - unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME); - unsigned long ipv6_interval = mlx5e_rep_ipv6_interval(); - struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - - rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval); - mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval); -} - -void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv) -{ - struct mlx5e_rep_priv *rpriv = priv->ppriv; - struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; - - mlx5_fc_queue_stats_work(priv->mdev, - &neigh_update->neigh_stats_work, - neigh_update->min_interval); -} - -static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe) -{ - return refcount_inc_not_zero(&nhe->refcnt); -} - -static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe); - -static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe) -{ - if (refcount_dec_and_test(&nhe->refcnt)) { - mlx5e_rep_neigh_entry_remove(nhe); - kfree_rcu(nhe, rcu); - } -} - -static struct mlx5e_neigh_hash_entry * -mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv, - struct mlx5e_neigh_hash_entry *nhe) -{ - struct mlx5e_neigh_hash_entry *next = NULL; - - rcu_read_lock(); - - for (next = nhe ? - list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, - &nhe->neigh_list, - struct mlx5e_neigh_hash_entry, - neigh_list) : - list_first_or_null_rcu(&rpriv->neigh_update.neigh_list, - struct mlx5e_neigh_hash_entry, - neigh_list); - next; - next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, - &next->neigh_list, - struct mlx5e_neigh_hash_entry, - neigh_list)) - if (mlx5e_rep_neigh_entry_hold(next)) - break; - - rcu_read_unlock(); - - if (nhe) - mlx5e_rep_neigh_entry_release(nhe); - - return next; -} - -static void mlx5e_rep_neigh_stats_work(struct work_struct *work) -{ - struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv, - neigh_update.neigh_stats_work.work); - struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - struct mlx5e_neigh_hash_entry *nhe = NULL; - - rtnl_lock(); - if (!list_empty(&rpriv->neigh_update.neigh_list)) - mlx5e_rep_queue_neigh_stats_work(priv); - - while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL) - mlx5e_tc_update_neigh_used_value(nhe); - - rtnl_unlock(); -} - -static void mlx5e_rep_update_flows(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e, - bool neigh_connected, - unsigned char ha[ETH_ALEN]) -{ - struct ethhdr *eth = (struct ethhdr *)e->encap_header; - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - bool encap_connected; - LIST_HEAD(flow_list); - - ASSERT_RTNL(); - - /* wait for encap to be fully initialized */ - wait_for_completion(&e->res_ready); - - mutex_lock(&esw->offloads.encap_tbl_lock); - encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID); - if (e->compl_result < 0 || (encap_connected == neigh_connected && - ether_addr_equal(e->h_dest, ha))) - goto unlock; - - mlx5e_take_all_encap_flows(e, &flow_list); - - if ((e->flags & MLX5_ENCAP_ENTRY_VALID) && - (!neigh_connected || !ether_addr_equal(e->h_dest, ha))) - mlx5e_tc_encap_flows_del(priv, e, &flow_list); - - if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) { - ether_addr_copy(e->h_dest, ha); - ether_addr_copy(eth->h_dest, ha); - /* Update the encap source mac, in case that we delete - * the flows when encap source mac changed. - */ - ether_addr_copy(eth->h_source, e->route_dev->dev_addr); - - mlx5e_tc_encap_flows_add(priv, e, &flow_list); - } -unlock: - mutex_unlock(&esw->offloads.encap_tbl_lock); - mlx5e_put_encap_flow_list(priv, &flow_list); -} - -static void mlx5e_rep_neigh_update(struct work_struct *work) -{ - struct mlx5e_neigh_hash_entry *nhe = - container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work); - struct neighbour *n = nhe->n; - struct mlx5e_encap_entry *e; - unsigned char ha[ETH_ALEN]; - struct mlx5e_priv *priv; - bool neigh_connected; - u8 nud_state, dead; - - rtnl_lock(); - - /* If these parameters are changed after we release the lock, - * we'll receive another event letting us know about it. - * We use this lock to avoid inconsistency between the neigh validity - * and it's hw address. - */ - read_lock_bh(&n->lock); - memcpy(ha, n->ha, ETH_ALEN); - nud_state = n->nud_state; - dead = n->dead; - read_unlock_bh(&n->lock); - - neigh_connected = (nud_state & NUD_VALID) && !dead; - - trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected); - - list_for_each_entry(e, &nhe->encap_list, encap_list) { - if (!mlx5e_encap_take(e)) - continue; - - priv = netdev_priv(e->out_dev); - mlx5e_rep_update_flows(priv, e, neigh_connected, ha); - mlx5e_encap_put(priv, e); - } - mlx5e_rep_neigh_entry_release(nhe); - rtnl_unlock(); - neigh_release(n); -} - -static struct mlx5e_rep_indr_block_priv * -mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv, - struct net_device *netdev) -{ - struct mlx5e_rep_indr_block_priv *cb_priv; - - /* All callback list access should be protected by RTNL. */ - ASSERT_RTNL(); - - list_for_each_entry(cb_priv, - &rpriv->uplink_priv.tc_indr_block_priv_list, - list) - if (cb_priv->netdev == netdev) - return cb_priv; - - return NULL; -} - -static void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) -{ - struct mlx5e_rep_indr_block_priv *cb_priv, *temp; - struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list; - - list_for_each_entry_safe(cb_priv, temp, head, list) { - mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev); - kfree(cb_priv); - } -} - -static int -mlx5e_rep_indr_offload(struct net_device *netdev, - struct flow_cls_offload *flower, - struct mlx5e_rep_indr_block_priv *indr_priv, - unsigned long flags) -{ - struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev); - int err = 0; - - switch (flower->command) { - case FLOW_CLS_REPLACE: - err = mlx5e_configure_flower(netdev, priv, flower, flags); - break; - case FLOW_CLS_DESTROY: - err = mlx5e_delete_flower(netdev, priv, flower, flags); - break; - case FLOW_CLS_STATS: - err = mlx5e_stats_flower(netdev, priv, flower, flags); - break; - default: - err = -EOPNOTSUPP; - } - - return err; -} - -static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type, - void *type_data, void *indr_priv) -{ - unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD); - struct mlx5e_rep_indr_block_priv *priv = indr_priv; - - switch (type) { - case TC_SETUP_CLSFLOWER: - return mlx5e_rep_indr_offload(priv->netdev, type_data, priv, - flags); - default: - return -EOPNOTSUPP; - } -} - -static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type, - void *type_data, void *indr_priv) -{ - struct mlx5e_rep_indr_block_priv *priv = indr_priv; - struct flow_cls_offload *f = type_data; - struct flow_cls_offload tmp; - struct mlx5e_priv *mpriv; - struct mlx5_eswitch *esw; - unsigned long flags; - int err; - - mpriv = netdev_priv(priv->rpriv->netdev); - esw = mpriv->mdev->priv.eswitch; - - flags = MLX5_TC_FLAG(EGRESS) | - MLX5_TC_FLAG(ESW_OFFLOAD) | - MLX5_TC_FLAG(FT_OFFLOAD); - - switch (type) { - case TC_SETUP_CLSFLOWER: - memcpy(&tmp, f, sizeof(*f)); - - /* Re-use tc offload path by moving the ft flow to the - * reserved ft chain. - * - * FT offload can use prio range [0, INT_MAX], so we normalize - * it to range [1, mlx5_esw_chains_get_prio_range(esw)] - * as with tc, where prio 0 isn't supported. - * - * We only support chain 0 of FT offload. - */ - if (!mlx5_esw_chains_prios_supported(esw) || - tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw) || - tmp.common.chain_index) - return -EOPNOTSUPP; - - tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw); - tmp.common.prio++; - err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags); - memcpy(&f->stats, &tmp.stats, sizeof(f->stats)); - return err; - default: - return -EOPNOTSUPP; - } -} - -static void mlx5e_rep_indr_block_unbind(void *cb_priv) -{ - struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv; - - list_del(&indr_priv->list); - kfree(indr_priv); -} - -static LIST_HEAD(mlx5e_block_cb_list); - -static int -mlx5e_rep_indr_setup_block(struct net_device *netdev, - struct mlx5e_rep_priv *rpriv, - struct flow_block_offload *f, - flow_setup_cb_t *setup_cb) -{ - struct mlx5e_rep_indr_block_priv *indr_priv; - struct flow_block_cb *block_cb; - - if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) - return -EOPNOTSUPP; - - f->unlocked_driver_cb = true; - f->driver_block_list = &mlx5e_block_cb_list; - - switch (f->command) { - case FLOW_BLOCK_BIND: - indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev); - if (indr_priv) - return -EEXIST; - - indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL); - if (!indr_priv) - return -ENOMEM; - - indr_priv->netdev = netdev; - indr_priv->rpriv = rpriv; - list_add(&indr_priv->list, - &rpriv->uplink_priv.tc_indr_block_priv_list); - - block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv, - mlx5e_rep_indr_block_unbind); - if (IS_ERR(block_cb)) { - list_del(&indr_priv->list); - kfree(indr_priv); - return PTR_ERR(block_cb); - } - flow_block_cb_add(block_cb, f); - list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list); - - return 0; - case FLOW_BLOCK_UNBIND: - indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev); - if (!indr_priv) - return -ENOENT; - - block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv); - if (!block_cb) - return -ENOENT; - - flow_block_cb_remove(block_cb, f); - list_del(&block_cb->driver_list); - return 0; - default: - return -EOPNOTSUPP; - } - return 0; -} - -static -int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv, - enum tc_setup_type type, void *type_data) -{ - switch (type) { - case TC_SETUP_BLOCK: - return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data, - mlx5e_rep_indr_setup_tc_cb); - case TC_SETUP_FT: - return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data, - mlx5e_rep_indr_setup_ft_cb); - default: - return -EOPNOTSUPP; - } -} - -static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv, - struct net_device *netdev) -{ - int err; - - err = __flow_indr_block_cb_register(netdev, rpriv, - mlx5e_rep_indr_setup_cb, - rpriv); - if (err) { - struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); - - mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n", - netdev_name(netdev), err); - } - return err; -} - -static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv, - struct net_device *netdev) -{ - __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb, - rpriv); -} - -static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr) -{ - struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv, - uplink_priv.netdevice_nb); - struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - - if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && - !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev)) - return NOTIFY_OK; - - switch (event) { - case NETDEV_REGISTER: - mlx5e_rep_indr_register_block(rpriv, netdev); - break; - case NETDEV_UNREGISTER: - mlx5e_rep_indr_unregister_block(rpriv, netdev); - break; - } - return NOTIFY_OK; -} - -static void -mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv, - struct mlx5e_neigh_hash_entry *nhe, - struct neighbour *n) -{ - /* Take a reference to ensure the neighbour and mlx5 encap - * entry won't be destructed until we drop the reference in - * delayed work. - */ - neigh_hold(n); - - /* This assignment is valid as long as the the neigh reference - * is taken - */ - nhe->n = n; - - if (!queue_work(priv->wq, &nhe->neigh_update_work)) { - mlx5e_rep_neigh_entry_release(nhe); - neigh_release(n); - } -} - -static struct mlx5e_neigh_hash_entry * -mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, - struct mlx5e_neigh *m_neigh); - -static int mlx5e_rep_netevent_event(struct notifier_block *nb, - unsigned long event, void *ptr) -{ - struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv, - neigh_update.netevent_nb); - struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; - struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - struct mlx5e_neigh_hash_entry *nhe = NULL; - struct mlx5e_neigh m_neigh = {}; - struct neigh_parms *p; - struct neighbour *n; - bool found = false; - - switch (event) { - case NETEVENT_NEIGH_UPDATE: - n = ptr; -#if IS_ENABLED(CONFIG_IPV6) - if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl) -#else - if (n->tbl != &arp_tbl) -#endif - return NOTIFY_DONE; - - m_neigh.dev = n->dev; - m_neigh.family = n->ops->family; - memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len); - - rcu_read_lock(); - nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh); - rcu_read_unlock(); - if (!nhe) - return NOTIFY_DONE; - - mlx5e_rep_queue_neigh_update_work(priv, nhe, n); - break; - - case NETEVENT_DELAY_PROBE_TIME_UPDATE: - p = ptr; - - /* We check the device is present since we don't care about - * changes in the default table, we only care about changes - * done per device delay prob time parameter. - */ -#if IS_ENABLED(CONFIG_IPV6) - if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != &arp_tbl)) -#else - if (!p->dev || p->tbl != &arp_tbl) -#endif - return NOTIFY_DONE; - - rcu_read_lock(); - list_for_each_entry_rcu(nhe, &neigh_update->neigh_list, - neigh_list) { - if (p->dev == nhe->m_neigh.dev) { - found = true; - break; - } - } - rcu_read_unlock(); - if (!found) - return NOTIFY_DONE; - - neigh_update->min_interval = min_t(unsigned long, - NEIGH_VAR(p, DELAY_PROBE_TIME), - neigh_update->min_interval); - mlx5_fc_update_sampling_interval(priv->mdev, - neigh_update->min_interval); - break; - } - return NOTIFY_DONE; -} - -static const struct rhashtable_params mlx5e_neigh_ht_params = { - .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node), - .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh), - .key_len = sizeof(struct mlx5e_neigh), - .automatic_shrinking = true, -}; - -static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) -{ - struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; - int err; - - err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params); - if (err) - return err; - - INIT_LIST_HEAD(&neigh_update->neigh_list); - mutex_init(&neigh_update->encap_lock); - INIT_DELAYED_WORK(&neigh_update->neigh_stats_work, - mlx5e_rep_neigh_stats_work); - mlx5e_rep_neigh_update_init_interval(rpriv); - - rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event; - err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb); - if (err) - goto out_err; - return 0; - -out_err: - rhashtable_destroy(&neigh_update->neigh_ht); - return err; -} - -static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) -{ - struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; - struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); - - unregister_netevent_notifier(&neigh_update->netevent_nb); - - flush_workqueue(priv->wq); /* flush neigh update works */ - - cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work); - - mutex_destroy(&neigh_update->encap_lock); - rhashtable_destroy(&neigh_update->neigh_ht); -} - -static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv, - struct mlx5e_neigh_hash_entry *nhe) -{ - struct mlx5e_rep_priv *rpriv = priv->ppriv; - int err; - - err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht, - &nhe->rhash_node, - mlx5e_neigh_ht_params); - if (err) - return err; - - list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list); - - return err; -} - -static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe) -{ - struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv; - - mutex_lock(&rpriv->neigh_update.encap_lock); - - list_del_rcu(&nhe->neigh_list); - - rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht, - &nhe->rhash_node, - mlx5e_neigh_ht_params); - mutex_unlock(&rpriv->neigh_update.encap_lock); -} - -/* This function must only be called under the representor's encap_lock or - * inside rcu read lock section. - */ -static struct mlx5e_neigh_hash_entry * -mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, - struct mlx5e_neigh *m_neigh) -{ - struct mlx5e_rep_priv *rpriv = priv->ppriv; - struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; - struct mlx5e_neigh_hash_entry *nhe; - - nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh, - mlx5e_neigh_ht_params); - return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL; -} - -static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e, - struct mlx5e_neigh_hash_entry **nhe) -{ - int err; - - *nhe = kzalloc(sizeof(**nhe), GFP_KERNEL); - if (!*nhe) - return -ENOMEM; - - (*nhe)->priv = priv; - memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh)); - INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update); - spin_lock_init(&(*nhe)->encap_list_lock); - INIT_LIST_HEAD(&(*nhe)->encap_list); - refcount_set(&(*nhe)->refcnt, 1); - - err = mlx5e_rep_neigh_entry_insert(priv, *nhe); - if (err) - goto out_free; - return 0; - -out_free: - kfree(*nhe); - return err; -} - -int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e) -{ - struct mlx5e_rep_priv *rpriv = priv->ppriv; - struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; - struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy; - struct mlx5e_neigh_hash_entry *nhe; - int err; - - err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type); - if (err) - return err; - - mutex_lock(&rpriv->neigh_update.encap_lock); - nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh); - if (!nhe) { - err = mlx5e_rep_neigh_entry_create(priv, e, &nhe); - if (err) { - mutex_unlock(&rpriv->neigh_update.encap_lock); - mlx5_tun_entropy_refcount_dec(tun_entropy, - e->reformat_type); - return err; - } - } - - e->nhe = nhe; - spin_lock(&nhe->encap_list_lock); - list_add_rcu(&e->encap_list, &nhe->encap_list); - spin_unlock(&nhe->encap_list_lock); - - mutex_unlock(&rpriv->neigh_update.encap_lock); - - return 0; -} - -void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e) -{ - struct mlx5e_rep_priv *rpriv = priv->ppriv; - struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; - struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy; - - if (!e->nhe) - return; - - spin_lock(&e->nhe->encap_list_lock); - list_del_rcu(&e->encap_list); - spin_unlock(&e->nhe->encap_list_lock); - - mlx5e_rep_neigh_entry_release(e->nhe); - e->nhe = NULL; - mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type); -} - static int mlx5e_rep_open(struct net_device *dev) { struct mlx5e_priv *priv = netdev_priv(dev); @@ -1225,129 +514,6 @@ static int mlx5e_rep_close(struct net_device *dev) return ret; } -static int -mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv, - struct flow_cls_offload *cls_flower, int flags) -{ - switch (cls_flower->command) { - case FLOW_CLS_REPLACE: - return mlx5e_configure_flower(priv->netdev, priv, cls_flower, - flags); - case FLOW_CLS_DESTROY: - return mlx5e_delete_flower(priv->netdev, priv, cls_flower, - flags); - case FLOW_CLS_STATS: - return mlx5e_stats_flower(priv->netdev, priv, cls_flower, - flags); - default: - return -EOPNOTSUPP; - } -} - -static -int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv, - struct tc_cls_matchall_offload *ma) -{ - switch (ma->command) { - case TC_CLSMATCHALL_REPLACE: - return mlx5e_tc_configure_matchall(priv, ma); - case TC_CLSMATCHALL_DESTROY: - return mlx5e_tc_delete_matchall(priv, ma); - case TC_CLSMATCHALL_STATS: - mlx5e_tc_stats_matchall(priv, ma); - return 0; - default: - return -EOPNOTSUPP; - } -} - -static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data, - void *cb_priv) -{ - unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD); - struct mlx5e_priv *priv = cb_priv; - - switch (type) { - case TC_SETUP_CLSFLOWER: - return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags); - case TC_SETUP_CLSMATCHALL: - return mlx5e_rep_setup_tc_cls_matchall(priv, type_data); - default: - return -EOPNOTSUPP; - } -} - -static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data, - void *cb_priv) -{ - struct flow_cls_offload tmp, *f = type_data; - struct mlx5e_priv *priv = cb_priv; - struct mlx5_eswitch *esw; - unsigned long flags; - int err; - - flags = MLX5_TC_FLAG(INGRESS) | - MLX5_TC_FLAG(ESW_OFFLOAD) | - MLX5_TC_FLAG(FT_OFFLOAD); - esw = priv->mdev->priv.eswitch; - - switch (type) { - case TC_SETUP_CLSFLOWER: - memcpy(&tmp, f, sizeof(*f)); - - if (!mlx5_esw_chains_prios_supported(esw)) - return -EOPNOTSUPP; - - /* Re-use tc offload path by moving the ft flow to the - * reserved ft chain. - * - * FT offload can use prio range [0, INT_MAX], so we normalize - * it to range [1, mlx5_esw_chains_get_prio_range(esw)] - * as with tc, where prio 0 isn't supported. - * - * We only support chain 0 of FT offload. - */ - if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw)) - return -EOPNOTSUPP; - if (tmp.common.chain_index != 0) - return -EOPNOTSUPP; - - tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw); - tmp.common.prio++; - err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags); - memcpy(&f->stats, &tmp.stats, sizeof(f->stats)); - return err; - default: - return -EOPNOTSUPP; - } -} - -static LIST_HEAD(mlx5e_rep_block_tc_cb_list); -static LIST_HEAD(mlx5e_rep_block_ft_cb_list); -static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, - void *type_data) -{ - struct mlx5e_priv *priv = netdev_priv(dev); - struct flow_block_offload *f = type_data; - - f->unlocked_driver_cb = true; - - switch (type) { - case TC_SETUP_BLOCK: - return flow_block_cb_setup_simple(type_data, - &mlx5e_rep_block_tc_cb_list, - mlx5e_rep_setup_tc_cb, - priv, priv, true); - case TC_SETUP_FT: - return flow_block_cb_setup_simple(type_data, - &mlx5e_rep_block_ft_cb_list, - mlx5e_rep_setup_ft_cb, - priv, priv, true); - default: - return -EOPNOTSUPP; - } -} - bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv) { struct mlx5e_rep_priv *rpriv = priv->ppriv; @@ -1791,31 +957,23 @@ static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) priv = netdev_priv(netdev); uplink_priv = &rpriv->uplink_priv; - mutex_init(&uplink_priv->unready_flows_lock); - INIT_LIST_HEAD(&uplink_priv->unready_flows); - - /* init shared tc flow table */ - err = mlx5e_tc_esw_init(&uplink_priv->tc_ht); + err = mlx5e_rep_tc_init(rpriv); if (err) return err; mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev); - /* init indirect block notifications */ - INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list); - uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event; - err = register_netdevice_notifier_dev_net(rpriv->netdev, - &uplink_priv->netdevice_nb, - &uplink_priv->netdevice_nn); + err = mlx5e_rep_tc_netdevice_event_register(rpriv); if (err) { - mlx5_core_err(priv->mdev, "Failed to register netdev notifier\n"); - goto tc_esw_cleanup; + mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n", + err); + goto tc_rep_cleanup; } return 0; -tc_esw_cleanup: - mlx5e_tc_esw_cleanup(&uplink_priv->tc_ht); +tc_rep_cleanup: + mlx5e_rep_tc_cleanup(rpriv); return err; } @@ -1845,17 +1003,10 @@ destroy_tises: static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) { - struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; - - /* clean indirect TC block notifications */ - unregister_netdevice_notifier_dev_net(rpriv->netdev, - &uplink_priv->netdevice_nb, - &uplink_priv->netdevice_nn); + mlx5e_rep_tc_netdevice_event_unregister(rpriv); mlx5e_rep_indr_clean_block_privs(rpriv); - /* delete shared tc flow table */ - mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht); - mutex_destroy(&rpriv->uplink_priv.unready_flows_lock); + mlx5e_rep_tc_cleanup(rpriv); } static void mlx5e_cleanup_rep_tx(struct mlx5e_priv *priv) @@ -1897,13 +1048,8 @@ static int uplink_rep_async_event(struct notifier_block *nb, unsigned long event return NOTIFY_OK; } - if (event == MLX5_DEV_EVENT_PORT_AFFINITY) { - struct mlx5e_rep_priv *rpriv = priv->ppriv; - - queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work); - - return NOTIFY_OK; - } + if (event == MLX5_DEV_EVENT_PORT_AFFINITY) + return mlx5e_rep_tc_event_port_affinity(priv); return NOTIFY_DONE; } @@ -1912,7 +1058,6 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) { struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; - struct mlx5e_rep_priv *rpriv = priv->ppriv; u16 max_mtu; netdev->min_mtu = ETH_MIN_MTU; @@ -1920,8 +1065,7 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu); mlx5e_set_dev_port_mtu(priv); - INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work, - mlx5e_tc_reoffload_flows_work); + mlx5e_rep_tc_enable(priv); mlx5_lag_add(mdev, netdev); priv->events_nb.notifier_call = uplink_rep_async_event; @@ -1933,11 +1077,10 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; - struct mlx5e_rep_priv *rpriv = priv->ppriv; mlx5e_dcbnl_delete_app(priv); mlx5_notifier_unregister(mdev, &priv->events_nb); - cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work); + mlx5e_rep_tc_disable(priv); mlx5_lag_remove(mdev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 6a2337900420..93e911baacad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -158,6 +158,22 @@ struct mlx5e_neigh_hash_entry { enum { /* set when the encap entry is successfully offloaded into HW */ MLX5_ENCAP_ENTRY_VALID = BIT(0), + MLX5_REFORMAT_DECAP = BIT(1), +}; + +struct mlx5e_decap_key { + struct ethhdr key; +}; + +struct mlx5e_decap_entry { + struct mlx5e_decap_key key; + struct list_head flows; + struct hlist_node hlist; + refcount_t refcnt; + struct completion res_ready; + int compl_result; + struct mlx5_pkt_reformat *pkt_reformat; + struct rcu_head rcu; }; struct mlx5e_encap_entry { @@ -203,11 +219,6 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); -int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e); -void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e); - void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv); bool mlx5e_eswitch_rep(struct net_device *netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index a514685fb560..6b3c82da199c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -42,6 +42,7 @@ #include "en_tc.h" #include "eswitch.h" #include "en_rep.h" +#include "en/rep/tc.h" #include "ipoib/ipoib.h" #include "en_accel/ipsec_rxtx.h" #include "en_accel/tls_rxtx.h" @@ -300,7 +301,7 @@ static inline void mlx5e_page_release(struct mlx5e_rq *rq, * put into the Reuse Ring, because there is no way to return * the page to the userspace when the interface goes down. */ - mlx5e_xsk_page_release(rq, dma_info); + xsk_buff_free(dma_info->xsk); else mlx5e_page_release_dynamic(rq, dma_info, recycle); } @@ -385,7 +386,11 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk) if (rq->umem) { int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags; - if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired))) + /* Check in advance that we have enough frames, instead of + * allocating one-by-one, failing and moving frames to the + * Reuse Ring. + */ + if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired))) return -ENOMEM; } @@ -480,8 +485,11 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) int err; int i; + /* Check in advance that we have enough frames, instead of allocating + * one-by-one, failing and moving frames to the Reuse Ring. + */ if (rq->umem && - unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) { + unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) { err = -ENOMEM; goto err; } @@ -1044,12 +1052,24 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va, return skb; } +static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, + u32 len, struct xdp_buff *xdp) +{ + xdp->data_hard_start = va; + xdp_set_data_meta_invalid(xdp); + xdp->data = va + headroom; + xdp->data_end = xdp->data + len; + xdp->rxq = &rq->xdp_rxq; + xdp->frame_sz = rq->buff.frame0_sz; +} + struct sk_buff * mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt) { struct mlx5e_dma_info *di = wi->di; u16 rx_headroom = rq->buff.headroom; + struct xdp_buff xdp; struct sk_buff *skb; void *va, *data; bool consumed; @@ -1065,11 +1085,13 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, prefetch(data); rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false); + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp); + consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp); rcu_read_unlock(); if (consumed) return NULL; /* page/packet was consumed by XDP */ + rx_headroom = xdp.data - xdp.data_hard_start; frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt); if (unlikely(!skb)) @@ -1216,12 +1238,12 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) if (rep->vlan && skb_vlan_tag_present(skb)) skb_vlan_pop(skb); - if (!mlx5e_tc_rep_update_skb(cqe, skb, &tc_priv)) + if (!mlx5e_rep_tc_update_skb(cqe, skb, &tc_priv)) goto free_wqe; napi_gro_receive(rq->cq.napi, skb); - mlx5_tc_rep_post_napi_receive(&tc_priv); + mlx5_rep_tc_post_napi_receive(&tc_priv); free_wqe: mlx5e_free_rx_wqe(rq, wi, true); @@ -1272,12 +1294,12 @@ void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); - if (!mlx5e_tc_rep_update_skb(cqe, skb, &tc_priv)) + if (!mlx5e_rep_tc_update_skb(cqe, skb, &tc_priv)) goto mpwrq_cqe_out; napi_gro_receive(rq->cq.napi, skb); - mlx5_tc_rep_post_napi_receive(&tc_priv); + mlx5_rep_tc_post_napi_receive(&tc_priv); mpwrq_cqe_out: if (likely(wi->consumed_strides < rq->mpwqe.num_strides)) @@ -1343,6 +1365,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; u16 rx_headroom = rq->buff.headroom; u32 cqe_bcnt32 = cqe_bcnt; + struct xdp_buff xdp; struct sk_buff *skb; void *va, *data; u32 frag_size; @@ -1364,7 +1387,8 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, prefetch(data); rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false); + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp); + consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp); rcu_read_unlock(); if (consumed) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) @@ -1372,6 +1396,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return NULL; /* page/packet was consumed by XDP */ } + rx_headroom = xdp.data - xdp.data_hard_start; frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32); if (unlikely(!skb)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a050808f2128..cc669ea450ae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -31,6 +31,7 @@ */ #include <net/flow_dissector.h> +#include <net/flow_offload.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gact.h> @@ -45,10 +46,14 @@ #include <net/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_pedit.h> #include <net/tc_act/tc_csum.h> +#include <net/tc_act/tc_mpls.h> #include <net/arp.h> #include <net/ipv6_stubs.h> +#include <net/bareudp.h> #include "en.h" #include "en_rep.h" +#include "en/rep/tc.h" +#include "en/rep/neigh.h" #include "en_tc.h" #include "eswitch.h" #include "esw/chains.h" @@ -89,6 +94,7 @@ enum { MLX5E_TC_FLOW_FLAG_NOT_READY = MLX5E_TC_FLOW_BASE + 5, MLX5E_TC_FLOW_FLAG_DELETED = MLX5E_TC_FLOW_BASE + 6, MLX5E_TC_FLOW_FLAG_CT = MLX5E_TC_FLOW_BASE + 7, + MLX5E_TC_FLOW_FLAG_L3_TO_L2_DECAP = MLX5E_TC_FLOW_BASE + 8, }; #define MLX5E_TC_MAX_SPLITS 1 @@ -122,6 +128,11 @@ struct mlx5e_tc_flow { u64 cookie; unsigned long flags; struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1]; + + /* flows sharing the same reformat object - currently mpls decap */ + struct list_head l3_to_l2_reformat; + struct mlx5e_decap_entry *decap_reformat; + /* Flow can be associated with multiple encap IDs. * The number of encaps is bounded by the number of supported * destinations. @@ -153,40 +164,12 @@ struct mlx5e_tc_flow_parse_attr { struct mlx5_flow_spec spec; struct mlx5e_tc_mod_hdr_acts mod_hdr_acts; int mirred_ifindex[MLX5_MAX_FLOW_FWD_VPORTS]; + struct ethhdr eth; }; #define MLX5E_TC_TABLE_NUM_GROUPS 4 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(16) -struct tunnel_match_key { - struct flow_dissector_key_control enc_control; - struct flow_dissector_key_keyid enc_key_id; - struct flow_dissector_key_ports enc_tp; - struct flow_dissector_key_ip enc_ip; - union { - struct flow_dissector_key_ipv4_addrs enc_ipv4; - struct flow_dissector_key_ipv6_addrs enc_ipv6; - }; - - int filter_ifindex; -}; - -struct tunnel_match_enc_opts { - struct flow_dissector_key_enc_opts key; - struct flow_dissector_key_enc_opts mask; -}; - -/* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS. - * Upper TUNNEL_INFO_BITS for general tunnel info. - * Lower ENC_OPTS_BITS bits for enc_opts. - */ -#define TUNNEL_INFO_BITS 6 -#define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0) -#define ENC_OPTS_BITS 2 -#define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0) -#define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS) -#define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0) - struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = { [CHAIN_TO_REG] = { .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0, @@ -1149,6 +1132,11 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct netlink_ext_ack *extack, struct net_device **encap_dev, bool *encap_valid); +static int mlx5e_attach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack); +static void mlx5e_detach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow); static struct mlx5_flow_handle * mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw, @@ -1324,6 +1312,12 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, return -EOPNOTSUPP; } + if (flow_flag_test(flow, L3_TO_L2_DECAP)) { + err = mlx5e_attach_decap(priv, flow, extack); + if (err) + return err; + } + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { int mirred_ifindex; @@ -1433,6 +1427,9 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) mlx5_fc_destroy(attr->counter_dev, attr->counter); + + if (flow_flag_test(flow, L3_TO_L2_DECAP)) + mlx5e_detach_decap(priv, flow); } void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, @@ -1709,6 +1706,17 @@ static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entr kfree_rcu(e, rcu); } +static void mlx5e_decap_dealloc(struct mlx5e_priv *priv, + struct mlx5e_decap_entry *d) +{ + WARN_ON(!list_empty(&d->flows)); + + if (!d->compl_result) + mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat); + + kfree_rcu(d, rcu); +} + void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; @@ -1721,6 +1729,18 @@ void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) mlx5e_encap_dealloc(priv, e); } +static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock)) + return; + hash_del_rcu(&d->hlist); + mutex_unlock(&esw->offloads.decap_tbl_lock); + + mlx5e_decap_dealloc(priv, d); +} + static void mlx5e_detach_encap(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, int out_index) { @@ -1744,6 +1764,29 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, mlx5e_encap_dealloc(priv, e); } +static void mlx5e_detach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_decap_entry *d = flow->decap_reformat; + + if (!d) + return; + + mutex_lock(&esw->offloads.decap_tbl_lock); + list_del(&flow->l3_to_l2_reformat); + flow->decap_reformat = NULL; + + if (!refcount_dec_and_test(&d->refcnt)) { + mutex_unlock(&esw->offloads.decap_tbl_lock); + return; + } + hash_del_rcu(&d->hlist); + mutex_unlock(&esw->offloads.decap_tbl_lock); + + mlx5e_decap_dealloc(priv, d); +} + static void __mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow) { struct mlx5_eswitch *esw = flow->priv->mdev->priv.eswitch; @@ -2015,7 +2058,11 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, return err; } - flow->esw_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + /* With mpls over udp we decapsulate using packet reformat + * object + */ + if (!netif_is_bareudp(filter_dev)) + flow->esw_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; } if (!needs_mapping && !sets_mapping) @@ -2098,6 +2145,20 @@ static int mlx5e_flower_parse_meta(struct net_device *filter_dev, return 0; } +static bool skip_key_basic(struct net_device *filter_dev, + struct flow_cls_offload *f) +{ + /* When doing mpls over udp decap, the user needs to provide + * MPLS_UC as the protocol in order to be able to match on mpls + * label fields. However, the actual ethertype is IP so we want to + * avoid matching on this, otherwise we'll fail the match. + */ + if (netif_is_bareudp(filter_dev) && f->common.chain_index == 0) + return true; + + return false; +} + static int __parse_cls_flower(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec, @@ -2142,7 +2203,8 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, BIT(FLOW_DISSECTOR_KEY_IP) | BIT(FLOW_DISSECTOR_KEY_CT) | BIT(FLOW_DISSECTOR_KEY_ENC_IP) | - BIT(FLOW_DISSECTOR_KEY_ENC_OPTS))) { + BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT(FLOW_DISSECTOR_KEY_MPLS))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported key"); netdev_warn(priv->netdev, "Unsupported key used: 0x%x\n", dissector->used_keys); @@ -2172,7 +2234,8 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, if (err) return err; - if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) { + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC) && + !skip_key_basic(filter_dev, f)) { struct flow_match_basic match; flow_rule_match_basic(rule, &match); @@ -2837,10 +2900,12 @@ void dealloc_mod_hdr_actions(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts) static const struct pedit_headers zero_masks = {}; -static int parse_tc_pedit_action(struct mlx5e_priv *priv, - const struct flow_action_entry *act, int namespace, - struct pedit_headers_action *hdrs, - struct netlink_ext_ack *extack) +static int +parse_pedit_to_modify_hdr(struct mlx5e_priv *priv, + const struct flow_action_entry *act, int namespace, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct pedit_headers_action *hdrs, + struct netlink_ext_ack *extack) { u8 cmd = (act->id == FLOW_ACTION_MANGLE) ? 0 : 1; int err = -EOPNOTSUPP; @@ -2876,6 +2941,46 @@ out_err: return err; } +static int +parse_pedit_to_reformat(struct mlx5e_priv *priv, + const struct flow_action_entry *act, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct netlink_ext_ack *extack) +{ + u32 mask, val, offset; + u32 *p; + + if (act->id != FLOW_ACTION_MANGLE) + return -EOPNOTSUPP; + + if (act->mangle.htype != FLOW_ACT_MANGLE_HDR_TYPE_ETH) { + NL_SET_ERR_MSG_MOD(extack, "Only Ethernet modification is supported"); + return -EOPNOTSUPP; + } + + mask = ~act->mangle.mask; + val = act->mangle.val; + offset = act->mangle.offset; + p = (u32 *)&parse_attr->eth; + *(p + (offset >> 2)) |= (val & mask); + + return 0; +} + +static int parse_tc_pedit_action(struct mlx5e_priv *priv, + const struct flow_action_entry *act, int namespace, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct pedit_headers_action *hdrs, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + if (flow && flow_flag_test(flow, L3_TO_L2_DECAP)) + return parse_pedit_to_reformat(priv, act, parse_attr, extack); + + return parse_pedit_to_modify_hdr(priv, act, namespace, + parse_attr, hdrs, extack); +} + static int alloc_tc_pedit_action(struct mlx5e_priv *priv, int namespace, struct mlx5e_tc_flow_parse_attr *parse_attr, struct pedit_headers_action *hdrs, @@ -3134,7 +3239,7 @@ static int add_vlan_rewrite_action(struct mlx5e_priv *priv, int namespace, return -EOPNOTSUPP; } - err = parse_tc_pedit_action(priv, &pedit_act, namespace, hdrs, NULL); + err = parse_tc_pedit_action(priv, &pedit_act, namespace, parse_attr, hdrs, NULL, extack); *action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; return err; @@ -3200,7 +3305,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, case FLOW_ACTION_MANGLE: case FLOW_ACTION_ADD: err = parse_tc_pedit_action(priv, act, MLX5_FLOW_NAMESPACE_KERNEL, - hdrs, extack); + parse_attr, hdrs, NULL, extack); if (err) return err; @@ -3294,12 +3399,22 @@ static inline int cmp_encap_info(struct encap_key *a, a->tc_tunnel->tunnel_type != b->tc_tunnel->tunnel_type; } +static inline int cmp_decap_info(struct mlx5e_decap_key *a, + struct mlx5e_decap_key *b) +{ + return memcmp(&a->key, &b->key, sizeof(b->key)); +} + static inline int hash_encap_info(struct encap_key *key) { return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key), key->tc_tunnel->tunnel_type); } +static inline int hash_decap_info(struct mlx5e_decap_key *key) +{ + return jhash(&key->key, sizeof(key->key), 0); +} static bool is_merged_eswitch_dev(struct mlx5e_priv *priv, struct net_device *peer_netdev) @@ -3314,13 +3429,16 @@ static bool is_merged_eswitch_dev(struct mlx5e_priv *priv, same_hw_devs(priv, peer_priv)); } - - bool mlx5e_encap_take(struct mlx5e_encap_entry *e) { return refcount_inc_not_zero(&e->refcnt); } +static bool mlx5e_decap_take(struct mlx5e_decap_entry *e) +{ + return refcount_inc_not_zero(&e->refcnt); +} + static struct mlx5e_encap_entry * mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key, uintptr_t hash_key) @@ -3341,6 +3459,24 @@ mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key, return NULL; } +static struct mlx5e_decap_entry * +mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key, + uintptr_t hash_key) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_decap_key r_key; + struct mlx5e_decap_entry *e; + + hash_for_each_possible_rcu(esw->offloads.decap_tbl, e, + hlist, hash_key) { + r_key = e->key; + if (!cmp_decap_info(&r_key, key) && + mlx5e_decap_take(e)) + return e; + } + return NULL; +} + static struct ip_tunnel_info *dup_tun_info(const struct ip_tunnel_info *tun_info) { size_t tun_size = sizeof(*tun_info) + tun_info->options_len; @@ -3486,6 +3622,84 @@ out_err_init: return err; } +static int mlx5e_attach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_esw_flow_attr *attr = flow->esw_attr; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_decap_entry *d; + struct mlx5e_decap_key key; + uintptr_t hash_key; + int err; + + parse_attr = attr->parse_attr; + if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) { + NL_SET_ERR_MSG_MOD(extack, + "encap header larger than max supported"); + return -EOPNOTSUPP; + } + + key.key = parse_attr->eth; + hash_key = hash_decap_info(&key); + mutex_lock(&esw->offloads.decap_tbl_lock); + d = mlx5e_decap_get(priv, &key, hash_key); + if (d) { + mutex_unlock(&esw->offloads.decap_tbl_lock); + wait_for_completion(&d->res_ready); + mutex_lock(&esw->offloads.decap_tbl_lock); + if (d->compl_result) { + err = -EREMOTEIO; + goto out_free; + } + goto found; + } + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) { + err = -ENOMEM; + goto out_err; + } + + d->key = key; + refcount_set(&d->refcnt, 1); + init_completion(&d->res_ready); + INIT_LIST_HEAD(&d->flows); + hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key); + mutex_unlock(&esw->offloads.decap_tbl_lock); + + d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, + MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2, + sizeof(parse_attr->eth), + &parse_attr->eth, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(d->pkt_reformat)) { + err = PTR_ERR(d->pkt_reformat); + d->compl_result = err; + } + mutex_lock(&esw->offloads.decap_tbl_lock); + complete_all(&d->res_ready); + if (err) + goto out_free; + +found: + flow->decap_reformat = d; + attr->decap_pkt_reformat = d->pkt_reformat; + list_add(&flow->l3_to_l2_reformat, &d->flows); + mutex_unlock(&esw->offloads.decap_tbl_lock); + return 0; + +out_free: + mutex_unlock(&esw->offloads.decap_tbl_lock); + mlx5e_decap_put(priv, d); + return err; + +out_err: + mutex_unlock(&esw->offloads.decap_tbl_lock); + return err; +} + static int parse_tc_vlan_action(struct mlx5e_priv *priv, const struct flow_action_entry *act, struct mlx5_esw_flow_attr *attr, @@ -3697,7 +3911,8 @@ static int verify_uplink_forwarding(struct mlx5e_priv *priv, static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct flow_action *flow_action, struct mlx5e_tc_flow *flow, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + struct net_device *filter_dev) { struct pedit_headers_action hdrs[2] = {}; struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; @@ -3711,6 +3926,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, bool encap = false, decap = false; u32 action = attr->action; int err, i, if_count = 0; + bool mpls_push = false; if (!flow_action_has_entries(flow_action)) return -EINVAL; @@ -3725,15 +3941,48 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, action |= MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; break; + case FLOW_ACTION_MPLS_PUSH: + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + reformat_l2_to_l3_tunnel) || + act->mpls_push.proto != htons(ETH_P_MPLS_UC)) { + NL_SET_ERR_MSG_MOD(extack, + "mpls push is supported only for mpls_uc protocol"); + return -EOPNOTSUPP; + } + mpls_push = true; + break; + case FLOW_ACTION_MPLS_POP: + /* we only support mpls pop if it is the first action + * and the filter net device is bareudp. Subsequent + * actions can be pedit and the last can be mirred + * egress redirect. + */ + if (i) { + NL_SET_ERR_MSG_MOD(extack, + "mpls pop supported only as first action"); + return -EOPNOTSUPP; + } + if (!netif_is_bareudp(filter_dev)) { + NL_SET_ERR_MSG_MOD(extack, + "mpls pop supported only on bareudp devices"); + return -EOPNOTSUPP; + } + + parse_attr->eth.h_proto = act->mpls_pop.proto; + action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_flag_set(flow, L3_TO_L2_DECAP); + break; case FLOW_ACTION_MANGLE: case FLOW_ACTION_ADD: err = parse_tc_pedit_action(priv, act, MLX5_FLOW_NAMESPACE_FDB, - hdrs, extack); + parse_attr, hdrs, flow, extack); if (err) return err; - action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; - attr->split_count = attr->out_count; + if (!flow_flag_test(flow, L3_TO_L2_DECAP)) { + action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + attr->split_count = attr->out_count; + } break; case FLOW_ACTION_CSUM: if (csum_offload_supported(priv, action, @@ -3755,6 +4004,12 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, return -EINVAL; } + if (mpls_push && !netif_is_bareudp(out_dev)) { + NL_SET_ERR_MSG_MOD(extack, + "mpls is supported only through a bareudp device"); + return -EOPNOTSUPP; + } + if (ft_flow && out_dev == priv->netdev) { /* Ignore forward to self rules generated * by adding both mlx5 devs to the flow table @@ -4085,6 +4340,7 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size, INIT_LIST_HEAD(&flow->encaps[out_index].list); INIT_LIST_HEAD(&flow->mod_hdr); INIT_LIST_HEAD(&flow->hairpin); + INIT_LIST_HEAD(&flow->l3_to_l2_reformat); refcount_set(&flow->refcnt, 1); init_completion(&flow->init_done); @@ -4154,7 +4410,7 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv, if (err) goto err_free; - err = parse_tc_fdb_actions(priv, &rule->action, flow, extack); + err = parse_tc_fdb_actions(priv, &rule->action, flow, extack, filter_dev); if (err) goto err_free; @@ -4806,148 +5062,35 @@ void mlx5e_tc_reoffload_flows_work(struct work_struct *work) mutex_unlock(&rpriv->unready_flows_lock); } -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) -static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb, - struct mlx5e_tc_update_priv *tc_priv, - u32 tunnel_id) -{ - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct tunnel_match_enc_opts enc_opts = {}; - struct mlx5_rep_uplink_priv *uplink_priv; - struct mlx5e_rep_priv *uplink_rpriv; - struct metadata_dst *tun_dst; - struct tunnel_match_key key; - u32 tun_id, enc_opts_id; - struct net_device *dev; - int err; - - enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK; - tun_id = tunnel_id >> ENC_OPTS_BITS; - - if (!tun_id) - return true; - - uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); - uplink_priv = &uplink_rpriv->uplink_priv; - - err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key); - if (err) { - WARN_ON_ONCE(true); - netdev_dbg(priv->netdev, - "Couldn't find tunnel for tun_id: %d, err: %d\n", - tun_id, err); - return false; - } - - if (enc_opts_id) { - err = mapping_find(uplink_priv->tunnel_enc_opts_mapping, - enc_opts_id, &enc_opts); - if (err) { - netdev_dbg(priv->netdev, - "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n", - enc_opts_id, err); - return false; - } - } - - tun_dst = tun_rx_dst(enc_opts.key.len); - if (!tun_dst) { - WARN_ON_ONCE(true); - return false; - } - - ip_tunnel_key_init(&tun_dst->u.tun_info.key, - key.enc_ipv4.src, key.enc_ipv4.dst, - key.enc_ip.tos, key.enc_ip.ttl, - 0, /* label */ - key.enc_tp.src, key.enc_tp.dst, - key32_to_tunnel_id(key.enc_key_id.keyid), - TUNNEL_KEY); - - if (enc_opts.key.len) - ip_tunnel_info_opts_set(&tun_dst->u.tun_info, - enc_opts.key.data, - enc_opts.key.len, - enc_opts.key.dst_opt_type); - - skb_dst_set(skb, (struct dst_entry *)tun_dst); - dev = dev_get_by_index(&init_net, key.filter_ifindex); - if (!dev) { - netdev_dbg(priv->netdev, - "Couldn't find tunnel device with ifindex: %d\n", - key.filter_ifindex); - return false; +static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv, + struct flow_cls_offload *cls_flower, + unsigned long flags) +{ + switch (cls_flower->command) { + case FLOW_CLS_REPLACE: + return mlx5e_configure_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_DESTROY: + return mlx5e_delete_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_STATS: + return mlx5e_stats_flower(priv->netdev, priv, cls_flower, + flags); + default: + return -EOPNOTSUPP; } - - /* Set tun_dev so we do dev_put() after datapath */ - tc_priv->tun_dev = dev; - - skb->dev = dev; - - return true; } -#endif /* CONFIG_NET_TC_SKB_EXT */ -bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe, - struct sk_buff *skb, - struct mlx5e_tc_update_priv *tc_priv) +int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) { -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - u32 chain = 0, reg_c0, reg_c1, tunnel_id, tuple_id; - struct mlx5_rep_uplink_priv *uplink_priv; - struct mlx5e_rep_priv *uplink_rpriv; - struct tc_skb_ext *tc_skb_ext; - struct mlx5_eswitch *esw; - struct mlx5e_priv *priv; - int tunnel_moffset; - int err; - - reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK); - if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG) - reg_c0 = 0; - reg_c1 = be32_to_cpu(cqe->ft_metadata); - - if (!reg_c0) - return true; - - priv = netdev_priv(skb->dev); - esw = priv->mdev->priv.eswitch; - - err = mlx5_eswitch_get_chain_for_tag(esw, reg_c0, &chain); - if (err) { - netdev_dbg(priv->netdev, - "Couldn't find chain for chain tag: %d, err: %d\n", - reg_c0, err); - return false; - } - - if (chain) { - tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); - if (!tc_skb_ext) { - WARN_ON(1); - return false; - } - - tc_skb_ext->chain = chain; + unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD); + struct mlx5e_priv *priv = cb_priv; - tuple_id = reg_c1 & TUPLE_ID_MAX; - - uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); - uplink_priv = &uplink_rpriv->uplink_priv; - if (!mlx5e_tc_ct_restore_flow(uplink_priv, skb, tuple_id)) - return false; + switch (type) { + case TC_SETUP_CLSFLOWER: + return mlx5e_setup_tc_cls_flower(priv, type_data, flags); + default: + return -EOPNOTSUPP; } - - tunnel_moffset = mlx5e_tc_attr_to_reg_mappings[TUNNEL_TO_REG].moffset; - tunnel_id = reg_c1 >> (8 * tunnel_moffset); - return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id); -#endif /* CONFIG_NET_TC_SKB_EXT */ - - return true; -} - -void mlx5_tc_rep_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv) -{ - if (tc_priv->tun_dev) - dev_put(tc_priv->tun_dev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index abdcfa4c4e0e..037aa73bf9ab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -34,11 +34,41 @@ #define __MLX5_EN_TC_H__ #include <net/pkt_cls.h> +#include "en.h" #define MLX5E_TC_FLOW_ID_MASK 0x0000ffff #ifdef CONFIG_MLX5_ESWITCH +struct tunnel_match_key { + struct flow_dissector_key_control enc_control; + struct flow_dissector_key_keyid enc_key_id; + struct flow_dissector_key_ports enc_tp; + struct flow_dissector_key_ip enc_ip; + union { + struct flow_dissector_key_ipv4_addrs enc_ipv4; + struct flow_dissector_key_ipv6_addrs enc_ipv6; + }; + + int filter_ifindex; +}; + +struct tunnel_match_enc_opts { + struct flow_dissector_key_enc_opts key; + struct flow_dissector_key_enc_opts mask; +}; + +/* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS. + * Upper TUNNEL_INFO_BITS for general tunnel info. + * Lower ENC_OPTS_BITS bits for enc_opts. + */ +#define TUNNEL_INFO_BITS 6 +#define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0) +#define ENC_OPTS_BITS 2 +#define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0) +#define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS) +#define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0) + enum { MLX5E_TC_FLAG_INGRESS_BIT, MLX5E_TC_FLAG_EGRESS_BIT, @@ -50,9 +80,6 @@ enum { #define MLX5_TC_FLAG(flag) BIT(MLX5E_TC_FLAG_##flag##_BIT) -int mlx5e_tc_nic_init(struct mlx5e_priv *priv); -void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv); - int mlx5e_tc_esw_init(struct rhashtable *tc_ht); void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht); @@ -119,11 +146,6 @@ struct mlx5e_tc_update_priv { struct net_device *tun_dev; }; -bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe, struct sk_buff *skb, - struct mlx5e_tc_update_priv *tc_priv); - -void mlx5_tc_rep_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv); - struct mlx5e_tc_mod_hdr_acts { int num_actions; int max_actions; @@ -148,6 +170,22 @@ void dealloc_mod_hdr_actions(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); struct mlx5e_tc_flow; u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow); +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + +int mlx5e_tc_nic_init(struct mlx5e_priv *priv); +void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv); + +int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + void *cb_priv); + +#else /* CONFIG_MLX5_CLS_ACT */ +static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {} +static inline int +mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_MLX5_CLS_ACT */ + #else /* CONFIG_MLX5_ESWITCH */ static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; } static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {} @@ -156,6 +194,10 @@ static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv, { return 0; } + +static inline int +mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) +{ return -EOPNOTSUPP; } #endif #endif /* __MLX5_EN_TC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.h index f8c4239846ea..7679ac359e31 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/chains.h @@ -6,6 +6,8 @@ #include "eswitch.h" +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + bool mlx5_esw_chains_prios_supported(struct mlx5_eswitch *esw); bool @@ -46,4 +48,21 @@ void mlx5_esw_chains_destroy(struct mlx5_eswitch *esw); int mlx5_eswitch_get_chain_for_tag(struct mlx5_eswitch *esw, u32 tag, u32 *chain); +#else /* CONFIG_MLX5_CLS_ACT */ + +static inline struct mlx5_flow_table * +mlx5_esw_chains_get_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, + u32 level) { return ERR_PTR(-EOPNOTSUPP); } +static inline void +mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, + u32 level) {} + +static inline struct mlx5_flow_table * +mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw) { return ERR_PTR(-EOPNOTSUPP); } + +static inline int mlx5_esw_chains_create(struct mlx5_eswitch *esw) { return 0; } +static inline void mlx5_esw_chains_destroy(struct mlx5_eswitch *esw) {} + +#endif /* CONFIG_MLX5_CLS_ACT */ + #endif /* __ML5_ESW_CHAINS_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index c5eb4e7754a9..ac79b7c9aeb3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -2262,6 +2262,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) hash_init(esw->offloads.encap_tbl); mutex_init(&esw->offloads.mod_hdr.lock); hash_init(esw->offloads.mod_hdr.hlist); + mutex_init(&esw->offloads.decap_tbl_lock); + hash_init(esw->offloads.decap_tbl); atomic64_set(&esw->offloads.num_flows, 0); mutex_init(&esw->state_lock); mutex_init(&esw->mode_lock); @@ -2303,6 +2305,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) mutex_destroy(&esw->state_lock); mutex_destroy(&esw->offloads.mod_hdr.lock); mutex_destroy(&esw->offloads.encap_tbl_lock); + mutex_destroy(&esw->offloads.decap_tbl_lock); kfree(esw->vports); kfree(esw); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 4a1c6c78bb14..ccbbea3e0505 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -209,6 +209,8 @@ struct mlx5_esw_offload { struct mutex peer_mutex; struct mutex encap_tbl_lock; /* protects encap_tbl */ DECLARE_HASHTABLE(encap_tbl, 8); + struct mutex decap_tbl_lock; /* protects decap_tbl */ + DECLARE_HASHTABLE(decap_tbl, 8); struct mod_hdr_tbl mod_hdr; DECLARE_HASHTABLE(termtbl_tbl, 8); struct mutex termtbl_mutex; /* protects termtbl hash */ @@ -432,6 +434,7 @@ struct mlx5_esw_flow_attr { struct mlx5_flow_table *fdb; struct mlx5_flow_table *dest_ft; struct mlx5_ct_attr ct_attr; + struct mlx5_pkt_reformat *decap_pkt_reformat; struct mlx5e_tc_flow_parse_attr *parse_attr; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 57ac2ef52e80..554fc64d8ef6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -366,6 +366,10 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, } } } + + if (attr->decap_pkt_reformat) + flow_act.pkt_reformat = attr->decap_pkt_reformat; + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; dest[i].counter_id = mlx5_fc_id(attr->counter); @@ -1727,7 +1731,9 @@ static int mlx5_esw_offloads_pair(struct mlx5_eswitch *esw, static void mlx5_esw_offloads_unpair(struct mlx5_eswitch *esw) { +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) mlx5e_tc_clean_fdb_peer_flows(esw); +#endif esw_del_fdb_peer_miss_rules(esw); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c index 8809a65ecefb..e042e0924079 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c @@ -144,11 +144,11 @@ static int mlx5_set_entropy(struct mlx5_tun_entropy *tun_entropy, int mlx5_tun_entropy_refcount_inc(struct mlx5_tun_entropy *tun_entropy, int reformat_type) { - /* the default is error for unknown (non VXLAN/GRE tunnel types) */ int err = -EOPNOTSUPP; mutex_lock(&tun_entropy->lock); - if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN && + if ((reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN || + reformat_type == MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL) && tun_entropy->enabled) { /* in case entropy calculation is enabled for all tunneling * types, it is ok for VXLAN, so approve. diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index b286fe158820..51e1b3930c56 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -30,14 +30,14 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; act = flow_action_first_entry_get(flow_action); - if (act->hw_stats == FLOW_ACTION_HW_STATS_ANY || - act->hw_stats == FLOW_ACTION_HW_STATS_IMMEDIATE) { + if (act->hw_stats & FLOW_ACTION_HW_STATS_DISABLED) { + /* Nothing to do */ + } else if (act->hw_stats & FLOW_ACTION_HW_STATS_IMMEDIATE) { /* Count action is inserted first */ err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack); if (err) return err; - } else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED && - act->hw_stats != FLOW_ACTION_HW_STATS_DONT_CARE) { + } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type"); return -EOPNOTSUPP; } diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 1e0c024b0a93..8e4141552423 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; xdp->frame_sz = PAGE_SIZE; - xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index acd51b29a476..822b3acf6be7 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -126,7 +126,7 @@ #define ATH8031_PHY_ID 0x004dd074 #define ATH8032_PHY_ID 0x004dd023 #define ATH8035_PHY_ID 0x004dd072 -#define AT803X_PHY_ID_MASK 0xffffffef +#define AT8030_PHY_ID_MASK 0xffffffef MODULE_DESCRIPTION("Qualcomm Atheros AR803x PHY driver"); MODULE_AUTHOR("Matus Ujhelyi"); @@ -967,9 +967,8 @@ static int at803x_cable_test_start(struct phy_device *phydev) static struct phy_driver at803x_driver[] = { { /* Qualcomm Atheros AR8035 */ - .phy_id = ATH8035_PHY_ID, + PHY_ID_MATCH_EXACT(ATH8035_PHY_ID), .name = "Qualcomm Atheros AR8035", - .phy_id_mask = AT803X_PHY_ID_MASK, .flags = PHY_POLL_CABLE_TEST, .probe = at803x_probe, .remove = at803x_remove, @@ -991,7 +990,7 @@ static struct phy_driver at803x_driver[] = { /* Qualcomm Atheros AR8030 */ .phy_id = ATH8030_PHY_ID, .name = "Qualcomm Atheros AR8030", - .phy_id_mask = AT803X_PHY_ID_MASK, + .phy_id_mask = AT8030_PHY_ID_MASK, .probe = at803x_probe, .remove = at803x_remove, .config_init = at803x_config_init, @@ -1005,9 +1004,8 @@ static struct phy_driver at803x_driver[] = { .config_intr = at803x_config_intr, }, { /* Qualcomm Atheros AR8031/AR8033 */ - .phy_id = ATH8031_PHY_ID, + PHY_ID_MATCH_EXACT(ATH8031_PHY_ID), .name = "Qualcomm Atheros AR8031/AR8033", - .phy_id_mask = AT803X_PHY_ID_MASK, .flags = PHY_POLL_CABLE_TEST, .probe = at803x_probe, .remove = at803x_remove, @@ -1055,10 +1053,10 @@ static struct phy_driver at803x_driver[] = { module_phy_driver(at803x_driver); static struct mdio_device_id __maybe_unused atheros_tbl[] = { - { ATH8030_PHY_ID, AT803X_PHY_ID_MASK }, - { ATH8031_PHY_ID, AT803X_PHY_ID_MASK }, + { ATH8030_PHY_ID, AT8030_PHY_ID_MASK }, + { PHY_ID_MATCH_EXACT(ATH8031_PHY_ID) }, { PHY_ID_MATCH_EXACT(ATH8032_PHY_ID) }, - { ATH8035_PHY_ID, AT803X_PHY_ID_MASK }, + { PHY_ID_MATCH_EXACT(ATH8035_PHY_ID) }, { PHY_ID_MATCH_EXACT(ATH9331_PHY_ID) }, { } }; diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index 7996a4aea8d2..cfb22a21a2e6 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -65,7 +65,9 @@ #define DP83869_RGMII_RX_CLK_DELAY_EN BIT(0) /* STRAP_STS1 bits */ +#define DP83869_STRAP_OP_MODE_MASK GENMASK(2, 0) #define DP83869_STRAP_STS1_RESERVED BIT(11) +#define DP83869_STRAP_MIRROR_ENABLED BIT(12) /* PHYCTRL bits */ #define DP83869_RX_FIFO_SHIFT 12 @@ -160,6 +162,20 @@ static int dp83869_config_port_mirroring(struct phy_device *phydev) DP83869_CFG3_PORT_MIRROR_EN); } +static int dp83869_set_strapped_mode(struct phy_device *phydev) +{ + struct dp83869_private *dp83869 = phydev->priv; + int val; + + val = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_STRAP_STS1); + if (val < 0) + return val; + + dp83869->mode = val & DP83869_STRAP_OP_MODE_MASK; + + return 0; +} + #ifdef CONFIG_OF_MDIO static int dp83869_of_init(struct phy_device *phydev) { @@ -184,6 +200,10 @@ static int dp83869_of_init(struct phy_device *phydev) if (dp83869->mode < DP83869_RGMII_COPPER_ETHERNET || dp83869->mode > DP83869_SGMII_COPPER_ETHERNET) return -EINVAL; + } else { + ret = dp83869_set_strapped_mode(phydev); + if (ret) + return ret; } if (of_property_read_bool(of_node, "ti,max-output-impedance")) @@ -191,10 +211,18 @@ static int dp83869_of_init(struct phy_device *phydev) else if (of_property_read_bool(of_node, "ti,min-output-impedance")) dp83869->io_impedance = DP83869_IO_MUX_CFG_IO_IMPEDANCE_MIN; - if (of_property_read_bool(of_node, "enet-phy-lane-swap")) + if (of_property_read_bool(of_node, "enet-phy-lane-swap")) { dp83869->port_mirroring = DP83869_PORT_MIRRORING_EN; - else - dp83869->port_mirroring = DP83869_PORT_MIRRORING_DIS; + } else { + /* If the lane swap is not in the DT then check the straps */ + ret = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_STRAP_STS1); + if (ret < 0) + return ret; + if (ret & DP83869_STRAP_MIRROR_ENABLED) + dp83869->port_mirroring = DP83869_PORT_MIRRORING_EN; + else + dp83869->port_mirroring = DP83869_PORT_MIRRORING_DIS; + } if (of_property_read_u32(of_node, "rx-fifo-depth", &dp83869->rx_fifo_depth)) @@ -209,7 +237,7 @@ static int dp83869_of_init(struct phy_device *phydev) #else static int dp83869_of_init(struct phy_device *phydev) { - return 0; + return dp83869_set_strapped_mode(phydev); } #endif /* CONFIG_OF_MDIO */ diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a5b415fed11e..3e88fbef2d4a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -26,6 +26,7 @@ #include <net/netns/generic.h> #include <net/tun_proto.h> #include <net/vxlan.h> +#include <net/nexthop.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_tunnel.h> @@ -78,6 +79,9 @@ struct vxlan_fdb { u16 state; /* see ndm_state */ __be32 vni; u16 flags; /* see ndm_flags and below */ + struct list_head nh_list; + struct nexthop __rcu *nh; + struct vxlan_dev *vdev; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 @@ -174,11 +178,15 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) */ static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); } static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } @@ -251,9 +259,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, { unsigned long now = jiffies; struct nda_cacheinfo ci; + bool send_ip, send_eth; struct nlmsghdr *nlh; + struct nexthop *nh; struct ndmsg *ndm; - bool send_ip, send_eth; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) @@ -264,16 +273,21 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, send_eth = send_ip = true; + nh = rcu_dereference_rtnl(fdb->nh); if (type == RTM_GETNEIGH) { - send_ip = !vxlan_addr_any(&rdst->remote_ip); + if (rdst) { + send_ip = !vxlan_addr_any(&rdst->remote_ip); + ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; + } else if (nh) { + ndm->ndm_family = nexthop_get_family(nh); + } send_eth = !is_zero_ether_addr(fdb->eth_addr); - ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; - if (rdst->offloaded) + if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; @@ -284,23 +298,30 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; + if (nh) { + if (nla_put_u32(skb, NDA_NH_ID, nh->id)) + goto nla_put_failure; + } else if (rdst) { + if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, + &rdst->remote_ip)) + goto nla_put_failure; + + if (rdst->remote_port && + rdst->remote_port != vxlan->cfg.dst_port && + nla_put_be16(skb, NDA_PORT, rdst->remote_port)) + goto nla_put_failure; + if (rdst->remote_vni != vxlan->default_dst.remote_vni && + nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) + goto nla_put_failure; + if (rdst->remote_ifindex && + nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) + goto nla_put_failure; + } - if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) - goto nla_put_failure; - - if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && - nla_put_be16(skb, NDA_PORT, rdst->remote_port)) - goto nla_put_failure; - if (rdst->remote_vni != vxlan->default_dst.remote_vni && - nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) - goto nla_put_failure; if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->vni))) goto nla_put_failure; - if (rdst->remote_ifindex && - nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) - goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; @@ -401,7 +422,7 @@ static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, { int err; - if (swdev_notify) { + if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, @@ -793,8 +814,9 @@ static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } -static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state, - __be32 src_vni, __u16 ndm_flags) +static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, + __u16 state, __be32 src_vni, + __u16 ndm_flags) { struct vxlan_fdb *f; @@ -805,6 +827,9 @@ static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state, f->flags = ndm_flags; f->updated = f->used = jiffies; f->vni = src_vni; + f->nh = NULL; + f->vdev = vxlan; + INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); @@ -819,11 +844,78 @@ static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, vxlan_fdb_head(vxlan, mac, src_vni)); } +static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, + u32 nhid, struct netlink_ext_ack *extack) +{ + struct nexthop *old_nh = rtnl_dereference(fdb->nh); + struct nh_group *nhg; + struct nexthop *nh; + int err = -EINVAL; + + if (old_nh && old_nh->id == nhid) + return 0; + + nh = nexthop_find_by_id(vxlan->net, nhid); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + nh = NULL; + goto err_inval; + } + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); + goto err_inval; + } + + if (!nh->is_group || !nh->nh_grp->mpath) { + NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); + goto err_inval; + } + + /* check nexthop group family */ + nhg = rtnl_dereference(nh->nh_grp); + switch (vxlan->default_dst.remote_ip.sa.sa_family) { + case AF_INET: + if (!nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + break; + case AF_INET6: + if (nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + } + } + + if (old_nh) { + list_del_rcu(&fdb->nh_list); + nexthop_put(old_nh); + } + rcu_assign_pointer(fdb->nh, nh); + list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); + return 1; + +err_inval: + if (nh) + nexthop_put(nh); + return err; +} + static int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb **fdb) + u32 nhid, struct vxlan_fdb **fdb, + struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; @@ -834,24 +926,37 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, return -ENOSPC; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); - f = vxlan_fdb_alloc(mac, state, src_vni, ndm_flags); + f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); if (!f) return -ENOMEM; - rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); - if (rc < 0) { - kfree(f); - return rc; - } + if (nhid) + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + else + rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); + if (rc < 0) + goto errout; *fdb = f; return 0; + +errout: + kfree(f); + return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; + struct nexthop *nh; + + nh = rcu_dereference_raw(f->nh); + if (nh) { + rcu_assign_pointer(f->nh, NULL); + list_del_rcu(&f->nh_list); + nexthop_put(nh); + } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); @@ -875,12 +980,18 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); --vxlan->addrcnt; - if (do_notify) - list_for_each_entry(rd, &f->remotes, list) - vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + if (do_notify) { + if (rcu_access_pointer(f->nh)) + vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); + else + list_for_each_entry(rd, &f->remotes, list) + vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + swdev_notify, NULL); + } hlist_del_rcu(&f->hlist); + f->vdev = NULL; call_rcu(&f->rcu, vxlan_fdb_free); } @@ -897,7 +1008,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb *f, + struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -908,6 +1019,18 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, int rc = 0; int err; + if (nhid && !rcu_access_pointer(f->nh)) { + NL_SET_ERR_MSG(extack, + "Cannot replace an existing non nexthop fdb with a nexthop"); + return -EOPNOTSUPP; + } + + if (nhid && (flags & NLM_F_APPEND)) { + NL_SET_ERR_MSG(extack, + "Cannot append to a nexthop fdb"); + return -EOPNOTSUPP; + } + /* Do not allow an externally learned entry to take over an entry added * by the user. */ @@ -929,10 +1052,17 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { - rc = vxlan_fdb_replace(f, ip, port, vni, - ifindex, &oldrd); + if (nhid) { + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + if (rc < 0) + return rc; + } else { + rc = vxlan_fdb_replace(f, ip, port, vni, + ifindex, &oldrd); + } notify |= rc; } else { + NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } @@ -962,6 +1092,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, return 0; err_notify: + if (nhid) + return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { @@ -975,7 +1107,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -990,7 +1122,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, - vni, ifindex, fdb_flags, &f); + vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; @@ -1012,7 +1144,7 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -1028,14 +1160,15 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, - swdev_notify, extack); + nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, - ndm_flags, swdev_notify, extack); + ndm_flags, nhid, swdev_notify, + extack); } } @@ -1049,7 +1182,7 @@ static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, - __be32 *vni, u32 *ifindex) + __be32 *vni, u32 *ifindex, u32 *nhid) { struct net *net = dev_net(vxlan->dev); int err; @@ -1109,6 +1242,11 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, *ifindex = 0; } + if (tb[NDA_NH_ID]) + *nhid = nla_get_u32(tb[NDA_NH_ID]); + else + *nhid = 0; + return 0; } @@ -1123,7 +1261,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be16 port; __be32 src_vni, vni; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; int err; @@ -1133,10 +1271,11 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (tb[NDA_DST] == NULL) + if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1148,7 +1287,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, - true, extack); + nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; @@ -1159,8 +1298,8 @@ static int __vxlan_fdb_delete(struct vxlan_dev *vxlan, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { - struct vxlan_fdb *f; struct vxlan_rdst *rd = NULL; + struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); @@ -1195,12 +1334,13 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; - __be16 port; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; + __be16 port; int err; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1228,6 +1368,17 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; + if (rcu_access_pointer(f->nh)) { + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI, NULL); + if (err < 0) + goto out; + continue; + } + list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < cb->args[2]) goto skip; @@ -1311,6 +1462,10 @@ static bool vxlan_snoop(struct net_device *dev, if (f->state & (NUD_PERMANENT | NUD_NOARP)) return true; + /* Don't override an fdb with nexthop with a learnt entry */ + if (rcu_access_pointer(f->nh)) + return true; + if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", @@ -1333,7 +1488,7 @@ static bool vxlan_snoop(struct net_device *dev, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, - ifindex, NTF_SELF, true, NULL); + ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock[hash_index]); } @@ -2616,6 +2771,38 @@ tx_error: kfree_skb(skb); } +static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, + struct vxlan_fdb *f, __be32 vni, bool did_rsc) +{ + struct vxlan_rdst nh_rdst; + struct nexthop *nh; + bool do_xmit; + u32 hash; + + memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); + hash = skb_get_hash(skb); + + rcu_read_lock(); + nh = rcu_dereference(f->nh); + if (!nh) { + rcu_read_unlock(); + goto drop; + } + do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); + rcu_read_unlock(); + + if (likely(do_xmit)) + vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); + else + goto drop; + + return; + +drop: + dev->stats.tx_dropped++; + dev_kfree_skb(skb); +} + /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. @@ -2692,22 +2879,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } } - list_for_each_entry_rcu(rdst, &f->remotes, list) { - struct sk_buff *skb1; + if (rcu_access_pointer(f->nh)) { + vxlan_xmit_nh(skb, dev, f, + (vni ? : vxlan->default_dst.remote_vni), did_rsc); + } else { + list_for_each_entry_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; - if (!fdst) { - fdst = rdst; - continue; + if (!fdst) { + fdst = rdst; + continue; + } + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); + if (fdst) + vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); + else + kfree_skb(skb); } - if (fdst) - vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); - else - kfree_skb(skb); return NETDEV_TX_OK; } @@ -3615,7 +3807,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, - NTF_SELF, &f); + NTF_SELF, 0, &f, extack); if (err) return err; } @@ -4013,7 +4205,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, - NTF_SELF, true, extack); + NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); netdev_adjacent_change_abort(dst->remote_dev, @@ -4335,7 +4527,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, - false, extack); + 0, false, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; @@ -4410,6 +4602,25 @@ static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = { .notifier_call = vxlan_switchdev_event, }; +static int vxlan_nexthop_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct nexthop *nh = ptr; + struct vxlan_fdb *fdb, *tmp; + + if (!nh || event != NEXTHOP_EVENT_DEL) + return NOTIFY_DONE; + + list_for_each_entry_safe(fdb, tmp, &nh->fdb_list, nh_list) + vxlan_fdb_destroy(fdb->vdev, fdb, false, false); + + return NOTIFY_DONE; +} + +static struct notifier_block vxlan_nexthop_notifier_block __read_mostly = { + .notifier_call = vxlan_nexthop_event, +}; + static __net_init int vxlan_init_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); @@ -4421,7 +4632,7 @@ static __net_init int vxlan_init_net(struct net *net) for (h = 0; h < PORT_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->sock_list[h]); - return 0; + return register_nexthop_notifier(net, &vxlan_nexthop_notifier_block); } static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) @@ -4454,6 +4665,8 @@ static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) rtnl_lock(); list_for_each_entry(net, net_list, exit_list) + unregister_nexthop_notifier(net, &vxlan_nexthop_notifier_block); + list_for_each_entry(net, net_list, exit_list) vxlan_destroy_tunnels(net, &list); unregister_netdevice_many(&list); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/net/bareudp.h b/include/net/bareudp.h index cb03f6f15956..dc65a0d71d9b 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/skbuff.h> +#include <net/rtnetlink.h> struct bareudp_conf { __be16 ethertype; @@ -17,4 +18,10 @@ struct net_device *bareudp_dev_create(struct net *net, const char *name, u8 name_assign_type, struct bareudp_conf *info); +static inline bool netif_is_bareudp(const struct net_device *dev) +{ + return dev->rtnl_link_ops && + !strcmp(dev->rtnl_link_ops->kind, "bareudp"); +} + #endif diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 4001ffb04f0d..95d633785ef9 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -168,10 +168,11 @@ enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, FLOW_ACTION_HW_STATS_DISABLED_BIT, + + FLOW_ACTION_HW_STATS_NUM_BITS }; enum flow_action_hw_stats { - FLOW_ACTION_HW_STATS_DONT_CARE = 0, FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), @@ -179,6 +180,7 @@ enum flow_action_hw_stats { FLOW_ACTION_HW_STATS_DELAYED, FLOW_ACTION_HW_STATS_DISABLED = BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), + FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1, }; typedef void (*action_destr)(void *priv); @@ -340,11 +342,12 @@ __flow_action_hw_stats_check(const struct flow_action *action, return false; action_entry = flow_action_first_entry_get(action); - if (action_entry->hw_stats == FLOW_ACTION_HW_STATS_DONT_CARE) - return true; + + /* Zero is not a legal value for hw_stats, catch anyone passing it */ + WARN_ON_ONCE(!action_entry->hw_stats); if (!check_allow_bit && - action_entry->hw_stats != FLOW_ACTION_HW_STATS_ANY) { + ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index fdaf975e3331..3f615a29766e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -65,6 +65,7 @@ struct fib6_config { struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; + bool fc_is_fdb; }; struct fib6_node { diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h index c712ee5eebd9..1937476c94a0 100644 --- a/include/net/netns/nexthop.h +++ b/include/net/netns/nexthop.h @@ -14,5 +14,6 @@ struct netns_nexthop { unsigned int seq; /* protected by rtnl_mutex */ u32 last_id_allocated; + struct atomic_notifier_head notifier_chain; }; #endif diff --git a/include/net/nexthop.h b/include/net/nexthop.h index c440ccc861fc..4c951680f6f9 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -10,6 +10,7 @@ #define __LINUX_NEXTHOP_H #include <linux/netdevice.h> +#include <linux/notifier.h> #include <linux/route.h> #include <linux/types.h> #include <net/ip_fib.h> @@ -26,6 +27,7 @@ struct nh_config { u8 nh_family; u8 nh_protocol; u8 nh_blackhole; + u8 nh_fdb; u32 nh_flags; int nh_ifindex; @@ -52,6 +54,7 @@ struct nh_info { u8 family; bool reject_nh; + bool fdb_nh; union { struct fib_nh_common fib_nhc; @@ -80,6 +83,7 @@ struct nexthop { struct rb_node rb_node; /* entry on netns rbtree */ struct list_head fi_list; /* v4 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */ + struct list_head fdb_list; /* fdb entries using this nh */ struct list_head grp_list; /* nh group entries using this nh */ struct net *net; @@ -88,6 +92,7 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool is_fdb_nh; refcount_t refcnt; struct rcu_head rcu; @@ -98,6 +103,17 @@ struct nexthop { }; }; +enum nexthop_event_type { + NEXTHOP_EVENT_ADD, + NEXTHOP_EVENT_DEL +}; + +int call_nexthop_notifier(struct notifier_block *nb, struct net *net, + enum nexthop_event_type event_type, + struct nexthop *nh); +int register_nexthop_notifier(struct net *net, struct notifier_block *nb); +int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); + /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); void nexthop_free_rcu(struct rcu_head *head); @@ -304,4 +320,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash) int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg); + +static inline int nexthop_get_family(struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); + + return nhi->family; +} + +static inline +struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); + + return &nhi->fib_nhc; +} + +static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh, + int hash) +{ + struct nh_info *nhi; + struct nexthop *nhp; + + nhp = nexthop_select_path(nh, hash); + if (unlikely(!nhp)) + return NULL; + nhi = rcu_dereference(nhp->nh_info); + return &nhi->fib_nhc; +} #endif diff --git a/include/net/switchdev.h b/include/net/switchdev.h index ae7aeb0d1f9c..db519957e134 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -62,7 +62,6 @@ struct switchdev_attr { #if IS_ENABLED(CONFIG_BRIDGE_MRP) u8 mrp_port_state; /* MRP_PORT_STATE */ u8 mrp_port_role; /* MRP_PORT_ROLE */ - u8 mrp_ring_state; /* MRP_RING_STATE */ #endif } u; }; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 373aadcfea21..3a41627cbdfe 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include <net/dst_metadata.h> #include <net/rtnetlink.h> #include <net/switchdev.h> +#include <net/nexthop.h> #define IANA_VXLAN_UDP_PORT 4789 @@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype, #undef VXLAN_FLAG } +static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, + int hash, + struct vxlan_rdst *rdst) +{ + struct fib_nh_common *nhc; + + nhc = nexthop_path_fdb_result(nh, hash); + if (unlikely(!nhc)) + return false; + + switch (nhc->nhc_gw_family) { + case AF_INET: + rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4; + rdst->remote_ip.sa.sa_family = AF_INET; + break; + case AF_INET6: + rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6; + rdst->remote_ip.sa.sa_family = AF_INET6; + break; + } + + return true; +} + #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index 3094fccf5a88..90f11760bd12 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -39,7 +39,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, - MEM_TYPE_ZERO_COPY, + MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -54,10 +54,6 @@ struct xdp_mem_info { struct page_pool; -struct zero_copy_allocator { - void (*free)(struct zero_copy_allocator *zca, unsigned long handle); -}; - struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -70,7 +66,6 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; - unsigned long handle; struct xdp_rxq_info *rxq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; @@ -119,7 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index abd72de25fa4..96bfc5f5f24e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -15,40 +15,15 @@ struct net_device; struct xsk_queue; - -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT) - -struct xdp_umem_page { - void *addr; - dma_addr_t dma; -}; - -struct xdp_umem_fq_reuse { - u32 nentries; - u32 length; - u64 handles[]; -}; - -/* Flags for the umem flags field. - * - * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public - * flags. See inlude/uapi/include/linux/if_xdp.h. - */ -#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1) +struct xdp_buff; struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct xdp_umem_page *pages; - u64 chunk_mask; + struct xsk_buff_pool *pool; u64 size; u32 headroom; - u32 chunk_size_nohr; + u32 chunk_size; struct user_struct *user; refcount_t users; struct work_struct work; @@ -59,28 +34,17 @@ struct xdp_umem { u8 flags; int id; struct net_device *dev; - struct xdp_umem_fq_reuse *fq_reuse; bool zc; spinlock_t xsk_tx_list_lock; struct list_head xsk_tx_list; }; -/* Nodes are linked in the struct xdp_sock map_list field, and used to - * track which maps a certain socket reside in. - */ - struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ struct xdp_sock *xsk_map[]; }; -struct xsk_map_node { - struct list_head node; - struct xsk_map *map; - struct xdp_sock **map_entry; -}; - struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; @@ -111,32 +75,9 @@ struct xdp_sock { spinlock_t map_list_lock; }; -struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS -int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); -/* Used from netdev driver */ -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_release_addr(struct xdp_umem *umem); -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); -void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq); -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); -void xsk_set_rx_need_wakeup(struct xdp_umem *umem); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); @@ -153,230 +94,13 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return xs; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); -} - -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - unsigned long page_addr; - - addr = xsk_umem_add_offset_to_addr(addr); - page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; - - return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - addr = xsk_umem_add_offset_to_addr(addr); - - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); -} - -/* Reuse-queue aware version of FILL queue helpers */ -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (rq->length >= cnt) - return true; - - return xsk_umem_has_addrs(umem, cnt - rq->length); -} - -static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - return xsk_umem_peek_addr(umem, addr); - - *addr = rq->handles[rq->length - 1]; - return addr; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - xsk_umem_release_addr(umem); - else - rq->length--; -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - rq->handles[rq->length++] = addr; -} - -/* Handle the offset appropriately depending on aligned or unaligned mode. - * For unaligned mode, we store the offset in the upper 16-bits of the address. - * For aligned mode, we simply add the offset to the address. - */ -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, - u64 offset) -{ - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) - return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); - else - return address + offset; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return umem->chunk_size_nohr + umem->headroom; -} - #else + static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { return -ENOTSUPP; } -static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) -{ - return false; -} - -static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) -{ -} - -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, - struct xdp_desc *desc) -{ - return false; -} - -static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) -{ -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - return NULL; -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( - struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - return NULL; -} -static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ -} - -static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) -{ - return NULL; -} - -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return 0; -} - -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return NULL; -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return 0; -} - -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ -} - -static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) -{ - return false; -} - -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, - u64 offset) -{ - return 0; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return 0; -} - static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; @@ -391,6 +115,7 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, { return NULL; } + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h new file mode 100644 index 000000000000..ccf848f7efa4 --- /dev/null +++ b/include/net/xdp_sock_drv.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Interface for implementing AF_XDP zero-copy support in drivers. + * Copyright(c) 2020 Intel Corporation. + */ + +#ifndef _LINUX_XDP_SOCK_DRV_H +#define _LINUX_XDP_SOCK_DRV_H + +#include <net/xdp_sock.h> +#include <net/xsk_buff_pool.h> + +#ifdef CONFIG_XDP_SOCKETS + +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); +struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); +void xsk_set_rx_need_wakeup(struct xdp_umem *umem); +void xsk_set_tx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); +bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); + +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return XDP_PACKET_HEADROOM + umem->headroom; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return umem->chunk_size; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ + xp_set_rxq_info(umem->pool, rxq); +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ + xp_dma_unmap(umem->pool, attrs); +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_dma(xskb); +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_frame_dma(xskb); +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return xp_alloc(umem->pool); +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return xp_can_alloc(umem->pool, count); +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_free(xskb); +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_dma(umem->pool, addr); +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_data(umem->pool, addr); +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_dma_sync_for_cpu(xskb); +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ + xp_dma_sync_for_device(umem->pool, dma, size); +} + +#else + +static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ +} + +static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, + struct xdp_desc *desc) +{ + return false; +} + +static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ +} + +static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, + u16 queue_id) +{ + return NULL; +} + +static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +{ + return false; +} + +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return NULL; +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return false; +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ +} + +#endif /* CONFIG_XDP_SOCKETS */ + +#endif /* _LINUX_XDP_SOCK_DRV_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h new file mode 100644 index 000000000000..a4ff226505c9 --- /dev/null +++ b/include/net/xsk_buff_pool.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef XSK_BUFF_POOL_H_ +#define XSK_BUFF_POOL_H_ + +#include <linux/if_xdp.h> +#include <linux/types.h> +#include <linux/dma-mapping.h> +#include <net/xdp.h> + +struct xsk_buff_pool; +struct xdp_rxq_info; +struct xsk_queue; +struct xdp_desc; +struct device; +struct page; + +struct xdp_buff_xsk { + struct xdp_buff xdp; + dma_addr_t dma; + dma_addr_t frame_dma; + struct xsk_buff_pool *pool; + bool unaligned; + u64 orig_addr; + struct list_head free_list_node; +}; + +struct xsk_buff_pool { + struct xsk_queue *fq; + struct list_head free_list; + dma_addr_t *dma_pages; + struct xdp_buff_xsk *heads; + u64 chunk_mask; + u64 addrs_cnt; + u32 free_list_cnt; + u32 dma_pages_cnt; + u32 heads_cnt; + u32 free_heads_cnt; + u32 headroom; + u32 chunk_size; + u32 frame_len; + bool cheap_dma; + bool unaligned; + void *addrs; + struct device *dev; + struct xdp_buff_xsk *free_heads[]; +}; + +/* AF_XDP core. */ +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned); +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); +void xp_destroy(struct xsk_buff_pool *pool); +void xp_release(struct xdp_buff_xsk *xskb); + +/* AF_XDP, and XDP core. */ +void xp_free(struct xdp_buff_xsk *xskb); + +/* AF_XDP ZC drivers, via xdp_sock_buff.h */ +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages); +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); +static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->dma; +} + +static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->frame_dma; +} + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); +static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->cheap_dma) + return; + + xp_dma_sync_for_cpu_slow(xskb); +} + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size); +static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, + dma_addr_t dma, size_t size) +{ + if (pool->cheap_dma) + return; + + xp_dma_sync_for_device_slow(pool, dma, size); +} + +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr, u32 len) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + + if (pool->dma_pages_cnt && cross_pg) { + return !(pool->dma_pages[addr >> PAGE_SHIFT] & + XSK_NEXT_PG_CONTIG_MASK); + } + return false; +} + +static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) +{ + return addr & pool->chunk_mask; +} + +static inline u64 xp_unaligned_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static inline u64 xp_unaligned_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) +{ + return xp_unaligned_extract_addr(addr) + + xp_unaligned_extract_offset(addr); +} + +#endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b95d65e8c628..b73d3e141323 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,7 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) + FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index cd144e3099a3..eefcda8ca44e 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -29,6 +29,7 @@ enum { NDA_LINK_NETNSID, NDA_SRC_VNI, NDA_PROTOCOL, /* Originator of entry */ + NDA_NH_ID, __NDA_MAX }; diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h index 7b61867e9848..2d4a1e784cf0 100644 --- a/include/uapi/linux/nexthop.h +++ b/include/uapi/linux/nexthop.h @@ -49,6 +49,9 @@ enum { NHA_GROUPS, /* flag; only return nexthop groups in dump */ NHA_MASTER, /* u32; only return nexthops with given master dev */ + NHA_FDB, /* flag; nexthop belongs to a bridge fdb */ + /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */ + __NHA_MAX, }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 37b2d8620153..375b933010dd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o endif ifeq ($(CONFIG_PERF_EVENTS),y) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25b14ee0e26d..d2e27dba4ac6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -393,6 +393,15 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool reg_type_not_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_MAP_VALUE || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_BTF_ID; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || @@ -6308,8 +6317,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { - if (__is_pointer_value(false, reg)) - return -1; + if (__is_pointer_value(false, reg)) { + if (!reg_type_not_null(reg->type)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } if (is_jmp32) return is_branch32_taken(reg, val, opcode); @@ -6808,7 +6834,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (pred >= 0) { - err = mark_chain_precision(env, insn->dst_reg); + /* If we get here with a dst_reg pointer type it is because + * above is_branch_taken() special cased the 0 comparison. + */ + if (!__is_pointer_value(false, dst_reg)) + err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) err = mark_chain_precision(env, insn->src_reg); if (err) @@ -7094,7 +7124,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: @@ -7120,10 +7154,11 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_TRACE_FEXIT: range = tnum_const(0); break; - case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: return 0; + case BPF_TRACE_ITER: + break; default: return -ENOTSUPP; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 30ba7d38941d..bfd4ccd80847 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + u32 user_size = kattr->test.data_size_in; void *data; if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); + if (user_size > size) + return ERR_PTR(-EMSGSIZE); + data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); - if (copy_from_user(data + headroom, data_in, size)) { + if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } @@ -486,8 +490,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, /* XDP have extra tailroom as (most) drivers use full page */ max_data_sz = 4096 - headroom - tailroom; - if (size > max_data_sz) - return -EINVAL; data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index d7bc09de4c13..528d767eb026 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -37,6 +37,26 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) return res; } +static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) +{ + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + struct net_bridge_port *p; + + p = rtnl_dereference(mrp->p_port); + if (p && p->dev->ifindex == ifindex) + return false; + + p = rtnl_dereference(mrp->s_port); + if (p && p->dev->ifindex == ifindex) + return false; + } + + return true; +} + static struct br_mrp *br_mrp_find_port(struct net_bridge *br, struct net_bridge_port *p) { @@ -203,6 +223,7 @@ out: static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) { struct net_bridge_port *p; + u8 state; /* Stop sending MRP_Test frames */ cancel_delayed_work_sync(&mrp->test_work); @@ -214,20 +235,24 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) p = rtnl_dereference(mrp->p_port); if (p) { spin_lock_bh(&br->lock); - p->state = BR_STATE_FORWARDING; + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); - br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->p_port, NULL); } p = rtnl_dereference(mrp->s_port); if (p) { spin_lock_bh(&br->lock); - p->state = BR_STATE_FORWARDING; + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; p->flags &= ~BR_MRP_AWARE; spin_unlock_bh(&br->lock); - br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + br_mrp_port_switchdev_set_state(p, state); rcu_assign_pointer(mrp->s_port, NULL); } @@ -255,6 +280,11 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) !br_mrp_get_port(br, instance->s_ifindex)) return -EINVAL; + /* It is not possible to have the same port part of multiple rings */ + if (!br_mrp_unique_ifindex(br, instance->p_ifindex) || + !br_mrp_unique_ifindex(br, instance->s_ifindex)) + return -EINVAL; + mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); if (!mrp) return -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index 822d662f97ef..bd2853d23b50 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e951b743bed3..e64941c526b1 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -8,6 +8,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; + int i; rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); @@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) return NULL; rule->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; return rule; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b607ea602774..37e4dba62460 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family) } const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, @@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/net/core/xdp.c b/net/core/xdp.c index 490b8f5fa8ee..90f44f382115 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> +#include <net/xdp_sock_drv.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -109,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -static void mem_id_disconnect(int id) -{ - struct xdp_mem_allocator *xa; - - mutex_lock(&mem_id_lock); - - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return; - } - - trace_mem_disconnect(xa); - - if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - - mutex_unlock(&mem_id_lock); -} - void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -143,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) - return mem_id_disconnect(id); - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); @@ -301,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + if (type == MEM_TYPE_PAGE_POOL) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -358,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. + * of xdp_frames/pages in those cases. This path is never used by the + * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of + * the switch-statement. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -383,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_ZERO_COPY: - /* NB! Only valid from an xdp_buff! */ - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - xa->zc_alloc->free(xa->zc_alloc, handle); - rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); break; } } void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0); + __xdp_return(xdpf->data, &xdpf->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } -EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) @@ -493,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->metasize = metasize; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - xdp_return_buff(xdp); + xsk_buff_free(xdp); return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 3aa4975919d7..9ef54cdcf662 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "netlink.h" #include "common.h" diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index eeb1137a3f23..31e0b4e88a9d 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -24,7 +24,7 @@ #include <linux/sched/signal.h> #include <linux/net.h> #include <net/devlink.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/flow_offload.h> #include <linux/ethtool_netlink.h> #include <generated/utsrelease.h> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c35a8b2e0499..02aa5cb3a4fd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -756,12 +756,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -782,6 +781,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 3957364d556c..c337e73e02dd 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -33,8 +33,20 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_FDB] = { .type = NLA_FLAG }, }; +static int call_nexthop_notifiers(struct net *net, + enum fib_event_type event_type, + struct nexthop *nh) +{ + int err; + + err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, + event_type, nh); + return notifier_to_errno(err); +} + static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; @@ -107,6 +119,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); + INIT_LIST_HEAD(&nh->fdb_list); } return nh; } @@ -227,6 +240,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; + if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; + if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); @@ -241,7 +257,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; - } else { + } else if (!nh->is_fdb_nh) { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; @@ -387,12 +403,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, return true; } +static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); + return -EINVAL; + } + + nhi = rtnl_dereference(nh->nh_info); + if (*nh_family == AF_UNSPEC) { + *nh_family = nhi->family; + } else if (*nh_family != nhi->family) { + NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); + return -EINVAL; + } + + return 0; +} + static int nh_check_attr_group(struct net *net, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); + u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; + u8 nhg_fdb = 0; if (len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, @@ -421,6 +460,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } } + if (tb[NHA_FDB]) + nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; @@ -432,11 +473,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } if (!valid_group_nh(nh, len, extack)) return -EINVAL; + + if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) + return -EINVAL; + + if (!nhg_fdb && nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); + return -EINVAL; + } } for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { if (!tb[i]) continue; - + if (tb[NHA_FDB]) + continue; NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; @@ -495,6 +545,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) if (hash > atomic_read(&nhge->upper_bound)) continue; + if (nhge->nh->is_fdb_nh) + return nhge->nh; + /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ @@ -564,6 +617,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, { struct nh_info *nhi; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + return -EINVAL; + } + /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source @@ -640,6 +698,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope, { int err = 0; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + if (nh->is_group) { struct nh_group *nhg; @@ -773,6 +837,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) bool do_flush = false; struct fib_info *fi; + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; @@ -1125,6 +1191,9 @@ static struct nexthop *nexthop_create_group(struct net *net, nh_group_rebalance(nhg); } + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + rcu_assign_pointer(nh->nh_grp, nhg); return nh; @@ -1152,7 +1221,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; - u32 tb_id = l3mdev_fib_table(cfg->dev); + u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); @@ -1161,6 +1230,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, goto out; } + if (nh->is_fdb_nh) + goto out; + /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { @@ -1186,6 +1258,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_flags = cfg->nh_flags, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, + .fc_is_fdb = cfg->nh_fdb, }; int err; @@ -1227,6 +1300,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; @@ -1248,7 +1324,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, } /* add the entry to the device based hash */ - nexthop_devhash_add(net, nhi); + if (!nh->is_fdb_nh) + nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); @@ -1352,6 +1429,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); + if (tb[NHA_FDB]) { + if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); + goto out; + } + if (nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); + goto out; + } + cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); + } + if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); @@ -1375,8 +1465,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || - tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { - NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } @@ -1385,26 +1475,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (!tb[NHA_OIF]) { - NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + if (!cfg->nh_fdb && !tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } - cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); - if (cfg->nh_ifindex) - cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + if (!cfg->nh_fdb && tb[NHA_OIF]) { + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); - if (!cfg->dev) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - goto out; - } else if (!(cfg->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } else if (!netif_carrier_ok(cfg->dev)) { - NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); - err = -ENETDOWN; - goto out; + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } } err = -EINVAL; @@ -1633,7 +1725,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, int *master_idx, bool *group_filter, - struct netlink_callback *cb) + bool *fdb_filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[NHA_MAX + 1]; @@ -1670,6 +1762,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, case NHA_GROUPS: *group_filter = true; break; + case NHA_FDB: + *fdb_filter = true; + break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; @@ -1688,17 +1783,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { + bool group_filter = false, fdb_filter = false; struct nhmsg *nhm = nlmsg_data(cb->nlh); int dev_filter_idx = 0, master_idx = 0; struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; - bool group_filter = false; struct rb_node *node; int idx = 0, s_idx; int err; err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, - &group_filter, cb); + &group_filter, &fdb_filter, cb); if (err < 0) return err; @@ -1783,6 +1878,19 @@ static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; +int register_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); +} +EXPORT_SYMBOL(register_nexthop_notifier); + +int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); +} +EXPORT_SYMBOL(unregister_nexthop_notifier); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); @@ -1799,6 +1907,7 @@ static int __net_init nexthop_net_init(struct net *net) net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; + ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b69496eaf922..0625a97a8894 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -505,9 +505,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -532,9 +531,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4703b09808d0..821d96c720b9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -89,6 +89,11 @@ struct ip6_tnl_net { struct ip6_tnl __rcu *collect_md_tun; }; +static inline int ip6_tnl_mpls_supported(void) +{ + return IS_ENABLED(CONFIG_MPLS); +} + static struct net_device_stats *ip6_get_stats(struct net_device *dev) { struct pcpu_sw_netstats tmp, sum = { 0 }; @@ -718,6 +723,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static int +mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __u32 rel_info = ntohl(info); + int err, rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; + + err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + return err; +} + static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) @@ -740,6 +759,14 @@ static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, return IP6_ECN_decapsulate(ipv6h, skb); } +static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + /* ECN is not supported in AF_MPLS */ + return 0; +} + __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) @@ -901,6 +928,11 @@ static const struct tnl_ptk_info tpi_v4 = { .proto = htons(ETH_P_IP), }; +static const struct tnl_ptk_info tpi_mpls = { + /* no tunnel info required for mplsip6. */ + .proto = htons(ETH_P_MPLS_UC), +}; + static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, @@ -958,6 +990,12 @@ static int ip6ip6_rcv(struct sk_buff *skb) ip6ip6_dscp_ecn_decapsulate); } +static int mplsip6_rcv(struct sk_buff *skb) +{ + return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, + mplsip6_dscp_ecn_decapsulate); +} + struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; @@ -1232,6 +1270,8 @@ route_lookup: ipv6_push_frag_opts(skb, &opt.ops, &proto); } + skb_set_inner_ipproto(skb, proto); + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); @@ -1253,22 +1293,22 @@ tx_err_dst_release: EXPORT_SYMBOL(ip6_tnl_xmit); static inline int -ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, + u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; + __u16 offset; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; - iph = ip_hdr(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - tproto = READ_ONCE(t->parms.proto); - if (tproto != IPPROTO_IPIP && tproto != 0) + if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { @@ -1281,129 +1321,100 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; + fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; + switch (protocol) { + case IPPROTO_IPIP: + iph = ip_hdr(skb); + orig_dsfield = ipv4_get_dsfield(iph); + break; + case IPPROTO_IPV6: + ipv6h = ipv6_hdr(skb); + orig_dsfield = ipv6_get_dsfield(ipv6h); + break; + default: + orig_dsfield = dsfield; + break; + } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; + if (protocol == IPPROTO_IPV6) { + offset = ip6_tnl_parse_tlv_enc_lim(skb, + skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have + * reallocated skb->head + */ + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - } - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph)); - - if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) - return -1; - - skb_set_inner_ipproto(skb, IPPROTO_IPIP); - - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPIP); - if (err != 0) { - /* XXX: send ICMP error even if DF is not set. */ - if (err == -EMSGSIZE) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - return -1; - } - - return 0; -} - -static inline int -ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h; - int encap_limit = -1; - __u16 offset; - struct flowi6 fl6; - __u8 dsfield; - __u32 mtu; - u8 tproto; - int err; - - ipv6h = ipv6_hdr(skb); - tproto = READ_ONCE(t->parms.proto); - if ((tproto != IPPROTO_IPV6 && tproto != 0) || - ip6_tnl_addr_conflict(t, ipv6h)) - return -1; - - if (t->parms.collect_md) { - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET6)) - return -1; - key = &tun_info->key; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; - fl6.saddr = key->u.ipv6.src; - fl6.daddr = key->u.ipv6.dst; - fl6.flowlabel = key->label; - dsfield = key->tos; - } else { - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - - tel = (void *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; + tel = (void *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { - encap_limit = t->parms.encap_limit; } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; + fl6.flowi6_proto = protocol; - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; + switch (protocol) { + case IPPROTO_IPIP: + iph = ip_hdr(skb); + orig_dsfield = ipv4_get_dsfield(iph); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + dsfield = orig_dsfield; + else + dsfield = ip6_tclass(t->parms.flowinfo); + break; + case IPPROTO_IPV6: + ipv6h = ipv6_hdr(skb); + orig_dsfield = ipv6_get_dsfield(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + dsfield = orig_dsfield; + else + dsfield = ip6_tclass(t->parms.flowinfo); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + break; + default: + orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); + break; + } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h)); + dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; - skb_set_inner_ipproto(skb, IPPROTO_IPV6); - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPV6); + protocol); if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + switch (protocol) { + case IPPROTO_IPIP: + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + break; + case IPPROTO_IPV6: + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + break; + default: + break; + } return -1; } @@ -1415,6 +1426,7 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; + u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) @@ -1422,15 +1434,21 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->protocol) { case htons(ETH_P_IP): - ret = ip4ip6_tnl_xmit(skb, dev); + ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): - ret = ip6ip6_tnl_xmit(skb, dev); + if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) + goto tx_err; + ipproto = IPPROTO_IPV6; + break; + case htons(ETH_P_MPLS_UC): + ipproto = IPPROTO_MPLS; break; default: goto tx_err; } + ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; @@ -2218,6 +2236,12 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; +static struct xfrm6_tunnel mplsip6_handler __read_mostly = { + .handler = mplsip6_rcv, + .err_handler = mplsip6_err, + .priority = 1, +}; + static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); @@ -2332,6 +2356,15 @@ static int __init ip6_tunnel_init(void) pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } + + if (ip6_tnl_mpls_supported()) { + err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); + if (err < 0) { + pr_err("%s: can't register mplsip6\n", __func__); + goto out_mplsip6; + } + } + err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; @@ -2339,6 +2372,9 @@ static int __init ip6_tunnel_init(void) return 0; rtnl_link_failed: + if (ip6_tnl_mpls_supported()) + xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); +out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); @@ -2361,6 +2397,9 @@ static void __exit ip6_tunnel_cleanup(void) if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); + if (ip6_tnl_mpls_supported() && + xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) + pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a52ec1b86432..82cbb46a2a4f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif + if (cfg->fc_is_fdb) { + fib6_nh->fib_nh_gw6 = cfg->fc_gateway; + fib6_nh->fib_nh_gw_family = AF_INET6; + return 0; + } err = -ENODEV; if (cfg->fc_ifindex) { diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 21e7b95ddbfa..06c02ebe6b9b 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -21,8 +21,14 @@ static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); +static inline int xfrm6_tunnel_mpls_supported(void) +{ + return IS_ENABLED(CONFIG_MPLS); +} + int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; @@ -32,8 +38,21 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) mutex_lock(&tunnel6_mutex); - for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - (t = rcu_dereference_protected(*pprev, + switch (family) { + case AF_INET6: + pprev = &tunnel6_handlers; + break; + case AF_INET: + pprev = &tunnel46_handlers; + break; + case AF_MPLS: + pprev = &tunnelmpls6_handlers; + break; + default: + goto err; + } + + for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) @@ -62,8 +81,21 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) mutex_lock(&tunnel6_mutex); - for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - (t = rcu_dereference_protected(*pprev, + switch (family) { + case AF_INET6: + pprev = &tunnel6_handlers; + break; + case AF_INET: + pprev = &tunnel46_handlers; + break; + case AF_MPLS: + pprev = &tunnelmpls6_handlers; + break; + default: + goto err; + } + + for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t == handler) { @@ -73,6 +105,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) } } +err: mutex_unlock(&tunnel6_mutex); synchronize_net(); @@ -86,6 +119,24 @@ EXPORT_SYMBOL(xfrm6_tunnel_deregister); handler != NULL; \ handler = rcu_dereference(handler->next)) \ +static int tunnelmpls6_rcv(struct sk_buff *skb) +{ + struct xfrm6_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + for_each_tunnel_rcu(tunnelmpls6_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -146,6 +197,18 @@ static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return -ENOENT; } +static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_tunnel *handler; + + for_each_tunnel_rcu(tunnelmpls6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + return 0; + + return -ENOENT; +} + static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, @@ -158,6 +221,12 @@ static const struct inet6_protocol tunnel46_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; +static const struct inet6_protocol tunnelmpls6_protocol = { + .handler = tunnelmpls6_rcv, + .err_handler = tunnelmpls6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { @@ -169,6 +238,13 @@ static int __init tunnel6_init(void) inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } + if (xfrm6_tunnel_mpls_supported() && + inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) { + pr_err("%s: can't add protocol\n", __func__); + inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); + inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); + return -EAGAIN; + } return 0; } @@ -178,6 +254,9 @@ static void __exit tunnel6_fini(void) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) pr_err("%s: can't remove protocol\n", __func__); + if (xfrm6_tunnel_mpls_supported() && + inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) + pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index a42e4ed5ab0e..fd30ea61336e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1593,7 +1593,8 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, dev->type == ARPHRD_IPGRE || dev->type == ARPHRD_IP6GRE || dev->type == ARPHRD_SIT || - dev->type == ARPHRD_TUNNEL) { + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_TUNNEL6) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); diff --git a/net/psample/psample.c b/net/psample/psample.c index 34a74043840b..a042261a45c5 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -209,6 +209,7 @@ void psample_group_put(struct psample_group *group) } EXPORT_SYMBOL_GPL(psample_group_put); +#ifdef CONFIG_INET static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { @@ -352,12 +353,15 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) return sum; } +#endif void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, u32 trunc_size, int in_ifindex, int out_ifindex, u32 sample_rate) { +#ifdef CONFIG_INET struct ip_tunnel_info *tun_info; +#endif struct sk_buff *nl_skb; int data_len; int meta_len; @@ -371,9 +375,11 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)); /* seq */ +#ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); if (tun_info) meta_len += psample_tunnel_meta_len(tun_info); +#endif data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) @@ -429,11 +435,13 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, goto error; } +#ifdef CONFIG_INET if (tun_info) { ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); if (unlikely(ret < 0)) goto error; } +#endif genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 71e2bdafb2ce..30cdc4315f42 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 37ace3bc0d48..19e59d1a5e9f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem) umem->zc = false; } -static void xdp_umem_unmap_pages(struct xdp_umem *umem) -{ - unsigned int i; - - for (i = 0; i < umem->npgs; i++) - if (PageHighMem(umem->pgs[i])) - vunmap(umem->pages[i].addr); -} - -static int xdp_umem_map_pages(struct xdp_umem *umem) -{ - unsigned int i; - void *addr; - - for (i = 0; i < umem->npgs; i++) { - if (PageHighMem(umem->pgs[i])) - addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL); - else - addr = page_address(umem->pgs[i]); - - if (!addr) { - xdp_umem_unmap_pages(umem); - return -ENOMEM; - } - - umem->pages[i].addr = addr; - } - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); @@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - xsk_reuseq_destroy(umem); - - xdp_umem_unmap_pages(umem); + xp_destroy(umem->pool); xdp_umem_unpin_pages(umem); - kvfree(umem->pages); - umem->pages = NULL; - xdp_umem_unaccount_pages(umem); kfree(umem); } @@ -385,11 +349,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK - : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; - umem->chunk_size_nohr = chunk_size - headroom; + umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; @@ -407,18 +369,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), - GFP_KERNEL_ACCOUNT); - if (!umem->pages) { + umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, + headroom, size, unaligned_chunks); + if (!umem->pool) { err = -ENOMEM; goto out_pin; } - - err = xdp_umem_map_pages(umem); - if (!err) - return 0; - - kvfree(umem->pages); + return 0; out_pin: xdp_umem_unpin_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index a63a9fb251f5..32067fe98f65 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,7 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 45ffd67b367d..b6c0f08bd80d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -22,7 +22,7 @@ #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return xskq_cons_has_entries(umem->fq, cnt); -} -EXPORT_SYMBOL(xsk_umem_has_addrs); - -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return xskq_cons_peek_addr(umem->fq, addr, umem); -} -EXPORT_SYMBOL(xsk_umem_peek_addr); - -void xsk_umem_release_addr(struct xdp_umem *umem) -{ - xskq_cons_release(umem->fq); -} -EXPORT_SYMBOL(xsk_umem_release_addr); - void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { if (umem->need_wakeup & XDP_WAKEUP_RX) @@ -117,76 +99,82 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); -/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for - * each page. This is only required in copy mode. - */ -static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, - u32 len, u32 metalen) +void xp_release(struct xdp_buff_xsk *xskb) { - void *to_buf = xdp_umem_get_data(umem, addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { - void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; - u64 page_start = addr & ~(PAGE_SIZE - 1); - u64 first_len = PAGE_SIZE - (addr - page_start); - - memcpy(to_buf, from_buf, first_len); - memcpy(next_pg_addr, from_buf + first_len, - len + metalen - first_len); + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} - return; - } +static u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - memcpy(to_buf, from_buf, len + metalen); + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u64 offset = xs->umem->headroom; - u64 addr, memcpy_addr; - void *from_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u64 addr; int err; - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc(xs->rx, addr, len); + if (err) { xs->rx_dropped++; - return -ENOSPC; + return err; } - if (unlikely(xdp_data_meta_unsupported(xdp))) { - from_buf = xdp->data; - metalen = 0; - } else { - from_buf = xdp->data_meta; - metalen = xdp->data - xdp->data_meta; - } + xp_release(xskb); + return 0; +} - memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); +static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +{ + void *from_buf, *to_buf; + u32 metalen; - offset += metalen; - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (!err) { - xskq_cons_release(xs->umem->fq); - xdp_return_buff(xdp); - return 0; + if (unlikely(xdp_data_meta_unsupported(from))) { + from_buf = from->data; + to_buf = to->data; + metalen = 0; + } else { + from_buf = from->data_meta; + metalen = from->data - from->data_meta; + to_buf = to->data - metalen; } - xs->rx_dropped++; - return err; + memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, + bool explicit_free) { - int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); + struct xdp_buff *xsk_xdp; + int err; - if (err) + if (len > xsk_umem_get_rx_frame_size(xs->umem)) { xs->rx_dropped++; + return -ENOSPC; + } - return err; + xsk_xdp = xsk_buff_alloc(xs->umem); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOSPC; + } + + xsk_copy_xdp(xsk_xdp, xdp, len); + err = __xsk_rcv_zc(xs, xsk_xdp, len); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + if (explicit_free) + xdp_return_buff(xdp); + return 0; } static bool xsk_is_bound(struct xdp_sock *xs) @@ -199,7 +187,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, + bool explicit_free) { u32 len; @@ -211,8 +200,9 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) len = xdp->data_end - xdp->data; - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); + return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + __xsk_rcv_zc(xs, xdp, len) : + __xsk_rcv(xs, xdp, len, explicit_free); } static void xsk_flush(struct xdp_sock *xs) @@ -224,46 +214,11 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 metalen = xdp->data - xdp->data_meta; - u32 len = xdp->data_end - xdp->data; - u64 offset = xs->umem->headroom; - void *buffer; - u64 addr; int err; spin_lock_bh(&xs->rx_lock); - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { - err = -EINVAL; - goto out_unlock; - } - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - err = -ENOSPC; - goto out_drop; - } - - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data_meta, len + metalen); - - addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (err) - goto out_drop; - - xskq_cons_release(xs->umem->fq); - xskq_prod_submit(xs->rx); - - spin_unlock_bh(&xs->rx_lock); - - xs->sk.sk_data_ready(&xs->sk); - return 0; - -out_drop: - xs->rx_dropped++; -out_unlock: + err = xsk_rcv(xs, xdp, false); + xsk_flush(xs); spin_unlock_bh(&xs->rx_lock); return err; } @@ -273,7 +228,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp); + err = xsk_rcv(xs, xdp, true); if (err) return err; @@ -404,7 +359,7 @@ static int xsk_generic_xmit(struct sock *sk) skb_put(skb, len); addr = desc.addr; - buffer = xdp_umem_get_data(xs->umem, addr); + buffer = xsk_buff_raw_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -629,24 +584,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } -/* Check if umem pages are contiguous. - * If zero-copy mode, use the DMA address to do the page contiguity check - * For all other modes we use addr (kernel virtual address) - * Store the result in the low bits of addr. - */ -static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) -{ - struct xdp_umem_page *pgs = umem->pages; - int i, is_contig; - - for (i = 0; i < umem->npgs - 1; i++) { - is_contig = (flags & XDP_ZEROCOPY) ? - (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : - (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); - pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; - } -} - static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -729,23 +666,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, xs->umem->size, - xs->umem->chunk_mask); - xskq_set_umem(xs->umem->cq, xs->umem->size, - xs->umem->chunk_mask); - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; - - xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); - xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: @@ -860,6 +788,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); + if (optname == XDP_UMEM_FILL_RING) + xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index 4cfd106bdb53..455ddd480f3d 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,6 +4,20 @@ #ifndef XSK_H_ #define XSK_H_ +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +/* Flags for the umem flags field. + * + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public + * flags. See inlude/uapi/include/linux/if_xdp.h. + */ +#define XDP_UMEM_USES_NEED_WAKEUP BIT(1) + struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; @@ -17,9 +31,25 @@ struct xdp_mmap_offsets_v1 { struct xdp_ring_offset_v1 cr; }; +/* Nodes are linked in the struct xdp_sock map_list field, and used to + * track which maps a certain socket reside in. + */ + +struct xsk_map_node { + struct list_head node; + struct xsk_map *map; + struct xdp_sock **map_entry; +}; + static inline struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry); +int xsk_map_inc(struct xsk_map *map); +void xsk_map_put(struct xsk_map *map); + #endif /* XSK_H_ */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c new file mode 100644 index 000000000000..540ed75e4482 --- /dev/null +++ b/net/xdp/xsk_buff_pool.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <net/xsk_buff_pool.h> +#include <net/xdp_sock.h> +#include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> +#include <linux/swiotlb.h> + +#include "xsk_queue.h" + +static void xp_addr_unmap(struct xsk_buff_pool *pool) +{ + vunmap(pool->addrs); +} + +static int xp_addr_map(struct xsk_buff_pool *pool, + struct page **pages, u32 nr_pages) +{ + pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!pool->addrs) + return -ENOMEM; + return 0; +} + +void xp_destroy(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + xp_addr_unmap(pool); + kvfree(pool->heads); + kvfree(pool); +} + +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned) +{ + struct xsk_buff_pool *pool; + struct xdp_buff_xsk *xskb; + int err; + u32 i; + + pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); + if (!pool) + goto out; + + pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); + if (!pool->heads) + goto out; + + pool->chunk_mask = ~((u64)chunk_size - 1); + pool->addrs_cnt = size; + pool->heads_cnt = chunks; + pool->free_heads_cnt = chunks; + pool->headroom = headroom; + pool->chunk_size = chunk_size; + pool->cheap_dma = true; + pool->unaligned = unaligned; + pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0; i < pool->free_heads_cnt; i++) { + xskb = &pool->heads[i]; + xskb->pool = pool; + xskb->xdp.frame_sz = chunk_size - headroom; + pool->free_heads[i] = xskb; + } + + err = xp_addr_map(pool, pages, nr_pages); + if (!err) + return pool; + +out: + xp_destroy(pool); + return NULL; +} + +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) +{ + pool->fq = fq; +} + +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) + pool->heads[i].xdp.rxq = rxq; +} +EXPORT_SYMBOL(xp_set_rxq_info); + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + if (pool->dma_pages_cnt == 0) + return; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = &pool->dma_pages[i]; + if (*dma) { + dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + *dma = 0; + } + } + + kvfree(pool->dma_pages); + pool->dma_pages_cnt = 0; + pool->dev = NULL; +} +EXPORT_SYMBOL(xp_dma_unmap); + +static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) +{ + u32 i; + + for (i = 0; i < pool->dma_pages_cnt - 1; i++) { + if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) + pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + else + pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + } +} + +static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_SWIOTLB) + phys_addr_t paddr; + u32 i; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); + if (is_swiotlb_buffer(paddr)) + return false; + } +#endif + return true; +} + +static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_HAS_DMA) + const struct dma_map_ops *ops = get_dma_ops(pool->dev); + + if (ops) { + return !ops->sync_single_for_cpu && + !ops->sync_single_for_device; + } + + if (!dma_is_direct(ops)) + return false; + + if (!xp_check_swiotlb_dma(pool)) + return false; + + if (!dev_is_dma_coherent(pool->dev)) { +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + return false; +#endif + } +#endif + return true; +} + +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages) +{ + dma_addr_t dma; + u32 i; + + pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), + GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dev; + pool->dma_pages_cnt = nr_pages; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + if (dma_mapping_error(dev, dma)) { + xp_dma_unmap(pool, attrs); + return -ENOMEM; + } + pool->dma_pages[i] = dma; + } + + if (pool->unaligned) + xp_check_dma_contiguity(pool); + + pool->dev = dev; + pool->cheap_dma = xp_check_cheap_dma(pool); + return 0; +} +EXPORT_SYMBOL(xp_dma_map); + +static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr) +{ + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); +} + +static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_unaligned_extract_addr(*addr); + if (*addr >= pool->addrs_cnt || + *addr + pool->chunk_size > pool->addrs_cnt || + xp_addr_crosses_non_contig_pg(pool, *addr)) + return false; + return true; +} + +static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_aligned_extract_addr(pool, *addr); + return *addr < pool->addrs_cnt; +} + +static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + if (pool->free_heads_cnt == 0) + return NULL; + + xskb = pool->free_heads[--pool->free_heads_cnt]; + + for (;;) { + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + xp_release(xskb); + return NULL; + } + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (!ok) { + pool->fq->invalid_descs++; + xskq_cons_release(pool->fq); + continue; + } + break; + } + xskq_cons_release(pool->fq); + + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; + if (pool->dma_pages_cnt) { + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + + XDP_PACKET_HEADROOM; + } + return xskb; +} + +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + + if (!pool->free_list_cnt) { + xskb = __xp_alloc(pool); + if (!xskb) + return NULL; + } else { + pool->free_list_cnt--; + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, + free_list_node); + list_del(&xskb->free_list_node); + } + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; + + if (!pool->cheap_dma) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, + pool->frame_len, + DMA_BIDIRECTIONAL); + } + return &xskb->xdp; +} +EXPORT_SYMBOL(xp_alloc); + +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) +{ + if (pool->free_list_cnt >= count) + return true; + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); +} +EXPORT_SYMBOL(xp_can_alloc); + +void xp_free(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_list_cnt++; + list_add(&xskb->free_list_node, &xskb->pool->free_list); +} +EXPORT_SYMBOL(xp_free); + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return pool->addrs + addr; +} +EXPORT_SYMBOL(xp_raw_get_data); + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); +} +EXPORT_SYMBOL(xp_raw_get_dma); + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) +{ + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) +{ + dma_sync_single_range_for_device(pool->dev, dma, 0, + size, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_device_slow); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index f59791ba43a0..0163b26aaf63 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = umem->chunk_size_nohr + umem->headroom; + du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 57fb81bd593c..6cf9586e5027 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -6,18 +6,10 @@ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> +#include <net/xdp_sock_drv.h> #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) -{ - if (!q) - return; - - q->umem_size = umem_size; - q->chunk_mask = chunk_mask; -} - static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; @@ -63,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } - -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - struct xdp_umem_fq_reuse *newq; - - /* Check for overflow */ - if (nentries > (u32)roundup_pow_of_two(nentries)) - return NULL; - nentries = roundup_pow_of_two(nentries); - - newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); - if (!newq) - return NULL; - memset(newq, 0, offsetof(typeof(*newq), handles)); - - newq->nentries = nentries; - return newq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); - -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; - - if (!oldq) { - umem->fq_reuse = newq; - return NULL; - } - - if (newq->nentries < oldq->length) - return newq; - - memcpy(newq->handles, oldq->handles, - array_size(oldq->length, sizeof(u64))); - newq->length = oldq->length; - - umem->fq_reuse = newq; - return oldq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_swap); - -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ - kvfree(rq); -} -EXPORT_SYMBOL_GPL(xsk_reuseq_free); - -void xsk_reuseq_destroy(struct xdp_umem *umem) -{ - xsk_reuseq_free(umem->fq_reuse); - umem->fq_reuse = NULL; -} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 648733ec24ac..5b5d24d2dd37 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -9,6 +9,9 @@ #include <linux/types.h> #include <linux/if_xdp.h> #include <net/xdp_sock.h> +#include <net/xsk_buff_pool.h> + +#include "xsk.h" struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -29,8 +32,6 @@ struct xdp_umem_ring { }; struct xsk_queue { - u64 chunk_mask; - u64 umem_size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -103,98 +104,73 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, - u64 addr, - u64 length) +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { - bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; - bool next_pg_contig = - (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & - XSK_NEXT_PG_CONTIG_MASK; - - return cross_pg && !next_pg_contig; -} + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; -static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, - u64 addr, - u64 length, - struct xdp_umem *umem) -{ - u64 base_addr = xsk_umem_extract_addr(addr); + if (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; - addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->umem_size || addr >= q->umem_size || - xskq_cons_crosses_non_contig_pg(umem, addr, length)) { - q->invalid_descs++; - return false; + *addr = ring->desc[idx]; + return true; } - return true; + return false; } -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) +static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { - if (addr >= q->umem_size) { - q->invalid_descs++; + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) return false; - } + if (desc->options) + return false; return true; } -static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; + u64 addr, base_addr; - *addr = ring->desc[idx] & q->chunk_mask; + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_cons_is_valid_unaligned(q, *addr, - umem->chunk_size_nohr, - umem)) - return true; - goto out; - } + if (desc->len > pool->chunk_size) + return false; - if (xskq_cons_is_valid_addr(q, *addr)) - return true; + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; -out: - q->cached_cons++; - } + if (desc->options) + return false; + return true; +} - return false; +static inline bool xp_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); } static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xdp_umem *umem) { - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) - return false; - - if (d->len > umem->chunk_size_nohr || d->options) { - q->invalid_descs++; - return false; - } - - return true; - } - - if (!xskq_cons_is_valid_addr(q, d->addr)) - return false; - - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { + if (!xp_validate_desc(umem->pool, d)) { q->invalid_descs++; return false; } - return true; } @@ -250,12 +226,11 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) return entries >= cnt; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); + return xskq_cons_read_addr_unchecked(q, addr); } static inline bool xskq_cons_peek_desc(struct xsk_queue *q, @@ -379,11 +354,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -/* Executed by the core when the entire UMEM gets freed */ -void xsk_reuseq_destroy(struct xdp_umem *umem); - #endif /* _LINUX_XSK_QUEUE_H */ diff --git a/kernel/bpf/xskmap.c b/net/xdp/xskmap.c index 2cc5c8f4c800..1dc7208c71ba 100644 --- a/kernel/bpf/xskmap.c +++ b/net/xdp/xskmap.c @@ -9,6 +9,8 @@ #include <linux/slab.h> #include <linux/sched.h> +#include "xsk.h" + int xsk_map_inc(struct xsk_map *map) { bpf_map_inc(&map->map); diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 23837f2ed458..034800c4d1e6 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -50,3 +50,4 @@ xdp_rxq_info xdp_sample_pkts xdp_tx_iptunnel xdpsock +testfile.img diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 424f6fe7ce38..8403e4762306 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -63,14 +63,14 @@ TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o fds_example-objs := fds_example.o sockex1-objs := sockex1_user.o sockex2-objs := sockex2_user.o -sockex3-objs := bpf_load.o sockex3_user.o -tracex1-objs := bpf_load.o tracex1_user.o $(TRACE_HELPERS) -tracex2-objs := bpf_load.o tracex2_user.o -tracex3-objs := bpf_load.o tracex3_user.o -tracex4-objs := bpf_load.o tracex4_user.o -tracex5-objs := bpf_load.o tracex5_user.o $(TRACE_HELPERS) -tracex6-objs := bpf_load.o tracex6_user.o -tracex7-objs := bpf_load.o tracex7_user.o +sockex3-objs := sockex3_user.o +tracex1-objs := tracex1_user.o $(TRACE_HELPERS) +tracex2-objs := tracex2_user.o +tracex3-objs := tracex3_user.o +tracex4-objs := tracex4_user.o +tracex5-objs := tracex5_user.o $(TRACE_HELPERS) +tracex6-objs := tracex6_user.o +tracex7-objs := tracex7_user.o test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS) lathist-objs := bpf_load.o lathist_user.o diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c index e504dc308371..f24806ac24e7 100644 --- a/samples/bpf/sampleip_kern.c +++ b/samples/bpf/sampleip_kern.c @@ -13,12 +13,12 @@ #define MAX_IPS 8192 -struct bpf_map_def SEC("maps") ip_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u64), - .value_size = sizeof(u32), - .max_entries = MAX_IPS, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u32); + __uint(max_entries, MAX_IPS); +} ip_map SEC(".maps"); SEC("perf_event") int do_sample(struct bpf_perf_event_data *ctx) diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index 4372d2da2f9e..921c505bb567 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -18,9 +18,6 @@ #include "perf-sys.h" #include "trace_helpers.h" -#define __must_check -#include <linux/err.h> - #define DEFAULT_FREQ 99 #define DEFAULT_SECS 5 #define MAX_IPS 8192 @@ -57,7 +54,7 @@ static int sampling_start(int freq, struct bpf_program *prog, return 1; } links[i] = bpf_program__attach_perf_event(prog, pmu_fd); - if (IS_ERR(links[i])) { + if (libbpf_get_error(links[i])) { fprintf(stderr, "ERROR: Attach perf event\n"); links[i] = NULL; close(pmu_fd); @@ -182,7 +179,7 @@ int main(int argc, char **argv) snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); - if (IS_ERR(obj)) { + if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); obj = NULL; goto cleanup; diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 779a5249c418..cab9cca0b8eb 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -19,12 +19,12 @@ #define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F -struct bpf_map_def SEC("maps") jmp_table = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 8, -}; +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 8); +} jmp_table SEC(".maps"); #define PARSE_VLAN 1 #define PARSE_MPLS 2 @@ -92,12 +92,12 @@ struct globals { struct flow_key_record flow; }; -struct bpf_map_def SEC("maps") percpu_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(__u32), - .value_size = sizeof(struct globals), - .max_entries = 32, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct globals); + __uint(max_entries, 32); +} percpu_map SEC(".maps"); /* user poor man's per_cpu until native support is ready */ static struct globals *this_cpu_globals(void) @@ -113,12 +113,12 @@ struct pair { __u64 bytes; }; -struct bpf_map_def SEC("maps") hash_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct flow_key_record), - .value_size = sizeof(struct pair), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct flow_key_record); + __type(value, struct pair); + __uint(max_entries, 1024); +} hash_map SEC(".maps"); static void update_stats(struct __sk_buff *skb, struct globals *g) { diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index bbb1cd0666a9..4dbee7427d47 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -1,18 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <assert.h> -#include <linux/bpf.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> #include <sys/resource.h> -#define PARSE_IP 3 -#define PARSE_IP_PROG_FD (prog_fd[0]) -#define PROG_ARRAY_FD (map_fd[0]) - struct flow_key_record { __be32 src; __be32 dst; @@ -30,31 +25,55 @@ struct pair { int main(int argc, char **argv) { + int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd; struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + const char *title; FILE *f; - int i, sock, err, id, key = PARSE_IP; - struct bpf_prog_info info = {}; - uint32_t info_len = sizeof(info); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table"); + hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); + if (jmp_table_fd < 0 || hash_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } - /* Test fd array lookup which returns the id of the bpf_prog */ - err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len); - assert(!err); - err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id); - assert(!err); - assert(id == info.id); + bpf_object__for_each_program(prog, obj) { + fd = bpf_program__fd(prog); + + title = bpf_program__title(prog, false); + if (sscanf(title, "socket/%d", &key) != 1) { + fprintf(stderr, "ERROR: finding prog failed\n"); + goto cleanup; + } + + if (key == 0) + main_prog_fd = fd; + else + bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY); + } sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4], + /* attach BPF program to socket */ + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd, sizeof(__u32)) == 0); if (argc > 1) @@ -69,8 +88,8 @@ int main(int argc, char **argv) sleep(1); printf("IP src.port -> dst.port bytes packets\n"); - while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[2], &next_key, &value); + while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(hash_map_fd, &next_key, &value); printf("%s.%05d -> %s.%05d %12lld %12lld\n", inet_ntoa((struct in_addr){htonl(next_key.src)}), next_key.port16[0], @@ -80,5 +99,8 @@ int main(int argc, char **argv) key = next_key; } } + +cleanup: + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h new file mode 100644 index 000000000000..8cb5400aed1f --- /dev/null +++ b/samples/bpf/trace_common.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_COMMON_H +#define __TRACE_COMMON_H + +#ifdef __x86_64__ +#define SYSCALL(SYS) "__x64_" __stringify(SYS) +#elif defined(__s390x__) +#define SYSCALL(SYS) "__s390x_" __stringify(SYS) +#else +#define SYSCALL(SYS) __stringify(SYS) +#endif + +#endif diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index da1d69e20645..7d3c66fb3f88 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -18,19 +18,19 @@ struct key_t { u32 userstack; }; -struct bpf_map_def SEC("maps") counts = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct key_t), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, 10000); +} counts SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); #define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) #define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index b6cd358d0418..ac1ba368195c 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -16,9 +16,6 @@ #include "perf-sys.h" #include "trace_helpers.h" -#define __must_check -#include <linux/err.h> - #define SAMPLE_FREQ 50 static int pid; @@ -159,7 +156,7 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr) goto all_cpu_err; } links[i] = bpf_program__attach_perf_event(prog, pmu_fd); - if (IS_ERR(links[i])) { + if (libbpf_get_error(links[i])) { printf("bpf_program__attach_perf_event failed\n"); links[i] = NULL; close(pmu_fd); @@ -198,7 +195,7 @@ static void test_perf_event_task(struct perf_event_attr *attr) goto err; } link = bpf_program__attach_perf_event(prog, pmu_fd); - if (IS_ERR(link)) { + if (libbpf_get_error(link)) { printf("bpf_program__attach_perf_event failed\n"); link = NULL; close(pmu_fd); @@ -314,7 +311,7 @@ int main(int argc, char **argv) } obj = bpf_object__open_file(filename, NULL); - if (IS_ERR(obj)) { + if (libbpf_get_error(obj)) { printf("opening BPF object file failed\n"); obj = NULL; goto cleanup; diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c index 55fddbd08702..9d4adb7fd834 100644 --- a/samples/bpf/tracex1_user.c +++ b/samples/bpf/tracex1_user.c @@ -1,21 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> -#include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "trace_helpers.h" int main(int ac, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } f = popen("taskset 1 ping -c5 localhost", "r"); @@ -23,5 +43,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index d865bb309bcb..5bc696bac27d 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -10,13 +10,14 @@ #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +#include "trace_common.h" -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful @@ -70,14 +71,14 @@ struct hist_key { u64 index; }; -struct bpf_map_def SEC("maps") my_hist_map = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(struct hist_key), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(struct hist_key)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_hist_map SEC(".maps"); -SEC("kprobe/sys_write") +SEC("kprobe/" SYSCALL(sys_write)) int bpf_prog3(struct pt_regs *ctx) { long write_size = PT_REGS_PARM3(ctx); diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index c9544a4ce61a..3e36b3e4e3ef 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -3,17 +3,19 @@ #include <unistd.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> #include <string.h> #include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" #define MAX_INDEX 64 #define MAX_STARS 38 +/* my_map, my_hist_map */ +static int map_fd[2]; + static void stars(char *str, long val, long max, int width) { int i; @@ -115,18 +117,39 @@ static void int_exit(int sig) int main(int ac, char **argv) { struct rlimit r = {1024*1024, RLIM_INFINITY}; - char filename[256]; long key, next_key, value; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int i, j = 0; FILE *f; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK)"); return 1; } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); @@ -138,9 +161,14 @@ int main(int ac, char **argv) f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); (void) f; - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } for (i = 0; i < 5; i++) { @@ -156,5 +184,10 @@ int main(int ac, char **argv) } print_hist(map_fd[1]); +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index fe21c14feb8d..659613c19a82 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -11,12 +11,12 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(u64), - .max_entries = 4096, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, u64); + __uint(max_entries, 4096); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful @@ -42,12 +42,12 @@ static unsigned int log2l(unsigned long long n) #define SLOTS 100 -struct bpf_map_def SEC("maps") lat_map = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = SLOTS, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, SLOTS); +} lat_map SEC(".maps"); SEC("kprobe/blk_account_io_completion") int bpf_prog2(struct pt_regs *ctx) diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index cf8fedc773f2..70e987775c15 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -7,11 +7,10 @@ #include <unistd.h> #include <stdbool.h> #include <string.h> -#include <linux/bpf.h> #include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" #define SLOTS 100 @@ -109,20 +108,11 @@ static void print_hist(int fd) int main(int ac, char **argv) { struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } + int map_fd, i, j = 0; for (i = 1; i < ac; i++) { if (strcmp(argv[i], "-a") == 0) { @@ -137,6 +127,40 @@ int main(int ac, char **argv) } } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf(" heatmap of IO latency\n"); if (text_only) printf(" %s", sym[num_colors - 1]); @@ -153,9 +177,14 @@ int main(int ac, char **argv) for (i = 0; ; i++) { if (i % 20 == 0) print_banner(); - print_hist(map_fd[1]); + print_hist(map_fd); sleep(2); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c index b1bb9df88f8e..eb0f8fdd14bf 100644 --- a/samples/bpf/tracex4_kern.c +++ b/samples/bpf/tracex4_kern.c @@ -15,12 +15,12 @@ struct pair { u64 ip; }; -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(struct pair), - .max_entries = 1000000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, struct pair); + __uint(max_entries, 1000000); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index ec52203fce39..e8faf8f184ae 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -8,11 +8,10 @@ #include <stdbool.h> #include <string.h> #include <time.h> -#include <linux/bpf.h> #include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> struct pair { long long val; @@ -36,8 +35,8 @@ static void print_old_objects(int fd) key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ key = -1; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &v); + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &v); key = next_key; if (val - v.val < 1000000000ll) /* object was allocated more then 1 sec ago */ @@ -50,25 +49,55 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + int map_fd, i, j = 0; if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); return 1; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } for (i = 0; ; i++) { - print_old_objects(map_fd[1]); + print_old_objects(map_fd); sleep(1); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c index 481790fde864..32b49e8ab6bd 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5_kern.c @@ -15,16 +15,16 @@ #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F -struct bpf_map_def SEC("maps") progs = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); #ifdef __mips__ - .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */ + __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */ #else - .max_entries = 1024, + __uint(max_entries, 1024); #endif -}; +} progs SEC(".maps"); SEC("kprobe/__seccomp_filter") int bpf_prog1(struct pt_regs *ctx) diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index c2317b39e0d2..98dad57a96c4 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -1,15 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> +#include <stdlib.h> #include <unistd.h> #include <linux/filter.h> #include <linux/seccomp.h> #include <sys/prctl.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include <sys/resource.h> #include "trace_helpers.h" +#ifdef __mips__ +#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */ +#else +#define MAX_ENTRIES 1024 +#endif + /* install fake seccomp program to enable seccomp code path inside the kernel, * so that our kprobe attached to seccomp_phase1() can be triggered */ @@ -28,16 +34,57 @@ static void install_accept_all_seccomp(void) int main(int ac, char **argv) { - FILE *f; - char filename[256]; struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + int key, fd, progs_fd; + char filename[256]; + const char *title; + FILE *f; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + progs_fd = bpf_object__find_map_fd_by_name(obj, "progs"); + if (progs_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + title = bpf_program__title(prog, false); + /* register only syscalls to PROG_ARRAY */ + if (sscanf(title, "kprobe/%d", &key) != 1) + continue; + + fd = bpf_program__fd(prog); + bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY); } install_accept_all_seccomp(); @@ -47,5 +94,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index 96c234efa852..acad5712d8b4 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -3,24 +3,26 @@ #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> -struct bpf_map_def SEC("maps") counters = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = 64, -}; -struct bpf_map_def SEC("maps") values = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(u64), - .max_entries = 64, -}; -struct bpf_map_def SEC("maps") values2 = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(struct bpf_perf_event_value), - .max_entries = 64, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 64); +} counters SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, u64); + __uint(max_entries, 64); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct bpf_perf_event_value); + __uint(max_entries, 64); +} values2 SEC(".maps"); SEC("kprobe/htab_map_get_next_key") int bpf_prog1(struct pt_regs *ctx) diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index 4bb3c830adb2..33df9784775d 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -4,7 +4,6 @@ #include <assert.h> #include <fcntl.h> #include <linux/perf_event.h> -#include <linux/bpf.h> #include <sched.h> #include <stdio.h> #include <stdlib.h> @@ -15,12 +14,15 @@ #include <sys/wait.h> #include <unistd.h> -#include "bpf_load.h" #include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "perf-sys.h" #define SAMPLE_PERIOD 0x7fffffffffffffffULL +/* counters, values, values2 */ +static int map_fd[3]; + static void check_on_cpu(int cpu, struct perf_event_attr *attr) { struct bpf_perf_event_value value2; @@ -174,16 +176,51 @@ static void test_bpf_perf_event(void) int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + int i = 0; + + setrlimit(RLIMIT_MEMLOCK, &r); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } test_bpf_perf_event(); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c index ea6dae78f0df..fdcd6580dd73 100644 --- a/samples/bpf/tracex7_user.c +++ b/samples/bpf/tracex7_user.c @@ -1,28 +1,51 @@ #define _GNU_SOURCE #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> -#include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> int main(int argc, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; char command[256]; - int ret; + int ret = 0; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } snprintf(command, 256, "mount %s tmpmnt/", argv[1]); f = popen(command, "r"); ret = pclose(f); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return ret ? 0 : 1; } diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 9b8f21abeac4..f3468168982e 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -19,9 +19,6 @@ static const char *__doc__ = #include <time.h> #include <linux/limits.h> -#define __must_check -#include <linux/err.h> - #include <arpa/inet.h> #include <linux/if_link.h> @@ -622,7 +619,7 @@ static struct bpf_link * attach_tp(struct bpf_object *obj, } link = bpf_program__attach_tracepoint(prog, tp_category, tp_name); - if (IS_ERR(link)) + if (libbpf_get_error(link)) exit(EXIT_FAIL_BPF); return link; diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index e4d9da654e84..a226aee3574f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -29,8 +29,8 @@ CGROUP COMMANDS | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } | *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** | | **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** | -| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | -| **getsockopt** | **setsockopt** } +| **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** | +| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** } | *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION @@ -101,7 +101,11 @@ DESCRIPTION an unconnected udp6 socket (since 5.2); **sysctl** sysctl access (since 5.2); **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3). + **setsockopt** call to setsockopt (since 5.3); + **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); + **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); + **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); + **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 5948e9d89c8d..2b254959d488 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -41,7 +41,8 @@ PROG COMMANDS | **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | | **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | | **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | | **cgroup/getsockopt** | **cgroup/setsockopt** | | **struct_ops** | **fentry** | **fexit** | **freplace** diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 9f0f20e73b87..25b25aca1112 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -472,6 +472,8 @@ _bpftool() lwt_seg6local sockops sk_skb sk_msg \ lirc_mode2 cgroup/bind4 cgroup/bind6 \ cgroup/connect4 cgroup/connect6 \ + cgroup/getpeername4 cgroup/getpeername6 \ + cgroup/getsockname4 cgroup/getsockname6 \ cgroup/sendmsg4 cgroup/sendmsg6 \ cgroup/recvmsg4 cgroup/recvmsg6 \ cgroup/post_bind4 cgroup/post_bind6 \ @@ -966,9 +968,10 @@ _bpftool() ;; attach|detach) local ATTACH_TYPES='ingress egress sock_create sock_ops \ - device bind4 bind6 post_bind4 post_bind6 connect4 \ - connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \ - getsockopt setsockopt' + device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \ + getpeername4 getpeername6 getsockname4 getsockname6 \ + sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \ + setsockopt' local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' case $prev in @@ -977,9 +980,9 @@ _bpftool() return 0 ;; ingress|egress|sock_create|sock_ops|device|bind4|bind6|\ - post_bind4|post_bind6|connect4|connect6|sendmsg4|\ - sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\ - setsockopt) + post_bind4|post_bind6|connect4|connect6|getpeername4|\ + getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\ + recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt) COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ "$cur" ) ) return 0 diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 1693c802bb20..27931db421d8 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -25,9 +25,10 @@ " ATTACH_TYPE := { ingress | egress | sock_create |\n" \ " sock_ops | device | bind4 | bind6 |\n" \ " post_bind4 | post_bind6 | connect4 |\n" \ - " connect6 | sendmsg4 | sendmsg6 |\n" \ - " recvmsg4 | recvmsg6 | sysctl |\n" \ - " getsockopt | setsockopt }" + " connect6 | getpeername4 | getpeername6 |\n" \ + " getsockname4 | getsockname6 | sendmsg4 |\n" \ + " sendmsg6 | recvmsg4 | recvmsg6 |\n" \ + " sysctl | getsockopt | setsockopt }" static unsigned int query_flags; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index f89ac70ef973..5cdf0bc049bd 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -100,6 +100,10 @@ static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_CGROUP_INET6_CONNECT] = "connect6", [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", + [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4", + [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6", + [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4", + [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6", [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", [BPF_CGROUP_SYSCTL] = "sysctl", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index b6e5ba568f98..245f941fdbcf 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2012,8 +2012,10 @@ static int do_help(int argc, char **argv) " sk_reuseport | flow_dissector | cgroup/sysctl |\n" " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" - " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n" - " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n" + " cgroup/getpeername4 | cgroup/getpeername6 |\n" + " cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n" + " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n" + " cgroup/getsockopt | cgroup/setsockopt |\n" " struct_ops | fentry | fexit | freplace }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 146c742f1d49..97e1fd19ff58 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; @@ -2015,8 +2019,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c index cffb96202e0d..a405dad068f5 100644 --- a/tools/lib/bpf/hashmap.c +++ b/tools/lib/bpf/hashmap.c @@ -60,7 +60,7 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, void hashmap__clear(struct hashmap *map) { struct hashmap_entry *cur, *tmp; - int bkt; + size_t bkt; hashmap__for_each_entry_safe(map, cur, tmp, bkt) { free(cur); @@ -100,8 +100,7 @@ static int hashmap_grow(struct hashmap *map) struct hashmap_entry **new_buckets; struct hashmap_entry *cur, *tmp; size_t new_cap_bits, new_cap; - size_t h; - int bkt; + size_t h, bkt; new_cap_bits = map->cap_bits + 1; if (new_cap_bits < HASHMAP_MIN_CAP_BITS) diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index bae8879cdf58..e823b35e7371 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -15,7 +15,6 @@ #else #include <bits/reg.h> #endif -#include "libbpf_internal.h" static inline size_t hash_bits(size_t h, int bits) { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 292257995487..fa04cbe547ed 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6705,6 +6705,14 @@ static const struct bpf_sec_def section_defs[] = { BPF_CGROUP_UDP4_RECVMSG), BPF_EAPROG_SEC("cgroup/recvmsg6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG), + BPF_EAPROG_SEC("cgroup/getpeername4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_GETPEERNAME), + BPF_EAPROG_SEC("cgroup/getpeername6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_GETPEERNAME), + BPF_EAPROG_SEC("cgroup/getsockname4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_GETSOCKNAME), + BPF_EAPROG_SEC("cgroup/getsockname6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_GETSOCKNAME), BPF_EAPROG_SEC("cgroup/sysctl", BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_CGROUP_SYSCTL), BPF_EAPROG_SEC("cgroup/getsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT, diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 0f67f1b470b0..e885d351595f 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -1,6 +1,8 @@ ================== BPF Selftest Notes ================== +General instructions on running selftests can be found in +`Documentation/bpf/bpf_devel_QA.rst`_. Additional information about selftest failures are documented here. diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..2118e23ac07a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y CONFIG_FTRACE_SYSCALLS=y CONFIG_IPV6_TUNNEL=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_SEG6_BPF=y CONFIG_NET_FOU=m CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_IPV6_FOU=m @@ -37,3 +38,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_LIRC=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 999a775484c1..e36dd1a1780d 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -5,6 +5,8 @@ #include <string.h> #include <unistd.h> +#include <arpa/inet.h> + #include <sys/epoll.h> #include <linux/err.h> @@ -35,7 +37,7 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; -int start_server(int family, int type) +int start_server_with_port(int family, int type, __u16 port) { struct sockaddr_storage addr = {}; socklen_t len; @@ -45,11 +47,13 @@ int start_server(int family, int type) struct sockaddr_in *sin = (void *)&addr; sin->sin_family = AF_INET; + sin->sin_port = htons(port); len = sizeof(*sin); } else { struct sockaddr_in6 *sin6 = (void *)&addr; sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); len = sizeof(*sin6); } @@ -76,6 +80,11 @@ int start_server(int family, int type) return fd; } +int start_server(int family, int type) +{ + return start_server_with_port(family, type, 0); +} + static const struct timeval timeo_sec = { .tv_sec = 3 }; static const size_t timeo_optlen = sizeof(timeo_sec); diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 86914e6e7b53..6a8009605670 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -34,6 +34,7 @@ struct ipv6_packet { extern struct ipv6_packet pkt_v6; int start_server(int family, int type); +int start_server_with_port(int family, int type, __u16 port); int connect_to_fd(int family, int type, int server_fd); int connect_fd_to_fd(int client_fd, int server_fd); int connect_wait(int client_fd); diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 0262f7b374f9..c548aded6585 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -1,24 +1,5 @@ -#include <asm/types.h> -#include <linux/types.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <stddef.h> -#include <stdbool.h> - -#include <linux/unistd.h> -#include <linux/filter.h> -#include <linux/bpf_perf_event.h> -#include <linux/bpf.h> - -#include <bpf/bpf.h> - -#include "../../../include/linux/filter.h" -#include "bpf_rlimit.h" -#include "bpf_util.h" +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> #define MAX_INSNS 512 #define MAX_MATCHES 16 @@ -359,15 +340,15 @@ static struct bpf_align_test tests[] = { * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, - {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, + {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, }, }, { @@ -410,15 +391,15 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, - {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ @@ -426,15 +407,15 @@ static struct bpf_align_test tests[] = { /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, - {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, }, }, { @@ -469,16 +450,16 @@ static struct bpf_align_test tests[] = { .matches = { {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, + {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -486,7 +467,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, } }, { @@ -528,7 +509,7 @@ static struct bpf_align_test tests[] = { /* New unknown value in R7 is (4n) */ {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Subtracting it from R6 blows our unsigned bounds */ - {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"}, + {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* At the time the word size load is performed from R5, @@ -537,7 +518,8 @@ static struct bpf_align_test tests[] = { * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, + }, }, { @@ -579,18 +561,18 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"}, + {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, }, }, }; @@ -669,51 +651,16 @@ static int do_test_single(struct bpf_align_test *test) return ret; } -static int do_test(unsigned int from, unsigned int to) +void test_align(void) { - int all_pass = 0; - int all_fail = 0; unsigned int i; - for (i = from; i < to; i++) { + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_align_test *test = &tests[i]; - int fail; - printf("Test %3d: %s ... ", - i, test->descr); - fail = do_test_single(test); - if (fail) { - all_fail++; - printf("FAIL\n"); - } else { - all_pass++; - printf("PASS\n"); - } - } - printf("Results: %d pass %d fail\n", - all_pass, all_fail); - return all_fail ? EXIT_FAILURE : EXIT_SUCCESS; -} + if (!test__start_subtest(test->descr)) + continue; -int main(int argc, char **argv) -{ - unsigned int from = 0, to = ARRAY_SIZE(tests); - - if (argc == 3) { - unsigned int l = atoi(argv[argc - 2]); - unsigned int u = atoi(argv[argc - 1]); - - if (l < to && u < to) { - from = l; - to = u + 1; - } - } else if (argc == 2) { - unsigned int t = atoi(argv[argc - 1]); - - if (t < to) { - from = t; - to = t + 1; - } + CHECK_FAIL(do_test_single(test)); } - return do_test(from, to); } diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c index 47fbb20cb6a6..17bbf76812ca 100644 --- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -4,7 +4,8 @@ #include "cgroup_helpers.h" #include "network_helpers.h" -static int verify_port(int family, int fd, int expected) +static int verify_ports(int family, int fd, + __u16 expected_local, __u16 expected_peer) { struct sockaddr_storage addr; socklen_t len = sizeof(addr); @@ -20,9 +21,25 @@ static int verify_port(int family, int fd, int expected) else port = ((struct sockaddr_in6 *)&addr)->sin6_port; - if (ntohs(port) != expected) { - log_err("Unexpected port %d, expected %d", ntohs(port), - expected); + if (ntohs(port) != expected_local) { + log_err("Unexpected local port %d, expected %d", ntohs(port), + expected_local); + return -1; + } + + if (getpeername(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get peer addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected_peer) { + log_err("Unexpected peer port %d, expected %d", ntohs(port), + expected_peer); return -1; } @@ -31,33 +48,67 @@ static int verify_port(int family, int fd, int expected) static int run_test(int cgroup_fd, int server_fd, int family, int type) { + bool v4 = family == AF_INET; + __u16 expected_local_port = v4 ? 22222 : 22223; + __u16 expected_peer_port = 60000; struct bpf_prog_load_attr attr = { - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .file = v4 ? "./connect_force_port4.o" : + "./connect_force_port6.o", }; + struct bpf_program *prog; struct bpf_object *obj; - int expected_port; - int prog_fd; - int err; - int fd; - - if (family == AF_INET) { - attr.file = "./connect_force_port4.o"; - attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; - expected_port = 22222; - } else { - attr.file = "./connect_force_port6.o"; - attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; - expected_port = 22223; - } + int xlate_fd, fd, err; + __u32 duration = 0; - err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd); if (err) { log_err("Failed to load BPF object"); return -1; } - err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, - 0); + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/connect4" : + "cgroup/connect6"); + if (CHECK(!prog, "find_prog", "connect prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_CONNECT : + BPF_CGROUP_INET6_CONNECT, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getpeername4" : + "cgroup/getpeername6"); + if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET6_GETPEERNAME, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getsockname4" : + "cgroup/getsockname6"); + if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETSOCKNAME : + BPF_CGROUP_INET6_GETSOCKNAME, 0); if (err) { log_err("Failed to attach BPF program"); goto close_bpf_object; @@ -69,8 +120,8 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type) goto close_bpf_object; } - err = verify_port(family, fd, expected_port); - + err = verify_ports(family, fd, expected_local_port, + expected_peer_port); close(fd); close_bpf_object: @@ -86,25 +137,25 @@ void test_connect_force_port(void) if (CHECK_FAIL(cgroup_fd < 0)) return; - server_fd = start_server(AF_INET, SOCK_STREAM); + server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); close(server_fd); - server_fd = start_server(AF_INET6, SOCK_STREAM); + server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); close(server_fd); - server_fd = start_server(AF_INET, SOCK_DGRAM); + server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); close(server_fd); - server_fd = start_server(AF_INET6, SOCK_DGRAM); + server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c index 4867cd3445c8..b57bd6fef208 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -1,11 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + SEC("iter/bpf_map") int dump_bpf_map(struct bpf_iter__bpf_map *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c index ab9e2650e021..c8e9ca74c87b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c @@ -1,9 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__ipv6_route #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__ipv6_route { + struct bpf_iter_meta *meta; + struct fib6_info *rt; +} __attribute__((preserve_access_index)); + char _license[] SEC("license") = "GPL"; extern bool CONFIG_IPV6_SUBTREES __kconfig __weak; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c index 6b40a233d4e0..e7b8753eac0b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -1,6 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__netlink bpf_iter__netlink___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__netlink #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> @@ -9,6 +14,17 @@ char _license[] SEC("license") = "GPL"; #define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_refcnt __sk_common.skc_refcnt +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__netlink { + struct bpf_iter_meta *meta; + struct netlink_sock *sk; +} __attribute__((preserve_access_index)); + static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c index 90f9011c57ca..ee754021f98e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -1,11 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c index c6ced38f0880..0f0ec3db20ba 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -1,11 +1,29 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task_file bpf_iter__task_file___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task_file #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task_file { + struct bpf_iter_meta *meta; + struct task_struct *task; + __u32 fd; + struct file *file; +} __attribute__((preserve_access_index)); + SEC("iter/task_file") int dump_task_file(struct bpf_iter__task_file *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c index 636a00fa074d..13c2c90c835f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c @@ -1,10 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c index b18dc0471d07..0aa71b333cf3 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -1,10 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + __u32 map1_id = 0, map2_id = 0; __u32 map1_accessed = 0, map2_accessed = 0; __u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h index bdd51cf14b54..dee1339e6905 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h @@ -1,11 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; int count = 0; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c index 1b8eb34b2db0..7396308677a3 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port4.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <string.h> +#include <stdbool.h> #include <linux/bpf.h> #include <linux/in.h> @@ -12,17 +13,71 @@ char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; +struct svc_addr { + __be32 addr; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + SEC("cgroup/connect4") -int _connect4(struct bpf_sock_addr *ctx) +int connect4(struct bpf_sock_addr *ctx) { struct sockaddr_in sa = {}; + struct svc_addr *orig; + /* Force local address to 127.0.0.1:22222. */ sa.sin_family = AF_INET; sa.sin_port = bpf_htons(22222); - sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) return 0; + /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr = ctx->user_ip4; + orig->port = ctx->user_port; + + ctx->user_ip4 = bpf_htonl(0x7f000001); + ctx->user_port = bpf_htons(60123); + } + return 1; +} + +SEC("cgroup/getsockname4") +int getsockname4(struct bpf_sock_addr *ctx) +{ + /* Expose local server as 1.2.3.4:60000 to client. */ + if (ctx->user_port == bpf_htons(60123)) { + ctx->user_ip4 = bpf_htonl(0x01020304); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername4") +int getpeername4(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service 1.2.3.4:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60123)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip4 = orig->addr; + ctx->user_port = orig->port; + } + } return 1; } diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c index ae6f7d750b4c..c1a2b555e9ad 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port6.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -12,17 +12,83 @@ char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; +struct svc_addr { + __be32 addr[4]; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + SEC("cgroup/connect6") -int _connect6(struct bpf_sock_addr *ctx) +int connect6(struct bpf_sock_addr *ctx) { struct sockaddr_in6 sa = {}; + struct svc_addr *orig; + /* Force local address to [::1]:22223. */ sa.sin6_family = AF_INET6; sa.sin6_port = bpf_htons(22223); - sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) return 0; + /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr[0] = ctx->user_ip6[0]; + orig->addr[1] = ctx->user_ip6[1]; + orig->addr[2] = ctx->user_ip6[2]; + orig->addr[3] = ctx->user_ip6[3]; + orig->port = ctx->user_port; + + ctx->user_ip6[0] = 0; + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60124); + } + return 1; +} + +SEC("cgroup/getsockname6") +int getsockname6(struct bpf_sock_addr *ctx) +{ + /* Expose local server as [fc00::1]:60000 to client. */ + if (ctx->user_port == bpf_htons(60124)) { + ctx->user_ip6[0] = bpf_htonl(0xfc000000); + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername6") +int getpeername6(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service [fc00::1]:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60124)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip6[0] = orig->addr[0]; + ctx->user_ip6[1] = orig->addr[1]; + ctx->user_ip6[2] = orig->addr[2]; + ctx->user_ip6[3] = orig->addr[3]; + ctx->user_port = orig->port; + } + } return 1; } diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index d2b38fa6a5b0..e83d0b48d80c 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -73,6 +73,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb) tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + bpf_printk("sk=%d\n", sk ? 1 : 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 9b4d3a68a91a..a443d3637db3 100644 --- a/tools/testing/selftests/bpf/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -110,8 +110,6 @@ int bpf_prog2(struct __sk_buff *skb) flags = *f; } - bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", - len, flags); #ifdef SOCKMAP return bpf_sk_redirect_map(skb, &sock_map, ret, flags); #else @@ -143,8 +141,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("passive(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: @@ -160,8 +156,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("active(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; default: @@ -199,72 +193,6 @@ int bpf_prog4(struct sk_msg_md *msg) } SEC("sk_msg2") -int bpf_prog5(struct sk_msg_md *msg) -{ - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; - int *bytes, len1, len2 = 0, len3, len4; - int err1 = -1, err2 = -1; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - int err; - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - int err; - - bpf_printk("sk_msg2: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg2: push_data err %i\n", err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg2: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg2: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - len1, err1, err2); - return SK_PASS; -} - -SEC("sk_msg3") int bpf_prog6(struct sk_msg_md *msg) { int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; @@ -305,86 +233,7 @@ int bpf_prog6(struct sk_msg_md *msg) #endif } -SEC("sk_msg4") -int bpf_prog7(struct sk_msg_md *msg) -{ - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int len1, len2 = 0, len3, len4; - int err1 = 0, err2 = 0, key = 0; - __u64 flags = 0; - - int err; - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - bpf_printk("sk_msg4: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg4: push_data err %i\n", - err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg4: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg4: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", - len1, flags, err1 ? err1 : err2); -#ifdef SOCKMAP - err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -#else - err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); -#endif - bpf_printk("sk_msg3: err %i\n", err); - return err; -} - -SEC("sk_msg5") +SEC("sk_msg3") int bpf_prog8(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -401,7 +250,7 @@ int bpf_prog8(struct sk_msg_md *msg) } return SK_PASS; } -SEC("sk_msg6") +SEC("sk_msg4") int bpf_prog9(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -419,7 +268,7 @@ int bpf_prog9(struct sk_msg_md *msg) return SK_PASS; } -SEC("sk_msg7") +SEC("sk_msg5") int bpf_prog10(struct sk_msg_md *msg) { int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; @@ -443,7 +292,6 @@ int bpf_prog10(struct sk_msg_md *msg) pop = bpf_map_lookup_elem(&sock_bytes, &five); if (start_pop && pop) bpf_msg_pop_data(msg, *start_pop, *pop, 0); - bpf_printk("return sk drop\n"); return SK_DROP; } diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 779e11da979c..c80643828b82 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -54,7 +54,7 @@ static void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" #define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o" #define CG_PATH "/sockmap" @@ -68,9 +68,7 @@ struct bpf_map *maps[8]; int prog_fd[11]; int txmsg_pass; -int txmsg_noisy; int txmsg_redir; -int txmsg_redir_noisy; int txmsg_drop; int txmsg_apply; int txmsg_cork; @@ -89,15 +87,13 @@ static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"cgroup", required_argument, NULL, 'c' }, {"rate", required_argument, NULL, 'r' }, - {"verbose", no_argument, NULL, 'v' }, + {"verbose", optional_argument, NULL, 'v' }, {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, {"data_test", no_argument, NULL, 'd' }, {"txmsg", no_argument, &txmsg_pass, 1 }, - {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, - {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {"txmsg_drop", no_argument, &txmsg_drop, 1 }, {"txmsg_apply", required_argument, NULL, 'a'}, {"txmsg_cork", required_argument, NULL, 'k'}, @@ -111,9 +107,104 @@ static const struct option long_options[] = { {"txmsg_skb", no_argument, &txmsg_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, + {"whitelist", required_argument, NULL, 'n' }, + {"blacklist", required_argument, NULL, 'b' }, {0, 0, NULL, 0 } }; +struct test_env { + const char *type; + const char *subtest; + const char *prepend; + + int test_num; + int subtest_num; + + int succ_cnt; + int fail_cnt; + int fail_last; +}; + +struct test_env env; + +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; + bool drop_expected; + int iov_count; + int iov_length; + int rate; + char *map; + char *whitelist; + char *blacklist; + char *prepend; +}; + +struct _test { + char *title; + void (*tester)(int cg_fd, struct sockmap_options *opt); +}; + +static void test_start(void) +{ + env.subtest_num++; +} + +static void test_fail(void) +{ + env.fail_cnt++; +} + +static void test_pass(void) +{ + env.succ_cnt++; +} + +static void test_reset(void) +{ + txmsg_start = txmsg_end = 0; + txmsg_start_pop = txmsg_pop = 0; + txmsg_start_push = txmsg_end_push = 0; + txmsg_pass = txmsg_drop = txmsg_redir = 0; + txmsg_apply = txmsg_cork = 0; + txmsg_ingress = txmsg_skb = 0; +} + +static int test_start_subtest(const struct _test *t, struct sockmap_options *o) +{ + env.type = o->map; + env.subtest = t->title; + env.prepend = o->prepend; + env.test_num++; + env.subtest_num = 0; + env.fail_last = env.fail_cnt; + test_reset(); + return 0; +} + +static void test_end_subtest(void) +{ + int error = env.fail_cnt - env.fail_last; + int type = strcmp(env.type, BPF_SOCKMAP_FILENAME); + + if (!error) + test_pass(); + + fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n", + env.test_num, env.subtest_num, + !type ? "sockmap" : "sockhash", + env.prepend ? : "", + env.subtest, error ? "FAIL" : "OK"); +} + +static void test_print_results(void) +{ + fprintf(stdout, "Pass: %d Fail: %d\n", + env.succ_cnt, env.fail_cnt); +} + static void usage(char *argv[]) { int i; @@ -296,7 +387,7 @@ static int sockmap_init_sockets(int verbose) return errno; } - if (verbose) { + if (verbose > 1) { printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); @@ -311,17 +402,6 @@ struct msg_stats { struct timespec end; }; -struct sockmap_options { - int verbose; - bool base; - bool sendpage; - bool data_test; - bool drop_expected; - int iov_count; - int iov_length; - int rate; -}; - static int msg_loop_sendpage(int fd, int iov_length, int cnt, struct msg_stats *s, struct sockmap_options *opt) @@ -345,14 +425,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendfile(fd, fp, NULL, iov_length); + int sent; + + errno = 0; + sent = sendfile(fd, fp, NULL, iov_length); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendpage loop error"); fclose(file); return sent; } else if (drop && sent >= 0) { - printf("sendpage loop error expected: %i\n", sent); + printf("sendpage loop error expected: %i errno %i\n", + sent, errno); fclose(file); return -EIO; } @@ -464,13 +548,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendmsg(fd, &msg, flags); + int sent; + + errno = 0; + sent = sendmsg(fd, &msg, flags); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendmsg loop error"); goto out_errno; } else if (drop && sent >= 0) { - printf("send loop error expected: %i\n", sent); + fprintf(stderr, + "sendmsg loop error expected: %i errno %i\n", + sent, errno); errno = -EIO; goto out_errno; } @@ -497,9 +586,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, * paths. */ total_bytes = (float)iov_count * (float)iov_length * (float)cnt; - txmsg_pop_total = txmsg_pop; if (txmsg_apply) - txmsg_pop_total *= (total_bytes / txmsg_apply); + txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply); + else + txmsg_pop_total = txmsg_pop * cnt; total_bytes -= txmsg_pop_total; err = clock_gettime(CLOCK_MONOTONIC, &s->start); if (err < 0) @@ -633,14 +723,18 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { + iov_buf -= (txmsg_pop - txmsg_start_pop + 1); if (opt->drop_expected) - exit(0); + _exit(0); + + if (!iov_buf) /* zero bytes sent case */ + _exit(0); if (opt->sendpage) iov_count = 1; err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false, opt); - if (opt->verbose) + if (opt->verbose > 1) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", iov_count, iov_buf, cnt, err); @@ -648,7 +742,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -678,7 +772,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -694,14 +788,14 @@ static int sendmsg_test(struct sockmap_options *opt) if (WIFEXITED(rx_status)) { err = WEXITSTATUS(rx_status); if (err) { - fprintf(stderr, "rx thread exited with err %d. ", err); + fprintf(stderr, "rx thread exited with err %d.\n", err); goto out; } } if (WIFEXITED(tx_status)) { err = WEXITSTATUS(tx_status); if (err) - fprintf(stderr, "tx thread exited with err %d. ", err); + fprintf(stderr, "tx thread exited with err %d.\n", err); } out: return err; @@ -783,6 +877,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) } enum { + SELFTESTS, PING_PONG, SENDMSG, BASE, @@ -834,19 +929,14 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) tx_prog_fd = prog_fd[3]; - else if (txmsg_noisy) - tx_prog_fd = prog_fd[4]; else if (txmsg_redir) + tx_prog_fd = prog_fd[4]; + else if (txmsg_apply) tx_prog_fd = prog_fd[5]; - else if (txmsg_redir_noisy) + else if (txmsg_cork) tx_prog_fd = prog_fd[6]; else if (txmsg_drop) - tx_prog_fd = prog_fd[9]; - /* apply and cork must be last */ - else if (txmsg_apply) tx_prog_fd = prog_fd[7]; - else if (txmsg_cork) - tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -870,7 +960,7 @@ run: goto out; } - if (txmsg_redir || txmsg_redir_noisy) + if (txmsg_redir) redir_fd = c2; else redir_fd = c1; @@ -1112,12 +1202,8 @@ static void test_options(char *options) if (txmsg_pass) strncat(options, "pass,", OPTSTRING); - if (txmsg_noisy) - strncat(options, "pass_noisy,", OPTSTRING); if (txmsg_redir) strncat(options, "redir,", OPTSTRING); - if (txmsg_redir_noisy) - strncat(options, "redir_noisy,", OPTSTRING); if (txmsg_drop) strncat(options, "drop,", OPTSTRING); if (txmsg_apply) { @@ -1168,416 +1254,283 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt) test_options(options); - fprintf(stdout, - "[TEST %i]: (%i, %i, %i, %s, %s): ", - test_cnt, opt->rate, opt->iov_count, opt->iov_length, - test_to_str(test), options); - fflush(stdout); + if (opt->verbose) { + fprintf(stdout, + " [TEST %i]: (%i, %i, %i, %s, %s): ", + test_cnt, opt->rate, opt->iov_count, opt->iov_length, + test_to_str(test), options); + fflush(stdout); + } err = run_options(opt, cgrp, test); - fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED"); + if (opt->verbose) + fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED"); test_cnt++; !err ? passed++ : failed++; free(options); return err; } -static int test_exec(int cgrp, struct sockmap_options *opt) -{ - int err = __test_exec(cgrp, SENDMSG, opt); - - if (err) - goto out; - - err = __test_exec(cgrp, SENDPAGE, opt); -out: - return err; -} - -static int test_loop(int cgrp) -{ - struct sockmap_options opt; - - int err, i, l, r; - - opt.verbose = 0; - opt.base = false; - opt.sendpage = false; - opt.data_test = false; - opt.drop_expected = false; - opt.iov_count = 0; - opt.iov_length = 0; - opt.rate = 0; - - r = 1; - for (i = 1; i < 100; i += 33) { - for (l = 1; l < 100; l += 33) { - opt.rate = r; - opt.iov_count = i; - opt.iov_length = l; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - } - sched_yield(); -out: - return err; -} - -static int test_txmsg(int cgrp) +static void test_exec(int cgrp, struct sockmap_options *opt) { + int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME); int err; - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_skb = 0; - - txmsg_pass = 1; - err = test_loop(cgrp); - txmsg_pass = 0; - if (err) - goto out; - - txmsg_redir = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - if (err) - goto out; - - txmsg_drop = 1; - err = test_loop(cgrp); - txmsg_drop = 0; - if (err) - goto out; - - txmsg_redir = 1; - txmsg_ingress = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - txmsg_ingress = 0; - if (err) - goto out; -out: - txmsg_pass = 0; - txmsg_redir = 0; - txmsg_drop = 0; - return err; + if (type == 0) { + test_start(); + err = __test_exec(cgrp, SENDMSG, opt); + if (err) + test_fail(); + } else { + test_start(); + err = __test_exec(cgrp, SENDPAGE, opt); + if (err) + test_fail(); + } } -static int test_send(struct sockmap_options *opt, int cgrp) +static void test_send_one(struct sockmap_options *opt, int cgrp) { - int err; - opt->iov_length = 1; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1; opt->iov_count = 1024; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1024; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); - opt->iov_length = 1; +} + +static void test_send_many(struct sockmap_options *opt, int cgrp) +{ + opt->iov_length = 3; opt->iov_count = 1; opt->rate = 512; - err = test_exec(cgrp, opt); - if (err) - goto out; - - opt->iov_length = 256; - opt->iov_count = 1024; - opt->rate = 2; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->rate = 100; opt->iov_count = 1; opt->iov_length = 5; - err = test_exec(cgrp, opt); - if (err) - goto out; -out: - sched_yield(); - return err; + test_exec(cgrp, opt); } -static int test_mixed(int cgrp) +static void test_send_large(struct sockmap_options *opt, int cgrp) { - struct sockmap_options opt = {0}; - int err; + opt->iov_length = 256; + opt->iov_count = 1024; + opt->rate = 2; + test_exec(cgrp, opt); +} - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_start = txmsg_end = 0; - txmsg_start_push = txmsg_end_push = 0; - txmsg_start_pop = txmsg_pop = 0; +static void test_send(struct sockmap_options *opt, int cgrp) +{ + test_send_one(opt, cgrp); + test_send_many(opt, cgrp); + test_send_large(opt, cgrp); + sched_yield(); +} +static void test_txmsg_pass(int cgrp, struct sockmap_options *opt) +{ /* Test small and large iov_count values with pass/redir/apply/cork */ txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_redir = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_drop(int cgrp, struct sockmap_options *opt) +{ + txmsg_drop = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 1; + test_send(opt, cgrp); +} +/* Test cork with hung data. This tests poor usage patterns where + * cork can leave data on the ring if user program is buggy and + * doesn't flush them somehow. They do take some time however + * because they wait for a timeout. Test pass, redir and cork with + * apply logic. Use cork size of 4097 with send_large to avoid + * aligning cork size with send size. + */ +static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt) +{ txmsg_pass = 1; txmsg_redir = 0; + txmsg_cork = 4097; + txmsg_apply = 4097; + test_send_large(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 4097; + test_send_large(opt, cgrp); - txmsg_pass = 1; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 4097; + txmsg_cork = 4097; + test_send_large(opt, cgrp); +} + +static void test_txmsg_pull(int cgrp, struct sockmap_options *opt) +{ + /* Test basic start/end */ + txmsg_start = 1; + txmsg_end = 2; + test_send(opt, cgrp); + + /* Test >4k pull */ + txmsg_start = 4096; + txmsg_end = 9182; + test_send_large(opt, cgrp); + + /* Test pull + redirect */ txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_start = 1; + txmsg_end = 2; + test_send(opt, cgrp); - txmsg_pass = 1; + /* Test pull + cork */ txmsg_redir = 0; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; + /* Test pull + cork + redirect */ txmsg_redir = 1; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(opt, cgrp); +} - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_pop(int cgrp, struct sockmap_options *opt) +{ + /* Test basic pop */ + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + /* Test pop with >4k */ + txmsg_start_pop = 4096; + txmsg_pop = 4096; + test_send_large(opt, cgrp); - txmsg_pass = 0; + /* Test pop + redirect */ txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + /* Test pop + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; + /* Test pop + redirect + cork */ txmsg_redir = 1; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; -out: - return err; + txmsg_cork = 4; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); } -static int test_start_end(int cgrp) +static void test_txmsg_push(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {0}; - int err, i; + /* Test basic push */ + txmsg_start_push = 1; + txmsg_end_push = 1; + test_send(opt, cgrp); - /* Test basic start/end with lots of iov_count and iov_lengths */ - txmsg_start = 1; - txmsg_end = 2; + /* Test push 4kB >4k */ + txmsg_start_push = 4096; + txmsg_end_push = 4096; + test_send_large(opt, cgrp); + + /* Test push + redirect */ + txmsg_redir = 1; txmsg_start_push = 1; txmsg_end_push = 2; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + test_send_many(opt, cgrp); - /* Cut a byte of pushed data but leave reamining in place */ - txmsg_start = 1; - txmsg_end = 2; + /* Test push + cork */ + txmsg_redir = 0; + txmsg_cork = 512; txmsg_start_push = 1; - txmsg_end_push = 3; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; - - /* Test start/end with cork */ - opt.rate = 16; - opt.iov_count = 1; - opt.iov_length = 100; - txmsg_cork = 1600; - - txmsg_start_pop = 0; - txmsg_pop = 0; - - for (i = 99; i <= 1600; i += 500) { - txmsg_start = 0; - txmsg_end = i; - txmsg_start_push = 0; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - - /* Test pop data in middle of cork */ - for (i = 99; i <= 1600; i += 500) { - txmsg_start_pop = 10; - txmsg_pop = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end with cork but pull data in middle */ - for (i = 199; i <= 1600; i += 500) { - txmsg_start = 100; - txmsg_end = i; - txmsg_start_push = 100; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - - /* Test start/end with cork pulling last sg entry */ - txmsg_start = 1500; - txmsg_end = 1600; - txmsg_start_push = 1500; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_end_push = 2; + test_send_many(opt, cgrp); +} - /* Test pop with cork pulling last sg entry */ - txmsg_start_pop = 1500; - txmsg_pop = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end pull of single byte in last page */ - txmsg_start = 1111; - txmsg_end = 1112; - txmsg_start_push = 1111; - txmsg_end_push = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt) +{ + txmsg_start_push = 1; + txmsg_end_push = 10; + txmsg_start_pop = 5; + txmsg_pop = 4; + test_send_large(opt, cgrp); +} - /* Test pop of single byte in last page */ - txmsg_start_pop = 1111; - txmsg_pop = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_apply(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); - /* Test start/end with end < start */ - txmsg_start = 1111; - txmsg_end = 0; - txmsg_start_push = 1111; - txmsg_end_push = 0; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); - /* Test start/end with end > data */ - txmsg_start = 0; - txmsg_end = 1601; - txmsg_start_push = 0; - txmsg_end_push = 1601; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); - /* Test start/end with start > data */ - txmsg_start = 1601; - txmsg_end = 1600; - txmsg_start_push = 1601; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); +} - /* Test pop with start > data */ - txmsg_start_pop = 1601; - txmsg_pop = 1; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_cork(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 0; + txmsg_cork = 1; + test_send(opt, cgrp); - /* Test pop with pop range > data */ - txmsg_start_pop = 1599; - txmsg_pop = 10; - err = test_exec(cgrp, &opt); -out: - txmsg_start = 0; - txmsg_end = 0; - sched_yield(); - return err; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 1; + test_send(opt, cgrp); } char *map_names[] = { @@ -1662,73 +1615,116 @@ static int populate_progs(char *bpf_file) return 0; } -static int __test_suite(int cg_fd, char *bpf_file) +struct _test test[] = { + {"txmsg test passthrough", test_txmsg_pass}, + {"txmsg test redirect", test_txmsg_redir}, + {"txmsg test drop", test_txmsg_drop}, + {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test apply", test_txmsg_apply}, + {"txmsg test cork", test_txmsg_cork}, + {"txmsg test hanging corks", test_txmsg_cork_hangs}, + {"txmsg test push_data", test_txmsg_push}, + {"txmsg test pull-data", test_txmsg_pull}, + {"txmsg test pop-data", test_txmsg_pop}, + {"txmsg test push/pop data", test_txmsg_push_pop}, +}; + +static int check_whitelist(struct _test *t, struct sockmap_options *opt) { - int err, cleanup = cg_fd; + char *entry, *ptr; + + if (!opt->whitelist) + return 0; + ptr = strdup(opt->whitelist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} - err = populate_progs(bpf_file); +static int check_blacklist(struct _test *t, struct sockmap_options *opt) +{ + char *entry, *ptr; + + if (!opt->blacklist) + return -EINVAL; + ptr = strdup(opt->blacklist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} + +static int __test_selftests(int cg_fd, struct sockmap_options *opt) +{ + int i, err; + + err = populate_progs(opt->map); if (err < 0) { fprintf(stderr, "ERROR: (%i) load bpf failed\n", err); return err; } - if (cg_fd < 0) { - if (setup_cgroup_environment()) { - fprintf(stderr, "ERROR: cgroup env failed\n"); - return -EINVAL; - } + /* Tests basic commands and APIs */ + for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { + struct _test t = test[i]; - cg_fd = create_and_get_cgroup(CG_PATH); - if (cg_fd < 0) { - fprintf(stderr, - "ERROR: (%i) open cg path failed: %s\n", - cg_fd, optarg); - return cg_fd; - } + if (check_whitelist(&t, opt) != 0) + continue; + if (check_blacklist(&t, opt) == 0) + continue; - if (join_cgroup(CG_PATH)) { - fprintf(stderr, "ERROR: failed to join cgroup\n"); - return -EINVAL; - } + test_start_subtest(&t, opt); + t.tester(cg_fd, opt); + test_end_subtest(); } - /* Tests basic commands and APIs with range of iov values */ - txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0; - err = test_txmsg(cg_fd); - if (err) - goto out; + return err; +} - /* Tests interesting combinations of APIs used together */ - err = test_mixed(cg_fd); - if (err) - goto out; +static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKMAP_FILENAME; + __test_selftests(cg_fd, opt); +} - /* Tests pull_data API using start/end API */ - err = test_start_end(cg_fd); - if (err) - goto out; +static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + __test_selftests(cg_fd, opt); +} -out: - printf("Summary: %i PASSED %i FAILED\n", passed, failed); - if (cleanup < 0) { - cleanup_cgroup_environment(); - close(cg_fd); - } - return err; +static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + opt->prepend = "ktls"; + ktls = 1; + __test_selftests(cg_fd, opt); + ktls = 0; } -static int test_suite(int cg_fd) +static int test_selftest(int cg_fd, struct sockmap_options *opt) { - int err; - err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME); - if (err) - goto out; - err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME); -out: - if (cg_fd > -1) - close(cg_fd); - return err; + test_selftests_sockmap(cg_fd, opt); + test_selftests_sockhash(cg_fd, opt); + test_selftests_ktls(cg_fd, opt); + test_print_results(); + return 0; } int main(int argc, char **argv) @@ -1737,12 +1733,10 @@ int main(int argc, char **argv) struct sockmap_options options = {0}; int opt, longindex, err, cg_fd = 0; char *bpf_file = BPF_SOCKMAP_FILENAME; - int test = PING_PONG; + int test = SELFTESTS; + bool cg_created = 0; - if (argc < 2) - return test_suite(-1); - - while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:", + while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:", long_options, &longindex)) != -1) { switch (opt) { case 's': @@ -1783,6 +1777,8 @@ int main(int argc, char **argv) break; case 'v': options.verbose = 1; + if (optarg) + options.verbose = atoi(optarg); break; case 'i': iov_count = atoi(optarg); @@ -1809,6 +1805,15 @@ int main(int argc, char **argv) return -1; } break; + case 'n': + options.whitelist = strdup(optarg); + if (!options.whitelist) + return -ENOMEM; + break; + case 'b': + options.blacklist = strdup(optarg); + if (!options.blacklist) + return -ENOMEM; case 0: break; case 'h': @@ -1818,13 +1823,30 @@ int main(int argc, char **argv) } } - if (argc <= 3 && cg_fd) - return test_suite(cg_fd); - if (!cg_fd) { - fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", - argv[0]); - return -1; + if (setup_cgroup_environment()) { + fprintf(stderr, "ERROR: cgroup env failed\n"); + return -EINVAL; + } + + cg_fd = create_and_get_cgroup(CG_PATH); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, strerror(errno)); + return cg_fd; + } + + if (join_cgroup(CG_PATH)) { + fprintf(stderr, "ERROR: failed to join cgroup\n"); + return -EINVAL; + } + cg_created = 1; + } + + if (test == SELFTESTS) { + err = test_selftest(cg_fd, &options); + goto out; } err = populate_progs(bpf_file); @@ -1843,6 +1865,13 @@ int main(int argc, char **argv) options.rate = rate; err = run_options(&options, cg_fd, test); +out: + if (options.whitelist) + free(options.whitelist); + if (options.blacklist) + free(options.blacklist); + if (cg_created) + cleanup_cgroup_environment(); close(cg_fd); return err; } diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 604b46151736..056e0273bf12 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -821,3 +821,36 @@ .result = REJECT, .errstr = "invalid mem access", }, +{ + "reference tracking: branch tracking valid pointer null comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, +{ + "reference tracking: branch tracking valid pointer value comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "Unreleased reference", + .result = REJECT, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c index 860d4a71cd83..3ecb70a3d939 100644 --- a/tools/testing/selftests/bpf/verifier/value_or_null.c +++ b/tools/testing/selftests/bpf/verifier/value_or_null.c @@ -150,3 +150,22 @@ .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "map lookup and null branch prediction", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 4 }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 50d822face36..51f8e9afe6ae 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode ipv4_fdb_grp_fcnal" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode ipv6_fdb_grp_fcnal" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -146,6 +146,7 @@ setup() create_ns remote IP="ip -netns me" + BRIDGE="bridge -netns me" set -e $IP li add veth1 type veth peer name veth2 $IP li set veth1 up @@ -280,6 +281,161 @@ stop_ip_monitor() return $rc } +check_nexthop_fdb_support() +{ + $IP nexthop help 2>&1 | grep -q fdb + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing fdb nexthop support" + return $ksft_skip + fi +} + +ipv6_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv6 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb" + run_cmd "$IP nexthop add id 102 group 61/62 fdb" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + ## get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 103 group 63/64 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb" + run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb" + run_cmd "$IP nexthop add id 104 group 65/66" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 67 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + ## fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + +ipv4_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv4 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 102 group 12/13 fdb" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + # get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 14 via 172.16.1.2" + run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 103 group 14/15 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 104 group 14/15" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 18 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + # fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # |