From b3c9e65eb227269ed72a115ba22f4f51b4e62b4d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Sep 2024 13:37:25 +0000 Subject: [PATCH 01/36] net: hsr: remove seqnr_lock syzbot found a new splat [1]. Instead of adding yet another spin_lock_bh(&hsr->seqnr_lock) / spin_unlock_bh(&hsr->seqnr_lock) pair, remove seqnr_lock and use atomic_t for hsr->sequence_nr and hsr->sup_sequence_nr. This also avoid a race in hsr_fill_info(). Also remove interlink_sequence_nr which is unused. [1] WARNING: CPU: 1 PID: 9723 at net/hsr/hsr_forward.c:602 handle_std_frame+0x247/0x2c0 net/hsr/hsr_forward.c:602 Modules linked in: CPU: 1 UID: 0 PID: 9723 Comm: syz.0.1657 Not tainted 6.11.0-rc6-syzkaller-00026-g88fac17500f4 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:handle_std_frame+0x247/0x2c0 net/hsr/hsr_forward.c:602 Code: 49 8d bd b0 01 00 00 be ff ff ff ff e8 e2 58 25 00 31 ff 89 c5 89 c6 e8 47 53 a8 f6 85 ed 0f 85 5a ff ff ff e8 fa 50 a8 f6 90 <0f> 0b 90 e9 4c ff ff ff e8 cc e7 06 f7 e9 8f fe ff ff e8 52 e8 06 RSP: 0018:ffffc90000598598 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffc90000598670 RCX: ffffffff8ae2c919 RDX: ffff888024e94880 RSI: ffffffff8ae2c926 RDI: 0000000000000005 RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000003 R13: ffff8880627a8cc0 R14: 0000000000000000 R15: ffff888012b03c3a FS: 0000000000000000(0000) GS:ffff88802b700000(0063) knlGS:00000000f5696b40 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 0000000020010000 CR3: 00000000768b4000 CR4: 0000000000350ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: hsr_fill_frame_info+0x2c8/0x360 net/hsr/hsr_forward.c:630 fill_frame_info net/hsr/hsr_forward.c:700 [inline] hsr_forward_skb+0x7df/0x25c0 net/hsr/hsr_forward.c:715 hsr_handle_frame+0x603/0x850 net/hsr/hsr_slave.c:70 __netif_receive_skb_core.constprop.0+0xa3d/0x4330 net/core/dev.c:5555 __netif_receive_skb_list_core+0x357/0x950 net/core/dev.c:5737 __netif_receive_skb_list net/core/dev.c:5804 [inline] netif_receive_skb_list_internal+0x753/0xda0 net/core/dev.c:5896 gro_normal_list include/net/gro.h:515 [inline] gro_normal_list include/net/gro.h:511 [inline] napi_complete_done+0x23f/0x9a0 net/core/dev.c:6247 gro_cell_poll+0x162/0x210 net/core/gro_cells.c:66 __napi_poll.constprop.0+0xb7/0x550 net/core/dev.c:6772 napi_poll net/core/dev.c:6841 [inline] net_rx_action+0xa92/0x1010 net/core/dev.c:6963 handle_softirqs+0x216/0x8f0 kernel/softirq.c:554 do_softirq kernel/softirq.c:455 [inline] do_softirq+0xb2/0xf0 kernel/softirq.c:442 Fixes: 06afd2c31d33 ("hsr: Synchronize sending frames to have always incremented outgoing seq nr.") Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Sebastian Andrzej Siewior Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 35 ++++++++++------------------------- net/hsr/hsr_forward.c | 4 +--- net/hsr/hsr_main.h | 6 ++---- net/hsr/hsr_netlink.c | 2 +- 4 files changed, 14 insertions(+), 33 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e4cc6b78dcfc..ac56784c327c 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -231,9 +231,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb->dev = master->dev; skb_reset_mac_header(skb); skb_reset_mac_len(skb); - spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, master); - spin_unlock_bh(&hsr->seqnr_lock); } else { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); @@ -314,14 +312,10 @@ static void send_hsr_supervision_frame(struct hsr_port *port, set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); /* From HSRv1 on we have separate supervision sequence numbers. */ - spin_lock_bh(&hsr->seqnr_lock); - if (hsr->prot_version > 0) { - hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); - hsr->sup_sequence_nr++; - } else { - hsr_stag->sequence_nr = htons(hsr->sequence_nr); - hsr->sequence_nr++; - } + if (hsr->prot_version > 0) + hsr_stag->sequence_nr = htons(atomic_inc_return(&hsr->sup_sequence_nr)); + else + hsr_stag->sequence_nr = htons(atomic_inc_return(&hsr->sequence_nr)); hsr_stag->tlv.HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ @@ -343,13 +337,11 @@ static void send_hsr_supervision_frame(struct hsr_port *port, ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox); } - if (skb_put_padto(skb, ETH_ZLEN)) { - spin_unlock_bh(&hsr->seqnr_lock); + if (skb_put_padto(skb, ETH_ZLEN)) return; - } hsr_forward_skb(skb, port); - spin_unlock_bh(&hsr->seqnr_lock); + return; } @@ -374,9 +366,7 @@ static void send_prp_supervision_frame(struct hsr_port *master, set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0)); /* From HSRv1 on we have separate supervision sequence numbers. */ - spin_lock_bh(&hsr->seqnr_lock); - hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); - hsr->sup_sequence_nr++; + hsr_stag->sequence_nr = htons(atomic_inc_return(&hsr->sup_sequence_nr)); hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload); @@ -384,13 +374,10 @@ static void send_prp_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); - if (skb_put_padto(skb, ETH_ZLEN)) { - spin_unlock_bh(&hsr->seqnr_lock); + if (skb_put_padto(skb, ETH_ZLEN)) return; - } hsr_forward_skb(skb, master); - spin_unlock_bh(&hsr->seqnr_lock); } /* Announce (supervision frame) timer function @@ -621,11 +608,9 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res < 0) return res; - spin_lock_init(&hsr->seqnr_lock); /* Overflow soon to find bugs easier: */ - hsr->sequence_nr = HSR_SEQNR_START; - hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; - hsr->interlink_sequence_nr = HSR_SEQNR_START; + atomic_set(&hsr->sequence_nr, HSR_SEQNR_START); + atomic_set(&hsr->sup_sequence_nr, HSR_SUP_SEQNR_START); timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index b38060246e62..6f63c8a775c4 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -599,9 +599,7 @@ static void handle_std_frame(struct sk_buff *skb, if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { /* Sequence nr for the master/interlink node */ - lockdep_assert_held(&hsr->seqnr_lock); - frame->sequence_nr = hsr->sequence_nr; - hsr->sequence_nr++; + frame->sequence_nr = atomic_inc_return(&hsr->sequence_nr); } } diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index ab1f8d35d9dc..6f7bbf01f3e4 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -202,11 +202,9 @@ struct hsr_priv { struct timer_list prune_timer; struct timer_list prune_proxy_timer; int announce_count; - u16 sequence_nr; - u16 interlink_sequence_nr; /* Interlink port seq_nr */ - u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ + atomic_t sequence_nr; + atomic_t sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ - spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ struct hsr_proto_ops *proto_ops; #define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index f6ff0b61e08a..8aea4ff5f49e 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -163,7 +163,7 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN, hsr->sup_multicast_addr) || - nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) + nla_put_u16(skb, IFLA_HSR_SEQ_NR, atomic_read(&hsr->sequence_nr))) goto nla_put_failure; if (hsr->prot_version == PRP_V1) proto = HSR_PROTOCOL_PRP; From 9debb703e14939dfafa5d403f27c4feb2e9f6501 Mon Sep 17 00:00:00 2001 From: Martyna Szapar-Mudlaw Date: Wed, 26 Jun 2024 11:43:42 +0200 Subject: [PATCH 02/36] ice: Fix lldp packets dropping after changing the number of channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After vsi setup refactor commit 6624e780a577 ("ice: split ice_vsi_setup into smaller functions") ice_cfg_sw_lldp function which removes rx rule directing LLDP packets to vsi is moved from ice_vsi_release to ice_vsi_decfg function. ice_vsi_decfg is used in more cases than just in vsi_release resulting in unnecessary removal of rx lldp packets handling switch rule. This leads to lldp packets being dropped after a change number of channels via ethtool. This patch moves ice_cfg_sw_lldp function that removes rx lldp sw rule back to ice_vsi_release function. Fixes: 6624e780a577 ("ice: split ice_vsi_setup into smaller functions") Reported-by: Matěj Grégr Closes: https://lore.kernel.org/intel-wired-lan/1be45a76-90af-4813-824f-8398b69745a9@netx.as/T/#u Reviewed-by: Przemek Kitszel Signed-off-by: Martyna Szapar-Mudlaw Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lib.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 737c00b02dd0..2405e5ed9128 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2413,13 +2413,6 @@ void ice_vsi_decfg(struct ice_vsi *vsi) struct ice_pf *pf = vsi->back; int err; - /* The Rx rule will only exist to remove if the LLDP FW - * engine is currently stopped - */ - if (!ice_is_safe_mode(pf) && vsi->type == ICE_VSI_PF && - !test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) - ice_cfg_sw_lldp(vsi, false, false); - ice_rm_vsi_lan_cfg(vsi->port_info, vsi->idx); err = ice_rm_vsi_rdma_cfg(vsi->port_info, vsi->idx); if (err) @@ -2764,6 +2757,14 @@ int ice_vsi_release(struct ice_vsi *vsi) ice_rss_clean(vsi); ice_vsi_close(vsi); + + /* The Rx rule will only exist to remove if the LLDP FW + * engine is currently stopped + */ + if (!ice_is_safe_mode(pf) && vsi->type == ICE_VSI_PF && + !test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) + ice_cfg_sw_lldp(vsi, false, false); + ice_vsi_decfg(vsi); /* retain SW VSI data structure since it is needed to unregister and From e843cf7b34fe2e0c1afc55e1f3057375c9b77a14 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 31 Jul 2024 09:55:55 -0700 Subject: [PATCH 03/36] ice: fix accounting for filters shared by multiple VSIs When adding a switch filter (such as a MAC or VLAN filter), it is expected that the driver will detect the case where the filter already exists, and return -EEXIST. This is used by calling code such as ice_vc_add_mac_addr, and ice_vsi_add_vlan to avoid incrementing the accounting fields such as vsi->num_vlan or vf->num_mac. This logic works correctly for the case where only a single VSI has added a given switch filter. When a second VSI adds the same switch filter, the driver converts the existing filter from an ICE_FWD_TO_VSI filter into an ICE_FWD_TO_VSI_LIST filter. This saves switch resources, by ensuring that multiple VSIs can re-use the same filter. The ice_add_update_vsi_list() function is responsible for doing this conversion. When first converting a filter from the FWD_TO_VSI into FWD_TO_VSI_LIST, it checks if the VSI being added is the same as the existing rule's VSI. In such a case it returns -EEXIST. However, when the switch rule has already been converted to a FWD_TO_VSI_LIST, the logic is different. Adding a new VSI in this case just requires extending the VSI list entry. The logic for checking if the rule already exists in this case returns 0 instead of -EEXIST. This breaks the accounting logic mentioned above, so the counters for how many MAC and VLAN filters exist for a given VF or VSI no longer accurately reflect the actual count. This breaks other code which relies on these counts. In typical usage this primarily affects such filters generally shared by multiple VSIs such as VLAN 0, or broadcast and multicast MAC addresses. Fix this by correctly reporting -EEXIST in the case of adding the same VSI to a switch rule already converted to ICE_FWD_TO_VSI_LIST. Fixes: 9daf8208dd4d ("ice: Add support for switch filter programming") Signed-off-by: Jacob Keller Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index fe8847184cb1..0160f0bae8d6 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -3194,7 +3194,7 @@ ice_add_update_vsi_list(struct ice_hw *hw, /* A rule already exists with the new VSI being added */ if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map)) - return 0; + return -EEXIST; /* Update the previously created VSI list set with * the new VSI ID passed in From e6501fc38a7590fc014d7bb5c406974d32c0530f Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Fri, 30 Aug 2024 15:44:11 +0200 Subject: [PATCH 04/36] ice: stop calling pci_disable_device() as we use pcim Our driver uses devres to manage resources, in particular we call pcim_enable_device(), what also means we express the intent to get automatic pci_disable_device() call at driver removal. Manual calls to pci_disable_device() misuse the API. Recent commit (see "Fixes" tag) has changed the removal action from conditional (silent ignore of double call to pci_disable_device()) to unconditional, but able to catch unwanted redundant calls; see cited "Fixes" commit for details. Since that, unloading the driver yields following warn+splat: [70633.628490] ice 0000:af:00.7: disabling already-disabled device [70633.628512] WARNING: CPU: 52 PID: 33890 at drivers/pci/pci.c:2250 pci_disable_device+0xf4/0x100 ... [70633.628744] ? pci_disable_device+0xf4/0x100 [70633.628752] release_nodes+0x4a/0x70 [70633.628759] devres_release_all+0x8b/0xc0 [70633.628768] device_unbind_cleanup+0xe/0x70 [70633.628774] device_release_driver_internal+0x208/0x250 [70633.628781] driver_detach+0x47/0x90 [70633.628786] bus_remove_driver+0x80/0x100 [70633.628791] pci_unregister_driver+0x2a/0xb0 [70633.628799] ice_module_exit+0x11/0x3a [ice] Note that this is the only Intel ethernet driver that needs such fix. Fixes: f748a07a0b64 ("PCI: Remove legacy pcim_release()") Reviewed-by: Larysa Zaremba Reviewed-by: Philipp Stanner Signed-off-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index c7db88b517da..ea780d468579 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5363,7 +5363,6 @@ err_load: ice_deinit(pf); err_init: ice_adapter_put(pdev); - pci_disable_device(pdev); return err; } @@ -5470,7 +5469,6 @@ static void ice_remove(struct pci_dev *pdev) ice_set_wake(pf); ice_adapter_put(pdev); - pci_disable_device(pdev); } /** From d2940002b0aa42898de815a1453b29d440292386 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Wed, 4 Sep 2024 11:39:22 +0200 Subject: [PATCH 05/36] ice: fix VSI lists confusion when adding VLANs The description of function ice_find_vsi_list_entry says: Search VSI list map with VSI count 1 However, since the blamed commit (see Fixes below), the function no longer checks vsi_count. This causes a problem in ice_add_vlan_internal, where the decision to share VSI lists between filter rules relies on the vsi_count of the found existing VSI list being 1. The reproducing steps: 1. Have a PF and two VFs. There will be a filter rule for VLAN 0, referring to a VSI list containing VSIs: 0 (PF), 2 (VF#0), 3 (VF#1). 2. Add VLAN 1234 to VF#0. ice will make the wrong decision to share the VSI list with the new rule. The wrong behavior may not be immediately apparent, but it can be observed with debug prints. 3. Add VLAN 1234 to VF#1. ice will unshare the VSI list for the VLAN 1234 rule. Due to the earlier bad decision, the newly created VSI list will contain VSIs 0 (PF) and 3 (VF#1), instead of expected 2 (VF#0) and 3 (VF#1). 4. Try pinging a network peer over the VLAN interface on VF#0. This fails. Reproducer script at: https://gitlab.com/mschmidt2/repro/-/blob/master/RHEL-46814/test-vlan-vsi-list-confusion.sh Commented debug trace: https://gitlab.com/mschmidt2/repro/-/blob/master/RHEL-46814/ice-vlan-vsi-lists-debug.txt Patch adding the debug prints: https://gitlab.com/mschmidt2/linux/-/commit/f8a8814623944a45091a77c6094c40bfe726bfdb (Unsafe, by the way. Lacks rule_lock when dumping in ice_remove_vlan.) Michal Swiatkowski added to the explanation that the bug is caused by reusing a VSI list created for VLAN 0. All created VFs' VSIs are added to VLAN 0 filter. When a non-zero VLAN is created on a VF which is already in VLAN 0 (normal case), the VSI list from VLAN 0 is reused. It leads to a problem because all VFs (VSIs to be specific) that are subscribed to VLAN 0 will now receive a new VLAN tag traffic. This is one bug, another is the bug described above. Removing filters from one VF will remove VLAN filter from the previous VF. It happens a VF is reset. Example: - creation of 3 VFs - we have VSI list (used for VLAN 0) [0 (pf), 2 (vf1), 3 (vf2), 4 (vf3)] - we are adding VLAN 100 on VF1, we are reusing the previous list because 2 is there - VLAN traffic works fine, but VLAN 100 tagged traffic can be received on all VSIs from the list (for example broadcast or unicast) - trust is turning on VF2, VF2 is resetting, all filters from VF2 are removed; the VLAN 100 filter is also removed because 3 is on the list - VLAN traffic to VF1 isn't working anymore, there is a need to recreate VLAN interface to readd VLAN filter One thing I'm not certain about is the implications for the LAG feature, which is another caller of ice_find_vsi_list_entry. I don't have a LAG-capable card at hand to test. Fixes: 23ccae5ce15f ("ice: changes to the interface with the HW and FW for SRIOV_VF+LAG") Reviewed-by: Michal Swiatkowski Signed-off-by: Michal Schmidt Reviewed-by: Dave Ertman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 0160f0bae8d6..79d91e95358c 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -3264,7 +3264,7 @@ ice_find_vsi_list_entry(struct ice_hw *hw, u8 recp_id, u16 vsi_handle, list_head = &sw->recp_list[recp_id].filt_rules; list_for_each_entry(list_itr, list_head, list_entry) { - if (list_itr->vsi_list_info) { + if (list_itr->vsi_count == 1 && list_itr->vsi_list_info) { map_info = list_itr->vsi_list_info; if (test_bit(vsi_handle, map_info->vsi_map)) { *vsi_list_id = map_info->vsi_list_id; From 27717f8b17c098c4373ddb8fe89e1a1899c7779d Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Thu, 22 Aug 2024 09:42:07 +0200 Subject: [PATCH 06/36] igb: Always call igb_xdp_ring_update_tail() under Tx lock Always call igb_xdp_ring_update_tail() under __netif_tx_lock, add a comment and lockdep assert to indicate that. This is needed to share the same TX ring between XDP, XSK and slow paths. Furthermore, the current XDP implementation is racy on tail updates. Fixes: 9cbc948b5a20 ("igb: add XDP support") Signed-off-by: Sriram Yagnaraman [Kurt: Add lockdep assert and fixes tag] Signed-off-by: Kurt Kanzenbach Acked-by: Maciej Fijalkowski Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 9dc7c60838ed..1ef4cb871452 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef CONFIG_IGB_DCA #include #endif @@ -2914,8 +2915,11 @@ static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } +/* This function assumes __netif_tx_lock is held by the caller. */ static void igb_xdp_ring_update_tail(struct igb_ring *ring) { + lockdep_assert_held(&txring_txq(ring)->_xmit_lock); + /* Force memory writes to complete before letting h/w know there * are new descriptors to fetch. */ @@ -3000,11 +3004,11 @@ static int igb_xdp_xmit(struct net_device *dev, int n, nxmit++; } - __netif_tx_unlock(nq); - if (unlikely(flags & XDP_XMIT_FLUSH)) igb_xdp_ring_update_tail(tx_ring); + __netif_tx_unlock(nq); + return nxmit; } @@ -8864,12 +8868,14 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring, static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) { + unsigned int total_bytes = 0, total_packets = 0; struct igb_adapter *adapter = q_vector->adapter; struct igb_ring *rx_ring = q_vector->rx.ring; - struct sk_buff *skb = rx_ring->skb; - unsigned int total_bytes = 0, total_packets = 0; u16 cleaned_count = igb_desc_unused(rx_ring); + struct sk_buff *skb = rx_ring->skb; + int cpu = smp_processor_id(); unsigned int xdp_xmit = 0; + struct netdev_queue *nq; struct xdp_buff xdp; u32 frame_sz = 0; int rx_buf_pgcnt; @@ -8997,7 +9003,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) if (xdp_xmit & IGB_XDP_TX) { struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); + nq = txring_txq(tx_ring); + __netif_tx_lock(nq, cpu); igb_xdp_ring_update_tail(tx_ring); + __netif_tx_unlock(nq); } u64_stats_update_begin(&rx_ring->rx_syncp); From 7472d157cb8014103105433bcc0705af2e6f7184 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 15 Aug 2024 11:02:34 +0300 Subject: [PATCH 07/36] net/mlx5: Update the list of the PCI supported devices Add the upcoming ConnectX-9 device ID to the table of supported PCI device IDs. Fixes: f908a35b2218 ("net/mlx5: Update the list of the PCI supported devices") Signed-off-by: Maher Sanalla Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 5b7e6f4b5c7e..2ec33c4a2a3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -2217,6 +2217,7 @@ static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x101f) }, /* ConnectX-6 LX */ { PCI_VDEVICE(MELLANOX, 0x1021) }, /* ConnectX-7 */ { PCI_VDEVICE(MELLANOX, 0x1023) }, /* ConnectX-8 */ + { PCI_VDEVICE(MELLANOX, 0x1025) }, /* ConnectX-9 */ { PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */ { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */ { PCI_VDEVICE(MELLANOX, 0xa2d6) }, /* BlueField-2 integrated ConnectX-6 Dx network controller */ From 7617d62cba4a8a3ff3ed3fda0171c43f135c142e Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 11 Aug 2024 13:56:13 +0300 Subject: [PATCH 08/36] net/mlx5e: Add missing link modes to ptys2ethtool_map Add MLX5E_1000BASE_T and MLX5E_100BASE_TX to the legacy modes in ptys2legacy_ethtool_table, since they were missing. Fixes: 665bc53969d7 ("net/mlx5e: Use new ethtool get/set link ksettings API") Signed-off-by: Shahar Shitrit Reviewed-by: Tariq Toukan Reviewed-by: Carolina Jubran Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 36845872ae94..1e829b97eaac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -139,6 +139,10 @@ void mlx5e_build_ptys2ethtool_map(void) ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT); MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_LR4, legacy, ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100BASE_TX, legacy, + ETHTOOL_LINK_MODE_100baseT_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_T, legacy, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT); MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_T, legacy, ETHTOOL_LINK_MODE_10000baseT_Full_BIT); MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_CR, legacy, From 80bf474242b21d64a514fd2bb65faa7a17ca8d8d Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 11 Aug 2024 13:58:04 +0300 Subject: [PATCH 09/36] net/mlx5e: Add missing link mode to ptys2ext_ethtool_map Add MLX5E_400GAUI_8_400GBASE_CR8 to the extended modes in ptys2ext_ethtool_table, since it was missing. Fixes: 6a897372417e ("net/mlx5: ethtool, Add ethtool support for 50Gbps per lane link modes") Signed-off-by: Shahar Shitrit Reviewed-by: Tariq Toukan Reviewed-by: Carolina Jubran Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 1e829b97eaac..1cf3c54d343e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -208,6 +208,12 @@ void mlx5e_build_ptys2ethtool_map(void) ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT, ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT, ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_400GAUI_8_400GBASE_CR8, ext, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT); MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GAUI_1_100GBASE_CR_KR, ext, ETHTOOL_LINK_MODE_100000baseKR_Full_BIT, ETHTOOL_LINK_MODE_100000baseSR_Full_BIT, From c88146abe4d0f8cf659b2b8883fdc33936d2e3b8 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 2 Sep 2024 11:46:14 +0300 Subject: [PATCH 10/36] net/mlx5: Explicitly set scheduling element and TSAR type Ensure the scheduling element type and TSAR type are explicitly initialized in the QoS rate group creation. This prevents potential issues due to default values. Fixes: 1ae258f8b343 ("net/mlx5: E-switch, Introduce rate limiting groups API") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 20146a2dc7f4..997c412a81af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -421,6 +421,7 @@ __esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *ex { u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; struct mlx5_esw_rate_group *group; + __be32 *attr; u32 divider; int err; @@ -428,6 +429,12 @@ __esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *ex if (!group) return ERR_PTR(-ENOMEM); + MLX5_SET(scheduling_context, tsar_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + + attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes); + *attr = cpu_to_be32(TSAR_ELEMENT_TSAR_TYPE_DWRR << 16); + MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, esw->qos.root_tsar_ix); err = mlx5_create_scheduling_element_cmd(esw->dev, From 452ef7f86036392005940de54228d42ca0044192 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 5 Aug 2024 10:03:20 +0300 Subject: [PATCH 11/36] net/mlx5: Add missing masks and QoS bit masks for scheduling elements Add the missing masks for supported element types and Transmit Scheduling Arbiter (TSAR) types in scheduling elements. Also, add the corresponding bit masks for these types in the QoS capabilities of a NIC scheduler. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cab228cf51c6..cfdf984a95a8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1027,7 +1027,8 @@ struct mlx5_ifc_qos_cap_bits { u8 max_tsar_bw_share[0x20]; - u8 reserved_at_100[0x20]; + u8 nic_element_type[0x10]; + u8 nic_tsar_type[0x10]; u8 reserved_at_120[0x3]; u8 log_meter_aso_granularity[0x5]; @@ -3966,6 +3967,7 @@ enum { ELEMENT_TYPE_CAP_MASK_VPORT = 1 << 1, ELEMENT_TYPE_CAP_MASK_VPORT_TC = 1 << 2, ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC = 1 << 3, + ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, }; struct mlx5_ifc_scheduling_context_bits { @@ -4675,6 +4677,12 @@ enum { TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, }; +enum { + TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, + TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, + TSAR_TYPE_CAP_MASK_ETS = 1 << 2, +}; + struct mlx5_ifc_tsar_element_bits { u8 reserved_at_0[0x8]; u8 tsar_type[0x8]; From 861cd9b9cb62feb244b8d77e68fd6ddedbbf66e9 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 5 Aug 2024 13:13:03 +0300 Subject: [PATCH 12/36] net/mlx5: Verify support for scheduling element and TSAR type Before creating a scheduling element in a NIC or E-Switch scheduler, ensure that the requested element type is supported. If the element is of type Transmit Scheduling Arbiter (TSAR), also verify that the specific TSAR type is supported. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Fixes: 85c5f7c9200e ("net/mlx5: E-switch, Create QoS on demand") Fixes: 0fe132eac38c ("net/mlx5: E-switch, Allow to add vports to rate groups") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 44 ++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/qos.c | 7 +++ 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 997c412a81af..02a3563f51ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -312,6 +312,25 @@ static int esw_qos_set_group_max_rate(struct mlx5_eswitch *esw, return err; } +static bool esw_qos_element_type_supported(struct mlx5_core_dev *dev, int type) +{ + switch (type) { + case SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_TSAR; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_VPORT; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_VPORT_TC; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC; + } + return false; +} + static int esw_qos_vport_create_sched_element(struct mlx5_eswitch *esw, struct mlx5_vport *vport, u32 max_rate, u32 bw_share) @@ -323,6 +342,9 @@ static int esw_qos_vport_create_sched_element(struct mlx5_eswitch *esw, void *vport_elem; int err; + if (!esw_qos_element_type_supported(dev, SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT)) + return -EOPNOTSUPP; + parent_tsar_ix = group ? group->tsar_ix : esw->qos.root_tsar_ix; MLX5_SET(scheduling_context, sched_ctx, element_type, SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT); @@ -533,25 +555,6 @@ static int esw_qos_destroy_rate_group(struct mlx5_eswitch *esw, return err; } -static bool esw_qos_element_type_supported(struct mlx5_core_dev *dev, int type) -{ - switch (type) { - case SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR: - return MLX5_CAP_QOS(dev, esw_element_type) & - ELEMENT_TYPE_CAP_MASK_TSAR; - case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT: - return MLX5_CAP_QOS(dev, esw_element_type) & - ELEMENT_TYPE_CAP_MASK_VPORT; - case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC: - return MLX5_CAP_QOS(dev, esw_element_type) & - ELEMENT_TYPE_CAP_MASK_VPORT_TC; - case SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC: - return MLX5_CAP_QOS(dev, esw_element_type) & - ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC; - } - return false; -} - static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack) { u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; @@ -562,7 +565,8 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling)) return -EOPNOTSUPP; - if (!esw_qos_element_type_supported(dev, SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR)) + if (!esw_qos_element_type_supported(dev, SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR) || + !(MLX5_CAP_QOS(dev, esw_tsar_type) & TSAR_TYPE_CAP_MASK_DWRR)) return -EOPNOTSUPP; MLX5_SET(scheduling_context, tsar_ctx, element_type, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/qos.c index 8bce730b5c5b..db2bd3ad63ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qos.c @@ -28,6 +28,9 @@ int mlx5_qos_create_leaf_node(struct mlx5_core_dev *mdev, u32 parent_id, { u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; + if (!(MLX5_CAP_QOS(mdev, nic_element_type) & ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP)) + return -EOPNOTSUPP; + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); MLX5_SET(scheduling_context, sched_ctx, element_type, SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP); @@ -44,6 +47,10 @@ int mlx5_qos_create_inner_node(struct mlx5_core_dev *mdev, u32 parent_id, u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; void *attr; + if (!(MLX5_CAP_QOS(mdev, nic_element_type) & ELEMENT_TYPE_CAP_MASK_TSAR) || + !(MLX5_CAP_QOS(mdev, nic_tsar_type) & TSAR_TYPE_CAP_MASK_DWRR)) + return -EOPNOTSUPP; + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); MLX5_SET(scheduling_context, sched_ctx, element_type, SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); From b1d305abef4640af1b4f1b4774d513cd81b10cfc Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 30 Aug 2024 08:39:27 -0400 Subject: [PATCH 13/36] net/mlx5: Fix bridge mode operations when there are no VFs Currently, trying to set the bridge mode attribute when numvfs=0 leads to a crash: bridge link set dev eth2 hwmode vepa [ 168.967392] BUG: kernel NULL pointer dereference, address: 0000000000000030 [...] [ 168.969989] RIP: 0010:mlx5_add_flow_rules+0x1f/0x300 [mlx5_core] [...] [ 168.976037] Call Trace: [ 168.976188] [ 168.978620] _mlx5_eswitch_set_vepa_locked+0x113/0x230 [mlx5_core] [ 168.979074] mlx5_eswitch_set_vepa+0x7f/0xa0 [mlx5_core] [ 168.979471] rtnl_bridge_setlink+0xe9/0x1f0 [ 168.979714] rtnetlink_rcv_msg+0x159/0x400 [ 168.980451] netlink_rcv_skb+0x54/0x100 [ 168.980675] netlink_unicast+0x241/0x360 [ 168.980918] netlink_sendmsg+0x1f6/0x430 [ 168.981162] ____sys_sendmsg+0x3bb/0x3f0 [ 168.982155] ___sys_sendmsg+0x88/0xd0 [ 168.985036] __sys_sendmsg+0x59/0xa0 [ 168.985477] do_syscall_64+0x79/0x150 [ 168.987273] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 168.987773] RIP: 0033:0x7f8f7950f917 (esw->fdb_table.legacy.vepa_fdb is null) The bridge mode is only relevant when there are multiple functions per port. Therefore, prevent setting and getting this setting when there are no VFs. Note that after this change, there are no settings to change on the PF interface using `bridge link` when there are no VFs, so the interface no longer appears in the `bridge link` output. Fixes: 4b89251de024 ("net/mlx5: Support ndo bridge_setlink and getlink") Signed-off-by: Benjamin Poirier Reviewed-by: Cosmin Ratiu Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c index 255bc8b749f9..8587cd572da5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c @@ -319,7 +319,7 @@ int mlx5_eswitch_set_vepa(struct mlx5_eswitch *esw, u8 setting) return -EPERM; mutex_lock(&esw->state_lock); - if (esw->mode != MLX5_ESWITCH_LEGACY) { + if (esw->mode != MLX5_ESWITCH_LEGACY || !mlx5_esw_is_fdb_created(esw)) { err = -EOPNOTSUPP; goto out; } @@ -339,7 +339,7 @@ int mlx5_eswitch_get_vepa(struct mlx5_eswitch *esw, u8 *setting) if (!mlx5_esw_allowed(esw)) return -EPERM; - if (esw->mode != MLX5_ESWITCH_LEGACY) + if (esw->mode != MLX5_ESWITCH_LEGACY || !mlx5_esw_is_fdb_created(esw)) return -EOPNOTSUPP; *setting = esw->fdb_table.legacy.vepa_uplink_rule ? 1 : 0; From 4c8002277167125078e6b9b90137bdf443ebaa08 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 6 Sep 2024 15:28:39 +0500 Subject: [PATCH 14/36] fou: fix initialization of grc The grc must be initialize first. There can be a condition where if fou is NULL, goto out will be executed and grc would be used uninitialized. Fixes: 7e4196935069 ("fou: Fix null-ptr-deref in GRO.") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240906102839.202798-1-usama.anjum@collabora.com Signed-off-by: Jakub Kicinski --- net/ipv4/fou_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 78b869b31492..3e30745e2c09 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -336,11 +336,11 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct gro_remcsum grc; u8 proto; + skb_gro_remcsum_init(&grc); + if (!fou) goto out; - skb_gro_remcsum_init(&grc); - off = skb_gro_offset(skb); len = off + sizeof(*guehdr); From 019aba04f08c2102b35ce7fee9d4628d349f56c0 Mon Sep 17 00:00:00 2001 From: Naveen Mamindlapalli Date: Fri, 6 Sep 2024 10:28:38 +0530 Subject: [PATCH 15/36] octeontx2-af: Modify SMQ flush sequence to drop packets The current implementation of SMQ flush sequence waits for the packets in the TM pipeline to be transmitted out of the link. This sequence doesn't succeed in HW when there is any issue with link such as lack of link credits, link down or any other traffic that is fully occupying the link bandwidth (QoS). This patch modifies the SMQ flush sequence to drop the packets after TL1 level (SQM) instead of polling for the packets to be sent out of RPM/CGX link. Fixes: 5d9b976d4480 ("octeontx2-af: Support fixed transmit scheduler topology") Signed-off-by: Naveen Mamindlapalli Reviewed-by: Sunil Kovvuri Goutham Link: https://patch.msgid.link/20240906045838.1620308-1-naveenm@marvell.com Signed-off-by: Paolo Abeni --- .../net/ethernet/marvell/octeontx2/af/rvu.h | 3 +- .../ethernet/marvell/octeontx2/af/rvu_nix.c | 59 +++++++++++++++---- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 03ee93fd9e94..db2db0738ee4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -319,6 +319,7 @@ struct nix_mark_format { /* smq(flush) to tl1 cir/pir info */ struct nix_smq_tree_ctx { + u16 schq; u64 cir_off; u64 cir_val; u64 pir_off; @@ -328,8 +329,6 @@ struct nix_smq_tree_ctx { /* smq flush context */ struct nix_smq_flush_ctx { int smq; - u16 tl1_schq; - u16 tl2_schq; struct nix_smq_tree_ctx smq_tree_ctx[NIX_TXSCH_LVL_CNT]; }; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 222f9e00b836..82832a24fbd8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -2259,14 +2259,13 @@ static void nix_smq_flush_fill_ctx(struct rvu *rvu, int blkaddr, int smq, schq = smq; for (lvl = NIX_TXSCH_LVL_SMQ; lvl <= NIX_TXSCH_LVL_TL1; lvl++) { smq_tree_ctx = &smq_flush_ctx->smq_tree_ctx[lvl]; + smq_tree_ctx->schq = schq; if (lvl == NIX_TXSCH_LVL_TL1) { - smq_flush_ctx->tl1_schq = schq; smq_tree_ctx->cir_off = NIX_AF_TL1X_CIR(schq); smq_tree_ctx->pir_off = 0; smq_tree_ctx->pir_val = 0; parent_off = 0; } else if (lvl == NIX_TXSCH_LVL_TL2) { - smq_flush_ctx->tl2_schq = schq; smq_tree_ctx->cir_off = NIX_AF_TL2X_CIR(schq); smq_tree_ctx->pir_off = NIX_AF_TL2X_PIR(schq); parent_off = NIX_AF_TL2X_PARENT(schq); @@ -2301,8 +2300,8 @@ static void nix_smq_flush_enadis_xoff(struct rvu *rvu, int blkaddr, { struct nix_txsch *txsch; struct nix_hw *nix_hw; + int tl2, tl2_schq; u64 regoff; - int tl2; nix_hw = get_nix_hw(rvu->hw, blkaddr); if (!nix_hw) @@ -2310,16 +2309,17 @@ static void nix_smq_flush_enadis_xoff(struct rvu *rvu, int blkaddr, /* loop through all TL2s with matching PF_FUNC */ txsch = &nix_hw->txsch[NIX_TXSCH_LVL_TL2]; + tl2_schq = smq_flush_ctx->smq_tree_ctx[NIX_TXSCH_LVL_TL2].schq; for (tl2 = 0; tl2 < txsch->schq.max; tl2++) { /* skip the smq(flush) TL2 */ - if (tl2 == smq_flush_ctx->tl2_schq) + if (tl2 == tl2_schq) continue; /* skip unused TL2s */ if (TXSCH_MAP_FLAGS(txsch->pfvf_map[tl2]) & NIX_TXSCHQ_FREE) continue; /* skip if PF_FUNC doesn't match */ if ((TXSCH_MAP_FUNC(txsch->pfvf_map[tl2]) & ~RVU_PFVF_FUNC_MASK) != - (TXSCH_MAP_FUNC(txsch->pfvf_map[smq_flush_ctx->tl2_schq] & + (TXSCH_MAP_FUNC(txsch->pfvf_map[tl2_schq] & ~RVU_PFVF_FUNC_MASK))) continue; /* enable/disable XOFF */ @@ -2361,10 +2361,12 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr, int smq, u16 pcifunc, int nixlf) { struct nix_smq_flush_ctx *smq_flush_ctx; + int err, restore_tx_en = 0, i; int pf = rvu_get_pf(pcifunc); u8 cgx_id = 0, lmac_id = 0; - int err, restore_tx_en = 0; - u64 cfg; + u16 tl2_tl3_link_schq; + u8 link, link_level; + u64 cfg, bmap = 0; if (!is_rvu_otx2(rvu)) { /* Skip SMQ flush if pkt count is zero */ @@ -2388,16 +2390,38 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr, nix_smq_flush_enadis_xoff(rvu, blkaddr, smq_flush_ctx, true); nix_smq_flush_enadis_rate(rvu, blkaddr, smq_flush_ctx, false); - cfg = rvu_read64(rvu, blkaddr, NIX_AF_SMQX_CFG(smq)); - /* Do SMQ flush and set enqueue xoff */ - cfg |= BIT_ULL(50) | BIT_ULL(49); - rvu_write64(rvu, blkaddr, NIX_AF_SMQX_CFG(smq), cfg); - /* Disable backpressure from physical link, * otherwise SMQ flush may stall. */ rvu_cgx_enadis_rx_bp(rvu, pf, false); + link_level = rvu_read64(rvu, blkaddr, NIX_AF_PSE_CHANNEL_LEVEL) & 0x01 ? + NIX_TXSCH_LVL_TL3 : NIX_TXSCH_LVL_TL2; + tl2_tl3_link_schq = smq_flush_ctx->smq_tree_ctx[link_level].schq; + link = smq_flush_ctx->smq_tree_ctx[NIX_TXSCH_LVL_TL1].schq; + + /* SMQ set enqueue xoff */ + cfg = rvu_read64(rvu, blkaddr, NIX_AF_SMQX_CFG(smq)); + cfg |= BIT_ULL(50); + rvu_write64(rvu, blkaddr, NIX_AF_SMQX_CFG(smq), cfg); + + /* Clear all NIX_AF_TL3_TL2_LINK_CFG[ENA] for the TL3/TL2 queue */ + for (i = 0; i < (rvu->hw->cgx_links + rvu->hw->lbk_links); i++) { + cfg = rvu_read64(rvu, blkaddr, + NIX_AF_TL3_TL2X_LINKX_CFG(tl2_tl3_link_schq, link)); + if (!(cfg & BIT_ULL(12))) + continue; + bmap |= (1 << i); + cfg &= ~BIT_ULL(12); + rvu_write64(rvu, blkaddr, + NIX_AF_TL3_TL2X_LINKX_CFG(tl2_tl3_link_schq, link), cfg); + } + + /* Do SMQ flush and set enqueue xoff */ + cfg = rvu_read64(rvu, blkaddr, NIX_AF_SMQX_CFG(smq)); + cfg |= BIT_ULL(50) | BIT_ULL(49); + rvu_write64(rvu, blkaddr, NIX_AF_SMQX_CFG(smq), cfg); + /* Wait for flush to complete */ err = rvu_poll_reg(rvu, blkaddr, NIX_AF_SMQX_CFG(smq), BIT_ULL(49), true); @@ -2406,6 +2430,17 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr, "NIXLF%d: SMQ%d flush failed, txlink might be busy\n", nixlf, smq); + /* Set NIX_AF_TL3_TL2_LINKX_CFG[ENA] for the TL3/TL2 queue */ + for (i = 0; i < (rvu->hw->cgx_links + rvu->hw->lbk_links); i++) { + if (!(bmap & (1 << i))) + continue; + cfg = rvu_read64(rvu, blkaddr, + NIX_AF_TL3_TL2X_LINKX_CFG(tl2_tl3_link_schq, link)); + cfg |= BIT_ULL(12); + rvu_write64(rvu, blkaddr, + NIX_AF_TL3_TL2X_LINKX_CFG(tl2_tl3_link_schq, link), cfg); + } + /* clear XOFF on TL2s */ nix_smq_flush_enadis_rate(rvu, blkaddr, smq_flush_ctx, true); nix_smq_flush_enadis_xoff(rvu, blkaddr, smq_flush_ctx, false); From fef2843bb49f414d1523ca007d088071dee0e055 Mon Sep 17 00:00:00 2001 From: Jacky Chou Date: Fri, 6 Sep 2024 14:28:31 +0800 Subject: [PATCH 16/36] net: ftgmac100: Enable TX interrupt to avoid TX timeout Currently, the driver only enables RX interrupt to handle RX packets and TX resources. Sometimes there is not RX traffic, so the TX resource needs to wait for RX interrupt to free. This situation will toggle the TX timeout watchdog when the MAC TX ring has no more resources to transmit packets. Therefore, enable TX interrupt to release TX resources at any time. When I am verifying iperf3 over UDP, the network hangs. Like the log below. root# iperf3 -c 192.168.100.100 -i1 -t10 -u -b0 Connecting to host 192.168.100.100, port 5201 [ 4] local 192.168.100.101 port 35773 connected to 192.168.100.100 port 5201 [ ID] Interval Transfer Bandwidth Total Datagrams [ 4] 0.00-20.42 sec 160 KBytes 64.2 Kbits/sec 20 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 [ 4] 20.42-20.42 sec 0.00 Bytes 0.00 bits/sec 0 - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bandwidth Jitter Lost/Total Datagrams [ 4] 0.00-20.42 sec 160 KBytes 64.2 Kbits/sec 0.000 ms 0/20 (0%) [ 4] Sent 20 datagrams iperf3: error - the server has terminated The network topology is FTGMAC connects directly to a PC. UDP does not need to wait for ACK, unlike TCP. Therefore, FTGMAC needs to enable TX interrupt to release TX resources instead of waiting for the RX interrupt. Fixes: 10cbd6407609 ("ftgmac100: Rework NAPI & interrupts handling") Signed-off-by: Jacky Chou Link: https://patch.msgid.link/20240906062831.2243399-1-jacky_chou@aspeedtech.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/faraday/ftgmac100.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.h b/drivers/net/ethernet/faraday/ftgmac100.h index 63b3e02fab16..4968f6f0bdbc 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.h +++ b/drivers/net/ethernet/faraday/ftgmac100.h @@ -84,7 +84,7 @@ FTGMAC100_INT_RPKT_BUF) /* All the interrupts we care about */ -#define FTGMAC100_INT_ALL (FTGMAC100_INT_RPKT_BUF | \ +#define FTGMAC100_INT_ALL (FTGMAC100_INT_RXTX | \ FTGMAC100_INT_BAD) /* From dc4547fbba874718af76e5c28c815fcef5c13c6c Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 6 Sep 2024 20:31:35 +0800 Subject: [PATCH 17/36] Revert "virtio_net: rx remove premapped failover code" This reverts commit defd28aa5acb0fd7c15adc6bc40a8ac277d04dea. Recover the code to disable premapped mode. Signed-off-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Tested-by: Takero Funaki Link: https://patch.msgid.link/20240906123137.108741-2-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 89 +++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c6af18948092..6fa8aab18484 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -356,6 +356,9 @@ struct receive_queue { struct xdp_rxq_info xsk_rxq_info; struct xdp_buff **xsk_buffs; + + /* Do dma by self */ + bool do_dma; }; /* This structure can contain rss message with maximum settings for indirection table and keysize @@ -885,7 +888,7 @@ static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx) void *buf; buf = virtqueue_get_buf_ctx(rq->vq, len, ctx); - if (buf) + if (buf && rq->do_dma) virtnet_rq_unmap(rq, buf, *len); return buf; @@ -898,6 +901,11 @@ static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len) u32 offset; void *head; + if (!rq->do_dma) { + sg_init_one(rq->sg, buf, len); + return; + } + head = page_address(rq->alloc_frag.page); offset = buf - head; @@ -923,42 +931,44 @@ static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) head = page_address(alloc_frag->page); - dma = head; + if (rq->do_dma) { + dma = head; - /* new pages */ - if (!alloc_frag->offset) { - if (rq->last_dma) { - /* Now, the new page is allocated, the last dma - * will not be used. So the dma can be unmapped - * if the ref is 0. + /* new pages */ + if (!alloc_frag->offset) { + if (rq->last_dma) { + /* Now, the new page is allocated, the last dma + * will not be used. So the dma can be unmapped + * if the ref is 0. + */ + virtnet_rq_unmap(rq, rq->last_dma, 0); + rq->last_dma = NULL; + } + + dma->len = alloc_frag->size - sizeof(*dma); + + addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, + dma->len, DMA_FROM_DEVICE, 0); + if (virtqueue_dma_mapping_error(rq->vq, addr)) + return NULL; + + dma->addr = addr; + dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); + + /* Add a reference to dma to prevent the entire dma from + * being released during error handling. This reference + * will be freed after the pages are no longer used. */ - virtnet_rq_unmap(rq, rq->last_dma, 0); - rq->last_dma = NULL; + get_page(alloc_frag->page); + dma->ref = 1; + alloc_frag->offset = sizeof(*dma); + + rq->last_dma = dma; } - dma->len = alloc_frag->size - sizeof(*dma); - - addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, - dma->len, DMA_FROM_DEVICE, 0); - if (virtqueue_dma_mapping_error(rq->vq, addr)) - return NULL; - - dma->addr = addr; - dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); - - /* Add a reference to dma to prevent the entire dma from - * being released during error handling. This reference - * will be freed after the pages are no longer used. - */ - get_page(alloc_frag->page); - dma->ref = 1; - alloc_frag->offset = sizeof(*dma); - - rq->last_dma = dma; + ++dma->ref; } - ++dma->ref; - buf = head + alloc_frag->offset; get_page(alloc_frag->page); @@ -975,9 +985,12 @@ static void virtnet_rq_set_premapped(struct virtnet_info *vi) if (!vi->mergeable_rx_bufs && vi->big_packets) return; - for (i = 0; i < vi->max_queue_pairs; i++) - /* error should never happen */ - BUG_ON(virtqueue_set_dma_premapped(vi->rq[i].vq)); + for (i = 0; i < vi->max_queue_pairs; i++) { + if (virtqueue_set_dma_premapped(vi->rq[i].vq)) + continue; + + vi->rq[i].do_dma = true; + } } static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) @@ -2430,7 +2443,8 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { - virtnet_rq_unmap(rq, buf, 0); + if (rq->do_dma) + virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } @@ -2544,7 +2558,8 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, ctx = mergeable_len_to_ctx(len + room, headroom); err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { - virtnet_rq_unmap(rq, buf, 0); + if (rq->do_dma) + virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } @@ -5892,7 +5907,7 @@ static void free_receive_page_frags(struct virtnet_info *vi) int i; for (i = 0; i < vi->max_queue_pairs; i++) if (vi->rq[i].alloc_frag.page) { - if (vi->rq[i].last_dma) + if (vi->rq[i].do_dma && vi->rq[i].last_dma) virtnet_rq_unmap(&vi->rq[i], vi->rq[i].last_dma, 0); put_page(vi->rq[i].alloc_frag.page); } From 38eef112a8e547b8c207b2a521ad4b077d792100 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 6 Sep 2024 20:31:36 +0800 Subject: [PATCH 18/36] Revert "virtio_net: big mode skip the unmap check" This reverts commit a377ae542d8d0a20a3173da3bbba72e045bea7a9. Signed-off-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Tested-by: Takero Funaki Link: https://patch.msgid.link/20240906123137.108741-3-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 6fa8aab18484..1cf80648f82a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1006,7 +1006,7 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) return; } - if (!vi->big_packets || vi->mergeable_rx_bufs) + if (rq->do_dma) virtnet_rq_unmap(rq, buf, 0); virtnet_rq_free_buf(vi, rq, buf); @@ -2716,7 +2716,7 @@ static int virtnet_receive_packets(struct virtnet_info *vi, } } else { while (packets < budget && - (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { + (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) { receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats); packets++; } From 111fc9f517cb293c4213673733b980123c3b0209 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 6 Sep 2024 20:31:37 +0800 Subject: [PATCH 19/36] virtio_net: disable premapped mode by default Now, the premapped mode encounters some problem. http://lore.kernel.org/all/8b20cc28-45a9-4643-8e87-ba164a540c0a@oracle.com So we disable the premapped mode by default. We can re-enable it in the future. Fixes: f9dac92ba908 ("virtio_ring: enable premapped mode whatever use_dma_api") Reported-by: "Si-Wei Liu" Closes: http://lore.kernel.org/all/8b20cc28-45a9-4643-8e87-ba164a540c0a@oracle.com Signed-off-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Tested-by: Takero Funaki Link: https://patch.msgid.link/20240906123137.108741-4-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1cf80648f82a..5a1c1ec5a64b 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -977,22 +977,6 @@ static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) return buf; } -static void virtnet_rq_set_premapped(struct virtnet_info *vi) -{ - int i; - - /* disable for big mode */ - if (!vi->mergeable_rx_bufs && vi->big_packets) - return; - - for (i = 0; i < vi->max_queue_pairs; i++) { - if (virtqueue_set_dma_premapped(vi->rq[i].vq)) - continue; - - vi->rq[i].do_dma = true; - } -} - static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) { struct virtnet_info *vi = vq->vdev->priv; @@ -6105,8 +6089,6 @@ static int init_vqs(struct virtnet_info *vi) if (ret) goto err_free; - virtnet_rq_set_premapped(vi); - cpus_read_lock(); virtnet_set_affinity(vi); cpus_read_unlock(); From 3f62ea572b3e8e3f10c39a9cb4f04ca9ae5f2952 Mon Sep 17 00:00:00 2001 From: Tomas Paukrt Date: Fri, 6 Sep 2024 12:52:40 +0200 Subject: [PATCH 20/36] net: phy: dp83822: Fix NULL pointer dereference on DP83825 devices The probe() function is only used for DP83822 and DP83826 PHY, leaving the private data pointer uninitialized for the DP83825 models which causes a NULL pointer dereference in the recently introduced/changed functions dp8382x_config_init() and dp83822_set_wol(). Add the dp8382x_probe() function, so all PHY models will have a valid private data pointer to fix this issue and also prevent similar issues in the future. Fixes: 9ef9ecfa9e9f ("net: phy: dp8382x: keep WOL settings across suspends") Signed-off-by: Tomas Paukrt Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/66w.ZbGt.65Ljx42yHo5.1csjxu@seznam.cz Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index efeb643c1373..fc247f479257 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -271,8 +271,7 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_ENERGY_DET_INT_EN | DP83822_LINK_QUAL_INT_EN); - /* Private data pointer is NULL on DP83825 */ - if (!dp83822 || !dp83822->fx_enabled) + if (!dp83822->fx_enabled) misr_status |= DP83822_ANEG_COMPLETE_INT_EN | DP83822_DUP_MODE_CHANGE_INT_EN | DP83822_SPEED_CHANGED_INT_EN; @@ -292,8 +291,7 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_PAGE_RX_INT_EN | DP83822_EEE_ERROR_CHANGE_INT_EN); - /* Private data pointer is NULL on DP83825 */ - if (!dp83822 || !dp83822->fx_enabled) + if (!dp83822->fx_enabled) misr_status |= DP83822_ANEG_ERR_INT_EN | DP83822_WOL_PKT_INT_EN; @@ -691,10 +689,9 @@ static int dp83822_read_straps(struct phy_device *phydev) return 0; } -static int dp83822_probe(struct phy_device *phydev) +static int dp8382x_probe(struct phy_device *phydev) { struct dp83822_private *dp83822; - int ret; dp83822 = devm_kzalloc(&phydev->mdio.dev, sizeof(*dp83822), GFP_KERNEL); @@ -703,6 +700,20 @@ static int dp83822_probe(struct phy_device *phydev) phydev->priv = dp83822; + return 0; +} + +static int dp83822_probe(struct phy_device *phydev) +{ + struct dp83822_private *dp83822; + int ret; + + ret = dp8382x_probe(phydev); + if (ret) + return ret; + + dp83822 = phydev->priv; + ret = dp83822_read_straps(phydev); if (ret) return ret; @@ -717,14 +728,11 @@ static int dp83822_probe(struct phy_device *phydev) static int dp83826_probe(struct phy_device *phydev) { - struct dp83822_private *dp83822; + int ret; - dp83822 = devm_kzalloc(&phydev->mdio.dev, sizeof(*dp83822), - GFP_KERNEL); - if (!dp83822) - return -ENOMEM; - - phydev->priv = dp83822; + ret = dp8382x_probe(phydev); + if (ret) + return ret; dp83826_of_init(phydev); @@ -795,6 +803,7 @@ static int dp83822_resume(struct phy_device *phydev) PHY_ID_MATCH_MODEL(_id), \ .name = (_name), \ /* PHY_BASIC_FEATURES */ \ + .probe = dp8382x_probe, \ .soft_reset = dp83822_phy_reset, \ .config_init = dp8382x_config_init, \ .get_wol = dp83822_get_wol, \ From e8a63d473b49011a68a748aea1c8aefa046ebacf Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 6 Sep 2024 17:07:43 -0400 Subject: [PATCH 21/36] selftests: net: csum: Fix checksums for packets with non-zero padding Padding is not included in UDP and TCP checksums. Therefore, reduce the length of the checksummed data to include only the data in the IP payload. This fixes spurious reported checksum failures like rx: pkt: sport=33000 len=26 csum=0xc850 verify=0xf9fe pkt: bad csum Technically it is possible for there to be trailing bytes after the UDP data but before the Ethernet padding (e.g. if sizeof(ip) + sizeof(udp) + udp.len < ip.len). However, we don't generate such packets. Fixes: 91a7de85600d ("selftests/net: add csum offload test") Signed-off-by: Sean Anderson Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240906210743.627413-1-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/csum.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c index b9f3fc3c3426..e0a34e5e8dd5 100644 --- a/tools/testing/selftests/net/lib/csum.c +++ b/tools/testing/selftests/net/lib/csum.c @@ -654,10 +654,16 @@ static int recv_verify_packet_ipv4(void *nh, int len) { struct iphdr *iph = nh; uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + uint16_t ip_len; if (len < sizeof(*iph) || iph->protocol != proto) return -1; + ip_len = ntohs(iph->tot_len); + if (ip_len > len || ip_len < sizeof(*iph)) + return -1; + + len = ip_len; iph_addr_p = &iph->saddr; if (proto == IPPROTO_TCP) return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph)); @@ -669,16 +675,22 @@ static int recv_verify_packet_ipv6(void *nh, int len) { struct ipv6hdr *ip6h = nh; uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + uint16_t ip_len; if (len < sizeof(*ip6h) || ip6h->nexthdr != proto) return -1; + ip_len = ntohs(ip6h->payload_len); + if (ip_len > len - sizeof(*ip6h)) + return -1; + + len = ip_len; iph_addr_p = &ip6h->saddr; if (proto == IPPROTO_TCP) - return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h)); + return recv_verify_packet_tcp(ip6h + 1, len); else - return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h)); + return recv_verify_packet_udp(ip6h + 1, len); } /* return whether auxdata includes TP_STATUS_CSUM_VALID */ From 2f9caba9b2f68639047d67d7cbfa98f7f2ac3180 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 9 Sep 2024 09:21:52 +0800 Subject: [PATCH 22/36] dt-bindings: net: tja11xx: fix the broken binding As Rob pointed in another mail thread [1], the binding of tja11xx PHY is completely broken, the schema cannot catch the error in the DTS. A compatiable string must be needed if we want to add a custom propety. So extract known PHY IDs from the tja11xx PHY drivers and convert them into supported compatible string list to fix the broken binding issue. Fixes: 52b2fe4535ad ("dt-bindings: net: tja11xx: add nxp,refclk_in property") Link: https://lore.kernel.org/31058f49-bac5-49a9-a422-c43b121bf049@kernel.org # [1] Signed-off-by: Wei Fang Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20240909012152.431647-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- .../devicetree/bindings/net/nxp,tja11xx.yaml | 62 ++++++++++++++----- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml b/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml index 85bfa45f5122..a754a61adc2d 100644 --- a/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml +++ b/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml @@ -14,8 +14,53 @@ maintainers: description: Bindings for NXP TJA11xx automotive PHYs +properties: + compatible: + enum: + - ethernet-phy-id0180.dc40 + - ethernet-phy-id0180.dc41 + - ethernet-phy-id0180.dc48 + - ethernet-phy-id0180.dd00 + - ethernet-phy-id0180.dd01 + - ethernet-phy-id0180.dd02 + - ethernet-phy-id0180.dc80 + - ethernet-phy-id0180.dc82 + - ethernet-phy-id001b.b010 + - ethernet-phy-id001b.b013 + - ethernet-phy-id001b.b030 + - ethernet-phy-id001b.b031 + allOf: - $ref: ethernet-phy.yaml# + - if: + properties: + compatible: + contains: + enum: + - ethernet-phy-id0180.dc40 + - ethernet-phy-id0180.dc41 + - ethernet-phy-id0180.dc48 + - ethernet-phy-id0180.dd00 + - ethernet-phy-id0180.dd01 + - ethernet-phy-id0180.dd02 + + then: + properties: + nxp,rmii-refclk-in: + type: boolean + description: | + The REF_CLK is provided for both transmitted and received data + in RMII mode. This clock signal is provided by the PHY and is + typically derived from an external 25MHz crystal. Alternatively, + a 50MHz clock signal generated by an external oscillator can be + connected to pin REF_CLK. A third option is to connect a 25MHz + clock to pin CLK_IN_OUT. So, the REF_CLK should be configured + as input or output according to the actual circuit connection. + If present, indicates that the REF_CLK will be configured as + interface reference clock input when RMII mode enabled. + If not present, the REF_CLK will be configured as interface + reference clock output when RMII mode enabled. + Only supported on TJA1100 and TJA1101. patternProperties: "^ethernet-phy@[0-9a-f]+$": @@ -32,22 +77,6 @@ patternProperties: description: The ID number for the child PHY. Should be +1 of parent PHY. - nxp,rmii-refclk-in: - type: boolean - description: | - The REF_CLK is provided for both transmitted and received data - in RMII mode. This clock signal is provided by the PHY and is - typically derived from an external 25MHz crystal. Alternatively, - a 50MHz clock signal generated by an external oscillator can be - connected to pin REF_CLK. A third option is to connect a 25MHz - clock to pin CLK_IN_OUT. So, the REF_CLK should be configured - as input or output according to the actual circuit connection. - If present, indicates that the REF_CLK will be configured as - interface reference clock input when RMII mode enabled. - If not present, the REF_CLK will be configured as interface - reference clock output when RMII mode enabled. - Only supported on TJA1100 and TJA1101. - required: - reg @@ -60,6 +89,7 @@ examples: #size-cells = <0>; tja1101_phy0: ethernet-phy@4 { + compatible = "ethernet-phy-id0180.dc40"; reg = <0x4>; nxp,rmii-refclk-in; }; From 330dadacc59c2290e0fae47736ccd26b74aa1fd9 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 9 Sep 2024 13:43:36 +0200 Subject: [PATCH 23/36] MAINTAINERS: Add ethtool pse-pd to PSE NETWORK DRIVER Add net/ethtool/pse-pd.c to PSE NETWORK DRIVER to receive emails concerning modifications to the ethtool part. Reviewed-by: Oleksij Rempel Signed-off-by: Kory Maincent Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240909114336.362174-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 511b92392f6e..93f413a3665f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18407,6 +18407,7 @@ L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/pse-pd/ F: drivers/net/pse-pd/ +F: net/ethtool/pse-pd.c PSTORE FILESYSTEM M: Kees Cook From 8b26ff7af8c32cb4148b3e147c52f9e4c695209c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 5 Sep 2024 12:54:46 +0200 Subject: [PATCH 24/36] netfilter: nft_socket: fix sk refcount leaks We must put 'sk' reference before returning. Fixes: 039b1f4f24ec ("netfilter: nft_socket: fix erroneous socket assignment") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index f30163e2ca62..765ffd6e06bc 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -110,13 +110,13 @@ static void nft_socket_eval(const struct nft_expr *expr, *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; case NFT_SOCKET_WILDCARD: if (!sk_fullsock(sk)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } nft_socket_wildcard(pkt, regs, sk, dest); break; @@ -124,7 +124,7 @@ static void nft_socket_eval(const struct nft_expr *expr, case NFT_SOCKET_CGROUPV2: if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; #endif @@ -133,6 +133,7 @@ static void nft_socket_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +out_put_sk: if (sk != skb->sk) sock_gen_put(sk); } From 7f3287db654395f9c5ddd246325ff7889f550286 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 7 Sep 2024 16:07:49 +0200 Subject: [PATCH 25/36] netfilter: nft_socket: make cgroupsv2 matching work with namespaces When running in container environmment, /sys/fs/cgroup/ might not be the real root node of the sk-attached cgroup. Example: In container: % stat /sys//fs/cgroup/ Device: 0,21 Inode: 2214 .. % stat /sys/fs/cgroup/foo Device: 0,21 Inode: 2264 .. The expectation would be for: nft add rule .. socket cgroupv2 level 1 "foo" counter to match traffic from a process that got added to "foo" via "echo $pid > /sys/fs/cgroup/foo/cgroup.procs". However, 'level 3' is needed to make this work. Seen from initial namespace, the complete hierarchy is: % stat /sys/fs/cgroup/system.slice/docker-.../foo Device: 0,21 Inode: 2264 .. i.e. hierarchy is 0 1 2 3 / -> system.slice -> docker-1... -> foo ... but the container doesn't know that its "/" is the "docker-1.." cgroup. Current code will retrieve the 'system.slice' cgroup node and store its kn->id in the destination register, so compare with 2264 ("foo" cgroup id) will not match. Fetch "/" cgroup from ->init() and add its level to the level we try to extract. cgroup root-level is 0 for the init-namespace or the level of the ancestor that is exposed as the cgroup root inside the container. In the above case, cgrp->level of "/" resolved in the container is 2 (docker-1...scope/) and request for 'level 1' will get adjusted to fetch the actual level (3). v2: use CONFIG_SOCK_CGROUP_DATA, eval function depends on it. (kernel test robot) Cc: cgroups@vger.kernel.org Fixes: e0bb96db96f8 ("netfilter: nft_socket: add support for cgroupsv2") Reported-by: Nadia Pinaeva Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 41 +++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 765ffd6e06bc..12cdff640492 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,7 +9,8 @@ struct nft_socket { enum nft_socket_keys key:8; - u8 level; + u8 level; /* cgroupv2 level to extract */ + u8 level_user; /* cgroupv2 level provided by userspace */ u8 len; union { u8 dreg; @@ -53,6 +54,28 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo memcpy(dest, &cgid, sizeof(u64)); return true; } + +/* process context only, uses current->nsproxy. */ +static noinline int nft_socket_cgroup_subtree_level(void) +{ + struct cgroup *cgrp = cgroup_get_from_path("/"); + int level; + + if (!cgrp) + return -ENOENT; + + level = cgrp->level; + + cgroup_put(cgrp); + + if (WARN_ON_ONCE(level > 255)) + return -ERANGE; + + if (WARN_ON_ONCE(level < 0)) + return -EINVAL; + + return level; +} #endif static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) @@ -174,9 +197,10 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_MARK: len = sizeof(u32); break; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_SOCK_CGROUP_DATA case NFT_SOCKET_CGROUPV2: { unsigned int level; + int err; if (!tb[NFTA_SOCKET_LEVEL]) return -EINVAL; @@ -185,6 +209,17 @@ static int nft_socket_init(const struct nft_ctx *ctx, if (level > 255) return -EOPNOTSUPP; + err = nft_socket_cgroup_subtree_level(); + if (err < 0) + return err; + + priv->level_user = level; + + level += err; + /* Implies a giant cgroup tree */ + if (WARN_ON_ONCE(level > 255)) + return -EOPNOTSUPP; + priv->level = level; len = sizeof(u64); break; @@ -209,7 +244,7 @@ static int nft_socket_dump(struct sk_buff *skb, if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; if (priv->key == NFT_SOCKET_CGROUPV2 && - nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) + nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user))) return -1; return 0; } From 49ac6f05ace5bb0070c68a0193aa05d3c25d4c83 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 10 Sep 2024 21:06:36 +0200 Subject: [PATCH 26/36] selftests: mptcp: join: restrict fullmesh endp on 1st sf A new endpoint using the IP of the initial subflow has been recently added to increase the code coverage. But it breaks the test when using old kernels not having commit 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk"), e.g. on v5.15. Similar to commit d4c81bbb8600 ("selftests: mptcp: join: support local endpoint being tracked or not"), it is possible to add the new endpoint conditionally, by checking if "mptcp_pm_subflow_check_next" is present in kallsyms: this is not directly linked to the commit introducing this symbol but for the parent one which is linked anyway. So we can know in advance what will be the expected behaviour, and add the new endpoint only when it makes sense to do so. Fixes: 4878f9f8421f ("selftests: mptcp: join: validate fullmesh endp on 1st sf") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240910-net-selftests-mptcp-fix-install-v1-1-8f124aa9156d@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a4762c49a878..cde041c93906 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3064,7 +3064,9 @@ fullmesh_tests() pm_nl_set_limits $ns1 1 3 pm_nl_set_limits $ns2 1 3 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh + if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh + fi fullmesh=1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 From 1a5a2d19e827447e6a4d768504866e6820819cee Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 10 Sep 2024 21:06:37 +0200 Subject: [PATCH 27/36] selftests: mptcp: include lib.sh file The lib.sh file from the parent directory is used by the MPTCP selftests and it needs to be present when running the tests. This file then needs to be listed in the Makefile to be included when exporting or installing the tests, e.g. with: make -C tools/testing/selftests \ TARGETS=net/mptcp \ install INSTALL_PATH=$KSFT_INSTALL_PATH cd $KSFT_INSTALL_PATH ./run_kselftest.sh -c net/mptcp Fixes: f265d3119a29 ("selftests: mptcp: lib: use setup/cleanup_ns helpers") Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240910-net-selftests-mptcp-fix-install-v1-2-8f124aa9156d@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 7b936a926859..68924053a1e6 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -11,6 +11,8 @@ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq TEST_FILES := mptcp_lib.sh settings +TEST_INCLUDES := ../lib.sh + EXTRA_CLEAN := *.pcap include ../../lib.mk From c66c08e51b55c30ae333d6027e4dfda209710d46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 10 Sep 2024 21:06:38 +0200 Subject: [PATCH 28/36] selftests: mptcp: include net_helper.sh file Similar to the previous commit, the net_helper.sh file from the parent directory is used by the MPTCP selftests and it needs to be present when running the tests. This file then needs to be listed in the Makefile to be included when exporting or installing the tests, e.g. with: make -C tools/testing/selftests \ TARGETS=net/mptcp \ install INSTALL_PATH=$KSFT_INSTALL_PATH cd $KSFT_INSTALL_PATH ./run_kselftest.sh -c net/mptcp Fixes: 1af3bc912eac ("selftests: mptcp: lib: use wait_local_port_listen helper") Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240910-net-selftests-mptcp-fix-install-v1-3-8f124aa9156d@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 68924053a1e6..5d796622e730 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -11,7 +11,7 @@ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq TEST_FILES := mptcp_lib.sh settings -TEST_INCLUDES := ../lib.sh +TEST_INCLUDES := ../lib.sh ../net_helper.sh EXTRA_CLEAN := *.pcap From a7789fd4caaf96ecfed5e28c4cddb927e6bebadb Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sun, 8 Sep 2024 04:03:41 +0900 Subject: [PATCH 29/36] net: hsr: prevent NULL pointer dereference in hsr_proxy_announce() In the function hsr_proxy_annouance() added in the previous commit 5f703ce5c981 ("net: hsr: Send supervisory frames to HSR network with ProxyNodeTable data"), the return value of the hsr_port_get_hsr() function is not checked to be a NULL pointer, which causes a NULL pointer dereference. To solve this, we need to add code to check whether the return value of hsr_port_get_hsr() is NULL. Reported-by: syzbot+02a42d9b1bd395cbcab4@syzkaller.appspotmail.com Fixes: 5f703ce5c981 ("net: hsr: Send supervisory frames to HSR network with ProxyNodeTable data") Signed-off-by: Jeongjun Park Reviewed-by: Simon Horman Acked-by: Lukasz Majewski Link: https://patch.msgid.link/20240907190341.162289-1-aha310510@gmail.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_device.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index ac56784c327c..049e22bdaafb 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -414,6 +414,9 @@ static void hsr_proxy_announce(struct timer_list *t) * of SAN nodes stored in ProxyNodeTable. */ interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); + if (!interlink) + goto done; + list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { if (hsr_addr_is_redbox(hsr, node->macaddress_A)) continue; @@ -428,6 +431,7 @@ static void hsr_proxy_announce(struct timer_list *t) mod_timer(&hsr->announce_proxy_timer, jiffies + interval); } +done: rcu_read_unlock(); } From 70654f4c212e83898feced125d91ebb3695950d8 Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Fri, 6 Sep 2024 17:35:50 +0800 Subject: [PATCH 30/36] net: dsa: felix: ignore pending status of TAS module when it's disabled The TAS module could not be configured when it's running in pending status. We need disable the module and configure it again. However, the pending status is not cleared after the module disabled. TC taprio set will always return busy even it's disabled. For example, a user uses tc-taprio to configure Qbv and a future basetime. The TAS module will run in a pending status. There is no way to reconfigure Qbv, it always returns busy. Actually the TAS module can be reconfigured when it's disabled. So it doesn't need to check the pending status if the TAS module is disabled. After the patch, user can delete the tc taprio configuration to disable Qbv and reconfigure it again. Fixes: de143c0e274b ("net: dsa: felix: Configure Time-Aware Scheduler via taprio offload") Signed-off-by: Xiaoliang Yang Link: https://patch.msgid.link/20240906093550.29985-1-xiaoliang.yang_1@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix_vsc9959.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index ba37a566da39..ecfa73725d25 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1474,10 +1474,13 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port, /* Hardware errata - Admin config could not be overwritten if * config is pending, need reset the TAS module */ - val = ocelot_read(ocelot, QSYS_PARAM_STATUS_REG_8); - if (val & QSYS_PARAM_STATUS_REG_8_CONFIG_PENDING) { - ret = -EBUSY; - goto err_reset_tc; + val = ocelot_read_rix(ocelot, QSYS_TAG_CONFIG, port); + if (val & QSYS_TAG_CONFIG_ENABLE) { + val = ocelot_read(ocelot, QSYS_PARAM_STATUS_REG_8); + if (val & QSYS_PARAM_STATUS_REG_8_CONFIG_PENDING) { + ret = -EBUSY; + goto err_reset_tc; + } } ocelot_rmw_rix(ocelot, From 077ee7e6b13a2b6668196ed01a22023549e19381 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Tue, 10 Sep 2024 17:56:29 +0800 Subject: [PATCH 31/36] net: libwx: fix number of Rx and Tx descriptors The number of transmit and receive descriptors must be a multiple of 128 due to the hardware limitation. If it is set to a multiple of 8 instead of a multiple 128, the queues will easily be hung. Cc: stable@vger.kernel.org Fixes: 883b5984a5d2 ("net: wangxun: add ethtool_ops for ring parameters") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240910095629.570674-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_type.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 1d57b047817b..b54bffda027b 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -426,9 +426,9 @@ enum WX_MSCA_CMD_value { #define WX_MIN_RXD 128 #define WX_MIN_TXD 128 -/* Number of Transmit and Receive Descriptors must be a multiple of 8 */ -#define WX_REQ_RX_DESCRIPTOR_MULTIPLE 8 -#define WX_REQ_TX_DESCRIPTOR_MULTIPLE 8 +/* Number of Transmit and Receive Descriptors must be a multiple of 128 */ +#define WX_REQ_RX_DESCRIPTOR_MULTIPLE 128 +#define WX_REQ_TX_DESCRIPTOR_MULTIPLE 128 #define WX_MAX_JUMBO_FRAME_SIZE 9432 /* max payload 9414 */ #define VMDQ_P(p) p From b4cd80b0338945a94972ac3ed54f8338d2da2076 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 10 Sep 2024 17:58:56 +0800 Subject: [PATCH 32/36] mptcp: pm: Fix uaf in __timer_delete_sync There are two paths to access mptcp_pm_del_add_timer, result in a race condition: CPU1 CPU2 ==== ==== net_rx_action napi_poll netlink_sendmsg __napi_poll netlink_unicast process_backlog netlink_unicast_kernel __netif_receive_skb genl_rcv __netif_receive_skb_one_core netlink_rcv_skb NF_HOOK genl_rcv_msg ip_local_deliver_finish genl_family_rcv_msg ip_protocol_deliver_rcu genl_family_rcv_msg_doit tcp_v4_rcv mptcp_pm_nl_flush_addrs_doit tcp_v4_do_rcv mptcp_nl_remove_addrs_list tcp_rcv_established mptcp_pm_remove_addrs_and_subflows tcp_data_queue remove_anno_list_by_saddr mptcp_incoming_options mptcp_pm_del_add_timer mptcp_pm_del_add_timer kfree(entry) In remove_anno_list_by_saddr(running on CPU2), after leaving the critical zone protected by "pm.lock", the entry will be released, which leads to the occurrence of uaf in the mptcp_pm_del_add_timer(running on CPU1). Keeping a reference to add_timer inside the lock, and calling sk_stop_timer_sync() with this reference, instead of "entry->add_timer". Move list_del(&entry->list) to mptcp_pm_del_add_timer and inside the pm lock, do not directly access any members of the entry outside the pm lock, which can avoid similar "entry->x" uaf. Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reported-and-tested-by: syzbot+f3a31fb909db9b2a5c4d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f3a31fb909db9b2a5c4d Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Edward Adam Davis Acked-by: Paolo Abeni Link: https://patch.msgid.link/tencent_7142963A37944B4A74EF76CD66EA3C253609@qq.com Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f891bc714668..ad935d34c973 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -334,15 +334,21 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; + struct timer_list *add_timer = NULL; spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - if (entry && (!check_id || entry->addr.id == addr->id)) + if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; + add_timer = &entry->add_timer; + } + if (!check_id && entry) + list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); - if (entry && (!check_id || entry->addr.id == addr->id)) - sk_stop_timer_sync(sk, &entry->add_timer); + /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ + if (add_timer) + sk_stop_timer_sync(sk, add_timer); return entry; } @@ -1462,7 +1468,6 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, entry = mptcp_pm_del_add_timer(msk, addr, false); if (entry) { - list_del(&entry->list); kfree(entry); return true; } From cbd7ec083413c6a2e0c326d49e24ec7d12c7a9e0 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 10 Sep 2024 10:31:44 -0400 Subject: [PATCH 33/36] net: dpaa: Pad packets to ETH_ZLEN When sending packets under 60 bytes, up to three bytes of the buffer following the data may be leaked. Avoid this by extending all packets to ETH_ZLEN, ensuring nothing is leaked in the padding. This bug can be reproduced by running $ ping -s 11 destination Fixes: 9ad1a3749333 ("dpaa_eth: add support for DPAA Ethernet") Suggested-by: Eric Dumazet Signed-off-by: Sean Anderson Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240910143144.1439910-1-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index cfe6b57b1da0..4a55e521c17e 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2272,12 +2272,12 @@ static netdev_tx_t dpaa_start_xmit(struct sk_buff *skb, struct net_device *net_dev) { const int queue_mapping = skb_get_queue_mapping(skb); - bool nonlinear = skb_is_nonlinear(skb); struct rtnl_link_stats64 *percpu_stats; struct dpaa_percpu_priv *percpu_priv; struct netdev_queue *txq; struct dpaa_priv *priv; struct qm_fd fd; + bool nonlinear; int offset = 0; int err = 0; @@ -2287,6 +2287,13 @@ dpaa_start_xmit(struct sk_buff *skb, struct net_device *net_dev) qm_fd_clear_fd(&fd); + /* Packet data is always read as 32-bit words, so zero out any part of + * the skb which might be sent if we have to pad the packet + */ + if (__skb_put_padto(skb, ETH_ZLEN, false)) + goto enomem; + + nonlinear = skb_is_nonlinear(skb); if (!nonlinear) { /* We're going to store the skb backpointer at the beginning * of the data buffer, so we need a privately owned skb From 09a45a5553792bbf20beba0a1ac90b4692324d06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Wed, 11 Sep 2024 09:10:02 +0000 Subject: [PATCH 34/36] netlink: specs: mptcp: fix port endianness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MPTCP port attribute is in host endianness, but was documented as big-endian in the ynl specification. Below are two examples from net/mptcp/pm_netlink.c showing that the attribute is converted to/from host endianness for use with netlink. Import from netlink: addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])) Export to netlink: nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port)) Where addr->port is defined as __be16. No functional change intended. Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Davide Caratti Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240911091003.1112179-1-ast@fiberby.net Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index af525ed29792..30d8342cacc8 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -109,7 +109,6 @@ attribute-sets: - name: port type: u16 - byte-order: big-endian - name: flags type: u32 From 6513eb3d3191574b58859ef2d6dc26c0277c6f81 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 10 Sep 2024 17:35:35 -0400 Subject: [PATCH 35/36] net: tighten bad gso csum offset check in virtio_net_hdr The referenced commit drops bad input, but has false positives. Tighten the check to avoid these. The check detects illegal checksum offload requests, which produce csum_start/csum_off beyond end of packet after segmentation. But it is based on two incorrect assumptions: 1. virtio_net_hdr_to_skb with VIRTIO_NET_HDR_GSO_TCP[46] implies GSO. True in callers that inject into the tx path, such as tap. But false in callers that inject into rx, like virtio-net. Here, the flags indicate GRO, and CHECKSUM_UNNECESSARY or CHECKSUM_NONE without VIRTIO_NET_HDR_F_NEEDS_CSUM is normal. 2. TSO requires checksum offload, i.e., ip_summed == CHECKSUM_PARTIAL. False, as tcp[46]_gso_segment will fix up csum_start and offset for all other ip_summed by calling __tcp_v4_send_check. Because of 2, we can limit the scope of the fix to virtio_net_hdr that do try to set these fields, with a bogus value. Link: https://lore.kernel.org/netdev/20240909094527.GA3048202@port70.net/ Fixes: 89add40066f9 ("net: drop bad gso csum_start and offset in virtio_net_hdr") Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20240910213553.839926-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 6c395a2600e8..276ca543ef44 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -173,7 +173,8 @@ retry: break; case SKB_GSO_TCPV4: case SKB_GSO_TCPV6: - if (skb->csum_offset != offsetof(struct tcphdr, check)) + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb->csum_offset != offsetof(struct tcphdr, check)) return -EINVAL; break; } From 3e705251d998c9688be0e7e0526c250fec24d233 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 11 Sep 2024 17:37:30 +0200 Subject: [PATCH 36/36] net: netfilter: move nf flowtable bpf initialization in nf_flow_table_module_init() Move nf flowtable bpf initialization in nf_flow_table module load routine since nf_flow_table_bpf is part of nf_flow_table module and not nf_flow_table_inet one. This patch allows to avoid the following kernel warning running the reproducer below: $modprobe nf_flow_table_inet $rmmod nf_flow_table_inet $modprobe nf_flow_table_inet modprobe: ERROR: could not insert 'nf_flow_table_inet': Invalid argument [ 184.081501] ------------[ cut here ]------------ [ 184.081527] WARNING: CPU: 0 PID: 1362 at kernel/bpf/btf.c:8206 btf_populate_kfunc_set+0x23c/0x330 [ 184.081550] CPU: 0 UID: 0 PID: 1362 Comm: modprobe Kdump: loaded Not tainted 6.11.0-0.rc5.22.el10.x86_64 #1 [ 184.081553] Hardware name: Red Hat OpenStack Compute, BIOS 1.14.0-1.module+el8.4.0+8855+a9e237a9 04/01/2014 [ 184.081554] RIP: 0010:btf_populate_kfunc_set+0x23c/0x330 [ 184.081558] RSP: 0018:ff22cfb38071fc90 EFLAGS: 00010202 [ 184.081559] RAX: 0000000000000001 RBX: 0000000000000001 RCX: 0000000000000000 [ 184.081560] RDX: 000000000000006e RSI: ffffffff95c00000 RDI: ff13805543436350 [ 184.081561] RBP: ffffffffc0e22180 R08: ff13805543410808 R09: 000000000001ec00 [ 184.081562] R10: ff13805541c8113c R11: 0000000000000010 R12: ff13805541b83c00 [ 184.081563] R13: ff13805543410800 R14: 0000000000000001 R15: ffffffffc0e2259a [ 184.081564] FS: 00007fa436c46740(0000) GS:ff1380557ba00000(0000) knlGS:0000000000000000 [ 184.081569] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 184.081570] CR2: 000055e7b3187000 CR3: 0000000100c48003 CR4: 0000000000771ef0 [ 184.081571] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 184.081572] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 184.081572] PKRU: 55555554 [ 184.081574] Call Trace: [ 184.081575] [ 184.081578] ? show_trace_log_lvl+0x1b0/0x2f0 [ 184.081580] ? show_trace_log_lvl+0x1b0/0x2f0 [ 184.081582] ? __register_btf_kfunc_id_set+0x199/0x200 [ 184.081585] ? btf_populate_kfunc_set+0x23c/0x330 [ 184.081586] ? __warn.cold+0x93/0xed [ 184.081590] ? btf_populate_kfunc_set+0x23c/0x330 [ 184.081592] ? report_bug+0xff/0x140 [ 184.081594] ? handle_bug+0x3a/0x70 [ 184.081596] ? exc_invalid_op+0x17/0x70 [ 184.081597] ? asm_exc_invalid_op+0x1a/0x20 [ 184.081601] ? btf_populate_kfunc_set+0x23c/0x330 [ 184.081602] __register_btf_kfunc_id_set+0x199/0x200 [ 184.081605] ? __pfx_nf_flow_inet_module_init+0x10/0x10 [nf_flow_table_inet] [ 184.081607] do_one_initcall+0x58/0x300 [ 184.081611] do_init_module+0x60/0x230 [ 184.081614] __do_sys_init_module+0x17a/0x1b0 [ 184.081617] do_syscall_64+0x7d/0x160 [ 184.081620] ? __count_memcg_events+0x58/0xf0 [ 184.081623] ? handle_mm_fault+0x234/0x350 [ 184.081626] ? do_user_addr_fault+0x347/0x640 [ 184.081630] ? clear_bhb_loop+0x25/0x80 [ 184.081633] ? clear_bhb_loop+0x25/0x80 [ 184.081634] ? clear_bhb_loop+0x25/0x80 [ 184.081637] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 184.081639] RIP: 0033:0x7fa43652e4ce [ 184.081647] RSP: 002b:00007ffe8213be18 EFLAGS: 00000246 ORIG_RAX: 00000000000000af [ 184.081649] RAX: ffffffffffffffda RBX: 000055e7b3176c20 RCX: 00007fa43652e4ce [ 184.081650] RDX: 000055e7737fde79 RSI: 0000000000003990 RDI: 000055e7b3185380 [ 184.081651] RBP: 000055e7737fde79 R08: 0000000000000007 R09: 000055e7b3179bd0 [ 184.081651] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000040000 [ 184.081652] R13: 000055e7b3176fa0 R14: 0000000000000000 R15: 000055e7b3179b80 Fixes: 391bb6594fd3 ("netfilter: Add bpf_xdp_flow_lookup kfunc") Signed-off-by: Lorenzo Bianconi Acked-by: Florian Westphal Acked-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20240911-nf-flowtable-bpf-modprob-fix-v1-1-f9fc075aafc3@kernel.org Signed-off-by: Paolo Abeni --- net/netfilter/nf_flow_table_core.c | 6 ++++++ net/netfilter/nf_flow_table_inet.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 5c1ff07eaee0..df72b0376970 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -670,8 +670,14 @@ static int __init nf_flow_table_module_init(void) if (ret) goto out_offload; + ret = nf_flow_register_bpf(); + if (ret) + goto out_bpf; + return 0; +out_bpf: + nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 8b541a080342..b0f199171932 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -101,7 +101,7 @@ static int __init nf_flow_inet_module_init(void) nft_register_flowtable_type(&flowtable_ipv6); nft_register_flowtable_type(&flowtable_inet); - return nf_flow_register_bpf(); + return 0; } static void __exit nf_flow_inet_module_exit(void)