From 22600596b6756b166fd052d5facb66287e6f0bad Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 9 Oct 2024 14:47:13 -0400 Subject: ipv4: give an IPv4 dev to blackhole_netdev After commit 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries"), blackhole_netdev was introduced to invalidate dst cache entries on the TX path whenever the cache times out or is flushed. When two UDP sockets (sk1 and sk2) send messages to the same destination simultaneously, they are using the same dst cache. If the dst cache is invalidated on one path (sk2) while the other (sk1) is still transmitting, sk1 may try to use the invalid dst entry. CPU1 CPU2 udp_sendmsg(sk1) udp_sendmsg(sk2) udp_send_skb() ip_output() <--- dst timeout or flushed dst_dev_put() ip_finish_output2() ip_neigh_for_gw() This results in a scenario where ip_neigh_for_gw() returns -EINVAL because blackhole_dev lacks an in_dev, which is needed to initialize the neigh in arp_constructor(). This error is then propagated back to userspace, breaking the UDP application. The patch fixes this issue by assigning an in_dev to blackhole_dev for IPv4, similar to what was done for IPv6 in commit e5f80fcf869a ("ipv6: give an IPv6 dev to blackhole_netdev"). This ensures that even when the dst entry is invalidated with blackhole_dev, it will not fail to create the neigh entry. As devinet_init() is called ealier than blackhole_netdev_init() in system booting, it can not assign the in_dev to blackhole_dev in devinet_init(). As Paolo suggested, add a separate late_initcall() in devinet.c to ensure inet_blackhole_dev_init() is called after blackhole_netdev_init(). Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries") Signed-off-by: Xin Long Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/3000792d45ca44e16c785ebe2b092e610e5b3df1.1728499633.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ab76744383cf..7cf5f7d0d0de 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -298,17 +298,19 @@ static struct in_device *inetdev_init(struct net_device *dev) /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); - err = devinet_sysctl_register(in_dev); - if (err) { - in_dev->dead = 1; - neigh_parms_release(&arp_tbl, in_dev->arp_parms); - in_dev_put(in_dev); - in_dev = NULL; - goto out; + if (dev != blackhole_netdev) { + err = devinet_sysctl_register(in_dev); + if (err) { + in_dev->dead = 1; + neigh_parms_release(&arp_tbl, in_dev->arp_parms); + in_dev_put(in_dev); + in_dev = NULL; + goto out; + } + ip_mc_init_dev(in_dev); + if (dev->flags & IFF_UP) + ip_mc_up(in_dev); } - ip_mc_init_dev(in_dev); - if (dev->flags & IFF_UP) - ip_mc_up(in_dev); /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); @@ -347,6 +349,19 @@ static void inetdev_destroy(struct in_device *in_dev) in_dev_put(in_dev); } +static int __init inet_blackhole_dev_init(void) +{ + int err = 0; + + rtnl_lock(); + if (!inetdev_init(blackhole_netdev)) + err = -ENOMEM; + rtnl_unlock(); + + return err; +} +late_initcall(inet_blackhole_dev_init); + int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; -- cgit From 8a6be4bd6fb319cee63d228e37c8dda5fd1eb74a Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Wed, 9 Oct 2024 14:49:56 +0200 Subject: net: sparx5: fix source port register when mirroring When port mirroring is added to a port, the bit position of the source port, needs to be written to the register ANA_AC_PROBE_PORT_CFG. This register is replicated for n_ports > 32, and therefore we need to derive the correct register from the port number. Before this patch, we wrongly calculate the register from portno / BITS_PER_BYTE, where the divisor ought to be 32, causing any port >=8 to be written to the wrong register. We fix this, by using do_div(), where the dividend is the register, the remainder is the bit position and the divisor is now 32. Fixes: 4e50d72b3b95 ("net: sparx5: add port mirroring implementation") Signed-off-by: Daniel Machon Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241009-mirroring-fix-v1-1-9ec962301989@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_mirror.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_mirror.c b/drivers/net/ethernet/microchip/sparx5/sparx5_mirror.c index 15db423be4aa..459a53676ae9 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_mirror.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_mirror.c @@ -31,10 +31,10 @@ static u64 sparx5_mirror_port_get(struct sparx5 *sparx5, u32 idx) /* Add port to mirror (only front ports) */ static void sparx5_mirror_port_add(struct sparx5 *sparx5, u32 idx, u32 portno) { - u32 val, reg = portno; + u64 reg = portno; + u32 val; - reg = portno / BITS_PER_BYTE; - val = BIT(portno % BITS_PER_BYTE); + val = BIT(do_div(reg, 32)); if (reg == 0) return spx5_rmw(val, val, sparx5, ANA_AC_PROBE_PORT_CFG(idx)); @@ -45,10 +45,10 @@ static void sparx5_mirror_port_add(struct sparx5 *sparx5, u32 idx, u32 portno) /* Delete port from mirror (only front ports) */ static void sparx5_mirror_port_del(struct sparx5 *sparx5, u32 idx, u32 portno) { - u32 val, reg = portno; + u64 reg = portno; + u32 val; - reg = portno / BITS_PER_BYTE; - val = BIT(portno % BITS_PER_BYTE); + val = BIT(do_div(reg, 32)); if (reg == 0) return spx5_rmw(0, val, sparx5, ANA_AC_PROBE_PORT_CFG(idx)); -- cgit From 412950d5746f7aa139e14fe95338694c1f09b595 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:53 +0800 Subject: net: enetc: remove xdp_drops statistic from enetc_xdp_drop() The xdp_drops statistic indicates the number of XDP frames dropped in the Rx direction. However, enetc_xdp_drop() is also used in XDP_TX and XDP_REDIRECT actions. If frame loss occurs in these two actions, the frames loss count should not be included in xdp_drops, because there are already xdp_tx_drops and xdp_redirect_failures to count the frame loss of these two actions, so it's better to remove xdp_drops statistic from enetc_xdp_drop() and increase xdp_drops in XDP_DROP action. Fixes: 7ed2bc80074e ("net: enetc: add support for XDP_TX") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Maciej Fijalkowski Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 032d8eadd003..56e59721ec7d 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1521,7 +1521,6 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, &rx_ring->rx_swbd[rx_ring_first]); enetc_bdr_idx_inc(rx_ring, &rx_ring_first); } - rx_ring->stats.xdp_drops++; } static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, @@ -1586,6 +1585,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, fallthrough; case XDP_DROP: enetc_xdp_drop(rx_ring, orig_i, i); + rx_ring->stats.xdp_drops++; break; case XDP_PASS: rxbd = orig_rxbd; -- cgit From c728a95ccf2a8ba544facfc30a4418d4c68c39f0 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:54 +0800 Subject: net: enetc: block concurrent XDP transmissions during ring reconfiguration When testing the XDP_REDIRECT function on the LS1028A platform, we found a very reproducible issue that the Tx frames can no longer be sent out even if XDP_REDIRECT is turned off. Specifically, if there is a lot of traffic on Rx direction, when XDP_REDIRECT is turned on, the console may display some warnings like "timeout for tx ring #6 clear", and all redirected frames will be dropped, the detailed log is as follows. root@ls1028ardb:~# ./xdp-bench redirect eno0 eno2 Redirecting from eno0 (ifindex 3; driver fsl_enetc) to eno2 (ifindex 4; driver fsl_enetc) [203.849809] fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #5 clear [204.006051] fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #6 clear [204.161944] fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #7 clear eno0->eno2 1420505 rx/s 1420590 err,drop/s 0 xmit/s xmit eno0->eno2 0 xmit/s 1420590 drop/s 0 drv_err/s 15.71 bulk-avg eno0->eno2 1420484 rx/s 1420485 err,drop/s 0 xmit/s xmit eno0->eno2 0 xmit/s 1420485 drop/s 0 drv_err/s 15.71 bulk-avg By analyzing the XDP_REDIRECT implementation of enetc driver, the driver will reconfigure Tx and Rx BD rings when a bpf program is installed or uninstalled, but there is no mechanisms to block the redirected frames when enetc driver reconfigures rings. Similarly, XDP_TX verdicts on received frames can also lead to frames being enqueued in the Tx rings. Because XDP ignores the state set by the netif_tx_wake_queue() API, so introduce the ENETC_TX_DOWN flag to suppress transmission of XDP frames. Fixes: c33bfaf91c4c ("net: enetc: set up XDP program under enetc_reconfigure()") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-3-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 14 ++++++++++++++ drivers/net/ethernet/freescale/enetc/enetc.h | 1 + 2 files changed, 15 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 56e59721ec7d..482c44ed9d46 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -902,6 +902,7 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) if (unlikely(tx_frm_cnt && netif_carrier_ok(ndev) && __netif_subqueue_stopped(ndev, tx_ring->index) && + !test_bit(ENETC_TX_DOWN, &priv->flags) && (enetc_bd_unused(tx_ring) >= ENETC_TXBDS_MAX_NEEDED))) { netif_wake_subqueue(ndev, tx_ring->index); } @@ -1377,6 +1378,9 @@ int enetc_xdp_xmit(struct net_device *ndev, int num_frames, int xdp_tx_bd_cnt, i, k; int xdp_tx_frm_cnt = 0; + if (unlikely(test_bit(ENETC_TX_DOWN, &priv->flags))) + return -ENETDOWN; + enetc_lock_mdio(); tx_ring = priv->xdp_tx_ring[smp_processor_id()]; @@ -1602,6 +1606,12 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, break; case XDP_TX: tx_ring = priv->xdp_tx_ring[rx_ring->index]; + if (unlikely(test_bit(ENETC_TX_DOWN, &priv->flags))) { + enetc_xdp_drop(rx_ring, orig_i, i); + tx_ring->stats.xdp_tx_drops++; + break; + } + xdp_tx_bd_cnt = enetc_rx_swbd_to_xdp_tx_swbd(xdp_tx_arr, rx_ring, orig_i, i); @@ -2463,6 +2473,8 @@ void enetc_start(struct net_device *ndev) enetc_enable_bdrs(priv); netif_tx_start_all_queues(ndev); + + clear_bit(ENETC_TX_DOWN, &priv->flags); } EXPORT_SYMBOL_GPL(enetc_start); @@ -2520,6 +2532,8 @@ void enetc_stop(struct net_device *ndev) struct enetc_ndev_priv *priv = netdev_priv(ndev); int i; + set_bit(ENETC_TX_DOWN, &priv->flags); + netif_tx_stop_all_queues(ndev); enetc_disable_bdrs(priv); diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 97524dfa234c..fb7d98d57783 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -325,6 +325,7 @@ enum enetc_active_offloads { enum enetc_flags_bit { ENETC_TX_ONESTEP_TSTAMP_IN_PROGRESS = 0, + ENETC_TX_DOWN, }; /* interrupt coalescing modes */ -- cgit From 0a93f2ca4be6c4616d371f18a3fabad2df7f8d55 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:55 +0800 Subject: net: enetc: disable Tx BD rings after they are empty The Tx BD rings are disabled first in enetc_stop() and the driver waits for them to become empty. This operation is not safe while the ring is actively transmitting frames, and will cause the ring to not be empty and hardware exception. As described in the NETC block guide, software should only disable an active Tx ring after all pending ring entries have been consumed (i.e. when PI = CI). Disabling a transmit ring that is actively processing BDs risks a HW-SW race hazard whereby a hardware resource becomes assigned to work on one or more ring entries only to have those entries be removed due to the ring becoming disabled. When testing XDP_REDIRECT feautre, although all frames were blocked from being put into Tx rings during ring reconfiguration, the similar warning log was still encountered: fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #6 clear fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #7 clear The reason is that when there are still unsent frames in the Tx ring, disabling the Tx ring causes the remaining frames to be unable to be sent out. And the Tx ring cannot be restored, which means that even if the xdp program is uninstalled, the Tx frames cannot be sent out anymore. Therefore, correct the operation order in enect_start() and enect_stop(). Fixes: ff58fda09096 ("net: enetc: prioritize ability to go down over packet processing") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-4-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 36 ++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 482c44ed9d46..52da10f62430 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2233,18 +2233,24 @@ static void enetc_enable_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) enetc_rxbdr_wr(hw, idx, ENETC_RBMR, rbmr); } -static void enetc_enable_bdrs(struct enetc_ndev_priv *priv) +static void enetc_enable_rx_bdrs(struct enetc_ndev_priv *priv) { struct enetc_hw *hw = &priv->si->hw; int i; - for (i = 0; i < priv->num_tx_rings; i++) - enetc_enable_txbdr(hw, priv->tx_ring[i]); - for (i = 0; i < priv->num_rx_rings; i++) enetc_enable_rxbdr(hw, priv->rx_ring[i]); } +static void enetc_enable_tx_bdrs(struct enetc_ndev_priv *priv) +{ + struct enetc_hw *hw = &priv->si->hw; + int i; + + for (i = 0; i < priv->num_tx_rings; i++) + enetc_enable_txbdr(hw, priv->tx_ring[i]); +} + static void enetc_disable_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) { int idx = rx_ring->index; @@ -2261,18 +2267,24 @@ static void enetc_disable_txbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) enetc_txbdr_wr(hw, idx, ENETC_TBMR, 0); } -static void enetc_disable_bdrs(struct enetc_ndev_priv *priv) +static void enetc_disable_rx_bdrs(struct enetc_ndev_priv *priv) { struct enetc_hw *hw = &priv->si->hw; int i; - for (i = 0; i < priv->num_tx_rings; i++) - enetc_disable_txbdr(hw, priv->tx_ring[i]); - for (i = 0; i < priv->num_rx_rings; i++) enetc_disable_rxbdr(hw, priv->rx_ring[i]); } +static void enetc_disable_tx_bdrs(struct enetc_ndev_priv *priv) +{ + struct enetc_hw *hw = &priv->si->hw; + int i; + + for (i = 0; i < priv->num_tx_rings; i++) + enetc_disable_txbdr(hw, priv->tx_ring[i]); +} + static void enetc_wait_txbdr(struct enetc_hw *hw, struct enetc_bdr *tx_ring) { int delay = 8, timeout = 100; @@ -2462,6 +2474,8 @@ void enetc_start(struct net_device *ndev) enetc_setup_interrupts(priv); + enetc_enable_tx_bdrs(priv); + for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, ENETC_BDR_INT_BASE_IDX + i); @@ -2470,7 +2484,7 @@ void enetc_start(struct net_device *ndev) enable_irq(irq); } - enetc_enable_bdrs(priv); + enetc_enable_rx_bdrs(priv); netif_tx_start_all_queues(ndev); @@ -2536,7 +2550,7 @@ void enetc_stop(struct net_device *ndev) netif_tx_stop_all_queues(ndev); - enetc_disable_bdrs(priv); + enetc_disable_rx_bdrs(priv); for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, @@ -2549,6 +2563,8 @@ void enetc_stop(struct net_device *ndev) enetc_wait_bdrs(priv); + enetc_disable_tx_bdrs(priv); + enetc_clear_interrupts(priv); } EXPORT_SYMBOL_GPL(enetc_stop); -- cgit From 6b58fadd44aafbbd6af5f0b965063e1fd2063992 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:56 +0800 Subject: net: enetc: disable NAPI after all rings are disabled When running "xdp-bench tx eno0" to test the XDP_TX feature of ENETC on LS1028A, it was found that if the command was re-run multiple times, Rx could not receive the frames, and the result of xdp-bench showed that the rx rate was 0. root@ls1028ardb:~# ./xdp-bench tx eno0 Hairpinning (XDP_TX) packets on eno0 (ifindex 3; driver fsl_enetc) Summary 2046 rx/s 0 err,drop/s Summary 0 rx/s 0 err,drop/s Summary 0 rx/s 0 err,drop/s Summary 0 rx/s 0 err,drop/s By observing the Rx PIR and CIR registers, CIR is always 0x7FF and PIR is always 0x7FE, which means that the Rx ring is full and can no longer accommodate other Rx frames. Therefore, the problem is caused by the Rx BD ring not being cleaned up. Further analysis of the code revealed that the Rx BD ring will only be cleaned if the "cleaned_cnt > xdp_tx_in_flight" condition is met. Therefore, some debug logs were added to the driver and the current values of cleaned_cnt and xdp_tx_in_flight were printed when the Rx BD ring was full. The logs are as follows. [ 178.762419] [XDP TX] >> cleaned_cnt:1728, xdp_tx_in_flight:2140 [ 178.771387] [XDP TX] >> cleaned_cnt:1941, xdp_tx_in_flight:2110 [ 178.776058] [XDP TX] >> cleaned_cnt:1792, xdp_tx_in_flight:2110 From the results, the max value of xdp_tx_in_flight has reached 2140. However, the size of the Rx BD ring is only 2048. So xdp_tx_in_flight did not drop to 0 after enetc_stop() is called and the driver does not clear it. The root cause is that NAPI is disabled too aggressively, without having waited for the pending XDP_TX frames to be transmitted, and their buffers recycled, so that xdp_tx_in_flight cannot naturally drop to 0. Later, enetc_free_tx_ring() does free those stale, unsent XDP_TX packets, but it is not coded up to also reset xdp_tx_in_flight, hence the manifestation of the bug. One option would be to cover this extra condition in enetc_free_tx_ring(), but now that the ENETC_TX_DOWN exists, we have created a window at the beginning of enetc_stop() where NAPI can still be scheduled, but any concurrent enqueue will be blocked. Therefore, enetc_wait_bdrs() and enetc_disable_tx_bdrs() can be called with NAPI still scheduled, and it is guaranteed that this will not wait indefinitely, but instead give us an indication that the pending TX frames have orderly dropped to zero. Only then should we call napi_disable(). This way, enetc_free_tx_ring() becomes entirely redundant and can be dropped as part of subsequent cleanup. The change also refactors enetc_start() so that it looks like the mirror opposite procedure of enetc_stop(). Fixes: ff58fda09096 ("net: enetc: prioritize ability to go down over packet processing") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-5-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 52da10f62430..c09370eab319 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2474,8 +2474,6 @@ void enetc_start(struct net_device *ndev) enetc_setup_interrupts(priv); - enetc_enable_tx_bdrs(priv); - for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, ENETC_BDR_INT_BASE_IDX + i); @@ -2484,6 +2482,8 @@ void enetc_start(struct net_device *ndev) enable_irq(irq); } + enetc_enable_tx_bdrs(priv); + enetc_enable_rx_bdrs(priv); netif_tx_start_all_queues(ndev); @@ -2552,6 +2552,10 @@ void enetc_stop(struct net_device *ndev) enetc_disable_rx_bdrs(priv); + enetc_wait_bdrs(priv); + + enetc_disable_tx_bdrs(priv); + for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, ENETC_BDR_INT_BASE_IDX + i); @@ -2561,10 +2565,6 @@ void enetc_stop(struct net_device *ndev) napi_disable(&priv->int_vector[i]->napi); } - enetc_wait_bdrs(priv); - - enetc_disable_tx_bdrs(priv); - enetc_clear_interrupts(priv); } EXPORT_SYMBOL_GPL(enetc_stop); -- cgit From 1d7b2ce43d2c22a21dadaf689cb36a69570346a6 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 11 Oct 2024 11:01:03 +0800 Subject: net: enetc: add missing static descriptor and inline keyword Fix the build warnings when CONFIG_FSL_ENETC_MDIO is not enabled. The detailed warnings are shown as follows. include/linux/fsl/enetc_mdio.h:62:18: warning: no previous prototype for function 'enetc_hw_alloc' [-Wmissing-prototypes] 62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) | ^ include/linux/fsl/enetc_mdio.h:62:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) | ^ | static 8 warnings generated. Fixes: 6517798dd343 ("enetc: Make MDIO accessors more generic and export to include/linux/fsl") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410102136.jQHZOcS4-lkp@intel.com/ Signed-off-by: Wei Fang Reviewed-by: Claudiu Manoil Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241011030103.392362-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/fsl/enetc_mdio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/fsl/enetc_mdio.h b/include/linux/fsl/enetc_mdio.h index df25fffdc0ae..623ccfcbf39c 100644 --- a/include/linux/fsl/enetc_mdio.h +++ b/include/linux/fsl/enetc_mdio.h @@ -59,7 +59,8 @@ static inline int enetc_mdio_read_c45(struct mii_bus *bus, int phy_id, static inline int enetc_mdio_write_c45(struct mii_bus *bus, int phy_id, int devad, int regnum, u16 value) { return -EINVAL; } -struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) +static inline struct enetc_hw *enetc_hw_alloc(struct device *dev, + void __iomem *port_regs) { return ERR_PTR(-EINVAL); } #endif -- cgit From 6ea8a1c28fd36179fc66e088060b11515c8508b7 Mon Sep 17 00:00:00 2001 From: Alessandro Zanni Date: Thu, 10 Oct 2024 21:44:17 +0200 Subject: selftests: net/rds: add module not found This fix solves this error, when calling kselftest with targets "net/rds": The error was found by running tests manually with the command: make kselftest TARGETS="net/rds" The patch also specifies to import ip() function from the utils module. Signed-off-by: Alessandro Zanni Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20241010194421.48198-1-alessandro.zanni87@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rds/test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/rds/test.py b/tools/testing/selftests/net/rds/test.py index e6bb109bcead..4a7178d11193 100755 --- a/tools/testing/selftests/net/rds/test.py +++ b/tools/testing/selftests/net/rds/test.py @@ -14,8 +14,11 @@ import sys import atexit from pwd import getpwuid from os import stat -from lib.py import ip +# Allow utils module to be imported from different directory +this_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.join(this_dir, "../")) +from lib.py.utils import ip libc = ctypes.cdll.LoadLibrary('libc.so.6') setns = libc.setns -- cgit From 174714f0e505070a16be6fbede30d32b81df789f Mon Sep 17 00:00:00 2001 From: Alessandro Zanni Date: Thu, 10 Oct 2024 20:30:30 +0200 Subject: selftests: drivers: net: fix name not defined This fix solves this error, when calling kselftest with targets "drivers/net": File "tools/testing/selftests/net/lib/py/nsim.py", line 64, in __init__ if e.errno == errno.ENOSPC: NameError: name 'errno' is not defined The error was found by running tests manually with the command: make kselftest TARGETS="drivers/net" The module errno makes available standard error system symbols. Reviewed-by: Petr Machata Signed-off-by: Alessandro Zanni Link: https://patch.msgid.link/20241010183034.24739-1-alessandro.zanni87@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/nsim.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py index f571a8b3139b..1a8cbe9acc48 100644 --- a/tools/testing/selftests/net/lib/py/nsim.py +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +import errno import json import os import random -- cgit From 0b84db5d8f258d4b212c05ea0772ee47612d6cfb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 11 Oct 2024 12:33:03 -0700 Subject: MAINTAINERS: add Andrew Lunn as a co-maintainer of all networking drivers Andrew has been a pillar of the community for as long as I remember. Focusing on embedded networking, co-maintaining Ethernet PHYs and DSA code, but also actively reviewing MAC and integrated NIC drivers. Elevate Andrew to the status of co-maintainer of all netdev drivers. Acked-by: Andrew Lunn Reviewed-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://patch.msgid.link/20241011193303.2461769-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e336dab6fdd1..44d599651690 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16092,6 +16092,7 @@ F: include/uapi/linux/net_dropmon.h F: net/core/drop_monitor.c NETWORKING DRIVERS +M: Andrew Lunn M: "David S. Miller" M: Eric Dumazet M: Jakub Kicinski -- cgit From d8794ac20a299b647ba9958f6d657051fc51a540 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 9 Oct 2024 15:23:01 +0800 Subject: posix-clock: Fix missing timespec64 check in pc_clock_settime() As Andrew pointed out, it will make sense that the PTP core checked timespec64 struct's tv_sec and tv_nsec range before calling ptp->info->settime64(). As the man manual of clock_settime() said, if tp.tv_sec is negative or tp.tv_nsec is outside the range [0..999,999,999], it should return EINVAL, which include dynamic clocks which handles PTP clock, and the condition is consistent with timespec64_valid(). As Thomas suggested, timespec64_valid() only check the timespec is valid, but not ensure that the time is in a valid range, so check it ahead using timespec64_valid_strict() in pc_clock_settime() and return -EINVAL if not valid. There are some drivers that use tp->tv_sec and tp->tv_nsec directly to write registers without validity checks and assume that the higher layer has checked it, which is dangerous and will benefit from this, such as hclge_ptp_settime(), igb_ptp_settime_i210(), _rcar_gen4_ptp_settime(), and some drivers can remove the checks of itself. Cc: stable@vger.kernel.org Fixes: 0606f422b453 ("posix clocks: Introduce dynamic clocks") Acked-by: Richard Cochran Suggested-by: Andrew Lunn Suggested-by: Thomas Gleixner Signed-off-by: Jinjie Ruan Link: https://patch.msgid.link/20241009072302.1754567-2-ruanjinjie@huawei.com Signed-off-by: Jakub Kicinski --- kernel/time/posix-clock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index c2f3d0c490d5..316a4e8c97d3 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -318,6 +318,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) goto out; } + if (!timespec64_valid_strict(ts)) + return -EINVAL; + if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else -- cgit From ea531dc66e27dcc5216bee1a76d1c052ab9eb5b6 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 9 Oct 2024 15:23:02 +0800 Subject: net: lan743x: Remove duplicate check Since timespec64_valid() has been checked in higher layer pc_clock_settime(), the duplicate check in lan743x_ptpci_settime64() can be removed. Acked-by: Richard Cochran Signed-off-by: Jinjie Ruan Link: https://patch.msgid.link/20241009072302.1754567-3-ruanjinjie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_ptp.c | 35 +++++++++++----------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index dcea6652d56d..4a777b449ecd 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -401,28 +401,21 @@ static int lan743x_ptpci_settime64(struct ptp_clock_info *ptpci, u32 nano_seconds = 0; u32 seconds = 0; - if (ts) { - if (ts->tv_sec > 0xFFFFFFFFLL || - ts->tv_sec < 0) { - netif_warn(adapter, drv, adapter->netdev, - "ts->tv_sec out of range, %lld\n", - ts->tv_sec); - return -ERANGE; - } - if (ts->tv_nsec >= 1000000000L || - ts->tv_nsec < 0) { - netif_warn(adapter, drv, adapter->netdev, - "ts->tv_nsec out of range, %ld\n", - ts->tv_nsec); - return -ERANGE; - } - seconds = ts->tv_sec; - nano_seconds = ts->tv_nsec; - lan743x_ptp_clock_set(adapter, seconds, nano_seconds, 0); - } else { - netif_warn(adapter, drv, adapter->netdev, "ts == NULL\n"); - return -EINVAL; + if (ts->tv_sec > 0xFFFFFFFFLL) { + netif_warn(adapter, drv, adapter->netdev, + "ts->tv_sec out of range, %lld\n", + ts->tv_sec); + return -ERANGE; + } + if (ts->tv_nsec < 0) { + netif_warn(adapter, drv, adapter->netdev, + "ts->tv_nsec out of range, %ld\n", + ts->tv_nsec); + return -ERANGE; } + seconds = ts->tv_sec; + nano_seconds = ts->tv_nsec; + lan743x_ptp_clock_set(adapter, seconds, nano_seconds, 0); return 0; } -- cgit From 25c12b459db8365fee84b63f3dd7910f70627f29 Mon Sep 17 00:00:00 2001 From: Kai Shen Date: Thu, 10 Oct 2024 11:56:24 +0000 Subject: net/smc: Fix memory leak when using percpu refs This patch adds missing percpu_ref_exit when releasing percpu refs. When releasing percpu refs, percpu_ref_exit should be called. Otherwise, memory leak happens. Fixes: 79a22238b4f2 ("net/smc: Use percpu ref for wr tx reference") Signed-off-by: Kai Shen Reviewed-by: Dust Li Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/20241010115624.7769-1-KaiShen@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/smc_wr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 0021065a600a..994c0cd4fddb 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -648,8 +648,10 @@ void smc_wr_free_link(struct smc_link *lnk) smc_wr_tx_wait_no_pending_sends(lnk); percpu_ref_kill(&lnk->wr_reg_refs); wait_for_completion(&lnk->reg_ref_comp); + percpu_ref_exit(&lnk->wr_reg_refs); percpu_ref_kill(&lnk->wr_tx_refs); wait_for_completion(&lnk->tx_ref_comp); + percpu_ref_exit(&lnk->wr_tx_refs); if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, @@ -912,11 +914,13 @@ int smc_wr_create_link(struct smc_link *lnk) init_waitqueue_head(&lnk->wr_reg_wait); rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL); if (rc) - goto dma_unmap; + goto cancel_ref; init_completion(&lnk->reg_ref_comp); init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; +cancel_ref: + percpu_ref_exit(&lnk->wr_tx_refs); dma_unmap: if (lnk->wr_rx_v2_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, -- cgit From b62f4c186c70aa235fef2da68d07325d85ca3ade Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 10 Oct 2024 15:19:14 +0200 Subject: net: usb: usbnet: fix race in probe failure The same bug as in the disconnect code path also exists in the case of a failure late during the probe process. The flag must also be set. Signed-off-by: Oliver Neukum Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://patch.msgid.link/20241010131934.1499695-1-oneukum@suse.com Signed-off-by: Paolo Abeni --- drivers/net/usb/usbnet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 2506aa8c603e..ee1b5fd7b491 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1870,6 +1870,7 @@ out1: * may trigger an error resubmitting itself and, worse, * schedule a timer. So we kill it all just in case. */ + usbnet_mark_going_away(dev); cancel_work_sync(&dev->kevent); del_timer_sync(&dev->delay); free_netdev(net); -- cgit From 1cff6ff302f5703a627f9ee1d99131161ea2683e Mon Sep 17 00:00:00 2001 From: Paritosh Dixit Date: Thu, 10 Oct 2024 10:29:08 -0400 Subject: net: stmmac: dwmac-tegra: Fix link bring-up sequence The Tegra MGBE driver sometimes fails to initialize, reporting the following error, and as a result, it is unable to acquire an IP address with DHCP: tegra-mgbe 6800000.ethernet: timeout waiting for link to become ready As per the recommendation from the Tegra hardware design team, fix this issue by: - clearing the PHY_RDY bit before setting the CDR_RESET bit and then setting PHY_RDY bit before clearing CDR_RESET bit. This ensures valid data is present at UPHY RX inputs before starting the CDR lock. - adding the required delays when bringing up the UPHY lane. Note we need to use delays here because there is no alternative, such as polling, for these cases. Using the usleep_range() instead of ndelay() as sleeping is preferred over busy wait loop. Without this change we would see link failures on boot sometimes as often as 1 in 5 boots. With this fix we have not observed any failures in over 1000 boots. Fixes: d8ca113724e7 ("net: stmmac: tegra: Add MGBE support") Signed-off-by: Paritosh Dixit Link: https://patch.msgid.link/20241010142908.602712-1-paritoshd@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index 362f85136c3e..6fdd94c8919e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -127,10 +127,12 @@ static int mgbe_uphy_lane_bringup_serdes_up(struct net_device *ndev, void *mgbe_ value &= ~XPCS_WRAP_UPHY_RX_CONTROL_AUX_RX_IDDQ; writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); + usleep_range(10, 20); /* 50ns min delay needed as per HW design */ value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_SLEEP; writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); + usleep_range(10, 20); /* 500ns min delay needed as per HW design */ value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_CAL_EN; writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); @@ -143,22 +145,30 @@ static int mgbe_uphy_lane_bringup_serdes_up(struct net_device *ndev, void *mgbe_ return err; } + usleep_range(10, 20); /* 50ns min delay needed as per HW design */ value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_DATA_EN; writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); - value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_CDR_RESET; + value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_PCS_PHY_RDY; writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); + usleep_range(10, 20); /* 50ns min delay needed as per HW design */ value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); - value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_CDR_RESET; + value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_CDR_RESET; writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); + usleep_range(10, 20); /* 50ns min delay needed as per HW design */ value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); value |= XPCS_WRAP_UPHY_RX_CONTROL_RX_PCS_PHY_RDY; writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); + msleep(30); /* 30ms delay needed as per HW design */ + value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); + value &= ~XPCS_WRAP_UPHY_RX_CONTROL_RX_CDR_RESET; + writel(value, mgbe->xpcs + XPCS_WRAP_UPHY_RX_CONTROL); + err = readl_poll_timeout(mgbe->xpcs + XPCS_WRAP_IRQ_STATUS, value, value & XPCS_WRAP_IRQ_STATUS_PCS_LINK_STS, 500, 500 * 2000); -- cgit From 637c4f6fe40befa04f19c38b5d15429cbb9191d9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 10 Oct 2024 16:45:19 +0100 Subject: octeontx2-af: Fix potential integer overflows on integer shifts The left shift int 32 bit integer constants 1 is evaluated using 32 bit arithmetic and then assigned to a 64 bit unsigned integer. In the case where the shift is 32 or more this can lead to an overflow. Avoid this by shifting using the BIT_ULL macro instead. Fixes: 019aba04f08c ("octeontx2-af: Modify SMQ flush sequence to drop packets") Signed-off-by: Colin Ian King Reviewed-by: Dan Carpenter Link: https://patch.msgid.link/20241010154519.768785-1-colin.i.king@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 82832a24fbd8..da69350c6f76 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -2411,7 +2411,7 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr, NIX_AF_TL3_TL2X_LINKX_CFG(tl2_tl3_link_schq, link)); if (!(cfg & BIT_ULL(12))) continue; - bmap |= (1 << i); + bmap |= BIT_ULL(i); cfg &= ~BIT_ULL(12); rvu_write64(rvu, blkaddr, NIX_AF_TL3_TL2X_LINKX_CFG(tl2_tl3_link_schq, link), cfg); @@ -2432,7 +2432,7 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr, /* Set NIX_AF_TL3_TL2_LINKX_CFG[ENA] for the TL3/TL2 queue */ for (i = 0; i < (rvu->hw->cgx_links + rvu->hw->lbk_links); i++) { - if (!(bmap & (1 << i))) + if (!(bmap & BIT_ULL(i))) continue; cfg = rvu_read64(rvu, blkaddr, NIX_AF_TL3_TL2X_LINKX_CFG(tl2_tl3_link_schq, link)); -- cgit From cf58aefb1332db322060cad4a330d5f9292b0f41 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 11 Oct 2024 17:16:37 +0200 Subject: macsec: don't increment counters for an unrelated SA On RX, we shouldn't be incrementing the stats for an arbitrary SA in case the actual SA hasn't been set up. Those counters are intended to track packets for their respective AN when the SA isn't currently configured. Due to the way MACsec is implemented, we don't keep counters unless the SA is configured, so we can't track those packets, and those counters will remain at 0. The RXSC's stats keeps track of those packets without telling us which AN they belonged to. We could add counters for non-existent SAs, and then find a way to integrate them in the dump to userspace, but I don't think it's worth the effort. Fixes: 91ec9bd57f35 ("macsec: Fix traffic counters/statistics") Reported-by: Paolo Abeni Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/f5ac92aaa5b89343232615f4c03f9f95042c6aa0.1728657709.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 12d1b205f6d1..26034f80d4a4 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -154,19 +154,6 @@ static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr) return sa; } -static struct macsec_rx_sa *macsec_active_rxsa_get(struct macsec_rx_sc *rx_sc) -{ - struct macsec_rx_sa *sa = NULL; - int an; - - for (an = 0; an < MACSEC_NUM_AN; an++) { - sa = macsec_rxsa_get(rx_sc->sa[an]); - if (sa) - break; - } - return sa; -} - static void free_rx_sc_rcu(struct rcu_head *head) { struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head); @@ -1208,15 +1195,12 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) /* If validateFrames is Strict or the C bit in the * SecTAG is set, discard */ - struct macsec_rx_sa *active_rx_sa = macsec_active_rxsa_get(rx_sc); if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotUsingSA++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(secy->netdev, rx_errors); - if (active_rx_sa) - this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA); goto drop_nosa; } @@ -1226,8 +1210,6 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsUnusedSA++; u64_stats_update_end(&rxsc_stats->syncp); - if (active_rx_sa) - this_cpu_inc(active_rx_sa->stats->InPktsUnusedSA); goto deliver; } -- cgit From a1494d532e28598bde7a5544892ef9c7dbfafa93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 12 Oct 2024 09:42:30 +0000 Subject: netdevsim: use cond_resched() in nsim_dev_trap_report_work() I am still seeing many syzbot reports hinting that syzbot might fool nsim_dev_trap_report_work() with hundreds of ports [1] Lets use cond_resched(), and system_unbound_wq instead of implicit system_wq. [1] INFO: task syz-executor:20633 blocked for more than 143 seconds. Not tainted 6.12.0-rc2-syzkaller-00205-g1d227fcc7222 #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor state:D stack:25856 pid:20633 tgid:20633 ppid:1 flags:0x00004006 ... NMI backtrace for cpu 1 CPU: 1 UID: 0 PID: 16760 Comm: kworker/1:0 Not tainted 6.12.0-rc2-syzkaller-00205-g1d227fcc7222 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: events nsim_dev_trap_report_work RIP: 0010:__sanitizer_cov_trace_pc+0x0/0x70 kernel/kcov.c:210 Code: 89 fb e8 23 00 00 00 48 8b 3d 04 fb 9c 0c 48 89 de 5b e9 c3 c7 5d 00 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1e fa 48 8b 04 24 65 48 8b 0c 25 c0 d7 03 00 65 8b 15 60 f0 RSP: 0018:ffffc90000a187e8 EFLAGS: 00000246 RAX: 0000000000000100 RBX: ffffc90000a188e0 RCX: ffff888027d3bc00 RDX: ffff888027d3bc00 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff88804a2e6000 R08: ffffffff8a4bc495 R09: ffffffff89da3577 R10: 0000000000000004 R11: ffffffff8a4bc2b0 R12: dffffc0000000000 R13: ffff88806573b503 R14: dffffc0000000000 R15: ffff8880663cca00 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc90a747f98 CR3: 000000000e734000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 000000000000002b DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Call Trace: __local_bh_enable_ip+0x1bb/0x200 kernel/softirq.c:382 spin_unlock_bh include/linux/spinlock.h:396 [inline] nsim_dev_trap_report drivers/net/netdevsim/dev.c:820 [inline] nsim_dev_trap_report_work+0x75d/0xaa0 drivers/net/netdevsim/dev.c:850 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa63/0x1850 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: ba5e1272142d ("netdevsim: avoid potential loop in nsim_dev_trap_report_work()") Reported-by: syzbot+d383dc9579a76f56c251@syzkaller.appspotmail.com Reported-by: syzbot+c596faae21a68bf7afd0@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Jiri Pirko Link: https://patch.msgid.link/20241012094230.3893510-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/dev.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 92a7a36b93ac..3e0b61202f0c 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -836,7 +836,8 @@ static void nsim_dev_trap_report_work(struct work_struct *work) nsim_dev = nsim_trap_data->nsim_dev; if (!devl_trylock(priv_to_devlink(nsim_dev))) { - schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, 1); + queue_delayed_work(system_unbound_wq, + &nsim_dev->trap_data->trap_report_dw, 1); return; } @@ -848,11 +849,12 @@ static void nsim_dev_trap_report_work(struct work_struct *work) continue; nsim_dev_trap_report(nsim_dev_port); + cond_resched(); } devl_unlock(priv_to_devlink(nsim_dev)); - - schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, - msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); + queue_delayed_work(system_unbound_wq, + &nsim_dev->trap_data->trap_report_dw, + msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); } static int nsim_dev_traps_init(struct devlink *devlink) @@ -907,8 +909,9 @@ static int nsim_dev_traps_init(struct devlink *devlink) INIT_DELAYED_WORK(&nsim_dev->trap_data->trap_report_dw, nsim_dev_trap_report_work); - schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, - msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); + queue_delayed_work(system_unbound_wq, + &nsim_dev->trap_data->trap_report_dw, + msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); return 0; -- cgit From cf57b5d7a2aad456719152ecd12007fe031628a3 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Sat, 12 Oct 2024 19:04:34 +0800 Subject: net: ethernet: aeroflex: fix potential memory leak in greth_start_xmit_gbit() The greth_start_xmit_gbit() returns NETDEV_TX_OK without freeing skb in case of skb->len being too long, add dev_kfree_skb() to fix it. Fixes: d4c41139df6e ("net: Add Aeroflex Gaisler 10/100/1G Ethernet MAC driver") Signed-off-by: Wang Hai Reviewed-by: Gerhard Engleder Link: https://patch.msgid.link/20241012110434.49265-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aeroflex/greth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c index 27af7746d645..adf6f67c5fcb 100644 --- a/drivers/net/ethernet/aeroflex/greth.c +++ b/drivers/net/ethernet/aeroflex/greth.c @@ -484,7 +484,7 @@ greth_start_xmit_gbit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->len > MAX_FRAME_SIZE)) { dev->stats.tx_errors++; - goto out; + goto len_error; } /* Save skb pointer. */ @@ -575,6 +575,7 @@ frag_map_error: map_error: if (net_ratelimit()) dev_warn(greth->dev, "Could not create TX DMA mapping\n"); +len_error: dev_kfree_skb(skb); out: return err; -- cgit From d0c3601f2c4e12e7689b0f46ebc17525250ea8c3 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 13 Oct 2024 07:29:16 +0200 Subject: net: macb: Avoid 20s boot delay by skipping MDIO bus registration for fixed-link PHY A boot delay was introduced by commit 79540d133ed6 ("net: macb: Fix handling of fixed-link node"). This delay was caused by the call to `mdiobus_register()` in cases where a fixed-link PHY was present. The MDIO bus registration triggered unnecessary PHY address scans, leading to a 20-second delay due to attempts to detect Clause 45 (C45) compatible PHYs, despite no MDIO bus being attached. The commit 79540d133ed6 ("net: macb: Fix handling of fixed-link node") was originally introduced to fix a regression caused by commit 7897b071ac3b4 ("net: macb: convert to phylink"), which caused the driver to misinterpret fixed-link nodes as PHY nodes. This resulted in warnings like: mdio_bus f0028000.ethernet-ffffffff: fixed-link has invalid PHY address mdio_bus f0028000.ethernet-ffffffff: scan phy fixed-link at address 0 ... mdio_bus f0028000.ethernet-ffffffff: scan phy fixed-link at address 31 This patch reworks the logic to avoid registering and allocation of the MDIO bus when: - The device tree contains a fixed-link node. - There is no "mdio" child node in the device tree. If a child node named "mdio" exists, the MDIO bus will be registered to support PHYs attached to the MACB's MDIO bus. Otherwise, with only a fixed-link, the MDIO bus is skipped. Tested on a sama5d35 based system with a ksz8863 switch attached to macb0. Fixes: 79540d133ed6 ("net: macb: Fix handling of fixed-link node") Signed-off-by: Oleksij Rempel Cc: stable@vger.kernel.org Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241013052916.3115142-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index f06babec04a0..56901280ba04 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -930,9 +930,6 @@ static int macb_mdiobus_register(struct macb *bp) return ret; } - if (of_phy_is_fixed_link(np)) - return mdiobus_register(bp->mii_bus); - /* Only create the PHY from the device tree if at least one PHY is * described. Otherwise scan the entire MDIO bus. We do this to support * old device tree that did not follow the best practices and did not @@ -953,8 +950,19 @@ static int macb_mdiobus_register(struct macb *bp) static int macb_mii_init(struct macb *bp) { + struct device_node *child, *np = bp->pdev->dev.of_node; int err = -ENXIO; + /* With fixed-link, we don't need to register the MDIO bus, + * except if we have a child named "mdio" in the device tree. + * In that case, some devices may be attached to the MACB's MDIO bus. + */ + child = of_get_child_by_name(np, "mdio"); + if (child) + of_node_put(child); + else if (of_phy_is_fixed_link(np)) + return macb_mii_probe(bp->dev); + /* Enable management port */ macb_writel(bp, NCR, MACB_BIT(MPE)); -- cgit From 82ac39ebd6db0c9f7a97a934bda1e3e101a9d201 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 14 Oct 2024 19:53:21 +0800 Subject: net/smc: Fix searching in list of known pnetids in smc_pnet_add_pnetid pnetid of pi (not newly allocated pe) should be compared Fixes: e888a2e8337c ("net/smc: introduce list of pnetids for Ethernet devices") Reviewed-by: D. Wythe Reviewed-by: Wen Gu Signed-off-by: Li RongQing Reviewed-by: Simon Horman Reviewed-by: Gerd Bayer Link: https://patch.msgid.link/20241014115321.33234-1-lirongqing@baidu.com Signed-off-by: Jakub Kicinski --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 1dd362326c0a..a04aa0e882f8 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -753,7 +753,7 @@ static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { - if (smc_pnet_match(pnetid, pe->pnetid)) { + if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; -- cgit From 3d041393ea8c815f773020fb4a995331a69c0139 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 14 Oct 2024 16:06:00 +0200 Subject: mptcp: prevent MPC handshake on port-based signal endpoints Syzkaller reported a lockdep splat: ============================================ WARNING: possible recursive locking detected 6.11.0-rc6-syzkaller-00019-g67784a74e258 #0 Not tainted -------------------------------------------- syz-executor364/5113 is trying to acquire lock: ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 but task is already holding lock: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(k-slock-AF_INET); lock(k-slock-AF_INET); *** DEADLOCK *** May be due to missing lock nesting notation 7 locks held by syz-executor364/5113: #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline] #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg+0x153/0x1b10 net/mptcp/protocol.c:1806 #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline] #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg_fastopen+0x11f/0x530 net/mptcp/protocol.c:1727 #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5f/0x1b80 net/ipv4/ip_output.c:470 #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1390 net/ipv4/ip_output.c:228 #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: local_lock_acquire include/linux/local_lock_internal.h:29 [inline] #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: process_backlog+0x33b/0x15b0 net/core/dev.c:6104 #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x230/0x5f0 net/ipv4/ip_input.c:232 #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 stack backtrace: CPU: 0 UID: 0 PID: 5113 Comm: syz-executor364 Not tainted 6.11.0-rc6-syzkaller-00019-g67784a74e258 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 check_deadlock kernel/locking/lockdep.c:3061 [inline] validate_chain+0x15d3/0x5900 kernel/locking/lockdep.c:3855 __lock_acquire+0x137a/0x2040 kernel/locking/lockdep.c:5142 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5759 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 mptcp_sk_clone_init+0x32/0x13c0 net/mptcp/protocol.c:3279 subflow_syn_recv_sock+0x931/0x1920 net/mptcp/subflow.c:874 tcp_check_req+0xfe4/0x1a20 net/ipv4/tcp_minisocks.c:853 tcp_v4_rcv+0x1c3e/0x37f0 net/ipv4/tcp_ipv4.c:2267 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5661 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5775 process_backlog+0x662/0x15b0 net/core/dev.c:6108 __napi_poll+0xcb/0x490 net/core/dev.c:6772 napi_poll net/core/dev.c:6841 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:6963 handle_softirqs+0x2c4/0x970 kernel/softirq.c:554 do_softirq+0x11b/0x1e0 kernel/softirq.c:455 __local_bh_enable_ip+0x1bb/0x200 kernel/softirq.c:382 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:908 [inline] __dev_queue_xmit+0x1763/0x3e90 net/core/dev.c:4450 dev_queue_xmit include/linux/netdevice.h:3105 [inline] neigh_hh_output include/net/neighbour.h:526 [inline] neigh_output include/net/neighbour.h:540 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:235 ip_local_out net/ipv4/ip_output.c:129 [inline] __ip_queue_xmit+0x118c/0x1b80 net/ipv4/ip_output.c:535 __tcp_transmit_skb+0x2544/0x3b30 net/ipv4/tcp_output.c:1466 tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:6542 [inline] tcp_rcv_state_process+0x2c32/0x4570 net/ipv4/tcp_input.c:6729 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1934 sk_backlog_rcv include/net/sock.h:1111 [inline] __release_sock+0x214/0x350 net/core/sock.c:3004 release_sock+0x61/0x1f0 net/core/sock.c:3558 mptcp_sendmsg_fastopen+0x1ad/0x530 net/mptcp/protocol.c:1733 mptcp_sendmsg+0x1884/0x1b10 net/mptcp/protocol.c:1812 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597 ___sys_sendmsg net/socket.c:2651 [inline] __sys_sendmmsg+0x3b2/0x740 net/socket.c:2737 __do_sys_sendmmsg net/socket.c:2766 [inline] __se_sys_sendmmsg net/socket.c:2763 [inline] __x64_sys_sendmmsg+0xa0/0xb0 net/socket.c:2763 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f04fb13a6b9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 01 1a 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd651f42d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f04fb13a6b9 RDX: 0000000000000001 RSI: 0000000020000d00 RDI: 0000000000000004 RBP: 00007ffd651f4310 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000020000080 R11: 0000000000000246 R12: 00000000000f4240 R13: 00007f04fb187449 R14: 00007ffd651f42f4 R15: 00007ffd651f4300 As noted by Cong Wang, the splat is false positive, but the code path leading to the report is an unexpected one: a client is attempting an MPC handshake towards the in-kernel listener created by the in-kernel PM for a port based signal endpoint. Such connection will be never accepted; many of them can make the listener queue full and preventing the creation of MPJ subflow via such listener - its intended role. Explicitly detect this scenario at initial-syn time and drop the incoming MPC request. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Cc: stable@vger.kernel.org Reported-by: syzbot+f4aacdfef2c6a6529c3e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f4aacdfef2c6a6529c3e Cc: Cong Wang Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-1-7faea8e6b6ae@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/pm_netlink.c | 1 + net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 11 +++++++++++ 5 files changed, 15 insertions(+) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index ad88bd3c58df..19eb9292bd60 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -17,6 +17,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP), SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED), + SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 3206cdda8bb1..128282982843 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -12,6 +12,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEDROP, /* Client-side fallback due to a MPC drop */ MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */ + MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f6f0a38a0750..1a78998fe1f4 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1121,6 +1121,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, */ inet_sk_state_store(newsk, TCP_LISTEN); lock_sock(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); err = __inet_listen_sk(ssk, backlog); if (!err) mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 74417aae08d0..568a72702b08 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -535,6 +535,7 @@ struct mptcp_subflow_context { __unused : 8; bool data_avail; bool scheduled; + bool pm_listener; /* a listener managed by the kernel PM? */ u32 remote_nonce; u64 thmac; u32 local_nonce; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 25dde81bcb75..6170f2fff71e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -132,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) } } +static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) +{ + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + return -EPERM; +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -165,6 +172,8 @@ static int subflow_check_req(struct request_sock *req, if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + if (unlikely(listener->pm_listener)) + return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { @@ -172,6 +181,8 @@ static int subflow_check_req(struct request_sock *req, if (mp_opt.backup) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); + } else if (unlikely(listener->pm_listener)) { + return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { -- cgit From 5afca7e996c42aed1b4a42d4712817601ba42aff Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 14 Oct 2024 16:06:01 +0200 Subject: selftests: mptcp: join: test for prohibited MPC to port-based endp Explicitly verify that MPC connection attempts towards a port-based signal endpoint fail with a reset. Note that this new test is a bit different from the other ones, not using 'run_tests'. It is then needed to add the capture capability, and the picking the right port which have been extracted into three new helpers. The info about the capture can also be printed from a single point, which simplifies the exit paths in do_transfer(). The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-2-7faea8e6b6ae@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 115 +++++++++++++++++------- 1 file changed, 85 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e8d0a01b4144..c07e2bd3a315 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -23,6 +23,7 @@ tmpfile="" cout="" err="" capout="" +cappid="" ns1="" ns2="" iptables="iptables" @@ -887,40 +888,62 @@ check_cestab() fi } -do_transfer() +cond_start_capture() { - local listener_ns="$1" - local connector_ns="$2" - local cl_proto="$3" - local srv_proto="$4" - local connect_addr="$5" - - local port=$((10000 + MPTCP_LIB_TEST_COUNTER - 1)) - local cappid - local FAILING_LINKS=${FAILING_LINKS:-""} - local fastclose=${fastclose:-""} - local speed=${speed:-"fast"} + local ns="$1" - :> "$cout" - :> "$sout" :> "$capout" if $capture; then - local capuser - if [ -z $SUDO_USER ] ; then + local capuser capfile + if [ -z $SUDO_USER ]; then capuser="" else capuser="-Z $SUDO_USER" fi - capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "${listener_ns}") + capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "$ns") echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile" - ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 & + ip netns exec "$ns" tcpdump -i any -s 65535 -B 32768 $capuser -w "$capfile" > "$capout" 2>&1 & cappid=$! sleep 1 fi +} + +cond_stop_capture() +{ + if $capture; then + sleep 1 + kill $cappid + cat "$capout" + fi +} + +get_port() +{ + echo "$((10000 + MPTCP_LIB_TEST_COUNTER - 1))" +} + +do_transfer() +{ + local listener_ns="$1" + local connector_ns="$2" + local cl_proto="$3" + local srv_proto="$4" + local connect_addr="$5" + local port + + local FAILING_LINKS=${FAILING_LINKS:-""} + local fastclose=${fastclose:-""} + local speed=${speed:-"fast"} + port=$(get_port) + + :> "$cout" + :> "$sout" + + cond_start_capture ${listener_ns} NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ nstat -n @@ -1007,10 +1030,7 @@ do_transfer() wait $spid local rets=$? - if $capture; then - sleep 1 - kill $cappid - fi + cond_stop_capture NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ nstat | grep Tcp > /tmp/${listener_ns}.out @@ -1026,7 +1046,6 @@ do_transfer() ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" cat /tmp/${connector_ns}.out - cat "$capout" return 1 fi @@ -1043,13 +1062,7 @@ do_transfer() fi rets=$? - if [ $retc -eq 0 ] && [ $rets -eq 0 ];then - cat "$capout" - return 0 - fi - - cat "$capout" - return 1 + [ $retc -eq 0 ] && [ $rets -eq 0 ] } make_file() @@ -2873,6 +2886,32 @@ verify_listener_events() fail_test } +chk_mpc_endp_attempt() +{ + local retl=$1 + local attempts=$2 + + print_check "Connect" + + if [ ${retl} = 124 ]; then + fail_test "timeout on connect" + elif [ ${retl} = 0 ]; then + fail_test "unexpected successful connect" + else + print_ok + + print_check "Attempts" + count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPCapableEndpAttempt") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$attempts" ]; then + fail_test "got ${count} MPC attempt[s] on port-based endpoint, expected ${attempts}" + else + print_ok + fi + fi +} + add_addr_ports_tests() { # signal address with port @@ -2963,6 +3002,22 @@ add_addr_ports_tests() chk_join_nr 2 2 2 chk_add_nr 2 2 2 fi + + if reset "port-based signal endpoint must not accept mpc"; then + local port retl count + port=$(get_port) + + cond_start_capture ${ns1} + pm_nl_add_endpoint ${ns1} 10.0.2.1 flags signal port ${port} + mptcp_lib_wait_local_port_listen ${ns1} ${port} + + timeout 1 ip netns exec ${ns2} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s MPTCP 10.0.2.1 >/dev/null 2>&1 + retl=$? + cond_stop_capture + + chk_mpc_endp_attempt ${retl} 1 + fi } syncookies_tests() -- cgit From 99714e37e8333bbc22496fe80f241d5b35380e83 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Oct 2024 22:37:04 +0800 Subject: net: xilinx: axienet: fix potential memory leak in axienet_start_xmit() The axienet_start_xmit() returns NETDEV_TX_OK without freeing skb in case of dma_map_single() fails, add dev_kfree_skb_any() to fix it. Fixes: 71791dc8bdea ("net: axienet: Check for DMA mapping errors") Signed-off-by: Wang Hai Reviewed-by: Radhey Shyam Pandey Link: https://patch.msgid.link/20241014143704.31938-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index fc35fcb22d94..d940853acc0b 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1051,6 +1051,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (net_ratelimit()) netdev_err(ndev, "TX DMA mapping error\n"); ndev->stats.tx_dropped++; + dev_kfree_skb_any(skb); return NETDEV_TX_OK; } desc_set_phys_addr(lp, phys, cur_p); @@ -1071,6 +1072,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) ndev->stats.tx_dropped++; axienet_free_tx_chain(lp, orig_tail_ptr, ii + 1, true, NULL, 0); + dev_kfree_skb_any(skb); return NETDEV_TX_OK; } desc_set_phys_addr(lp, phys, cur_p); -- cgit From c186b7a7f2387d9e09ad408420570be025b187c5 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Oct 2024 22:42:50 +0800 Subject: net: ethernet: rtsn: fix potential memory leak in rtsn_start_xmit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rtsn_start_xmit() returns NETDEV_TX_OK without freeing skb in case of skb->len being too long, add dev_kfree_skb_any() to fix it. Fixes: b0d3969d2b4d ("net: ethernet: rtsn: Add support for Renesas Ethernet-TSN") Signed-off-by: Wang Hai Reviewed-by: Niklas Söderlund Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014144250.38802-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rtsn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/renesas/rtsn.c b/drivers/net/ethernet/renesas/rtsn.c index f9f63c61d792..6b3f7fca8d15 100644 --- a/drivers/net/ethernet/renesas/rtsn.c +++ b/drivers/net/ethernet/renesas/rtsn.c @@ -1057,6 +1057,7 @@ static netdev_tx_t rtsn_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (skb->len >= TX_DS) { priv->stats.tx_dropped++; priv->stats.tx_errors++; + dev_kfree_skb_any(skb); goto out; } -- cgit From c401ed1c709948e57945485088413e1bb5e94bd1 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Oct 2024 22:51:15 +0800 Subject: net: systemport: fix potential memory leak in bcm_sysport_xmit() The bcm_sysport_xmit() returns NETDEV_TX_OK without freeing skb in case of dma_map_single() fails, add dev_kfree_skb() to fix it. Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver") Signed-off-by: Wang Hai Link: https://patch.msgid.link/20241014145115.44977-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcmsysport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index c9faa8540859..0a68b526e4a8 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1359,6 +1359,7 @@ static netdev_tx_t bcm_sysport_xmit(struct sk_buff *skb, netif_err(priv, tx_err, dev, "DMA map failed at %p (len=%d)\n", skb->data, skb_len); ret = NETDEV_TX_OK; + dev_kfree_skb_any(skb); goto out; } -- cgit From fed07d3eb8a8d9fcc0e455175a89bc6445d6faed Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Oct 2024 22:59:01 +0800 Subject: net: bcmasp: fix potential memory leak in bcmasp_xmit() The bcmasp_xmit() returns NETDEV_TX_OK without freeing skb in case of mapping fails, add dev_kfree_skb() to fix it. Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller") Signed-off-by: Wang Hai Acked-by: Florian Fainelli Link: https://patch.msgid.link/20241014145901.48940-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index 82768b0e9026..9ea16ef4139d 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -322,6 +322,7 @@ static netdev_tx_t bcmasp_xmit(struct sk_buff *skb, struct net_device *dev) } /* Rewind so we do not have a hole */ spb_index = intf->tx_spb_index; + dev_kfree_skb(skb); return NETDEV_TX_OK; } -- cgit From e8c526f2bdf1845bedaf6a478816a3d06fa78b8f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 15:33:12 -0700 Subject: tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink(). Martin KaFai Lau reported use-after-free [0] in reqsk_timer_handler(). """ We are seeing a use-after-free from a bpf prog attached to trace_tcp_retransmit_synack. The program passes the req->sk to the bpf_sk_storage_get_tracing kernel helper which does check for null before using it. """ The commit 83fccfc3940c ("inet: fix potential deadlock in reqsk_queue_unlink()") added timer_pending() in reqsk_queue_unlink() not to call del_timer_sync() from reqsk_timer_handler(), but it introduced a small race window. Before the timer is called, expire_timers() calls detach_timer(timer, true) to clear timer->entry.pprev and marks it as not pending. If reqsk_queue_unlink() checks timer_pending() just after expire_timers() calls detach_timer(), TCP will miss del_timer_sync(); the reqsk timer will continue running and send multiple SYN+ACKs until it expires. The reported UAF could happen if req->sk is close()d earlier than the timer expiration, which is 63s by default. The scenario would be 1. inet_csk_complete_hashdance() calls inet_csk_reqsk_queue_drop(), but del_timer_sync() is missed 2. reqsk timer is executed and scheduled again 3. req->sk is accept()ed and reqsk_put() decrements rsk_refcnt, but reqsk timer still has another one, and inet_csk_accept() does not clear req->sk for non-TFO sockets 4. sk is close()d 5. reqsk timer is executed again, and BPF touches req->sk Let's not use timer_pending() by passing the caller context to __inet_csk_reqsk_queue_drop(). Note that reqsk timer is pinned, so the issue does not happen in most use cases. [1] [0] BUG: KFENCE: use-after-free read in bpf_sk_storage_get_tracing+0x2e/0x1b0 Use-after-free read at 0x00000000a891fb3a (in kfence-#1): bpf_sk_storage_get_tracing+0x2e/0x1b0 bpf_prog_5ea3e95db6da0438_tcp_retransmit_synack+0x1d20/0x1dda bpf_trace_run2+0x4c/0xc0 tcp_rtx_synack+0xf9/0x100 reqsk_timer_handler+0xda/0x3d0 run_timer_softirq+0x292/0x8a0 irq_exit_rcu+0xf5/0x320 sysvec_apic_timer_interrupt+0x6d/0x80 asm_sysvec_apic_timer_interrupt+0x16/0x20 intel_idle_irq+0x5a/0xa0 cpuidle_enter_state+0x94/0x273 cpu_startup_entry+0x15e/0x260 start_secondary+0x8a/0x90 secondary_startup_64_no_verify+0xfa/0xfb kfence-#1: 0x00000000a72cc7b6-0x00000000d97616d9, size=2376, cache=TCPv6 allocated by task 0 on cpu 9 at 260507.901592s: sk_prot_alloc+0x35/0x140 sk_clone_lock+0x1f/0x3f0 inet_csk_clone_lock+0x15/0x160 tcp_create_openreq_child+0x1f/0x410 tcp_v6_syn_recv_sock+0x1da/0x700 tcp_check_req+0x1fb/0x510 tcp_v6_rcv+0x98b/0x1420 ipv6_list_rcv+0x2258/0x26e0 napi_complete_done+0x5b1/0x2990 mlx5e_napi_poll+0x2ae/0x8d0 net_rx_action+0x13e/0x590 irq_exit_rcu+0xf5/0x320 common_interrupt+0x80/0x90 asm_common_interrupt+0x22/0x40 cpuidle_enter_state+0xfb/0x273 cpu_startup_entry+0x15e/0x260 start_secondary+0x8a/0x90 secondary_startup_64_no_verify+0xfa/0xfb freed by task 0 on cpu 9 at 260507.927527s: rcu_core_si+0x4ff/0xf10 irq_exit_rcu+0xf5/0x320 sysvec_apic_timer_interrupt+0x6d/0x80 asm_sysvec_apic_timer_interrupt+0x16/0x20 cpuidle_enter_state+0xfb/0x273 cpu_startup_entry+0x15e/0x260 start_secondary+0x8a/0x90 secondary_startup_64_no_verify+0xfa/0xfb Fixes: 83fccfc3940c ("inet: fix potential deadlock in reqsk_queue_unlink()") Reported-by: Martin KaFai Lau Closes: https://lore.kernel.org/netdev/eb6684d0-ffd9-4bdc-9196-33f690c25824@linux.dev/ Link: https://lore.kernel.org/netdev/b55e2ca0-42f2-4b7c-b445-6ffd87ca74a0@linux.dev/ [1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Martin KaFai Lau Link: https://patch.msgid.link/20241014223312.4254-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2c5632d4fddb..2b698f8419fe 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1045,21 +1045,31 @@ static bool reqsk_queue_unlink(struct request_sock *req) found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } - if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) - reqsk_put(req); + return found; } -bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +static bool __inet_csk_reqsk_queue_drop(struct sock *sk, + struct request_sock *req, + bool from_timer) { bool unlinked = reqsk_queue_unlink(req); + if (!from_timer && timer_delete_sync(&req->rsk_timer)) + reqsk_put(req); + if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } + return unlinked; } + +bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +{ + return __inet_csk_reqsk_queue_drop(sk, req, false); +} EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) @@ -1152,7 +1162,7 @@ static void reqsk_timer_handler(struct timer_list *t) if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ - inet_csk_reqsk_queue_drop(sk_listener, nreq); + __inet_csk_reqsk_queue_drop(sk_listener, nreq, true); goto no_ownership; } @@ -1178,7 +1188,8 @@ no_ownership: } drop: - inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); + __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); + reqsk_put(req); } static bool reqsk_queue_hash_req(struct request_sock *req, -- cgit From 1833d8a26f057128fd63e126b4428203ece84684 Mon Sep 17 00:00:00 2001 From: Peter Rashleigh Date: Mon, 14 Oct 2024 13:43:42 -0700 Subject: net: dsa: mv88e6xxx: Fix the max_vid definition for the MV88E6361 According to the Marvell datasheet the 88E6361 has two VTU pages (4k VIDs per page) so the max_vid should be 8191, not 4095. In the current implementation mv88e6xxx_vtu_walk() gives unexpected results because of this error. I verified that mv88e6xxx_vtu_walk() works correctly on the MV88E6361 with this patch in place. Fixes: 12899f299803 ("net: dsa: mv88e6xxx: enable support for 88E6361 switch") Signed-off-by: Peter Rashleigh Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241014204342.5852-1-peter@rashleigh.ca Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 5b4e2ce5470d..284270a4ade1 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -6347,7 +6347,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .invalid_port_mask = BIT(1) | BIT(2) | BIT(8), .num_internal_phys = 5, .internal_phys_offset = 3, - .max_vid = 4095, + .max_vid = 8191, .max_sid = 63, .port_base_addr = 0x0, .phy_base_addr = 0x0, -- cgit From 56440d7ec28d60f8da3bfa09062b3368ff9b16db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Oct 2024 17:12:17 +0000 Subject: genetlink: hold RCU in genlmsg_mcast() While running net selftests with CONFIG_PROVE_RCU_LIST=y I saw one lockdep splat [1]. genlmsg_mcast() uses for_each_net_rcu(), and must therefore hold RCU. Instead of letting all callers guard genlmsg_multicast_allns() with a rcu_read_lock()/rcu_read_unlock() pair, do it in genlmsg_mcast(). This also means the @flags parameter is useless, we need to always use GFP_ATOMIC. [1] [10882.424136] ============================= [10882.424166] WARNING: suspicious RCU usage [10882.424309] 6.12.0-rc2-virtme #1156 Not tainted [10882.424400] ----------------------------- [10882.424423] net/netlink/genetlink.c:1940 RCU-list traversed in non-reader section!! [10882.424469] other info that might help us debug this: [10882.424500] rcu_scheduler_active = 2, debug_locks = 1 [10882.424744] 2 locks held by ip/15677: [10882.424791] #0: ffffffffb6b491b0 (cb_lock){++++}-{3:3}, at: genl_rcv (net/netlink/genetlink.c:1219) [10882.426334] #1: ffffffffb6b49248 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg (net/netlink/genetlink.c:61 net/netlink/genetlink.c:57 net/netlink/genetlink.c:1209) [10882.426465] stack backtrace: [10882.426805] CPU: 14 UID: 0 PID: 15677 Comm: ip Not tainted 6.12.0-rc2-virtme #1156 [10882.426919] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [10882.427046] Call Trace: [10882.427131] [10882.427244] dump_stack_lvl (lib/dump_stack.c:123) [10882.427335] lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822) [10882.427387] genlmsg_multicast_allns (net/netlink/genetlink.c:1940 (discriminator 7) net/netlink/genetlink.c:1977 (discriminator 7)) [10882.427436] l2tp_tunnel_notify.constprop.0 (net/l2tp/l2tp_netlink.c:119) l2tp_netlink [10882.427683] l2tp_nl_cmd_tunnel_create (net/l2tp/l2tp_netlink.c:253) l2tp_netlink [10882.427748] genl_family_rcv_msg_doit (net/netlink/genetlink.c:1115) [10882.427834] genl_rcv_msg (net/netlink/genetlink.c:1195 net/netlink/genetlink.c:1210) [10882.427877] ? __pfx_l2tp_nl_cmd_tunnel_create (net/l2tp/l2tp_netlink.c:186) l2tp_netlink [10882.427927] ? __pfx_genl_rcv_msg (net/netlink/genetlink.c:1201) [10882.427959] netlink_rcv_skb (net/netlink/af_netlink.c:2551) [10882.428069] genl_rcv (net/netlink/genetlink.c:1220) [10882.428095] netlink_unicast (net/netlink/af_netlink.c:1332 net/netlink/af_netlink.c:1357) [10882.428140] netlink_sendmsg (net/netlink/af_netlink.c:1901) [10882.428210] ____sys_sendmsg (net/socket.c:729 (discriminator 1) net/socket.c:744 (discriminator 1) net/socket.c:2607 (discriminator 1)) Fixes: 33f72e6f0c67 ("l2tp : multicast notification to the registered listeners") Signed-off-by: Eric Dumazet Cc: James Chapman Cc: Tom Parkin Cc: Johannes Berg Link: https://patch.msgid.link/20241011171217.3166614-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/target/target_core_user.c | 2 +- include/net/genetlink.h | 3 +-- net/l2tp/l2tp_netlink.c | 4 ++-- net/netlink/genetlink.c | 28 ++++++++++++++-------------- net/wireless/nl80211.c | 8 ++------ 5 files changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 7eb94894bd68..717931267bda 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -2130,7 +2130,7 @@ static int tcmu_netlink_event_send(struct tcmu_dev *udev, } ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0, - TCMU_MCGRP_CONFIG, GFP_KERNEL); + TCMU_MCGRP_CONFIG); /* Wait during an add as the listener may not be up yet */ if (ret == 0 || diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9ab49bfeae78..c1d91f1d20f6 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -531,13 +531,12 @@ static inline int genlmsg_multicast(const struct genl_family *family, * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array - * @flags: allocation flags * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, - unsigned int group, gfp_t flags); + unsigned int group); /** * genlmsg_unicast - unicast a netlink message diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 284f1dec1b56..59457c0c14aa 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -116,7 +116,7 @@ static int l2tp_tunnel_notify(struct genl_family *family, NLM_F_ACK, tunnel, cmd); if (ret >= 0) { - ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; @@ -144,7 +144,7 @@ static int l2tp_session_notify(struct genl_family *family, NLM_F_ACK, session, cmd); if (ret >= 0) { - ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index feb54c63a116..07ad65774fe2 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1501,15 +1501,11 @@ static int genl_ctrl_event(int event, const struct genl_family *family, if (IS_ERR(msg)) return PTR_ERR(msg); - if (!family->netnsok) { + if (!family->netnsok) genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0, 0, GFP_KERNEL); - } else { - rcu_read_lock(); - genlmsg_multicast_allns(&genl_ctrl, msg, 0, - 0, GFP_ATOMIC); - rcu_read_unlock(); - } + else + genlmsg_multicast_allns(&genl_ctrl, msg, 0, 0); return 0; } @@ -1929,23 +1925,23 @@ problem: core_initcall(genl_init); -static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, - gfp_t flags) +static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group) { struct sk_buff *tmp; struct net *net, *prev = NULL; bool delivered = false; int err; + rcu_read_lock(); for_each_net_rcu(net) { if (prev) { - tmp = skb_clone(skb, flags); + tmp = skb_clone(skb, GFP_ATOMIC); if (!tmp) { err = -ENOMEM; goto error; } err = nlmsg_multicast(prev->genl_sock, tmp, - portid, group, flags); + portid, group, GFP_ATOMIC); if (!err) delivered = true; else if (err != -ESRCH) @@ -1954,27 +1950,31 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, prev = net; } + err = nlmsg_multicast(prev->genl_sock, skb, portid, group, GFP_ATOMIC); + + rcu_read_unlock(); - err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); if (!err) delivered = true; else if (err != -ESRCH) return err; return delivered ? 0 : -ESRCH; error: + rcu_read_unlock(); + kfree_skb(skb); return err; } int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, - unsigned int group, gfp_t flags) + unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; - return genlmsg_mcast(skb, portid, group, flags); + return genlmsg_mcast(skb, portid, group); } EXPORT_SYMBOL(genlmsg_multicast_allns); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9ab777e0bd4d..d7d099f7118a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17986,10 +17986,8 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, genlmsg_end(msg, hdr); - rcu_read_lock(); genlmsg_multicast_allns(&nl80211_fam, msg, 0, - NL80211_MCGRP_REGULATORY, GFP_ATOMIC); - rcu_read_unlock(); + NL80211_MCGRP_REGULATORY); return; @@ -18722,10 +18720,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, genlmsg_end(msg, hdr); - rcu_read_lock(); genlmsg_multicast_allns(&nl80211_fam, msg, 0, - NL80211_MCGRP_REGULATORY, GFP_ATOMIC); - rcu_read_unlock(); + NL80211_MCGRP_REGULATORY); return; -- cgit From d96016a764f6aa5c7528c3d3f9cb472ef7266951 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 11 Oct 2024 14:17:30 +0200 Subject: udp: Compute L4 checksum as usual when not segmenting the skb If: 1) the user requested USO, but 2) there is not enough payload for GSO to kick in, and 3) the egress device doesn't offer checksum offload, then we want to compute the L4 checksum in software early on. In the case when we are not taking the GSO path, but it has been requested, the software checksum fallback in skb_segment doesn't get a chance to compute the full checksum, if the egress device can't do it. As a result we end up sending UDP datagrams with only a partial checksum filled in, which the peer will discard. Fixes: 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") Reported-by: Ivan Babrou Signed-off-by: Jakub Sitnicki Acked-by: Willem de Bruijn Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20241011-uso-swcsum-fixup-v2-1-6e1ddc199af9@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 4 +++- net/ipv6/udp.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8accbf4cb295..2849b273b131 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -951,8 +951,10 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); + + /* Don't checksum the payload, skb will get segmented */ + goto csum_partial; } - goto csum_partial; } if (is_udplite) /* UDP-Lite */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 52dfbb2ff1a8..0cef8ae5d1ea 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1266,8 +1266,10 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); + + /* Don't checksum the payload, skb will get segmented */ + goto csum_partial; } - goto csum_partial; } if (is_udplite) -- cgit From 6ed97afd75cc5cac34b1c15a930ab2a6b7c6ff0f Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 12 Oct 2024 22:35:22 +0200 Subject: dt-bindings: net: brcm,unimac-mdio: Add bcm6846-mdio The MDIO block in the BCM6846 is not identical to any of the previous versions, but has extended registers not present in the other variants. For this reason we need to use a new compatible especially for this SoC. Suggested-by: Florian Fainelli Link: https://lore.kernel.org/linux-devicetree/b542b2e8-115c-4234-a464-e73aa6bece5c@broadcom.com/ Signed-off-by: Linus Walleij Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20241012-bcm6846-mdio-v1-1-c703ca83e962@linaro.org Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml index 23dfe0838dca..63bee5b542f5 100644 --- a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml +++ b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml @@ -26,6 +26,7 @@ properties: - brcm,asp-v2.1-mdio - brcm,asp-v2.2-mdio - brcm,unimac-mdio + - brcm,bcm6846-mdio reg: minItems: 1 -- cgit From 906b77ca91c7e9833b4e47bedb6bec76be71d497 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 12 Oct 2024 22:35:23 +0200 Subject: net: phy: mdio-bcm-unimac: Add BCM6846 support Add Unimac mdio compatible string for the special BCM6846 variant. This variant has a few extra registers compared to other versions. Suggested-by: Florian Fainelli Link: https://lore.kernel.org/linux-devicetree/b542b2e8-115c-4234-a464-e73aa6bece5c@broadcom.com/ Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20241012-bcm6846-mdio-v1-2-c703ca83e962@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-bcm-unimac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/mdio/mdio-bcm-unimac.c b/drivers/net/mdio/mdio-bcm-unimac.c index f40eb50bb978..b7bc70586ee0 100644 --- a/drivers/net/mdio/mdio-bcm-unimac.c +++ b/drivers/net/mdio/mdio-bcm-unimac.c @@ -337,6 +337,7 @@ static const struct of_device_id unimac_mdio_ids[] = { { .compatible = "brcm,asp-v2.2-mdio", }, { .compatible = "brcm,asp-v2.1-mdio", }, { .compatible = "brcm,asp-v2.0-mdio", }, + { .compatible = "brcm,bcm6846-mdio", }, { .compatible = "brcm,genet-mdio-v5", }, { .compatible = "brcm,genet-mdio-v4", }, { .compatible = "brcm,genet-mdio-v3", }, -- cgit From 217a3d98d1e9891a8b1438a27dfbc64ddf01f691 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 14 Oct 2024 20:19:22 +0800 Subject: net: microchip: vcap api: Fix memory leaks in vcap_api_encode_rule_test() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a3c1e45156ad ("net: microchip: vcap: Fix use-after-free error in kunit test") fixed the use-after-free error, but introduced below memory leaks by removing necessary vcap_free_rule(), add it to fix it. unreferenced object 0xffffff80ca58b700 (size 192): comm "kunit_try_catch", pid 1215, jiffies 4294898264 hex dump (first 32 bytes): 00 12 7a 00 05 00 00 00 0a 00 00 00 64 00 00 00 ..z.........d... 00 00 00 00 00 00 00 00 00 04 0b cc 80 ff ff ff ................ backtrace (crc 9c09c3fe): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<0000000040a01b8d>] vcap_alloc_rule+0x3cc/0x9c4 [<000000003fe86110>] vcap_api_encode_rule_test+0x1ac/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0400 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898265 hex dump (first 32 bytes): 80 04 0b cc 80 ff ff ff 18 b7 58 ca 80 ff ff ff ..........X..... 39 00 00 00 02 00 00 00 06 05 04 03 02 01 ff ff 9............... backtrace (crc daf014e9): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000000ff63fd4>] vcap_rule_add_key+0x2cc/0x528 [<00000000dfdb1e81>] vcap_api_encode_rule_test+0x224/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0700 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898265 hex dump (first 32 bytes): 80 07 0b cc 80 ff ff ff 28 b7 58 ca 80 ff ff ff ........(.X..... 3c 00 00 00 00 00 00 00 01 2f 03 b3 ec ff ff ff <......../...... backtrace (crc 8d877792): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000006eadfab7>] vcap_rule_add_action+0x2d0/0x52c [<00000000323475d1>] vcap_api_encode_rule_test+0x4d4/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0900 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898266 hex dump (first 32 bytes): 80 09 0b cc 80 ff ff ff 80 06 0b cc 80 ff ff ff ................ 7d 00 00 00 01 00 00 00 00 00 00 00 ff 00 00 00 }............... backtrace (crc 34181e56): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000000ff63fd4>] vcap_rule_add_key+0x2cc/0x528 [<00000000991e3564>] vcap_val_rule+0xcf0/0x13e8 [<00000000fc9868e5>] vcap_api_encode_rule_test+0x678/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0980 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898266 hex dump (first 32 bytes): 18 b7 58 ca 80 ff ff ff 00 09 0b cc 80 ff ff ff ..X............. 67 00 00 00 00 00 00 00 01 01 74 88 c0 ff ff ff g.........t..... backtrace (crc 275fd9be): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000000ff63fd4>] vcap_rule_add_key+0x2cc/0x528 [<000000001396a1a2>] test_add_def_fields+0xb0/0x100 [<000000006e7621f0>] vcap_val_rule+0xa98/0x13e8 [<00000000fc9868e5>] vcap_api_encode_rule_test+0x678/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 ...... Cc: stable@vger.kernel.org Fixes: a3c1e45156ad ("net: microchip: vcap: Fix use-after-free error in kunit test") Reviewed-by: Simon Horman Reviewed-by: Jens Emil Schulz Østergaard Signed-off-by: Jinjie Ruan Link: https://patch.msgid.link/20241014121922.1280583-1-ruanjinjie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c b/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c index f2a5a36fdacd..7251121ab196 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c +++ b/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c @@ -1444,6 +1444,8 @@ static void vcap_api_encode_rule_test(struct kunit *test) ret = vcap_del_rule(&test_vctrl, &test_netdev, id); KUNIT_EXPECT_EQ(test, 0, ret); + + vcap_free_rule(rule); } static void vcap_api_set_rule_counter_test(struct kunit *test) -- cgit From 126e799602f45e9ce1ded03ee9eadda68bf470e0 Mon Sep 17 00:00:00 2001 From: Niklas Söderlund Date: Mon, 14 Oct 2024 14:43:43 +0200 Subject: net: ravb: Only advertise Rx/Tx timestamps if hardware supports it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent work moving the reporting of Rx software timestamps to the core [1] highlighted an issue where hardware time stamping was advertised for the platforms where it is not supported. Fix this by covering advertising support for hardware timestamps only if the hardware supports it. Due to the Tx implementation in RAVB software Tx timestamping is also only considered if the hardware supports hardware timestamps. This should be addressed in future, but this fix only reflects what the driver currently implements. 1. Commit 277901ee3a26 ("ravb: Remove setting of RX software timestamp") Fixes: 7e09a052dc4e ("ravb: Exclude gPTP feature support for RZ/G2L") Signed-off-by: Niklas Söderlund Reviewed-by: Paul Barker Tested-by: Paul Barker Reviewed-by: Sergey Shtylyov Link: https://patch.msgid.link/20241014124343.3875285-1-niklas.soderlund+renesas@ragnatech.se Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/ravb_main.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index d2a6518532f3..907af4651c55 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1750,20 +1750,19 @@ static int ravb_get_ts_info(struct net_device *ndev, struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *hw_info = priv->info; - info->so_timestamping = - SOF_TIMESTAMPING_TX_SOFTWARE | - SOF_TIMESTAMPING_TX_HARDWARE | - SOF_TIMESTAMPING_RX_HARDWARE | - SOF_TIMESTAMPING_RAW_HARDWARE; - info->tx_types = (1 << HWTSTAMP_TX_OFF) | (1 << HWTSTAMP_TX_ON); - info->rx_filters = - (1 << HWTSTAMP_FILTER_NONE) | - (1 << HWTSTAMP_FILTER_PTP_V2_L2_EVENT) | - (1 << HWTSTAMP_FILTER_ALL); - if (hw_info->gptp || hw_info->ccc_gac) + if (hw_info->gptp || hw_info->ccc_gac) { + info->so_timestamping = + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE; + info->tx_types = (1 << HWTSTAMP_TX_OFF) | (1 << HWTSTAMP_TX_ON); + info->rx_filters = + (1 << HWTSTAMP_FILTER_NONE) | + (1 << HWTSTAMP_FILTER_PTP_V2_L2_EVENT) | + (1 << HWTSTAMP_FILTER_ALL); info->phc_index = ptp_clock_index(priv->ptp.clock); - else - info->phc_index = 0; + } return 0; } -- cgit From 11d06f0aaef89f4cad68b92510bd9decff2d7b87 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 14 Oct 2024 18:30:41 +0300 Subject: net: dsa: vsc73xx: fix reception from VLAN-unaware bridges Similar to the situation described for sja1105 in commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"), the vsc73xx driver uses tag_8021q and doesn't need the ds->untag_bridge_pvid request. In fact, this option breaks packet reception. The ds->untag_bridge_pvid option strips VLANs from packets received on VLAN-unaware bridge ports. But those VLANs should already be stripped by tag_vsc73xx_8021q.c as part of vsc73xx_rcv() - they are not VLANs in VLAN-unaware mode, but DSA tags. Thus, dsa_software_vlan_untag() tries to untag a VLAN that doesn't exist, corrupting the packet. Fixes: 93e4649efa96 ("net: dsa: provide a software untagging function on RX for VLAN-aware bridges") Tested-by: Pawel Dembicki Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Linus Walleij Link: https://patch.msgid.link/20241014153041.1110364-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/vitesse-vsc73xx-core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index e4b98fd51643..f18aa321053d 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -851,7 +851,6 @@ static int vsc73xx_setup(struct dsa_switch *ds) dev_info(vsc->dev, "set up the switch\n"); - ds->untag_bridge_pvid = true; ds->max_num_bridges = DSA_TAG_8021Q_MAX_NUM_BRIDGES; ds->fdb_isolation = true; -- cgit From 4678adf94da4a9e9683817b246b58ce15fb81782 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Oct 2024 21:03:11 +0200 Subject: vmxnet3: Fix packet corruption in vmxnet3_xdp_xmit_frame Andrew and Nikolay reported connectivity issues with Cilium's service load-balancing in case of vmxnet3. If a BPF program for native XDP adds an encapsulation header such as IPIP and transmits the packet out the same interface, then in case of vmxnet3 a corrupted packet is being sent and subsequently dropped on the path. vmxnet3_xdp_xmit_frame() which is called e.g. via vmxnet3_run_xdp() through vmxnet3_xdp_xmit_back() calculates an incorrect DMA address: page = virt_to_page(xdpf->data); tbi->dma_addr = page_pool_get_dma_addr(page) + VMXNET3_XDP_HEADROOM; dma_sync_single_for_device(&adapter->pdev->dev, tbi->dma_addr, buf_size, DMA_TO_DEVICE); The above assumes a fixed offset (VMXNET3_XDP_HEADROOM), but the XDP BPF program could have moved xdp->data. While the passed buf_size is correct (xdpf->len), the dma_addr needs to have a dynamic offset which can be calculated as xdpf->data - (void *)xdpf, that is, xdp->data - xdp->data_hard_start. Fixes: 54f00cce1178 ("vmxnet3: Add XDP support.") Reported-by: Andrew Sauber Reported-by: Nikolay Nikolaev Signed-off-by: Daniel Borkmann Tested-by: Nikolay Nikolaev Acked-by: Anton Protopopov Cc: William Tu Cc: Ronak Doshi Link: https://patch.msgid.link/a0888656d7f09028f9984498cc698bb5364d89fc.1728931137.git.daniel@iogearbox.net Signed-off-by: Paolo Abeni --- drivers/net/vmxnet3/vmxnet3_xdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c index a6c787454a1a..1341374a4588 100644 --- a/drivers/net/vmxnet3/vmxnet3_xdp.c +++ b/drivers/net/vmxnet3/vmxnet3_xdp.c @@ -148,7 +148,7 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, } else { /* XDP buffer from page pool */ page = virt_to_page(xdpf->data); tbi->dma_addr = page_pool_get_dma_addr(page) + - VMXNET3_XDP_HEADROOM; + (xdpf->data - (void *)xdpf); dma_sync_single_for_device(&adapter->pdev->dev, tbi->dma_addr, buf_size, DMA_TO_DEVICE); -- cgit From 88806efc034a9830f483963326b99930ad519af1 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 15 Oct 2024 10:17:55 +0200 Subject: net: ethernet: mtk_eth_soc: fix memory corruption during fq dma init The loop responsible for allocating up to MTK_FQ_DMA_LENGTH buffers must only touch as many descriptors, otherwise it ends up corrupting unrelated memory. Fix the loop iteration count accordingly. Fixes: c57e55819443 ("net: ethernet: mtk_eth_soc: handle dma buffer size soc specific") Signed-off-by: Felix Fietkau Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241015081755.31060-1-nbd@nbd.name Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 16ca427cf4c3..ed7313c10a05 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -1171,7 +1171,7 @@ static int mtk_init_fq_dma(struct mtk_eth *eth) if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) return -ENOMEM; - for (i = 0; i < cnt; i++) { + for (i = 0; i < len; i++) { struct mtk_tx_dma_v2 *txd; txd = eth->scratch_ring + (j * MTK_FQ_DMA_LENGTH + i) * soc->tx.desc_size; -- cgit From 7decd1f5904a489d3ccdcf131972f94645681689 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 15 Oct 2024 10:38:47 +0200 Subject: mptcp: pm: fix UaF read in mptcp_pm_nl_rm_addr_or_subflow Syzkaller reported this splat: ================================================================== BUG: KASAN: slab-use-after-free in mptcp_pm_nl_rm_addr_or_subflow+0xb44/0xcc0 net/mptcp/pm_netlink.c:881 Read of size 4 at addr ffff8880569ac858 by task syz.1.2799/14662 CPU: 0 UID: 0 PID: 14662 Comm: syz.1.2799 Not tainted 6.12.0-rc2-syzkaller-00307-g36c254515dc6 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:377 [inline] print_report+0xc3/0x620 mm/kasan/report.c:488 kasan_report+0xd9/0x110 mm/kasan/report.c:601 mptcp_pm_nl_rm_addr_or_subflow+0xb44/0xcc0 net/mptcp/pm_netlink.c:881 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:914 [inline] mptcp_nl_remove_id_zero_address+0x305/0x4a0 net/mptcp/pm_netlink.c:1572 mptcp_pm_nl_del_addr_doit+0x5c9/0x770 net/mptcp/pm_netlink.c:1603 genl_family_rcv_msg_doit+0x202/0x2f0 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x565/0x800 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x165/0x410 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x53c/0x7f0 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8b8/0xd70 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg net/socket.c:744 [inline] ____sys_sendmsg+0x9ae/0xb40 net/socket.c:2607 ___sys_sendmsg+0x135/0x1e0 net/socket.c:2661 __sys_sendmsg+0x117/0x1f0 net/socket.c:2690 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e RIP: 0023:0xf7fe4579 Code: b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 26 00 00 00 00 8d b4 26 00 00 00 00 RSP: 002b:00000000f574556c EFLAGS: 00000296 ORIG_RAX: 0000000000000172 RAX: ffffffffffffffda RBX: 000000000000000b RCX: 0000000020000140 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000296 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 5387: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:394 kmalloc_noprof include/linux/slab.h:878 [inline] kzalloc_noprof include/linux/slab.h:1014 [inline] subflow_create_ctx+0x87/0x2a0 net/mptcp/subflow.c:1803 subflow_ulp_init+0xc3/0x4d0 net/mptcp/subflow.c:1956 __tcp_set_ulp net/ipv4/tcp_ulp.c:146 [inline] tcp_set_ulp+0x326/0x7f0 net/ipv4/tcp_ulp.c:167 mptcp_subflow_create_socket+0x4ae/0x10a0 net/mptcp/subflow.c:1764 __mptcp_subflow_connect+0x3cc/0x1490 net/mptcp/subflow.c:1592 mptcp_pm_create_subflow_or_signal_addr+0xbda/0x23a0 net/mptcp/pm_netlink.c:642 mptcp_pm_nl_fully_established net/mptcp/pm_netlink.c:650 [inline] mptcp_pm_nl_work+0x3a1/0x4f0 net/mptcp/pm_netlink.c:943 mptcp_worker+0x15a/0x1240 net/mptcp/protocol.c:2777 process_one_work+0x958/0x1b30 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x6c8/0xf00 kernel/workqueue.c:3391 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Freed by task 113: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 kasan_save_free_info+0x3b/0x60 mm/kasan/generic.c:579 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x51/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:230 [inline] slab_free_hook mm/slub.c:2342 [inline] slab_free mm/slub.c:4579 [inline] kfree+0x14f/0x4b0 mm/slub.c:4727 kvfree+0x47/0x50 mm/util.c:701 kvfree_rcu_list+0xf5/0x2c0 kernel/rcu/tree.c:3423 kvfree_rcu_drain_ready kernel/rcu/tree.c:3563 [inline] kfree_rcu_monitor+0x503/0x8b0 kernel/rcu/tree.c:3632 kfree_rcu_shrink_scan+0x245/0x3a0 kernel/rcu/tree.c:3966 do_shrink_slab+0x44f/0x11c0 mm/shrinker.c:435 shrink_slab+0x32b/0x12a0 mm/shrinker.c:662 shrink_one+0x47e/0x7b0 mm/vmscan.c:4818 shrink_many mm/vmscan.c:4879 [inline] lru_gen_shrink_node mm/vmscan.c:4957 [inline] shrink_node+0x2452/0x39d0 mm/vmscan.c:5937 kswapd_shrink_node mm/vmscan.c:6765 [inline] balance_pgdat+0xc19/0x18f0 mm/vmscan.c:6957 kswapd+0x5ea/0xbf0 mm/vmscan.c:7226 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Last potentially related work creation: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 __kasan_record_aux_stack+0xba/0xd0 mm/kasan/generic.c:541 kvfree_call_rcu+0x74/0xbe0 kernel/rcu/tree.c:3810 subflow_ulp_release+0x2ae/0x350 net/mptcp/subflow.c:2009 tcp_cleanup_ulp+0x7c/0x130 net/ipv4/tcp_ulp.c:124 tcp_v4_destroy_sock+0x1c5/0x6a0 net/ipv4/tcp_ipv4.c:2541 inet_csk_destroy_sock+0x1a3/0x440 net/ipv4/inet_connection_sock.c:1293 tcp_done+0x252/0x350 net/ipv4/tcp.c:4870 tcp_rcv_state_process+0x379b/0x4f30 net/ipv4/tcp_input.c:6933 tcp_v4_do_rcv+0x1ad/0xa90 net/ipv4/tcp_ipv4.c:1938 sk_backlog_rcv include/net/sock.h:1115 [inline] __release_sock+0x31b/0x400 net/core/sock.c:3072 __tcp_close+0x4f3/0xff0 net/ipv4/tcp.c:3142 __mptcp_close_ssk+0x331/0x14d0 net/mptcp/protocol.c:2489 mptcp_close_ssk net/mptcp/protocol.c:2543 [inline] mptcp_close_ssk+0x150/0x220 net/mptcp/protocol.c:2526 mptcp_pm_nl_rm_addr_or_subflow+0x2be/0xcc0 net/mptcp/pm_netlink.c:878 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:914 [inline] mptcp_nl_remove_id_zero_address+0x305/0x4a0 net/mptcp/pm_netlink.c:1572 mptcp_pm_nl_del_addr_doit+0x5c9/0x770 net/mptcp/pm_netlink.c:1603 genl_family_rcv_msg_doit+0x202/0x2f0 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x565/0x800 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x165/0x410 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x53c/0x7f0 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8b8/0xd70 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg net/socket.c:744 [inline] ____sys_sendmsg+0x9ae/0xb40 net/socket.c:2607 ___sys_sendmsg+0x135/0x1e0 net/socket.c:2661 __sys_sendmsg+0x117/0x1f0 net/socket.c:2690 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e The buggy address belongs to the object at ffff8880569ac800 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 88 bytes inside of freed 512-byte region [ffff8880569ac800, ffff8880569aca00) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x569ac head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x4fff00000000040(head|node=1|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 04fff00000000040 ffff88801ac42c80 dead000000000100 dead000000000122 raw: 0000000000000000 0000000080100010 00000001f5000000 0000000000000000 head: 04fff00000000040 ffff88801ac42c80 dead000000000100 dead000000000122 head: 0000000000000000 0000000080100010 00000001f5000000 0000000000000000 head: 04fff00000000002 ffffea00015a6b01 ffffffffffffffff 0000000000000000 head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 10238, tgid 10238 (kworker/u32:6), ts 597403252405, free_ts 597177952947 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x2d1/0x350 mm/page_alloc.c:1537 prep_new_page mm/page_alloc.c:1545 [inline] get_page_from_freelist+0x101e/0x3070 mm/page_alloc.c:3457 __alloc_pages_noprof+0x223/0x25a0 mm/page_alloc.c:4733 alloc_pages_mpol_noprof+0x2c9/0x610 mm/mempolicy.c:2265 alloc_slab_page mm/slub.c:2412 [inline] allocate_slab mm/slub.c:2578 [inline] new_slab+0x2ba/0x3f0 mm/slub.c:2631 ___slab_alloc+0xd1d/0x16f0 mm/slub.c:3818 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] __kmalloc_cache_noprof+0x2c5/0x310 mm/slub.c:4290 kmalloc_noprof include/linux/slab.h:878 [inline] kzalloc_noprof include/linux/slab.h:1014 [inline] mld_add_delrec net/ipv6/mcast.c:743 [inline] igmp6_leave_group net/ipv6/mcast.c:2625 [inline] igmp6_group_dropped+0x4ab/0xe40 net/ipv6/mcast.c:723 __ipv6_dev_mc_dec+0x281/0x360 net/ipv6/mcast.c:979 addrconf_leave_solict net/ipv6/addrconf.c:2253 [inline] __ipv6_ifa_notify+0x3f6/0xc30 net/ipv6/addrconf.c:6283 addrconf_ifdown.isra.0+0xef9/0x1a20 net/ipv6/addrconf.c:3982 addrconf_notify+0x220/0x19c0 net/ipv6/addrconf.c:3781 notifier_call_chain+0xb9/0x410 kernel/notifier.c:93 call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:1996 call_netdevice_notifiers_extack net/core/dev.c:2034 [inline] call_netdevice_notifiers net/core/dev.c:2048 [inline] dev_close_many+0x333/0x6a0 net/core/dev.c:1589 page last free pid 13136 tgid 13136 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1108 [inline] free_unref_page+0x5f4/0xdc0 mm/page_alloc.c:2638 stack_depot_save_flags+0x2da/0x900 lib/stackdepot.c:666 kasan_save_stack+0x42/0x60 mm/kasan/common.c:48 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:319 [inline] __kasan_slab_alloc+0x89/0x90 mm/kasan/common.c:345 kasan_slab_alloc include/linux/kasan.h:247 [inline] slab_post_alloc_hook mm/slub.c:4085 [inline] slab_alloc_node mm/slub.c:4134 [inline] kmem_cache_alloc_noprof+0x121/0x2f0 mm/slub.c:4141 skb_clone+0x190/0x3f0 net/core/skbuff.c:2084 do_one_broadcast net/netlink/af_netlink.c:1462 [inline] netlink_broadcast_filtered+0xb11/0xef0 net/netlink/af_netlink.c:1540 netlink_broadcast+0x39/0x50 net/netlink/af_netlink.c:1564 uevent_net_broadcast_untagged lib/kobject_uevent.c:331 [inline] kobject_uevent_net_broadcast lib/kobject_uevent.c:410 [inline] kobject_uevent_env+0xacd/0x1670 lib/kobject_uevent.c:608 device_del+0x623/0x9f0 drivers/base/core.c:3882 snd_card_disconnect.part.0+0x58a/0x7c0 sound/core/init.c:546 snd_card_disconnect+0x1f/0x30 sound/core/init.c:495 snd_usx2y_disconnect+0xe9/0x1f0 sound/usb/usx2y/usbusx2y.c:417 usb_unbind_interface+0x1e8/0x970 drivers/usb/core/driver.c:461 device_remove drivers/base/dd.c:569 [inline] device_remove+0x122/0x170 drivers/base/dd.c:561 That's because 'subflow' is used just after 'mptcp_close_ssk(subflow)', which will initiate the release of its memory. Even if it is very likely the release and the re-utilisation will be done later on, it is of course better to avoid any issues and read the content of 'subflow' before closing it. Fixes: 1c1f72137598 ("mptcp: pm: only decrement add_addr_accepted for MPJ req") Cc: stable@vger.kernel.org Reported-by: syzbot+3c8b7a8e7df6a2a226ca@syzkaller.appspotmail.com Closes: https://lore.kernel.org/670d7337.050a0220.4cbc0.004f.GAE@google.com Signed-off-by: Matthieu Baerts (NGI0) Acked-by: Paolo Abeni Link: https://patch.msgid.link/20241015-net-mptcp-uaf-pm-rm-v1-1-c4ee5d987a64@kernel.org Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 1a78998fe1f4..db586a5b3866 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -873,12 +873,12 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); + removed |= subflow->request_join; /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); - removed |= subflow->request_join; if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } -- cgit From 65b4eb9f3d1e037100b157e023f4d3d988aafd29 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 15 Oct 2024 12:32:01 +0300 Subject: net/mlx5: HWS, removed wrong access to a number of rules variable Removed wrong access to the num_of_rules field of the matcher. This is a usual u32 variable, but the access was as if it was atomic. This fixes the following CI warnings: mlx5hws_bwc.c:708:17: warning: large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0 bytes) [-Watomic-alignment] Fixes: 510f9f61a112 ("net/mlx5: HWS, added API and enabled HWS support") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409291101.6NdtMFVC-lkp@intel.com/ Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_bwc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_bwc.c index bd52b05db367..8f3a6f9d703d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_bwc.c @@ -691,7 +691,6 @@ static int hws_bwc_matcher_move(struct mlx5hws_bwc_matcher *bwc_matcher) static int hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) { - u32 num_of_rules; int ret; /* If the current matcher size is already at its max size, we can't @@ -705,8 +704,7 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) * Need to check again if we really need rehash. * If the reason for rehash was size, but not any more - skip rehash. */ - num_of_rules = __atomic_load_n(&bwc_matcher->num_of_rules, __ATOMIC_RELAXED); - if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, num_of_rules)) + if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, bwc_matcher->num_of_rules)) return 0; /* Now we're done all the checking - do the rehash: -- cgit From 5aa2184e29081665f915594bc6de9b7fee6e4883 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 15 Oct 2024 12:32:02 +0300 Subject: net/mlx5: HWS, fixed double free in error flow of definer layout Fix error flow bug that could lead to double free of a buffer during a failure to calculate a suitable definer layout. Fixes: 74a778b4a63f ("net/mlx5: HWS, added definers handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_definer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_definer.c index d566d2ddf424..3f4c58bada37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_definer.c @@ -1925,7 +1925,7 @@ mlx5hws_definer_calc_layout(struct mlx5hws_context *ctx, ret = hws_definer_conv_match_params_to_hl(ctx, mt, match_hl); if (ret) { mlx5hws_err(ctx, "Failed to convert items to header layout\n"); - goto free_fc; + goto free_match_hl; } /* Find the match definer layout for header layout match union */ @@ -1946,7 +1946,7 @@ mlx5hws_definer_calc_layout(struct mlx5hws_context *ctx, free_fc: kfree(mt->fc); - +free_match_hl: kfree(match_hl); return ret; } -- cgit From 45bcbd49224ac5aec5e1239de9060e431ca2acd9 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 15 Oct 2024 12:32:03 +0300 Subject: net/mlx5: HWS, don't destroy more bwc queue locks than allocated hws_send_queues_bwc_locks_destroy destroyed more queue locks than allocated, leading to memory corruption (occasionally) and warnings such as DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)) in __mutex_destroy because sometimes, the 'mutex' being destroyed was random memory. The severity of this problem is proportional to the number of queues configured because the code overreaches beyond the end of the bwc_send_queue_locks array by 2x its length. Fix that by using the correct number of bwc queues. Fixes: 2ca62599aa0b ("net/mlx5: HWS, added send engine and context handling") Signed-off-by: Cosmin Ratiu Signed-off-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c index 0c7989184c30..e101dc46d99e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c @@ -941,7 +941,7 @@ static void __hws_send_queues_close(struct mlx5hws_context *ctx, u16 queues) static void hws_send_queues_bwc_locks_destroy(struct mlx5hws_context *ctx) { - int bwc_queues = ctx->queues - 1; + int bwc_queues = mlx5hws_bwc_queues(ctx); int i; if (!mlx5hws_context_bwc_supported(ctx)) -- cgit From 9addffa3435973e016f066e13f950be5eed73c06 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 15 Oct 2024 12:32:04 +0300 Subject: net/mlx5: HWS, use lock classes for bwc locks The HWS BWC API uses one lock per queue and usually acquires one of them, except when doing changes which require locking all queues in order. Naturally, lockdep isn't too happy about acquiring the same lock class multiple times, so inform it that each queue lock is a different class to avoid false positives. Fixes: 2ca62599aa0b ("net/mlx5: HWS, added send engine and context handling") Signed-off-by: Cosmin Ratiu Signed-off-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../mlx5/core/steering/hws/mlx5hws_context.h | 1 + .../mellanox/mlx5/core/steering/hws/mlx5hws_send.c | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_context.h index e5a7ce604334..8ab548aa402b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_context.h @@ -46,6 +46,7 @@ struct mlx5hws_context { struct mlx5hws_send_engine *send_queue; size_t queues; struct mutex *bwc_send_queue_locks; /* protect BWC queues */ + struct lock_class_key *bwc_lock_class_keys; struct list_head tbl_list; struct mlx5hws_context_debug_info debug_info; struct xarray peer_ctx_xa; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c index e101dc46d99e..6d443e6ee8d9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws_send.c @@ -947,8 +947,12 @@ static void hws_send_queues_bwc_locks_destroy(struct mlx5hws_context *ctx) if (!mlx5hws_context_bwc_supported(ctx)) return; - for (i = 0; i < bwc_queues; i++) + for (i = 0; i < bwc_queues; i++) { mutex_destroy(&ctx->bwc_send_queue_locks[i]); + lockdep_unregister_key(ctx->bwc_lock_class_keys + i); + } + + kfree(ctx->bwc_lock_class_keys); kfree(ctx->bwc_send_queue_locks); } @@ -977,10 +981,22 @@ static int hws_bwc_send_queues_init(struct mlx5hws_context *ctx) if (!ctx->bwc_send_queue_locks) return -ENOMEM; - for (i = 0; i < bwc_queues; i++) + ctx->bwc_lock_class_keys = kcalloc(bwc_queues, + sizeof(*ctx->bwc_lock_class_keys), + GFP_KERNEL); + if (!ctx->bwc_lock_class_keys) + goto err_lock_class_keys; + + for (i = 0; i < bwc_queues; i++) { mutex_init(&ctx->bwc_send_queue_locks[i]); + lockdep_register_key(ctx->bwc_lock_class_keys + i); + } return 0; + +err_lock_class_keys: + kfree(ctx->bwc_send_queue_locks); + return -ENOMEM; } int mlx5hws_send_queues_open(struct mlx5hws_context *ctx, -- cgit From d4f25be27e3ef7e23998fbd3dd4bff0602de7ae5 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 15 Oct 2024 12:32:05 +0300 Subject: net/mlx5: Check for invalid vector index on EQ creation Currently, mlx5 driver does not enforce vector index to be lower than the maximum number of supported completion vectors when requesting a new completion EQ. Thus, mlx5_comp_eqn_get() fails when trying to acquire an IRQ with an improper vector index. To prevent the case above, enforce that vector index value is valid and lower than maximum in mlx5_comp_eqn_get() before handling the request. Fixes: f14c1a14e632 ("net/mlx5: Allocate completion EQs dynamically") Signed-off-by: Maher Sanalla Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 2505f90c0b39..68cb86b37e56 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -1061,6 +1061,12 @@ int mlx5_comp_eqn_get(struct mlx5_core_dev *dev, u16 vecidx, int *eqn) struct mlx5_eq_comp *eq; int ret = 0; + if (vecidx >= table->max_comp_eqs) { + mlx5_core_dbg(dev, "Requested vector index %u should be less than %u", + vecidx, table->max_comp_eqs); + return -EINVAL; + } + mutex_lock(&table->comp_lock); eq = xa_load(&table->comp_eqs, vecidx); if (eq) { -- cgit From d62b14045c6511a7b2d4948d1a83a4e592deeb05 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 15 Oct 2024 12:32:06 +0300 Subject: net/mlx5: Fix command bitmask initialization Command bitmask have a dedicated bit for MANAGE_PAGES command, this bit isn't Initialize during command bitmask Initialization, only during MANAGE_PAGES. In addition, mlx5_cmd_trigger_completions() is trying to trigger completion for MANAGE_PAGES command as well. Hence, in case health error occurred before any MANAGE_PAGES command have been invoke (for example, during mlx5_enable_hca()), mlx5_cmd_trigger_completions() will try to trigger completion for MANAGE_PAGES command, which will result in null-ptr-deref error.[1] Fix it by Initialize command bitmask correctly. While at it, re-write the code for better understanding. [1] BUG: KASAN: null-ptr-deref in mlx5_cmd_trigger_completions+0x1db/0x600 [mlx5_core] Write of size 4 at addr 0000000000000214 by task kworker/u96:2/12078 CPU: 10 PID: 12078 Comm: kworker/u96:2 Not tainted 6.9.0-rc2_for_upstream_debug_2024_04_07_19_01 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_health0000:08:00.0 mlx5_fw_fatal_reporter_err_work [mlx5_core] Call Trace: dump_stack_lvl+0x7e/0xc0 kasan_report+0xb9/0xf0 kasan_check_range+0xec/0x190 mlx5_cmd_trigger_completions+0x1db/0x600 [mlx5_core] mlx5_cmd_flush+0x94/0x240 [mlx5_core] enter_error_state+0x6c/0xd0 [mlx5_core] mlx5_fw_fatal_reporter_err_work+0xf3/0x480 [mlx5_core] process_one_work+0x787/0x1490 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? pwq_dec_nr_in_flight+0xda0/0xda0 ? assign_work+0x168/0x240 worker_thread+0x586/0xd30 ? rescuer_thread+0xae0/0xae0 kthread+0x2df/0x3b0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x2d/0x70 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 Fixes: 9b98d395b85d ("net/mlx5: Start health poll at earlier stage of driver load") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index a64d96effb9e..6bd8a18e3af3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1765,6 +1765,10 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force } } +#define MLX5_MAX_MANAGE_PAGES_CMD_ENT 1 +#define MLX5_CMD_MASK ((1UL << (cmd->vars.max_reg_cmds + \ + MLX5_MAX_MANAGE_PAGES_CMD_ENT)) - 1) + static void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) { struct mlx5_cmd *cmd = &dev->cmd; @@ -1776,7 +1780,7 @@ static void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) /* wait for pending handlers to complete */ mlx5_eq_synchronize_cmd_irq(dev); spin_lock_irqsave(&dev->cmd.alloc_lock, flags); - vector = ~dev->cmd.vars.bitmask & ((1ul << (1 << dev->cmd.vars.log_sz)) - 1); + vector = ~dev->cmd.vars.bitmask & MLX5_CMD_MASK; if (!vector) goto no_trig; @@ -2361,7 +2365,7 @@ int mlx5_cmd_enable(struct mlx5_core_dev *dev) cmd->state = MLX5_CMDIF_STATE_DOWN; cmd->vars.max_reg_cmds = (1 << cmd->vars.log_sz) - 1; - cmd->vars.bitmask = (1UL << cmd->vars.max_reg_cmds) - 1; + cmd->vars.bitmask = MLX5_CMD_MASK; sema_init(&cmd->vars.sem, cmd->vars.max_reg_cmds); sema_init(&cmd->vars.pages_sem, 1); -- cgit From 1da9cfd6c41c2e6bbe624d0568644e1521c33e12 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 15 Oct 2024 12:32:07 +0300 Subject: net/mlx5: Unregister notifier on eswitch init failure It otherwise remains registered and a subsequent attempt at eswitch enabling might trigger warnings of the sort: [ 682.589148] ------------[ cut here ]------------ [ 682.590204] notifier callback eswitch_vport_event [mlx5_core] already registered [ 682.590256] WARNING: CPU: 13 PID: 2660 at kernel/notifier.c:31 notifier_chain_register+0x3e/0x90 [...snipped] [ 682.610052] Call Trace: [ 682.610369] [ 682.610663] ? __warn+0x7c/0x110 [ 682.611050] ? notifier_chain_register+0x3e/0x90 [ 682.611556] ? report_bug+0x148/0x170 [ 682.611977] ? handle_bug+0x36/0x70 [ 682.612384] ? exc_invalid_op+0x13/0x60 [ 682.612817] ? asm_exc_invalid_op+0x16/0x20 [ 682.613284] ? notifier_chain_register+0x3e/0x90 [ 682.613789] atomic_notifier_chain_register+0x25/0x40 [ 682.614322] mlx5_eswitch_enable_locked+0x1d4/0x3b0 [mlx5_core] [ 682.614965] mlx5_eswitch_enable+0xc9/0x100 [mlx5_core] [ 682.615551] mlx5_device_enable_sriov+0x25/0x340 [mlx5_core] [ 682.616170] mlx5_core_sriov_configure+0x50/0x170 [mlx5_core] [ 682.616789] sriov_numvfs_store+0xb0/0x1b0 [ 682.617248] kernfs_fop_write_iter+0x117/0x1a0 [ 682.617734] vfs_write+0x231/0x3f0 [ 682.618138] ksys_write+0x63/0xe0 [ 682.618536] do_syscall_64+0x4c/0x100 [ 682.618958] entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 7624e58a8b3a ("net/mlx5: E-switch, register event handler before arming the event") Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 17f78091ad30..7aef30dbd82d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1489,7 +1489,7 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int num_vfs) } if (err) - goto abort; + goto err_esw_enable; esw->fdb_table.flags |= MLX5_ESW_FDB_CREATED; @@ -1503,7 +1503,8 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int num_vfs) return 0; -abort: +err_esw_enable: + mlx5_eq_notifier_unregister(esw->dev, &esw->nb); mlx5_esw_acls_ns_cleanup(esw); return err; } -- cgit From 4dbc1d1a9f39c3711ad2a40addca04d07d9ab5d0 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 15 Oct 2024 12:32:08 +0300 Subject: net/mlx5e: Don't call cleanup on profile rollback failure When profile rollback fails in mlx5e_netdev_change_profile, the netdev profile var is left set to NULL. Avoid a crash when unloading the driver by not calling profile->cleanup in such a case. This was encountered while testing, with the original trigger that the wq rescuer thread creation got interrupted (presumably due to Ctrl+C-ing modprobe), which gets converted to ENOMEM (-12) by mlx5e_priv_init, the profile rollback also fails for the same reason (signal still active) so the profile is left as NULL, leading to a crash later in _mlx5e_remove. [ 732.473932] mlx5_core 0000:08:00.1: E-Switch: Unload vfs: mode(OFFLOADS), nvfs(2), necvfs(0), active vports(2) [ 734.525513] workqueue: Failed to create a rescuer kthread for wq "mlx5e": -EINTR [ 734.557372] mlx5_core 0000:08:00.1: mlx5e_netdev_init_profile:6235:(pid 6086): mlx5e_priv_init failed, err=-12 [ 734.559187] mlx5_core 0000:08:00.1 eth3: mlx5e_netdev_change_profile: new profile init failed, -12 [ 734.560153] workqueue: Failed to create a rescuer kthread for wq "mlx5e": -EINTR [ 734.589378] mlx5_core 0000:08:00.1: mlx5e_netdev_init_profile:6235:(pid 6086): mlx5e_priv_init failed, err=-12 [ 734.591136] mlx5_core 0000:08:00.1 eth3: mlx5e_netdev_change_profile: failed to rollback to orig profile, -12 [ 745.537492] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 745.538222] #PF: supervisor read access in kernel mode [ 745.551290] Call Trace: [ 745.551590] [ 745.551866] ? __die+0x20/0x60 [ 745.552218] ? page_fault_oops+0x150/0x400 [ 745.555307] ? exc_page_fault+0x79/0x240 [ 745.555729] ? asm_exc_page_fault+0x22/0x30 [ 745.556166] ? mlx5e_remove+0x6b/0xb0 [mlx5_core] [ 745.556698] auxiliary_bus_remove+0x18/0x30 [ 745.557134] device_release_driver_internal+0x1df/0x240 [ 745.557654] bus_remove_device+0xd7/0x140 [ 745.558075] device_del+0x15b/0x3c0 [ 745.558456] mlx5_rescan_drivers_locked.part.0+0xb1/0x2f0 [mlx5_core] [ 745.559112] mlx5_unregister_device+0x34/0x50 [mlx5_core] [ 745.559686] mlx5_uninit_one+0x46/0xf0 [mlx5_core] [ 745.560203] remove_one+0x4e/0xd0 [mlx5_core] [ 745.560694] pci_device_remove+0x39/0xa0 [ 745.561112] device_release_driver_internal+0x1df/0x240 [ 745.561631] driver_detach+0x47/0x90 [ 745.562022] bus_remove_driver+0x84/0x100 [ 745.562444] pci_unregister_driver+0x3b/0x90 [ 745.562890] mlx5_cleanup+0xc/0x1b [mlx5_core] [ 745.563415] __x64_sys_delete_module+0x14d/0x2f0 [ 745.563886] ? kmem_cache_free+0x1b0/0x460 [ 745.564313] ? lockdep_hardirqs_on_prepare+0xe2/0x190 [ 745.564825] do_syscall_64+0x6d/0x140 [ 745.565223] entry_SYSCALL_64_after_hwframe+0x4b/0x53 [ 745.565725] RIP: 0033:0x7f1579b1288b Fixes: 3ef14e463f6e ("net/mlx5e: Separate between netdev objects and mlx5e profiles initialization") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a5659c0c4236..e601324a690a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6509,7 +6509,9 @@ static void _mlx5e_remove(struct auxiliary_device *adev) mlx5e_dcbnl_delete_app(priv); unregister_netdev(priv->netdev); _mlx5e_suspend(adev, false); - priv->profile->cleanup(priv); + /* Avoid cleanup if profile rollback failed. */ + if (priv->profile) + priv->profile->cleanup(priv); mlx5e_destroy_netdev(priv); mlx5e_devlink_port_unregister(mlx5e_dev); mlx5e_destroy_devlink(mlx5e_dev); -- cgit