From 989cd9fd1ffe1a964429325f9092ea8f0db3f953 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 19 Dec 2023 18:25:16 +0200 Subject: wifi: p54: fix GCC format truncation warning with wiphy->fw_version GCC 13.2 warns: drivers/net/wireless/intersil/p54/fwio.c:128:34: warning: '%s' directive output may be truncated writing up to 39 bytes into a region of size 32 [-Wformat-truncation=] drivers/net/wireless/intersil/p54/fwio.c:128:33: note: directive argument in the range [0, 16777215] drivers/net/wireless/intersil/p54/fwio.c:128:33: note: directive argument in the range [0, 255] drivers/net/wireless/intersil/p54/fwio.c:127:17: note: 'snprintf' output between 7 and 52 bytes into a destination of size 32 The issue here is that wiphy->fw_version is 32 bytes and in theory the string we try to place there can be 39 bytes. wiphy->fw_version is used for providing the firmware version to user space via ethtool, so not really important. fw_version in theory can be 24 bytes but in practise it's shorter, so even if print only 19 bytes via ethtool there should not be any practical difference. I did consider removing fw_var from the string altogether or making the maximum length for fw_version 19 bytes, but chose this approach as it was the least intrusive. Compile tested only. Signed-off-by: Kalle Valo Acked-by: Christian Lamparter # Tested with Dell 1450 USB Signed-off-by: Kalle Valo Link: https://msgid.link/20231219162516.898205-1-kvalo@kernel.org --- drivers/net/wireless/intersil/p54/fwio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intersil/p54/fwio.c b/drivers/net/wireless/intersil/p54/fwio.c index b52cce38115d..c4fe70e05b9b 100644 --- a/drivers/net/wireless/intersil/p54/fwio.c +++ b/drivers/net/wireless/intersil/p54/fwio.c @@ -125,7 +125,7 @@ int p54_parse_firmware(struct ieee80211_hw *dev, const struct firmware *fw) "FW rev %s - Softmac protocol %x.%x\n", fw_version, priv->fw_var >> 8, priv->fw_var & 0xff); snprintf(dev->wiphy->fw_version, sizeof(dev->wiphy->fw_version), - "%s - %x.%x", fw_version, + "%.19s - %x.%x", fw_version, priv->fw_var >> 8, priv->fw_var & 0xff); } -- cgit From 556857aa1d0855aba02b1c63bc52b91ec63fc2cc Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 15 Jan 2024 11:18:05 +0100 Subject: wifi: ath11k: rely on mac80211 debugfs handling for vif mac80211 started to delete debugfs entries in certain cases, causing a ath11k to crash when it tried to delete the entries later. Fix this by relying on mac80211 to delete the entries when appropriate and adding them from the vif_add_debugfs handler. Fixes: 0a3d898ee9a8 ("wifi: mac80211: add/remove driver debugfs entries as appropriate") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218364 Signed-off-by: Benjamin Berg Acked-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://msgid.link/20240115101805.1277949-1-benjamin@sipsolutions.net --- drivers/net/wireless/ath/ath11k/core.h | 4 ---- drivers/net/wireless/ath/ath11k/debugfs.c | 25 ++++++++++--------------- drivers/net/wireless/ath/ath11k/debugfs.h | 12 ++---------- drivers/net/wireless/ath/ath11k/mac.c | 12 +----------- 4 files changed, 13 insertions(+), 40 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index f12b606e2d2e..667d55e26156 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -368,10 +368,6 @@ struct ath11k_vif { struct ieee80211_chanctx_conf chanctx; struct ath11k_arp_ns_offload arp_ns_offload; struct ath11k_rekey_data rekey_data; - -#ifdef CONFIG_ATH11K_DEBUGFS - struct dentry *debugfs_twt; -#endif /* CONFIG_ATH11K_DEBUGFS */ }; struct ath11k_vif_iter { diff --git a/drivers/net/wireless/ath/ath11k/debugfs.c b/drivers/net/wireless/ath/ath11k/debugfs.c index be76e7d1c436..0796f4d92b47 100644 --- a/drivers/net/wireless/ath/ath11k/debugfs.c +++ b/drivers/net/wireless/ath/ath11k/debugfs.c @@ -1893,35 +1893,30 @@ static const struct file_operations ath11k_fops_twt_resume_dialog = { .open = simple_open }; -void ath11k_debugfs_add_interface(struct ath11k_vif *arvif) +void ath11k_debugfs_op_vif_add(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) { + struct ath11k_vif *arvif = ath11k_vif_to_arvif(vif); struct ath11k_base *ab = arvif->ar->ab; + struct dentry *debugfs_twt; if (arvif->vif->type != NL80211_IFTYPE_AP && !(arvif->vif->type == NL80211_IFTYPE_STATION && test_bit(WMI_TLV_SERVICE_STA_TWT, ab->wmi_ab.svc_map))) return; - arvif->debugfs_twt = debugfs_create_dir("twt", - arvif->vif->debugfs_dir); - debugfs_create_file("add_dialog", 0200, arvif->debugfs_twt, + debugfs_twt = debugfs_create_dir("twt", + arvif->vif->debugfs_dir); + debugfs_create_file("add_dialog", 0200, debugfs_twt, arvif, &ath11k_fops_twt_add_dialog); - debugfs_create_file("del_dialog", 0200, arvif->debugfs_twt, + debugfs_create_file("del_dialog", 0200, debugfs_twt, arvif, &ath11k_fops_twt_del_dialog); - debugfs_create_file("pause_dialog", 0200, arvif->debugfs_twt, + debugfs_create_file("pause_dialog", 0200, debugfs_twt, arvif, &ath11k_fops_twt_pause_dialog); - debugfs_create_file("resume_dialog", 0200, arvif->debugfs_twt, + debugfs_create_file("resume_dialog", 0200, debugfs_twt, arvif, &ath11k_fops_twt_resume_dialog); } -void ath11k_debugfs_remove_interface(struct ath11k_vif *arvif) -{ - if (!arvif->debugfs_twt) - return; - - debugfs_remove_recursive(arvif->debugfs_twt); - arvif->debugfs_twt = NULL; -} diff --git a/drivers/net/wireless/ath/ath11k/debugfs.h b/drivers/net/wireless/ath/ath11k/debugfs.h index 3af0169f6cf2..6f630b42e95c 100644 --- a/drivers/net/wireless/ath/ath11k/debugfs.h +++ b/drivers/net/wireless/ath/ath11k/debugfs.h @@ -306,8 +306,8 @@ static inline int ath11k_debugfs_rx_filter(struct ath11k *ar) return ar->debug.rx_filter; } -void ath11k_debugfs_add_interface(struct ath11k_vif *arvif); -void ath11k_debugfs_remove_interface(struct ath11k_vif *arvif); +void ath11k_debugfs_op_vif_add(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); void ath11k_debugfs_add_dbring_entry(struct ath11k *ar, enum wmi_direct_buffer_module id, enum ath11k_dbg_dbr_event event, @@ -386,14 +386,6 @@ static inline int ath11k_debugfs_get_fw_stats(struct ath11k *ar, return 0; } -static inline void ath11k_debugfs_add_interface(struct ath11k_vif *arvif) -{ -} - -static inline void ath11k_debugfs_remove_interface(struct ath11k_vif *arvif) -{ -} - static inline void ath11k_debugfs_add_dbring_entry(struct ath11k *ar, enum wmi_direct_buffer_module id, diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 7f7b39817773..71c6dab1aedb 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -6750,13 +6750,6 @@ static int ath11k_mac_op_add_interface(struct ieee80211_hw *hw, goto err; } - /* In the case of hardware recovery, debugfs files are - * not deleted since ieee80211_ops.remove_interface() is - * not invoked. In such cases, try to delete the files. - * These will be re-created later. - */ - ath11k_debugfs_remove_interface(arvif); - memset(arvif, 0, sizeof(*arvif)); arvif->ar = ar; @@ -6933,8 +6926,6 @@ static int ath11k_mac_op_add_interface(struct ieee80211_hw *hw, ath11k_dp_vdev_tx_attach(ar, arvif); - ath11k_debugfs_add_interface(arvif); - if (vif->type != NL80211_IFTYPE_MONITOR && test_bit(ATH11K_FLAG_MONITOR_CONF_ENABLED, &ar->monitor_flags)) { ret = ath11k_mac_monitor_vdev_create(ar); @@ -7050,8 +7041,6 @@ err_vdev_del: /* Recalc txpower for remaining vdev */ ath11k_mac_txpower_recalc(ar); - ath11k_debugfs_remove_interface(arvif); - /* TODO: recal traffic pause state based on the available vdevs */ mutex_unlock(&ar->conf_mutex); @@ -9149,6 +9138,7 @@ static const struct ieee80211_ops ath11k_ops = { #endif #ifdef CONFIG_ATH11K_DEBUGFS + .vif_add_debugfs = ath11k_debugfs_op_vif_add, .sta_add_debugfs = ath11k_debugfs_sta_op_add, #endif -- cgit From a6e4f85d3820d00694ed10f581f4c650445dbcda Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Tue, 16 Jan 2024 14:22:57 +0000 Subject: wifi: cfg80211: fix missing interfaces when dumping The nl80211_dump_interface() supports resumption in case nl80211_send_iface() doesn't have the resources to complete its work. The logic would store the progress as iteration offsets for rdev and wdev loops. However the logic did not properly handle resumption for non-last rdev. Assuming a system with 2 rdevs, with 2 wdevs each, this could happen: dump(cb=[0, 0]): if_start=cb[1] (=0) send rdev0.wdev0 -> ok send rdev0.wdev1 -> yield cb[1] = 1 dump(cb=[0, 1]): if_start=cb[1] (=1) send rdev0.wdev1 -> ok // since if_start=1 the rdev0.wdev0 got skipped // through if_idx < if_start send rdev1.wdev1 -> ok The if_start needs to be reset back to 0 upon wdev loop end. The problem is actually hard to hit on a desktop, and even on most routers. The prerequisites for this manifesting was: - more than 1 wiphy - a few handful of interfaces - dump without rdev or wdev filter I was seeing this with 4 wiphys 9 interfaces each. It'd miss 6 interfaces from the last wiphy reported to userspace. Signed-off-by: Michal Kazior Link: https://msgid.link/20240116142340.89678-1-kazikcz@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1cbbb11ea503..fbf95b7ff6b4 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4008,6 +4008,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * } wiphy_unlock(&rdev->wiphy); + if_start = 0; wp_idx++; } out: -- cgit From 26490da5a71da9064e58f0d4ce82756c26ef9eb1 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 18 Jan 2024 09:25:45 +0100 Subject: wifi: cfg80211/mac80211: remove dependency on non-existing option Commit ffbd0c8c1e7f ("wifi: mac80211: add an element parsing unit test") and commit 730eeb17bbdd ("wifi: cfg80211: add first kunit tests, for element defrag") add new configs that depend on !KERNEL_6_2, but the config option KERNEL_6_2 does not exist in the tree. This dependency is used for handling backporting to restrict the option to certain kernels but this really should not be carried around the mainline kernel tree. Clean up this needless dependency on the non-existing option KERNEL_6_2. Link: https://lore.kernel.org/lkml/CAKXUXMyfrM6amOR7Ysim3WNQ-Ckf9HJDqRhAoYmLXujo1UV+yA@mail.gmail.com/ Signed-off-by: Lukas Bulwahn Signed-off-by: Johannes Berg --- net/mac80211/Kconfig | 1 - net/wireless/Kconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index cb0291decf2e..13438cc0a6b1 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -62,7 +62,6 @@ config MAC80211_KUNIT_TEST depends on KUNIT depends on MAC80211 default KUNIT_ALL_TESTS - depends on !KERNEL_6_2 help Enable this option to test mac80211 internals with kunit. diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index a9ac85e09af3..10345388ad13 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -206,7 +206,6 @@ config CFG80211_KUNIT_TEST depends on KUNIT depends on CFG80211 default KUNIT_ALL_TESTS - depends on !KERNEL_6_2 help Enable this option to test cfg80211 functions with kunit. -- cgit From b01a74b3ca6fd51b62c67733ba7c3280fa6c5d26 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 11 Jan 2024 18:17:44 +0200 Subject: wifi: mac80211: fix potential sta-link leak When a station is allocated, links are added but not set to valid yet (e.g. during connection to an AP MLD), we might remove the station without ever marking links valid, and leak them. Fix that. Fixes: cb71f1d136a6 ("wifi: mac80211: add sta link addition/removal") Signed-off-by: Johannes Berg Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://msgid.link/20240111181514.6573998beaf8.I09ac2e1d41c80f82a5a616b8bd1d9d8dd709a6a6@changeid Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 0ba613dd1cc4..c33decbb97f2 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -404,7 +404,10 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { - if (!(sta->sta.valid_links & BIT(i))) + struct link_sta_info *link_sta; + + link_sta = rcu_access_pointer(sta->link[i]); + if (!link_sta) continue; sta_remove_link(sta, i, false); -- cgit From cf4a0d840ecc72fcf16198d5e9c505ab7d5a5e4d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 11 Jan 2024 15:07:25 +0200 Subject: wifi: iwlwifi: fix a memory corruption iwl_fw_ini_trigger_tlv::data is a pointer to a __le32, which means that if we copy to iwl_fw_ini_trigger_tlv::data + offset while offset is in bytes, we'll write past the buffer. Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218233 Fixes: cf29c5b66b9f ("iwlwifi: dbg_ini: implement time point handling") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://msgid.link/20240111150610.2d2b8b870194.I14ed76505a5cf87304e0c9cc05cc0ae85ed3bf91@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c index b658cf228fbe..9160d81a871e 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include #include "iwl-drv.h" @@ -1096,7 +1096,7 @@ static int iwl_dbg_tlv_override_trig_node(struct iwl_fw_runtime *fwrt, node_trig = (void *)node_tlv->data; } - memcpy(node_trig->data + offset, trig->data, trig_data_len); + memcpy((u8 *)node_trig->data + offset, trig->data, trig_data_len); node_tlv->length = cpu_to_le32(size); if (policy & IWL_FW_INI_APPLY_POLICY_OVERRIDE_CFG) { -- cgit From bcbc84af1183c8cf3d1ca9b78540c2185cd85e7f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 4 Jan 2024 19:10:59 +0100 Subject: wifi: mac80211: fix race condition on enabling fast-xmit fast-xmit must only be enabled after the sta has been uploaded to the driver, otherwise it could end up passing the not-yet-uploaded sta via drv_tx calls to the driver, leading to potential crashes because of uninitialized drv_priv data. Add a missing sta->uploaded check and re-check fast xmit after inserting a sta. Signed-off-by: Felix Fietkau Link: https://msgid.link/20240104181059.84032-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 ++ net/mac80211/tx.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index c33decbb97f2..bcf3f727fc6d 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -913,6 +913,8 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); + ieee80211_check_fast_xmit(sta); + return 0; out_remove: if (sta->sta.valid_links) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index ed4fdf655343..4b2823e36a37 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3048,7 +3048,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) sdata->vif.type == NL80211_IFTYPE_STATION) goto out; - if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->uploaded) goto out; if (test_sta_flag(sta, WLAN_STA_PS_STA) || -- cgit From dbc153fd3c142909e564bb256da087e13fbf239c Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 18 Jan 2024 12:32:10 +0800 Subject: net/smc: fix illegal rmb_desc access in SMC-D connection dump A crash was found when dumping SMC-D connections. It can be reproduced by following steps: - run nginx/wrk test: smc_run nginx smc_run wrk -t 16 -c 1000 -d -H 'Connection: Close' - continuously dump SMC-D connections in parallel: watch -n 1 'smcss -D' BUG: kernel NULL pointer dereference, address: 0000000000000030 CPU: 2 PID: 7204 Comm: smcss Kdump: loaded Tainted: G E 6.7.0+ #55 RIP: 0010:__smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag] Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x66/0x150 ? exc_page_fault+0x69/0x140 ? asm_exc_page_fault+0x26/0x30 ? __smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag] ? __kmalloc_node_track_caller+0x35d/0x430 ? __alloc_skb+0x77/0x170 smc_diag_dump_proto+0xd0/0xf0 [smc_diag] smc_diag_dump+0x26/0x60 [smc_diag] netlink_dump+0x19f/0x320 __netlink_dump_start+0x1dc/0x300 smc_diag_handler_dump+0x6a/0x80 [smc_diag] ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag] sock_diag_rcv_msg+0x121/0x140 ? __pfx_sock_diag_rcv_msg+0x10/0x10 netlink_rcv_skb+0x5a/0x110 sock_diag_rcv+0x28/0x40 netlink_unicast+0x22a/0x330 netlink_sendmsg+0x1f8/0x420 __sock_sendmsg+0xb0/0xc0 ____sys_sendmsg+0x24e/0x300 ? copy_msghdr_from_user+0x62/0x80 ___sys_sendmsg+0x7c/0xd0 ? __do_fault+0x34/0x160 ? do_read_fault+0x5f/0x100 ? do_fault+0xb0/0x110 ? __handle_mm_fault+0x2b0/0x6c0 __sys_sendmsg+0x4d/0x80 do_syscall_64+0x69/0x180 entry_SYSCALL_64_after_hwframe+0x6e/0x76 It is possible that the connection is in process of being established when we dump it. Assumed that the connection has been registered in a link group by smc_conn_create() but the rmb_desc has not yet been initialized by smc_buf_create(), thus causing the illegal access to conn->rmb_desc. So fix it by checking before dump. Fixes: 4b1b7d3b30a6 ("net/smc: add SMC-D diag support") Signed-off-by: Wen Gu Reviewed-by: Dust Li Reviewed-by: Wenjia Zhang Signed-off-by: David S. Miller --- net/smc/smc_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 52f7c4f1e767..5a33908015f3 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -164,7 +164,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, } if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && - !list_empty(&smc->conn.lgr->list)) { + !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; struct smcd_dev *smcd = conn->lgr->smcd; -- cgit From b01f15a7571b7aa222458bc9bf26ab59bd84e384 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 17 Jan 2024 19:12:32 -0500 Subject: selftests: bonding: Increase timeout to 1200s When tests are run by runner.sh, bond_options.sh gets killed before it can complete: make -C tools/testing/selftests run_tests TARGETS="drivers/net/bonding" [...] # timeout set to 120 # selftests: drivers/net/bonding: bond_options.sh # TEST: prio (active-backup miimon primary_reselect 0) [ OK ] # TEST: prio (active-backup miimon primary_reselect 1) [ OK ] # TEST: prio (active-backup miimon primary_reselect 2) [ OK ] # TEST: prio (active-backup arp_ip_target primary_reselect 0) [ OK ] # TEST: prio (active-backup arp_ip_target primary_reselect 1) [ OK ] # TEST: prio (active-backup arp_ip_target primary_reselect 2) [ OK ] # not ok 7 selftests: drivers/net/bonding: bond_options.sh # TIMEOUT 120 seconds This test includes many sleep statements, at least some of which are related to timers in the operation of the bonding driver itself. Increase the test timeout to allow the test to complete. I ran the test in slightly different VMs (including one without HW virtualization support) and got runtimes of 13m39.760s, 13m31.238s, and 13m2.956s. Use a ~1.5x "safety factor" and set the timeout to 1200s. Fixes: 42a8d4aaea84 ("selftests: bonding: add bonding prio option test") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240116104402.1203850a@kernel.org/#t Suggested-by: Jakub Kicinski Signed-off-by: Benjamin Poirier Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240118001233.304759-1-bpoirier@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/bonding/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/bonding/settings b/tools/testing/selftests/drivers/net/bonding/settings index 6091b45d226b..79b65bdf05db 100644 --- a/tools/testing/selftests/drivers/net/bonding/settings +++ b/tools/testing/selftests/drivers/net/bonding/settings @@ -1 +1 @@ -timeout=120 +timeout=1200 -- cgit From 198bc90e0e734e5f98c3d2833e8390cac3df61b2 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 18 Jan 2024 09:20:19 +0800 Subject: tcp: make sure init the accept_queue's spinlocks once When I run syz's reproduction C program locally, it causes the following issue: pvqspinlock: lock 0xffff9d181cd5c660 has corrupted value 0x0! WARNING: CPU: 19 PID: 21160 at __pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:__pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) Code: 73 56 3a ff 90 c3 cc cc cc cc 8b 05 bb 1f 48 01 85 c0 74 05 c3 cc cc cc cc 8b 17 48 89 fe 48 c7 c7 30 20 ce 8f e8 ad 56 42 ff <0f> 0b c3 cc cc cc cc 0f 0b 0f 1f 40 00 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffa8d200604cb8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9d1ef60e0908 RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9d1ef60e0900 RBP: ffff9d181cd5c280 R08: 0000000000000000 R09: 00000000ffff7fff R10: ffffa8d200604b68 R11: ffffffff907dcdc8 R12: 0000000000000000 R13: ffff9d181cd5c660 R14: ffff9d1813a3f330 R15: 0000000000001000 FS: 00007fa110184640(0000) GS:ffff9d1ef60c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000000 CR3: 000000011f65e000 CR4: 00000000000006f0 Call Trace: _raw_spin_unlock (kernel/locking/spinlock.c:186) inet_csk_reqsk_queue_add (net/ipv4/inet_connection_sock.c:1321) inet_csk_complete_hashdance (net/ipv4/inet_connection_sock.c:1358) tcp_check_req (net/ipv4/tcp_minisocks.c:868) tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2260) ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205) ip_local_deliver_finish (net/ipv4/ip_input.c:234) __netif_receive_skb_one_core (net/core/dev.c:5529) process_backlog (./include/linux/rcupdate.h:779) __napi_poll (net/core/dev.c:6533) net_rx_action (net/core/dev.c:6604) __do_softirq (./arch/x86/include/asm/jump_label.h:27) do_softirq (kernel/softirq.c:454 kernel/softirq.c:441) __local_bh_enable_ip (kernel/softirq.c:381) __dev_queue_xmit (net/core/dev.c:4374) ip_finish_output2 (./include/net/neighbour.h:540 net/ipv4/ip_output.c:235) __ip_queue_xmit (net/ipv4/ip_output.c:535) __tcp_transmit_skb (net/ipv4/tcp_output.c:1462) tcp_rcv_synsent_state_process (net/ipv4/tcp_input.c:6469) tcp_rcv_state_process (net/ipv4/tcp_input.c:6657) tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1929) __release_sock (./include/net/sock.h:1121 net/core/sock.c:2968) release_sock (net/core/sock.c:3536) inet_wait_for_connect (net/ipv4/af_inet.c:609) __inet_stream_connect (net/ipv4/af_inet.c:702) inet_stream_connect (net/ipv4/af_inet.c:748) __sys_connect (./include/linux/file.h:45 net/socket.c:2064) __x64_sys_connect (net/socket.c:2073 net/socket.c:2070 net/socket.c:2070) do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) RIP: 0033:0x7fa10ff05a3d Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48 RSP: 002b:00007fa110183de8 EFLAGS: 00000202 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000020000054 RCX: 00007fa10ff05a3d RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003 RBP: 00007fa110183e20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00007fa110184640 R13: 0000000000000000 R14: 00007fa10fe8b060 R15: 00007fff73e23b20 The issue triggering process is analyzed as follows: Thread A Thread B tcp_v4_rcv //receive ack TCP packet inet_shutdown tcp_check_req tcp_disconnect //disconnect sock ... tcp_set_state(sk, TCP_CLOSE) inet_csk_complete_hashdance ... inet_csk_reqsk_queue_add inet_listen //start listen spin_lock(&queue->rskq_lock) inet_csk_listen_start ... reqsk_queue_alloc ... spin_lock_init spin_unlock(&queue->rskq_lock) //warning When the socket receives the ACK packet during the three-way handshake, it will hold spinlock. And then the user actively shutdowns the socket and listens to the socket immediately, the spinlock will be initialized. When the socket is going to release the spinlock, a warning is generated. Also the same issue to fastopenq.lock. Move init spinlock to inet_create and inet_accept to make sure init the accept_queue's spinlocks once. Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Reported-by: Ming Shu Signed-off-by: Zhengchao Shao Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240118012019.1751966-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 8 ++++++++ net/core/request_sock.c | 3 --- net/ipv4/af_inet.c | 3 +++ net/ipv4/inet_connection_sock.c | 4 ++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index d0a2f827d5f2..9ab4bf704e86 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -357,4 +357,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk) return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } +static inline void inet_init_csk_locks(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); + spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); +} + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index f35c2e998406..63de5c635842 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -33,9 +33,6 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) { - spin_lock_init(&queue->rskq_lock); - - spin_lock_init(&queue->fastopenq.lock); queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 835f4f9d98d2..4e635dd3d3c8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -330,6 +330,9 @@ lookup_protocol: if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; + if (INET_PROTOSW_ICSK & answer_flags) + inet_init_csk_locks(sk); + inet = inet_sk(sk); inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8e2eb1793685..459af1f89739 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -727,6 +727,10 @@ out: } if (req) reqsk_put(req); + + if (newsk) + inet_init_csk_locks(newsk); + return newsk; out_err: newsk = NULL; -- cgit From 3c1069fa42872f95cf3c6fedf80723d391e12d57 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 17 Jan 2024 15:45:11 -0800 Subject: bnxt_en: Wait for FLR to complete during probe The first message to firmware may fail if the device is undergoing FLR. The driver has some recovery logic for this failure scenario but we must wait 100 msec for FLR to complete before proceeding. Otherwise the recovery will always fail. Fixes: ba02629ff6cb ("bnxt_en: log firmware status on firmware init failure") Reviewed-by: Damodharam Ammepalli Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240117234515.226944-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0aacd3c6ed5c..0866aba35d9b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -13232,6 +13232,11 @@ static int bnxt_fw_init_one_p1(struct bnxt *bp) bp->fw_cap = 0; rc = bnxt_hwrm_ver_get(bp); + /* FW may be unresponsive after FLR. FLR must complete within 100 msec + * so wait before continuing with recovery. + */ + if (rc) + msleep(100); bnxt_try_map_fw_health_reg(bp); if (rc) { rc = bnxt_try_recover_fw(bp); -- cgit From 2ad8e57338ac7b1e149d458669a95132e2460096 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 17 Jan 2024 15:45:12 -0800 Subject: bnxt_en: Fix memory leak in bnxt_hwrm_get_rings() bnxt_hwrm_get_rings() can abort and return error when there are not enough ring resources. It aborts without releasing the HWRM DMA buffer, causing a dma_pool_destroy warning when the driver is unloaded: bnxt_en 0000:99:00.0: dma_pool_destroy bnxt_hwrm, 000000005b089ba8 busy Fixes: f1e50b276d37 ("bnxt_en: Fix trimming of P5 RX and TX rings") Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240117234515.226944-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0866aba35d9b..9fdc90bfce38 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6926,7 +6926,7 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp) if (cp < (rx + tx)) { rc = __bnxt_trim_rings(bp, &rx, &tx, cp, false); if (rc) - return rc; + goto get_rings_exit; if (bp->flags & BNXT_FLAG_AGG_RINGS) rx <<= 1; hw_resc->resv_rx_rings = rx; @@ -6938,8 +6938,9 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp) hw_resc->resv_cp_rings = cp; hw_resc->resv_stat_ctxs = stats; } +get_rings_exit: hwrm_req_drop(bp, req); - return 0; + return rc; } int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings) -- cgit From 523384a6aa095d3f3d9ee8b1a4e289d4311cd2d9 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 17 Jan 2024 15:45:13 -0800 Subject: bnxt_en: Fix RSS table entries calculation for P5_PLUS chips The existing formula used in the driver to calculate the number of RSS table entries is to round up the number of RX rings to the next integer multiples of 64 (e.g. 64, 128, 192, ..). This is incorrect. The valid values supported by the chip are 64, 128, 256, 512 only (power of 2 starting from 64). When the number of RX rings is greater than 128, the entry size will likely be wrong. Firmware will round down the invalid value (e.g. 192 rounded down to 128) provided by the driver, causing some RSS rings to not receive any packets. We already have an existing function bnxt_calc_nr_ring_pages() to do this calculation. Use it in bnxt_get_nr_rss_ctxs() to calculate the number of RSS contexts correctly for P5_PLUS chips. Reviewed-by: Andy Gospodarek Reviewed-by: Pavan Chebbi Fixes: 7b3af4f75b81 ("bnxt_en: Add RSS support for 57500 chips.") Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240117234515.226944-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 17 ++++++++++++----- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 ++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 9fdc90bfce38..3d090d4403df 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5935,8 +5935,12 @@ static u16 bnxt_get_max_rss_ring(struct bnxt *bp) int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings) { - if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) - return DIV_ROUND_UP(rx_rings, BNXT_RSS_TABLE_ENTRIES_P5); + if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { + if (!rx_rings) + return 0; + return bnxt_calc_nr_ring_pages(rx_rings - 1, + BNXT_RSS_TABLE_ENTRIES_P5); + } if (BNXT_CHIP_TYPE_NITRO_A0(bp)) return 2; return 1; @@ -7001,10 +7005,11 @@ __bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, int tx_rings, int rx_rings, req->num_rx_rings = cpu_to_le16(rx_rings); if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { + u16 rss_ctx = bnxt_get_nr_rss_ctxs(bp, ring_grps); + req->num_cmpl_rings = cpu_to_le16(tx_rings + ring_grps); req->num_msix = cpu_to_le16(cp_rings); - req->num_rsscos_ctxs = - cpu_to_le16(DIV_ROUND_UP(ring_grps, 64)); + req->num_rsscos_ctxs = cpu_to_le16(rss_ctx); } else { req->num_cmpl_rings = cpu_to_le16(cp_rings); req->num_hw_ring_grps = cpu_to_le16(ring_grps); @@ -7051,8 +7056,10 @@ __bnxt_hwrm_reserve_vf_rings(struct bnxt *bp, int tx_rings, int rx_rings, req->num_tx_rings = cpu_to_le16(tx_rings); req->num_rx_rings = cpu_to_le16(rx_rings); if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { + u16 rss_ctx = bnxt_get_nr_rss_ctxs(bp, ring_grps); + req->num_cmpl_rings = cpu_to_le16(tx_rings + ring_grps); - req->num_rsscos_ctxs = cpu_to_le16(DIV_ROUND_UP(ring_grps, 64)); + req->num_rsscos_ctxs = cpu_to_le16(rss_ctx); } else { req->num_cmpl_rings = cpu_to_le16(cp_rings); req->num_hw_ring_grps = cpu_to_le16(ring_grps); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 27b983c0a8a9..1f6e0cd84f2e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1574,7 +1574,8 @@ u32 bnxt_get_rxfh_indir_size(struct net_device *dev) struct bnxt *bp = netdev_priv(dev); if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) - return ALIGN(bp->rx_nr_rings, BNXT_RSS_TABLE_ENTRIES_P5); + return bnxt_get_nr_rss_ctxs(bp, bp->rx_nr_rings) * + BNXT_RSS_TABLE_ENTRIES_P5; return HW_HASH_INDEX_SIZE; } -- cgit From c20f482129a582455f02eb9a6dcb2a4215274599 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 17 Jan 2024 15:45:14 -0800 Subject: bnxt_en: Prevent kernel warning when running offline self test We call bnxt_half_open_nic() to setup the chip partially to run loopback tests. The rings and buffers are initialized normally so that we can transmit and receive packets in loopback mode. That means page pool buffers are allocated for the aggregation ring just like the normal case. NAPI is not needed because we are just polling for the loopback packets. When we're done with the loopback tests, we call bnxt_half_close_nic() to clean up. When freeing the page pools, we hit a WARN_ON() in page_pool_unlink_napi() because the NAPI state linked to the page pool is uninitialized. The simplest way to avoid this warning is just to initialize the NAPIs during half open and delete the NAPIs during half close. Trying to skip the page pool initialization or skip linking of NAPI during half open will be more complicated. This fix avoids this warning: WARNING: CPU: 4 PID: 46967 at net/core/page_pool.c:946 page_pool_unlink_napi+0x1f/0x30 CPU: 4 PID: 46967 Comm: ethtool Tainted: G S W 6.7.0-rc5+ #22 Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.3.8 08/31/2021 RIP: 0010:page_pool_unlink_napi+0x1f/0x30 Code: 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 8b 47 18 48 85 c0 74 1b 48 8b 50 10 83 e2 01 74 08 8b 40 34 83 f8 ff 74 02 <0f> 0b 48 c7 47 18 00 00 00 00 c3 cc cc cc cc 66 90 90 90 90 90 90 RSP: 0018:ffa000003d0dfbe8 EFLAGS: 00010246 RAX: ff110003607ce640 RBX: ff110010baf5d000 RCX: 0000000000000008 RDX: 0000000000000000 RSI: ff110001e5e522c0 RDI: ff110010baf5d000 RBP: ff11000145539b40 R08: 0000000000000001 R09: ffffffffc063f641 R10: ff110001361eddb8 R11: 000000000040000f R12: 0000000000000001 R13: 000000000000001c R14: ff1100014553a080 R15: 0000000000003fc0 FS: 00007f9301c4f740(0000) GS:ff1100103fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f91344fa8f0 CR3: 00000003527cc005 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __warn+0x81/0x140 ? page_pool_unlink_napi+0x1f/0x30 ? report_bug+0x102/0x200 ? handle_bug+0x44/0x70 ? exc_invalid_op+0x13/0x60 ? asm_exc_invalid_op+0x16/0x20 ? bnxt_free_ring.isra.123+0xb1/0xd0 [bnxt_en] ? page_pool_unlink_napi+0x1f/0x30 page_pool_destroy+0x3e/0x150 bnxt_free_mem+0x441/0x5e0 [bnxt_en] bnxt_half_close_nic+0x2a/0x40 [bnxt_en] bnxt_self_test+0x21d/0x450 [bnxt_en] __dev_ethtool+0xeda/0x2e30 ? native_queued_spin_lock_slowpath+0x17f/0x2b0 ? __link_object+0xa1/0x160 ? _raw_spin_unlock_irqrestore+0x23/0x40 ? __create_object+0x5f/0x90 ? __kmem_cache_alloc_node+0x317/0x3c0 ? dev_ethtool+0x59/0x170 dev_ethtool+0xa7/0x170 dev_ioctl+0xc3/0x530 sock_do_ioctl+0xa8/0xf0 sock_ioctl+0x270/0x310 __x64_sys_ioctl+0x8c/0xc0 do_syscall_64+0x3e/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Fixes: 294e39e0d034 ("bnxt: hook NAPIs to page pools") Reviewed-by: Andy Gospodarek Reviewed-by: Ajit Khaparde Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240117234515.226944-5-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3d090d4403df..0f5004872a46 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11572,10 +11572,12 @@ int bnxt_half_open_nic(struct bnxt *bp) netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc); goto half_open_err; } + bnxt_init_napi(bp); set_bit(BNXT_STATE_HALF_OPEN, &bp->state); rc = bnxt_init_nic(bp, true); if (rc) { clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); + bnxt_del_napi(bp); netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc); goto half_open_err; } @@ -11594,6 +11596,7 @@ half_open_err: void bnxt_half_close_nic(struct bnxt *bp) { bnxt_hwrm_resource_free(bp, false, true); + bnxt_del_napi(bp); bnxt_free_skbs(bp); bnxt_free_mem(bp, true); clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); -- cgit From 467739baf63646d4a5033f7f8a9306669ea55326 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 17 Jan 2024 15:45:15 -0800 Subject: bnxt_en: Fix possible crash after creating sw mqprio TCs The driver relies on netdev_get_num_tc() to get the number of HW offloaded mqprio TCs to allocate and free TX rings. This won't work and can potentially crash the system if software mqprio or taprio TCs have been setup. netdev_get_num_tc() will return the number of software TCs and it may cause the driver to allocate or free more TX rings that it should. Fix it by adding a bp->num_tc field to store the number of HW offload mqprio TCs for the device. Use bp->num_tc instead of netdev_get_num_tc(). This fixes a crash like this: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 42b8404067 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 120 PID: 8661 Comm: ifconfig Kdump: loaded Tainted: G OE 5.18.16 #1 Hardware name: Lenovo ThinkSystem SR650 V3/SB27A92818, BIOS ESE114N-2.12 04/25/2023 RIP: 0010:bnxt_hwrm_cp_ring_alloc_p5+0x10/0x90 [bnxt_en] Code: 41 5c 41 5d 41 5e c3 cc cc cc cc 41 8b 44 24 08 66 89 03 eb c6 e8 b0 f1 7d db 0f 1f 44 00 00 41 56 41 55 41 54 55 48 89 fd 53 <48> 8b 06 48 89 f3 48 81 c6 28 01 00 00 0f b6 96 13 ff ff ff 44 8b RSP: 0018:ff65907660d1fa88 EFLAGS: 00010202 RAX: 0000000000000010 RBX: ff4dde1d907e4980 RCX: f400000000000000 RDX: 0000000000000010 RSI: 0000000000000000 RDI: ff4dde1d907e4980 RBP: ff4dde1d907e4980 R08: 000000000000000f R09: 0000000000000000 R10: ff4dde5f02671800 R11: 0000000000000008 R12: 0000000088888889 R13: 0500000000000000 R14: 00f0000000000000 R15: ff4dde5f02671800 FS: 00007f4b126b5740(0000) GS:ff4dde9bff600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000416f9c6002 CR4: 0000000000771ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: bnxt_hwrm_ring_alloc+0x204/0x770 [bnxt_en] bnxt_init_chip+0x4d/0x680 [bnxt_en] ? bnxt_poll+0x1a0/0x1a0 [bnxt_en] __bnxt_open_nic+0xd2/0x740 [bnxt_en] bnxt_open+0x10b/0x220 [bnxt_en] ? raw_notifier_call_chain+0x41/0x60 __dev_open+0xf3/0x1b0 __dev_change_flags+0x1db/0x250 dev_change_flags+0x21/0x60 devinet_ioctl+0x590/0x720 ? avc_has_extended_perms+0x1b7/0x420 ? _copy_from_user+0x3a/0x60 inet_ioctl+0x189/0x1c0 ? wp_page_copy+0x45a/0x6e0 sock_do_ioctl+0x42/0xf0 ? ioctl_has_perm.constprop.0.isra.0+0xbd/0x120 sock_ioctl+0x1ce/0x2e0 __x64_sys_ioctl+0x87/0xc0 do_syscall_64+0x59/0x90 ? syscall_exit_work+0x103/0x130 ? syscall_exit_to_user_mode+0x12/0x30 ? do_syscall_64+0x69/0x90 ? exc_page_fault+0x62/0x150 Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Reviewed-by: Damodharam Ammepalli Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240117234515.226944-6-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 19 ++++++++++++------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0f5004872a46..39845d556baf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3817,7 +3817,7 @@ static int bnxt_alloc_cp_rings(struct bnxt *bp) { bool sh = !!(bp->flags & BNXT_FLAG_SHARED_RINGS); int i, j, rc, ulp_base_vec, ulp_msix; - int tcs = netdev_get_num_tc(bp->dev); + int tcs = bp->num_tc; if (!tcs) tcs = 1; @@ -9946,7 +9946,7 @@ static int __bnxt_num_tx_to_cp(struct bnxt *bp, int tx, int tx_sets, int tx_xdp) int bnxt_num_tx_to_cp(struct bnxt *bp, int tx) { - int tcs = netdev_get_num_tc(bp->dev); + int tcs = bp->num_tc; if (!tcs) tcs = 1; @@ -9955,7 +9955,7 @@ int bnxt_num_tx_to_cp(struct bnxt *bp, int tx) static int bnxt_num_cp_to_tx(struct bnxt *bp, int tx_cp) { - int tcs = netdev_get_num_tc(bp->dev); + int tcs = bp->num_tc; return (tx_cp - bp->tx_nr_rings_xdp) * tcs + bp->tx_nr_rings_xdp; @@ -9985,7 +9985,7 @@ static void bnxt_setup_msix(struct bnxt *bp) struct net_device *dev = bp->dev; int tcs, i; - tcs = netdev_get_num_tc(dev); + tcs = bp->num_tc; if (tcs) { int i, off, count; @@ -10017,8 +10017,10 @@ static void bnxt_setup_inta(struct bnxt *bp) { const int len = sizeof(bp->irq_tbl[0].name); - if (netdev_get_num_tc(bp->dev)) + if (bp->num_tc) { netdev_reset_tc(bp->dev); + bp->num_tc = 0; + } snprintf(bp->irq_tbl[0].name, len, "%s-%s-%d", bp->dev->name, "TxRx", 0); @@ -10244,8 +10246,8 @@ static void bnxt_clear_int_mode(struct bnxt *bp) int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init) { - int tcs = netdev_get_num_tc(bp->dev); bool irq_cleared = false; + int tcs = bp->num_tc; int rc; if (!bnxt_need_reserve_rings(bp)) @@ -10271,6 +10273,7 @@ int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init) bp->tx_nr_rings - bp->tx_nr_rings_xdp)) { netdev_err(bp->dev, "tx ring reservation failure\n"); netdev_reset_tc(bp->dev); + bp->num_tc = 0; if (bp->tx_nr_rings_xdp) bp->tx_nr_rings_per_tc = bp->tx_nr_rings_xdp; else @@ -13800,7 +13803,7 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc) return -EINVAL; } - if (netdev_get_num_tc(dev) == tc) + if (bp->num_tc == tc) return 0; if (bp->flags & BNXT_FLAG_SHARED_RINGS) @@ -13818,9 +13821,11 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc) if (tc) { bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tc; netdev_set_num_tc(dev, tc); + bp->num_tc = tc; } else { bp->tx_nr_rings = bp->tx_nr_rings_per_tc; netdev_reset_tc(dev); + bp->num_tc = 0; } bp->tx_nr_rings += bp->tx_nr_rings_xdp; tx_cp = bnxt_num_tx_to_cp(bp, bp->tx_nr_rings); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index b8ef1717cb65..47338b48ca20 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2225,6 +2225,7 @@ struct bnxt { u8 tc_to_qidx[BNXT_MAX_QUEUE]; u8 q_ids[BNXT_MAX_QUEUE]; u8 max_q; + u8 num_tc; unsigned int current_interval; #define BNXT_TIMER_INTERVAL HZ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c index 63e067038385..0dbb880a7aa0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -228,7 +228,7 @@ static int bnxt_queue_remap(struct bnxt *bp, unsigned int lltc_mask) } } if (bp->ieee_ets) { - int tc = netdev_get_num_tc(bp->dev); + int tc = bp->num_tc; if (!tc) tc = 1; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 1f6e0cd84f2e..dc4ca706b0e2 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -884,7 +884,7 @@ static void bnxt_get_channels(struct net_device *dev, if (max_tx_sch_inputs) max_tx_rings = min_t(int, max_tx_rings, max_tx_sch_inputs); - tcs = netdev_get_num_tc(dev); + tcs = bp->num_tc; tx_grps = max(tcs, 1); if (bp->tx_nr_rings_xdp) tx_grps++; @@ -944,7 +944,7 @@ static int bnxt_set_channels(struct net_device *dev, if (channel->combined_count) sh = true; - tcs = netdev_get_num_tc(dev); + tcs = bp->num_tc; req_tx_rings = sh ? channel->combined_count : channel->tx_count; req_rx_rings = sh ? channel->combined_count : channel->rx_count; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index c2b25fc623ec..4079538bc310 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -407,7 +407,7 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog) if (prog) tx_xdp = bp->rx_nr_rings; - tc = netdev_get_num_tc(dev); + tc = bp->num_tc; if (!tc) tc = 1; rc = bnxt_check_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings, -- cgit From 6c21660fe221a15c789dee2bc2fd95516bc5aeaf Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 18 Jan 2024 21:03:06 +0800 Subject: vlan: skip nested type that is not IFLA_VLAN_QOS_MAPPING In the vlan_changelink function, a loop is used to parse the nested attributes IFLA_VLAN_EGRESS_QOS and IFLA_VLAN_INGRESS_QOS in order to obtain the struct ifla_vlan_qos_mapping. These two nested attributes are checked in the vlan_validate_qos_map function, which calls nla_validate_nested_deprecated with the vlan_map_policy. However, this deprecated validator applies a LIBERAL strictness, allowing the presence of an attribute with the type IFLA_VLAN_QOS_UNSPEC. Consequently, the loop in vlan_changelink may parse an attribute of type IFLA_VLAN_QOS_UNSPEC and believe it carries a payload of struct ifla_vlan_qos_mapping, which is not necessarily true. To address this issue and ensure compatibility, this patch introduces two type checks that skip attributes whose type is not IFLA_VLAN_QOS_MAPPING. Fixes: 07b5b17e157b ("[VLAN]: Use rtnl_link API") Signed-off-by: Lin Ma Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240118130306.1644001-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- net/8021q/vlan_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 214532173536..a3b68243fd4b 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -118,12 +118,16 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { + if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) + continue; m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { + if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) + continue; m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) -- cgit From dad555c816a50c6a6a8a86be1f9177673918c647 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2024 18:36:25 +0000 Subject: llc: make llc_ui_sendmsg() more robust against bonding changes syzbot was able to trick llc_ui_sendmsg(), allocating an skb with no headroom, but subsequently trying to push 14 bytes of Ethernet header [1] Like some others, llc_ui_sendmsg() releases the socket lock before calling sock_alloc_send_skb(). Then it acquires it again, but does not redo all the sanity checks that were performed. This fix: - Uses LL_RESERVED_SPACE() to reserve space. - Check all conditions again after socket lock is held again. - Do not account Ethernet header for mtu limitation. [1] skbuff: skb_under_panic: text:ffff800088baa334 len:1514 put:14 head:ffff0000c9c37000 data:ffff0000c9c36ff2 tail:0x5dc end:0x6c0 dev:bond0 kernel BUG at net/core/skbuff.c:193 ! Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 6875 Comm: syz-executor.0 Not tainted 6.7.0-rc8-syzkaller-00101-g0802e17d9aca-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : skb_panic net/core/skbuff.c:189 [inline] pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 lr : skb_panic net/core/skbuff.c:189 [inline] lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 sp : ffff800096f97000 x29: ffff800096f97010 x28: ffff80008cc8d668 x27: dfff800000000000 x26: ffff0000cb970c90 x25: 00000000000005dc x24: ffff0000c9c36ff2 x23: ffff0000c9c37000 x22: 00000000000005ea x21: 00000000000006c0 x20: 000000000000000e x19: ffff800088baa334 x18: 1fffe000368261ce x17: ffff80008e4ed000 x16: ffff80008a8310f8 x15: 0000000000000001 x14: 1ffff00012df2d58 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000001 x10: 0000000000ff0100 x9 : e28a51f1087e8400 x8 : e28a51f1087e8400 x7 : ffff80008028f8d0 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff800082b78714 x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000089 Call trace: skb_panic net/core/skbuff.c:189 [inline] skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 skb_push+0xf0/0x108 net/core/skbuff.c:2451 eth_header+0x44/0x1f8 net/ethernet/eth.c:83 dev_hard_header include/linux/netdevice.h:3188 [inline] llc_mac_hdr_init+0x110/0x17c net/llc/llc_output.c:33 llc_sap_action_send_xid_c+0x170/0x344 net/llc/llc_s_ac.c:85 llc_exec_sap_trans_actions net/llc/llc_sap.c:153 [inline] llc_sap_next_state net/llc/llc_sap.c:182 [inline] llc_sap_state_process+0x1ec/0x774 net/llc/llc_sap.c:209 llc_build_and_send_xid_pkt+0x12c/0x1c0 net/llc/llc_sap.c:270 llc_ui_sendmsg+0x7bc/0xb1c net/llc/af_llc.c:997 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] sock_sendmsg+0x194/0x274 net/socket.c:767 splice_to_socket+0x7cc/0xd58 fs/splice.c:881 do_splice_from fs/splice.c:933 [inline] direct_splice_actor+0xe4/0x1c0 fs/splice.c:1142 splice_direct_to_actor+0x2a0/0x7e4 fs/splice.c:1088 do_splice_direct+0x20c/0x348 fs/splice.c:1194 do_sendfile+0x4bc/0xc70 fs/read_write.c:1254 __do_sys_sendfile64 fs/read_write.c:1322 [inline] __se_sys_sendfile64 fs/read_write.c:1308 [inline] __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1308 __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155 el0_svc+0x54/0x158 arch/arm64/kernel/entry-common.c:678 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:595 Code: aa1803e6 aa1903e7 a90023f5 94792f6a (d4210000) Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+2a7024e9502df538e8ef@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240118183625.4007013-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/llc/af_llc.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 9b06c380866b..20551cfb7da6 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -928,14 +928,15 @@ copy_uaddr: */ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { + DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; + int rc = -EINVAL, copied = 0, hdrlen, hh_len; struct sk_buff *skb = NULL; + struct net_device *dev; size_t size = 0; - int rc = -EINVAL, copied = 0, hdrlen; dprintk("%s: sending from %02X to %02X\n", __func__, llc->laddr.lsap, llc->daddr.lsap); @@ -955,22 +956,29 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (rc) goto out; } - hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); + dev = llc->dev; + hh_len = LL_RESERVED_SPACE(dev); + hdrlen = llc_ui_header_len(sk, addr); size = hdrlen + len; - if (size > llc->dev->mtu) - size = llc->dev->mtu; + size = min_t(size_t, size, READ_ONCE(dev->mtu)); copied = size - hdrlen; rc = -EINVAL; if (copied < 0) goto out; release_sock(sk); - skb = sock_alloc_send_skb(sk, size, noblock, &rc); + skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); lock_sock(sk); if (!skb) goto out; - skb->dev = llc->dev; + if (sock_flag(sk, SOCK_ZAPPED) || + llc->dev != dev || + hdrlen != llc_ui_header_len(sk, addr) || + hh_len != LL_RESERVED_SPACE(dev) || + size > READ_ONCE(dev->mtu)) + goto out; + skb->dev = dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); - skb_reserve(skb, hdrlen); + skb_reserve(skb, hh_len + hdrlen); rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; -- cgit From e3f9bed9bee261e3347131764e42aeedf1ffea61 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 18 Jan 2024 17:55:15 -0800 Subject: llc: Drop support for ETH_P_TR_802_2. syzbot reported an uninit-value bug below. [0] llc supports ETH_P_802_2 (0x0004) and used to support ETH_P_TR_802_2 (0x0011), and syzbot abused the latter to trigger the bug. write$tun(r0, &(0x7f0000000040)={@val={0x0, 0x11}, @val, @mpls={[], @llc={@snap={0xaa, 0x1, ')', "90e5dd"}}}}, 0x16) llc_conn_handler() initialises local variables {saddr,daddr}.mac based on skb in llc_pdu_decode_sa()/llc_pdu_decode_da() and passes them to __llc_lookup(). However, the initialisation is done only when skb->protocol is htons(ETH_P_802_2), otherwise, __llc_lookup_established() and __llc_lookup_listener() will read garbage. The missing initialisation existed prior to commit 211ed865108e ("net: delete all instances of special processing for token ring"). It removed the part to kick out the token ring stuff but forgot to close the door allowing ETH_P_TR_802_2 packets to sneak into llc_rcv(). Let's remove llc_tr_packet_type and complete the deprecation. [0]: BUG: KMSAN: uninit-value in __llc_lookup_established+0xe9d/0xf90 __llc_lookup_established+0xe9d/0xf90 __llc_lookup net/llc/llc_conn.c:611 [inline] llc_conn_handler+0x4bd/0x1360 net/llc/llc_conn.c:791 llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 __netif_receive_skb_one_core net/core/dev.c:5527 [inline] __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5641 netif_receive_skb_internal net/core/dev.c:5727 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5786 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2020 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x8ef/0x1490 fs/read_write.c:584 ksys_write+0x20f/0x4c0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x93/0xd0 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Local variable daddr created at: llc_conn_handler+0x53/0x1360 net/llc/llc_conn.c:783 llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 CPU: 1 PID: 5004 Comm: syz-executor994 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Fixes: 211ed865108e ("net: delete all instances of special processing for token ring") Reported-by: syzbot+b5ad66046b913bc04c6f@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b5ad66046b913bc04c6f Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240119015515.61898-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/llc_pdu.h | 6 ++---- net/llc/llc_core.c | 7 ------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index 7e73f8e5e497..1d55ba7c45be 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); + memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** @@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); + memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 6e387aadffce..4f16d9c88350 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -135,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = { .func = llc_rcv, }; -static struct packet_type llc_tr_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_TR_802_2), - .func = llc_rcv, -}; - static int __init llc_init(void) { dev_add_pack(&llc_packet_type); - dev_add_pack(&llc_tr_packet_type); return 0; } static void __exit llc_exit(void) { dev_remove_pack(&llc_packet_type); - dev_remove_pack(&llc_tr_packet_type); } module_init(llc_init); -- cgit From a54d51fb2dfb846aedf3751af501e9688db447f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2024 20:17:49 +0000 Subject: udp: fix busy polling Generic sk_busy_loop_end() only looks at sk->sk_receive_queue for presence of packets. Problem is that for UDP sockets after blamed commit, some packets could be present in another queue: udp_sk(sk)->reader_queue In some cases, a busy poller could spin until timeout expiration, even if some packets are available in udp_sk(sk)->reader_queue. v3: - make sk_busy_loop_end() nicer (Willem) v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats. - add a sk_is_inet() check in sk_is_udp() (Willem feedback) - add a sk_is_inet() check in sk_is_tcp(). Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception") Signed-off-by: Eric Dumazet Reviewed-by: Paolo Abeni Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/skmsg.h | 6 ------ include/net/inet_sock.h | 5 ----- include/net/sock.h | 18 +++++++++++++++++- net/core/sock.c | 11 +++++++++-- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 888a4b217829..e65ec3fd2799 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -505,12 +505,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } -static inline bool sk_is_udp(const struct sock *sk) -{ - return sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP; -} - #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index aa86453f6b9b..d94c242eb3ed 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -307,11 +307,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) #define inet_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) -static inline bool sk_is_inet(struct sock *sk) -{ - return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; -} - /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket diff --git a/include/net/sock.h b/include/net/sock.h index a7f815c7cfdf..54ca8dcbfb43 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2765,9 +2765,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) &skb_shinfo(skb)->tskey); } +static inline bool sk_is_inet(const struct sock *sk) +{ + int family = READ_ONCE(sk->sk_family); + + return family == AF_INET || family == AF_INET6; +} + static inline bool sk_is_tcp(const struct sock *sk) { - return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; + return sk_is_inet(sk) && + sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk_is_inet(sk) && + sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; } static inline bool sk_is_stream_unix(const struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index 158dbdebce6a..0a7f46c37f0c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -107,6 +107,7 @@ #include #include #include +#include #include #include #include @@ -4144,8 +4145,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; - return !skb_queue_empty_lockless(&sk->sk_receive_queue) || - sk_busy_loop_timeout(sk, start_time); + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + return true; + + if (sk_is_udp(sk) && + !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) + return true; + + return sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ -- cgit From 359724fa3ab79fbe9f42c6263cddc2afae32eef3 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 18 Jan 2024 21:50:40 +0100 Subject: idpf: distinguish vports by the dev_port attribute idpf registers multiple netdevs (virtual ports) for one PCI function, but it does not provide a way for userspace to distinguish them with sysfs attributes. Per Documentation/ABI/testing/sysfs-class-net, it is a bug not to set dev_port for independent ports on the same PCI bus, device and function. Without dev_port set, systemd-udevd's default naming policy attempts to assign the same name ("ens2f0") to all four idpf netdevs on my test system and obviously fails, leaving three of them with the initial eth name. With this patch, systemd-udevd is able to assign unique names to the netdevs (e.g. "ens2f0", "ens2f0d1", "ens2f0d2", "ens2f0d3"). The Intel-provided out-of-tree idpf driver already sets dev_port. In this patch I chose to do it in the same place in the idpf_cfg_netdev function. Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration") Signed-off-by: Michal Schmidt Reviewed-by: Jesse Brandeburg Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index 5fea2fd957eb..58179bd733ff 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -783,6 +783,8 @@ static int idpf_cfg_netdev(struct idpf_vport *vport) /* setup watchdog timeout value to be 5 second */ netdev->watchdog_timeo = 5 * HZ; + netdev->dev_port = idx; + /* configure default MTU size */ netdev->min_mtu = ETH_MIN_MTU; netdev->max_mtu = vport->max_mtu; -- cgit From d09486a04f5da0a812c26217213b89a3b1acf836 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 18 Jan 2024 16:58:59 -0800 Subject: net: fix removing a namespace with conflicting altnames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark reports a BUG() when a net namespace is removed. kernel BUG at net/core/dev.c:11520! Physical interfaces moved outside of init_net get "refunded" to init_net when that namespace disappears. The main interface name may get overwritten in the process if it would have conflicted. We need to also discard all conflicting altnames. Recent fixes addressed ensuring that altnames get moved with the main interface, which surfaced this problem. Reported-by: Марк Коренберг Link: https://lore.kernel.org/all/CAEmTpZFZ4Sv3KwqFOY2WKDHeZYdi0O7N5H1nTvcGp=SAEavtDg@mail.gmail.com/ Fixes: 7663d522099e ("net: check for altname conflicts when changing netdev's netns") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Reviewed-by: Jiri Pirko Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++++ net/core/dev.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index f01a9b858347..cb2dab0feee0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11551,6 +11551,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = { static void __net_exit default_device_exit_net(struct net *net) { + struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the @@ -11573,6 +11574,14 @@ static void __net_exit default_device_exit_net(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); + + netdev_for_each_altname_safe(dev, name_node, tmp) + if (netdev_name_in_use(&init_net, name_node->name)) { + netdev_name_node_del(name_node); + synchronize_rcu(); + __netdev_name_node_alt_destroy(name_node); + } + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", diff --git a/net/core/dev.h b/net/core/dev.h index cf93e188785b..7480b4c84298 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -63,6 +63,9 @@ int dev_change_name(struct net_device *dev, const char *newname); #define netdev_for_each_altname(dev, namenode) \ list_for_each_entry((namenode), &(dev)->name_node->list, list) +#define netdev_for_each_altname_safe(dev, namenode, next) \ + list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ + list) int netdev_name_node_alt_create(struct net_device *dev, const char *name); int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); -- cgit From 5744ba05e7c4bff8fec133dd0f9e51ddffba92f5 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Fri, 19 Jan 2024 18:22:35 +0800 Subject: tun: fix missing dropped counter in tun_xdp_act The commit 8ae1aff0b331 ("tuntap: split out XDP logic") includes dropped counter for XDP_DROP, XDP_ABORTED, and invalid XDP actions. Unfortunately, that commit missed the dropped counter when error occurs during XDP_TX and XDP_REDIRECT actions. This patch fixes this issue. Fixes: 8ae1aff0b331 ("tuntap: split out XDP logic") Signed-off-by: Yunjian Wang Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index afa5497f7c35..237fef557ba5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1630,13 +1630,17 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); - if (err) + if (err) { + dev_core_stats_rx_dropped_inc(tun->dev); return err; + } break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); - if (err < 0) + if (err < 0) { + dev_core_stats_rx_dropped_inc(tun->dev); return err; + } break; case XDP_PASS: break; -- cgit From f1084c427f55d573fcd5688d9ba7b31b78019716 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Fri, 19 Jan 2024 18:22:56 +0800 Subject: tun: add missing rx stats accounting in tun_xdp_act The TUN can be used as vhost-net backend, and it is necessary to count the packets transmitted from TUN to vhost-net/virtio-net. However, there are some places in the receive path that were not taken into account when using XDP. It would be beneficial to also include new accounting for successfully received bytes using dev_sw_netstats_rx_add. Fixes: 761876c857cb ("tap: XDP support") Signed-off-by: Yunjian Wang Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 237fef557ba5..4a4f8c8e79fa 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1634,6 +1634,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, dev_core_stats_rx_dropped_inc(tun->dev); return err; } + dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); @@ -1641,6 +1642,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, dev_core_stats_rx_dropped_inc(tun->dev); return err; } + dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_PASS: break; -- cgit From b6a11a7fc4d6337f7ea720b9287d1b9749c4eae0 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 19 Jan 2024 14:43:01 +0100 Subject: dpll: fix broken error path in dpll_pin_alloc(..) If pin type is not expected, or pin properities failed to allocate memory, the unwind error path shall not destroy pin's xarrays, which were not yet initialized. Add new goto label and use it to fix broken error path. Reviewed-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Signed-off-by: David S. Miller --- drivers/dpll/dpll_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index 1eca8cc271f8..c08772ee9fd6 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -441,7 +441,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, if (WARN_ON(prop->type < DPLL_PIN_TYPE_MUX || prop->type > DPLL_PIN_TYPE_MAX)) { ret = -EINVAL; - goto err; + goto err_pin_prop; } pin->prop = prop; refcount_set(&pin->refcount, 1); @@ -450,11 +450,12 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, ret = xa_alloc_cyclic(&dpll_pin_xa, &pin->id, pin, xa_limit_32b, &dpll_pin_xa_id, GFP_KERNEL); if (ret) - goto err; + goto err_xa_alloc; return pin; -err: +err_xa_alloc: xa_destroy(&pin->dpll_refs); xa_destroy(&pin->parent_refs); +err_pin_prop: kfree(pin); return ERR_PTR(ret); } -- cgit From 830ead5fb0c5855ce4d70ba2ed4a673b5f1e7d9b Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 19 Jan 2024 14:43:02 +0100 Subject: dpll: fix pin dump crash for rebound module When a kernel module is unbound but the pin resources were not entirely freed (other kernel module instance of the same PCI device have had kept the reference to that pin), and kernel module is again bound, the pin properties would not be updated (the properties are only assigned when memory for the pin is allocated), prop pointer still points to the kernel module memory of the kernel module which was deallocated on the unbind. If the pin dump is invoked in this state, the result is a kernel crash. Prevent the crash by storing persistent pin properties in dpll subsystem, copy the content from the kernel module when pin is allocated, instead of using memory of the kernel module. Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions") Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") Reviewed-by: Jan Glaza Reviewed-by: Przemek Kitszel Signed-off-by: Arkadiusz Kubalewski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/dpll/dpll_core.c | 55 +++++++++++++++++++++++++++++++++++++++++++-- drivers/dpll/dpll_core.h | 4 ++-- drivers/dpll/dpll_netlink.c | 28 +++++++++++------------ 3 files changed, 69 insertions(+), 18 deletions(-) diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index c08772ee9fd6..cb62696467d1 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -425,6 +425,53 @@ void dpll_device_unregister(struct dpll_device *dpll, } EXPORT_SYMBOL_GPL(dpll_device_unregister); +static void dpll_pin_prop_free(struct dpll_pin_properties *prop) +{ + kfree(prop->package_label); + kfree(prop->panel_label); + kfree(prop->board_label); + kfree(prop->freq_supported); +} + +static int dpll_pin_prop_dup(const struct dpll_pin_properties *src, + struct dpll_pin_properties *dst) +{ + memcpy(dst, src, sizeof(*dst)); + if (src->freq_supported && src->freq_supported_num) { + size_t freq_size = src->freq_supported_num * + sizeof(*src->freq_supported); + dst->freq_supported = kmemdup(src->freq_supported, + freq_size, GFP_KERNEL); + if (!src->freq_supported) + return -ENOMEM; + } + if (src->board_label) { + dst->board_label = kstrdup(src->board_label, GFP_KERNEL); + if (!dst->board_label) + goto err_board_label; + } + if (src->panel_label) { + dst->panel_label = kstrdup(src->panel_label, GFP_KERNEL); + if (!dst->panel_label) + goto err_panel_label; + } + if (src->package_label) { + dst->package_label = kstrdup(src->package_label, GFP_KERNEL); + if (!dst->package_label) + goto err_package_label; + } + + return 0; + +err_package_label: + kfree(dst->panel_label); +err_panel_label: + kfree(dst->board_label); +err_board_label: + kfree(dst->freq_supported); + return -ENOMEM; +} + static struct dpll_pin * dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, const struct dpll_pin_properties *prop) @@ -443,7 +490,9 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, ret = -EINVAL; goto err_pin_prop; } - pin->prop = prop; + ret = dpll_pin_prop_dup(prop, &pin->prop); + if (ret) + goto err_pin_prop; refcount_set(&pin->refcount, 1); xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC); xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC); @@ -455,6 +504,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, err_xa_alloc: xa_destroy(&pin->dpll_refs); xa_destroy(&pin->parent_refs); + dpll_pin_prop_free(&pin->prop); err_pin_prop: kfree(pin); return ERR_PTR(ret); @@ -515,6 +565,7 @@ void dpll_pin_put(struct dpll_pin *pin) xa_destroy(&pin->dpll_refs); xa_destroy(&pin->parent_refs); xa_erase(&dpll_pin_xa, pin->id); + dpll_pin_prop_free(&pin->prop); kfree(pin); } mutex_unlock(&dpll_lock); @@ -637,7 +688,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin, unsigned long i, stop; int ret; - if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX)) + if (WARN_ON(parent->prop.type != DPLL_PIN_TYPE_MUX)) return -EINVAL; if (WARN_ON(!ops) || diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h index 5585873c5c1b..717f715015c7 100644 --- a/drivers/dpll/dpll_core.h +++ b/drivers/dpll/dpll_core.h @@ -44,7 +44,7 @@ struct dpll_device { * @module: module of creator * @dpll_refs: hold referencees to dplls pin was registered with * @parent_refs: hold references to parent pins pin was registered with - * @prop: pointer to pin properties given by registerer + * @prop: pin properties copied from the registerer * @rclk_dev_name: holds name of device when pin can recover clock from it * @refcount: refcount **/ @@ -55,7 +55,7 @@ struct dpll_pin { struct module *module; struct xarray dpll_refs; struct xarray parent_refs; - const struct dpll_pin_properties *prop; + struct dpll_pin_properties prop; refcount_t refcount; }; diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 3370dbddb86b..30f5be020862 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -303,17 +303,17 @@ dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin, if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq, DPLL_A_PIN_PAD)) return -EMSGSIZE; - for (fs = 0; fs < pin->prop->freq_supported_num; fs++) { + for (fs = 0; fs < pin->prop.freq_supported_num; fs++) { nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED); if (!nest) return -EMSGSIZE; - freq = pin->prop->freq_supported[fs].min; + freq = pin->prop.freq_supported[fs].min; if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq), &freq, DPLL_A_PIN_PAD)) { nla_nest_cancel(msg, nest); return -EMSGSIZE; } - freq = pin->prop->freq_supported[fs].max; + freq = pin->prop.freq_supported[fs].max; if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq), &freq, DPLL_A_PIN_PAD)) { nla_nest_cancel(msg, nest); @@ -329,9 +329,9 @@ static bool dpll_pin_is_freq_supported(struct dpll_pin *pin, u32 freq) { int fs; - for (fs = 0; fs < pin->prop->freq_supported_num; fs++) - if (freq >= pin->prop->freq_supported[fs].min && - freq <= pin->prop->freq_supported[fs].max) + for (fs = 0; fs < pin->prop.freq_supported_num; fs++) + if (freq >= pin->prop.freq_supported[fs].min && + freq <= pin->prop.freq_supported[fs].max) return true; return false; } @@ -421,7 +421,7 @@ static int dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin, struct netlink_ext_ack *extack) { - const struct dpll_pin_properties *prop = pin->prop; + const struct dpll_pin_properties *prop = &pin->prop; struct dpll_pin_ref *ref; int ret; @@ -717,7 +717,7 @@ dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32 parent_idx, int ret; if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE & - pin->prop->capabilities)) { + pin->prop.capabilities)) { NL_SET_ERR_MSG(extack, "state changing is not allowed"); return -EOPNOTSUPP; } @@ -753,7 +753,7 @@ dpll_pin_state_set(struct dpll_device *dpll, struct dpll_pin *pin, int ret; if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE & - pin->prop->capabilities)) { + pin->prop.capabilities)) { NL_SET_ERR_MSG(extack, "state changing is not allowed"); return -EOPNOTSUPP; } @@ -780,7 +780,7 @@ dpll_pin_prio_set(struct dpll_device *dpll, struct dpll_pin *pin, int ret; if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE & - pin->prop->capabilities)) { + pin->prop.capabilities)) { NL_SET_ERR_MSG(extack, "prio changing is not allowed"); return -EOPNOTSUPP; } @@ -808,7 +808,7 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll, int ret; if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE & - pin->prop->capabilities)) { + pin->prop.capabilities)) { NL_SET_ERR_MSG(extack, "direction changing is not allowed"); return -EOPNOTSUPP; } @@ -838,8 +838,8 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr, int ret; phase_adj = nla_get_s32(phase_adj_attr); - if (phase_adj > pin->prop->phase_range.max || - phase_adj < pin->prop->phase_range.min) { + if (phase_adj > pin->prop.phase_range.max || + phase_adj < pin->prop.phase_range.min) { NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr, "phase adjust value not supported"); return -EINVAL; @@ -1023,7 +1023,7 @@ dpll_pin_find(u64 clock_id, struct nlattr *mod_name_attr, unsigned long i; xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) { - prop = pin->prop; + prop = &pin->prop; cid_match = clock_id ? pin->clock_id == clock_id : true; mod_match = mod_name_attr && module_name(pin->module) ? !nla_strcmp(mod_name_attr, -- cgit From db2ec3c94667eaeecc6a74d96594fab6baf80fdc Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 19 Jan 2024 14:43:03 +0100 Subject: dpll: fix userspace availability of pins If parent pin was unregistered but child pin was not, the userspace would see the "zombie" pins - the ones that were registered with a parent pin (dpll_pin_on_pin_register(..)). Technically those are not available - as there is no dpll device in the system. Do not dump those pins and prevent userspace from any interaction with them. Provide a unified function to determine if the pin is available and use it before acting/responding for user requests. Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") Reviewed-by: Jan Glaza Reviewed-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Signed-off-by: David S. Miller --- drivers/dpll/dpll_netlink.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 30f5be020862..314bb3775465 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -553,6 +553,24 @@ __dpll_device_change_ntf(struct dpll_device *dpll) return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll); } +static bool dpll_pin_available(struct dpll_pin *pin) +{ + struct dpll_pin_ref *par_ref; + unsigned long i; + + if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)) + return false; + xa_for_each(&pin->parent_refs, i, par_ref) + if (xa_get_mark(&dpll_pin_xa, par_ref->pin->id, + DPLL_REGISTERED)) + return true; + xa_for_each(&pin->dpll_refs, i, par_ref) + if (xa_get_mark(&dpll_device_xa, par_ref->dpll->id, + DPLL_REGISTERED)) + return true; + return false; +} + /** * dpll_device_change_ntf - notify that the dpll device has been changed * @dpll: registered dpll pointer @@ -579,7 +597,7 @@ dpll_pin_event_send(enum dpll_cmd event, struct dpll_pin *pin) int ret = -ENOMEM; void *hdr; - if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))) + if (!dpll_pin_available(pin)) return -ENODEV; msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); @@ -1130,6 +1148,10 @@ int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info) } pin = dpll_pin_find_from_nlattr(info); if (!IS_ERR(pin)) { + if (!dpll_pin_available(pin)) { + nlmsg_free(msg); + return -ENODEV; + } ret = dpll_msg_add_pin_handle(msg, pin); if (ret) { nlmsg_free(msg); @@ -1179,6 +1201,8 @@ int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) xa_for_each_marked_start(&dpll_pin_xa, i, pin, DPLL_REGISTERED, ctx->idx) { + if (!dpll_pin_available(pin)) + continue; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &dpll_nl_family, NLM_F_MULTI, @@ -1441,7 +1465,8 @@ int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, } info->user_ptr[0] = xa_load(&dpll_pin_xa, nla_get_u32(info->attrs[DPLL_A_PIN_ID])); - if (!info->user_ptr[0]) { + if (!info->user_ptr[0] || + !dpll_pin_available(info->user_ptr[0])) { NL_SET_ERR_MSG(info->extack, "pin not found"); ret = -ENODEV; goto unlock_dev; -- cgit From 7dc5b18ff71bd6f948810ab8a08b6a6ff8b315c5 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 19 Jan 2024 14:43:04 +0100 Subject: dpll: fix register pin with unregistered parent pin In case of multiple kernel module instances using the same dpll device: if only one registers dpll device, then only that one can register directly connected pins with a dpll device. When unregistered parent is responsible for determining if the muxed pin can be registered with it or not, the drivers need to be loaded in serialized order to work correctly - first the driver instance which registers the direct pins needs to be loaded, then the other instances could register muxed type pins. Allow registration of a pin with a parent even if the parent was not yet registered, thus allow ability for unserialized driver instance load order. Do not WARN_ON notification for unregistered pin, which can be invoked for described case, instead just return error. Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions") Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") Reviewed-by: Jan Glaza Reviewed-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Signed-off-by: David S. Miller --- drivers/dpll/dpll_core.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index cb62696467d1..5152bd1b0daf 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -29,8 +29,6 @@ static u32 dpll_pin_xa_id; WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED)) #define ASSERT_DPLL_NOT_REGISTERED(d) \ WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED)) -#define ASSERT_PIN_REGISTERED(p) \ - WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED)) struct dpll_device_registration { struct list_head list; @@ -616,8 +614,6 @@ dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin, WARN_ON(!ops->state_on_dpll_get) || WARN_ON(!ops->direction_get)) return -EINVAL; - if (ASSERT_DPLL_REGISTERED(dpll)) - return -EINVAL; mutex_lock(&dpll_lock); if (WARN_ON(!(dpll->module == pin->module && @@ -695,8 +691,6 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin, WARN_ON(!ops->state_on_pin_get) || WARN_ON(!ops->direction_get)) return -EINVAL; - if (ASSERT_PIN_REGISTERED(parent)) - return -EINVAL; mutex_lock(&dpll_lock); ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv); -- cgit From aaf632f7ab6dec57bc9329a438f94504fe8034b9 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 19 Jan 2024 11:47:50 +0100 Subject: net: micrel: Fix PTP frame parsing for lan8814 The HW has the capability to check each frame if it is a PTP frame, which domain it is, which ptp frame type it is, different ip address in the frame. And if one of these checks fail then the frame is not timestamp. Most of these checks were disabled except checking the field minorVersionPTP inside the PTP header. Meaning that once a partner sends a frame compliant to 8021AS which has minorVersionPTP set to 1, then the frame was not timestamp because the HW expected by default a value of 0 in minorVersionPTP. This is exactly the same issue as on lan8841. Fix this issue by removing this check so the userspace can decide on this. Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Horatiu Vultur Reviewed-by: Maxime Chevallier Reviewed-by: Divya Koppera Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 81c20eb4b54b..dad720138baa 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -120,6 +120,11 @@ */ #define LAN8814_1PPM_FORMAT 17179 +#define PTP_RX_VERSION 0x0248 +#define PTP_TX_VERSION 0x0288 +#define PTP_MAX_VERSION(x) (((x) & GENMASK(7, 0)) << 8) +#define PTP_MIN_VERSION(x) ((x) & GENMASK(7, 0)) + #define PTP_RX_MOD 0x024F #define PTP_RX_MOD_BAD_UDPV4_CHKSUM_FORCE_FCS_DIS_ BIT(3) #define PTP_RX_TIMESTAMP_EN 0x024D @@ -3150,6 +3155,12 @@ static void lan8814_ptp_init(struct phy_device *phydev) lanphy_write_page_reg(phydev, 5, PTP_TX_PARSE_IP_ADDR_EN, 0); lanphy_write_page_reg(phydev, 5, PTP_RX_PARSE_IP_ADDR_EN, 0); + /* Disable checking for minorVersionPTP field */ + lanphy_write_page_reg(phydev, 5, PTP_RX_VERSION, + PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0)); + lanphy_write_page_reg(phydev, 5, PTP_TX_VERSION, + PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0)); + skb_queue_head_init(&ptp_priv->tx_queue); skb_queue_head_init(&ptp_priv->rx_queue); INIT_LIST_HEAD(&ptp_priv->rx_ts_list); -- cgit From 13e788deb7348cc88df34bed736c3b3b9927ea52 Mon Sep 17 00:00:00 2001 From: Sharath Srinivasan Date: Fri, 19 Jan 2024 17:48:39 -0800 Subject: net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv Syzcaller UBSAN crash occurs in rds_cmsg_recv(), which reads inc->i_rx_lat_trace[j + 1] with index 4 (3 + 1), but with array size of 4 (RDS_RX_MAX_TRACES). Here 'j' is assigned from rs->rs_rx_trace[i] and in-turn from trace.rx_trace_pos[i] in rds_recv_track_latency(), with both arrays sized 3 (RDS_MSG_RX_DGRAM_TRACE_MAX). So fix the off-by-one bounds check in rds_recv_track_latency() to prevent a potential crash in rds_cmsg_recv(). Found by syzcaller: ================================================================= UBSAN: array-index-out-of-bounds in net/rds/recv.c:585:39 index 4 is out of range for type 'u64 [4]' CPU: 1 PID: 8058 Comm: syz-executor228 Not tainted 6.6.0-gd2f51b3516da #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x136/0x150 lib/dump_stack.c:106 ubsan_epilogue lib/ubsan.c:217 [inline] __ubsan_handle_out_of_bounds+0xd5/0x130 lib/ubsan.c:348 rds_cmsg_recv+0x60d/0x700 net/rds/recv.c:585 rds_recvmsg+0x3fb/0x1610 net/rds/recv.c:716 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0xe2/0x160 net/socket.c:1066 __sys_recvfrom+0x1b6/0x2f0 net/socket.c:2246 __do_sys_recvfrom net/socket.c:2264 [inline] __se_sys_recvfrom net/socket.c:2260 [inline] __x64_sys_recvfrom+0xe0/0x1b0 net/socket.c:2260 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b ================================================================== Fixes: 3289025aedc0 ("RDS: add receive message trace used by application") Reported-by: Chenyuan Yang Closes: https://lore.kernel.org/linux-rdma/CALGdzuoVdq-wtQ4Az9iottBqC5cv9ZhcE5q8N7LfYFvkRsOVcw@mail.gmail.com/ Signed-off-by: Sharath Srinivasan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/rds/af_rds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 01c4cdfef45d..8435a20968ef 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -419,7 +419,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, rs->rs_rx_traces = trace.rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { - if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { + if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { rs->rs_rx_traces = 0; return -EFAULT; } -- cgit From 7267e8dcad6b2f9fce05a6a06335d7040acbc2b6 Mon Sep 17 00:00:00 2001 From: Salvatore Dipietro Date: Fri, 19 Jan 2024 11:01:33 -0800 Subject: tcp: Add memory barrier to tcp_push() On CPUs with weak memory models, reads and updates performed by tcp_push to the sk variables can get reordered leaving the socket throttled when it should not. The tasklet running tcp_wfree() may also not observe the memory updates in time and will skip flushing any packets throttled by tcp_push(), delaying the sending. This can pathologically cause 40ms extra latency due to bad interactions with delayed acks. Adding a memory barrier in tcp_push removes the bug, similarly to the previous commit bf06200e732d ("tcp: tsq: fix nonagle handling"). smp_mb__after_atomic() is used to not incur in unnecessary overhead on x86 since not affected. Patch has been tested using an AWS c7g.2xlarge instance with Ubuntu 22.04 and Apache Tomcat 9.0.83 running the basic servlet below: import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; public class HelloWorldServlet extends HttpServlet { @Override protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setContentType("text/html;charset=utf-8"); OutputStreamWriter osw = new OutputStreamWriter(response.getOutputStream(),"UTF-8"); String s = "a".repeat(3096); osw.write(s,0,s.length()); osw.flush(); } } Load was applied using wrk2 (https://github.com/kinvolk/wrk2) from an AWS c6i.8xlarge instance. Before the patch an additional 40ms latency from P99.99+ values is observed while, with the patch, the extra latency disappears. No patch and tcp_autocorking=1 ./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello ... 50.000% 0.91ms 75.000% 1.13ms 90.000% 1.46ms 99.000% 1.74ms 99.900% 1.89ms 99.990% 41.95ms <<< 40+ ms extra latency 99.999% 48.32ms 100.000% 48.96ms With patch and tcp_autocorking=1 ./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello ... 50.000% 0.90ms 75.000% 1.13ms 90.000% 1.45ms 99.000% 1.72ms 99.900% 1.83ms 99.990% 2.11ms <<< no 40+ ms extra latency 99.999% 2.53ms 100.000% 2.62ms Patch has been also tested on x86 (m7i.2xlarge instance) which it is not affected by this issue and the patch doesn't introduce any additional delay. Fixes: 7aa5470c2c09 ("tcp: tsq: move tsq_flags close to sk_wmem_alloc") Signed-off-by: Salvatore Dipietro Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240119190133.43698-1-dipiets@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1baa484d2190..a1c6de385cce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -722,6 +722,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now, if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. -- cgit From 97de5a15edf2d22184f5ff588656030bbb7fa358 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Jan 2024 19:16:42 -0800 Subject: selftest: Don't reuse port for SO_INCOMING_CPU test. Jakub reported that ASSERT_EQ(cpu, i) in so_incoming_cpu.c seems to fire somewhat randomly. # # RUN so_incoming_cpu.before_reuseport.test3 ... # # so_incoming_cpu.c:191:test3:Expected cpu (32) == i (0) # # test3: Test terminated by assertion # # FAIL so_incoming_cpu.before_reuseport.test3 # not ok 3 so_incoming_cpu.before_reuseport.test3 When the test failed, not-yet-accepted CLOSE_WAIT sockets received SYN with a "challenging" SEQ number, which was sent from an unexpected CPU that did not create the receiver. The test basically does: 1. for each cpu: 1-1. create a server 1-2. set SO_INCOMING_CPU 2. for each cpu: 2-1. set cpu affinity 2-2. create some clients 2-3. let clients connect() to the server on the same cpu 2-4. close() clients 3. for each server: 3-1. accept() all child sockets 3-2. check if all children have the same SO_INCOMING_CPU with the server The root cause was the close() in 2-4. and net.ipv4.tcp_tw_reuse. In a loop of 2., close() changed the client state to FIN_WAIT_2, and the peer transitioned to CLOSE_WAIT. In another loop of 2., connect() happened to select the same port of the FIN_WAIT_2 socket, and it was reused as the default value of net.ipv4.tcp_tw_reuse is 2. As a result, the new client sent SYN to the CLOSE_WAIT socket from a different CPU, and the receiver's sk_incoming_cpu was overwritten with unexpected CPU ID. Also, the SYN had a different SEQ number, so the CLOSE_WAIT socket responded with Challenge ACK. The new client properly returned RST and effectively killed the CLOSE_WAIT socket. This way, all clients were created successfully, but the error was detected later by 3-2., ASSERT_EQ(cpu, i). To avoid the failure, let's make sure that (i) the number of clients is less than the number of available ports and (ii) such reuse never happens. Fixes: 6df96146b202 ("selftest: Add test for SO_INCOMING_CPU.") Reported-by: Jakub Kicinski Signed-off-by: Kuniyuki Iwashima Tested-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240120031642.67014-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/so_incoming_cpu.c | 68 ++++++++++++++++++++------- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/net/so_incoming_cpu.c b/tools/testing/selftests/net/so_incoming_cpu.c index a14818164102..e9fa14e10732 100644 --- a/tools/testing/selftests/net/so_incoming_cpu.c +++ b/tools/testing/selftests/net/so_incoming_cpu.c @@ -3,19 +3,16 @@ #define _GNU_SOURCE #include +#include + #include #include #include #include "../kselftest_harness.h" -#define CLIENT_PER_SERVER 32 /* More sockets, more reliable */ -#define NR_SERVER self->nproc -#define NR_CLIENT (CLIENT_PER_SERVER * NR_SERVER) - FIXTURE(so_incoming_cpu) { - int nproc; int *servers; union { struct sockaddr addr; @@ -56,12 +53,47 @@ FIXTURE_VARIANT_ADD(so_incoming_cpu, after_all_listen) .when_to_set = AFTER_ALL_LISTEN, }; +static void write_sysctl(struct __test_metadata *_metadata, + char *filename, char *string) +{ + int fd, len, ret; + + fd = open(filename, O_WRONLY); + ASSERT_NE(fd, -1); + + len = strlen(string); + ret = write(fd, string, len); + ASSERT_EQ(ret, len); +} + +static void setup_netns(struct __test_metadata *_metadata) +{ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + ASSERT_EQ(system("ip link set lo up"), 0); + + write_sysctl(_metadata, "/proc/sys/net/ipv4/ip_local_port_range", "10000 60001"); + write_sysctl(_metadata, "/proc/sys/net/ipv4/tcp_tw_reuse", "0"); +} + +#define NR_PORT (60001 - 10000 - 1) +#define NR_CLIENT_PER_SERVER_DEFAULT 32 +static int nr_client_per_server, nr_server, nr_client; + FIXTURE_SETUP(so_incoming_cpu) { - self->nproc = get_nprocs(); - ASSERT_LE(2, self->nproc); + setup_netns(_metadata); + + nr_server = get_nprocs(); + ASSERT_LE(2, nr_server); + + if (NR_CLIENT_PER_SERVER_DEFAULT * nr_server < NR_PORT) + nr_client_per_server = NR_CLIENT_PER_SERVER_DEFAULT; + else + nr_client_per_server = NR_PORT / nr_server; + + nr_client = nr_client_per_server * nr_server; - self->servers = malloc(sizeof(int) * NR_SERVER); + self->servers = malloc(sizeof(int) * nr_server); ASSERT_NE(self->servers, NULL); self->in_addr.sin_family = AF_INET; @@ -74,7 +106,7 @@ FIXTURE_TEARDOWN(so_incoming_cpu) { int i; - for (i = 0; i < NR_SERVER; i++) + for (i = 0; i < nr_server; i++) close(self->servers[i]); free(self->servers); @@ -110,10 +142,10 @@ int create_server(struct __test_metadata *_metadata, if (variant->when_to_set == BEFORE_LISTEN) set_so_incoming_cpu(_metadata, fd, cpu); - /* We don't use CLIENT_PER_SERVER here not to block + /* We don't use nr_client_per_server here not to block * this test at connect() if SO_INCOMING_CPU is broken. */ - ret = listen(fd, NR_CLIENT); + ret = listen(fd, nr_client); ASSERT_EQ(ret, 0); if (variant->when_to_set == AFTER_LISTEN) @@ -128,7 +160,7 @@ void create_servers(struct __test_metadata *_metadata, { int i, ret; - for (i = 0; i < NR_SERVER; i++) { + for (i = 0; i < nr_server; i++) { self->servers[i] = create_server(_metadata, self, variant, i); if (i == 0) { @@ -138,7 +170,7 @@ void create_servers(struct __test_metadata *_metadata, } if (variant->when_to_set == AFTER_ALL_LISTEN) { - for (i = 0; i < NR_SERVER; i++) + for (i = 0; i < nr_server; i++) set_so_incoming_cpu(_metadata, self->servers[i], i); } } @@ -149,7 +181,7 @@ void create_clients(struct __test_metadata *_metadata, cpu_set_t cpu_set; int i, j, fd, ret; - for (i = 0; i < NR_SERVER; i++) { + for (i = 0; i < nr_server; i++) { CPU_ZERO(&cpu_set); CPU_SET(i, &cpu_set); @@ -162,7 +194,7 @@ void create_clients(struct __test_metadata *_metadata, ret = sched_setaffinity(0, sizeof(cpu_set), &cpu_set); ASSERT_EQ(ret, 0); - for (j = 0; j < CLIENT_PER_SERVER; j++) { + for (j = 0; j < nr_client_per_server; j++) { fd = socket(AF_INET, SOCK_STREAM, 0); ASSERT_NE(fd, -1); @@ -180,8 +212,8 @@ void verify_incoming_cpu(struct __test_metadata *_metadata, int i, j, fd, cpu, ret, total = 0; socklen_t len = sizeof(int); - for (i = 0; i < NR_SERVER; i++) { - for (j = 0; j < CLIENT_PER_SERVER; j++) { + for (i = 0; i < nr_server; i++) { + for (j = 0; j < nr_client_per_server; j++) { /* If we see -EAGAIN here, SO_INCOMING_CPU is broken */ fd = accept(self->servers[i], &self->addr, &self->addrlen); ASSERT_NE(fd, -1); @@ -195,7 +227,7 @@ void verify_incoming_cpu(struct __test_metadata *_metadata, } } - ASSERT_EQ(total, NR_CLIENT); + ASSERT_EQ(total, nr_client); TH_LOG("SO_INCOMING_CPU is very likely to be " "working correctly with %d sockets.", total); } -- cgit From 234ec0b6034b16869d45128b8cd2dc6ffe596f04 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 22 Jan 2024 09:18:07 +0800 Subject: netlink: fix potential sleeping issue in mqueue_flush_file I analyze the potential sleeping issue of the following processes: Thread A Thread B ... netlink_create //ref = 1 do_mq_notify ... sock = netlink_getsockbyfilp ... //ref = 2 info->notify_sock = sock; ... ... netlink_sendmsg ... skb = netlink_alloc_large_skb //skb->head is vmalloced ... netlink_unicast ... sk = netlink_getsockbyportid //ref = 3 ... netlink_sendskb ... __netlink_sendskb ... skb_queue_tail //put skb to sk_receive_queue ... sock_put //ref = 2 ... ... ... netlink_release ... deferred_put_nlk_sk //ref = 1 mqueue_flush_file spin_lock remove_notification netlink_sendskb sock_put //ref = 0 sk_free ... __sk_destruct netlink_sock_destruct skb_queue_purge //get skb from sk_receive_queue ... __skb_queue_purge_reason kfree_skb_reason __kfree_skb ... skb_release_all skb_release_head_state netlink_skb_destructor vfree(skb->head) //sleeping while holding spinlock In netlink_sendmsg, if the memory pointed to by skb->head is allocated by vmalloc, and is put to sk_receive_queue queue, also the skb is not freed. When the mqueue executes flush, the sleeping bug will occur. Use vfree_atomic instead of vfree in netlink_skb_destructor to solve the issue. Fixes: c05cdb1b864f ("netlink: allow large data transfers from user-space") Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20240122011807.2110357-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4ed8ffd58ff3..9c962347cf85 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -374,7 +374,7 @@ static void netlink_skb_destructor(struct sk_buff *skb) if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) - vfree(skb->head); + vfree_atomic(skb->head); skb->head = NULL; } -- cgit From 435e202d645c197dcfd39d7372eb2a56529b6640 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 22 Jan 2024 18:20:01 +0800 Subject: ipv6: init the accept_queue's spinlocks in inet6_create In commit 198bc90e0e73("tcp: make sure init the accept_queue's spinlocks once"), the spinlocks of accept_queue are initialized only when socket is created in the inet4 scenario. The locks are not initialized when socket is created in the inet6 scenario. The kernel reports the following error: INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Call Trace: dump_stack_lvl (lib/dump_stack.c:107) register_lock_class (kernel/locking/lockdep.c:1289) __lock_acquire (kernel/locking/lockdep.c:5015) lock_acquire.part.0 (kernel/locking/lockdep.c:5756) _raw_spin_lock_bh (kernel/locking/spinlock.c:178) inet_csk_listen_stop (net/ipv4/inet_connection_sock.c:1386) tcp_disconnect (net/ipv4/tcp.c:2981) inet_shutdown (net/ipv4/af_inet.c:935) __sys_shutdown (./include/linux/file.h:32 net/socket.c:2438) __x64_sys_shutdown (net/socket.c:2445) do_syscall_64 (arch/x86/entry/common.c:52) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) RIP: 0033:0x7f52ecd05a3d Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48 RSP: 002b:00007f52ecf5dde8 EFLAGS: 00000293 ORIG_RAX: 0000000000000030 RAX: ffffffffffffffda RBX: 00007f52ecf5e640 RCX: 00007f52ecd05a3d RDX: 00007f52ecc8b188 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 00007f52ecf5de20 R08: 00007ffdae45c69f R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f52ecf5e640 R13: 0000000000000000 R14: 00007f52ecc8b060 R15: 00007ffdae45c6e0 Fixes: 198bc90e0e73 ("tcp: make sure init the accept_queue's spinlocks once") Signed-off-by: Zhengchao Shao Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240122102001.2851701-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- net/ipv6/af_inet6.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 13a1833a4df5..959bfd9f6344 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -199,6 +199,9 @@ lookup_protocol: if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; + if (INET_PROTOSW_ICSK & answer_flags) + inet_init_csk_locks(sk); + inet = inet_sk(sk); inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); -- cgit From 1732ebc4a26181c8f116c7639db99754b313edc8 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 23 Jan 2024 02:32:07 +0000 Subject: riscv, bpf: Fix unpredictable kernel crash about RV64 struct_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We encountered a kernel crash triggered by the bpf_tcp_ca testcase as show below: Unable to handle kernel paging request at virtual address ff60000088554500 Oops [#1] ... CPU: 3 PID: 458 Comm: test_progs Tainted: G OE 6.8.0-rc1-kselftest_plain #1 Hardware name: riscv-virtio,qemu (DT) epc : 0xff60000088554500 ra : tcp_ack+0x288/0x1232 epc : ff60000088554500 ra : ffffffff80cc7166 sp : ff2000000117ba50 gp : ffffffff82587b60 tp : ff60000087be0040 t0 : ff60000088554500 t1 : ffffffff801ed24e t2 : 0000000000000000 s0 : ff2000000117bbc0 s1 : 0000000000000500 a0 : ff20000000691000 a1 : 0000000000000018 a2 : 0000000000000001 a3 : ff60000087be03a0 a4 : 0000000000000000 a5 : 0000000000000000 a6 : 0000000000000021 a7 : ffffffff8263f880 s2 : 000000004ac3c13b s3 : 000000004ac3c13a s4 : 0000000000008200 s5 : 0000000000000001 s6 : 0000000000000104 s7 : ff2000000117bb00 s8 : ff600000885544c0 s9 : 0000000000000000 s10: ff60000086ff0b80 s11: 000055557983a9c0 t3 : 0000000000000000 t4 : 000000000000ffc4 t5 : ffffffff8154f170 t6 : 0000000000000030 status: 0000000200000120 badaddr: ff60000088554500 cause: 000000000000000c Code: c796 67d7 0000 0000 0052 0002 c13b 4ac3 0000 0000 (0001) 0000 ---[ end trace 0000000000000000 ]--- The reason is that commit 2cd3e3772e41 ("x86/cfi,bpf: Fix bpf_struct_ops CFI") changes the func_addr of arch_prepare_bpf_trampoline in struct_ops from NULL to non-NULL, while we use func_addr on RV64 to differentiate between struct_ops and regular trampoline. When the struct_ops testcase is triggered, it emits wrong prologue and epilogue, and lead to unpredictable issues. After commit 2cd3e3772e41, we can use BPF_TRAMP_F_INDIRECT to distinguish them as it always be set in struct_ops. Fixes: 2cd3e3772e41 ("x86/cfi,bpf: Fix bpf_struct_ops CFI") Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240123023207.1917284-1-pulehui@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 58dc64dd94a8..719a97e7edb2 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -795,6 +795,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + bool is_struct_ops = flags & BPF_TRAMP_F_INDIRECT; void *orig_call = func_addr; bool save_ret; u32 insn; @@ -878,7 +879,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, stack_size = round_up(stack_size, 16); - if (func_addr) { + if (!is_struct_ops) { /* For the trampoline called from function entry, * the frame of traced function and the frame of * trampoline need to be considered. @@ -998,7 +999,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, emit_ld(RV_REG_S1, -sreg_off, RV_REG_FP, ctx); - if (func_addr) { + if (!is_struct_ops) { /* trampoline called from function entry */ emit_ld(RV_REG_T0, stack_size - 8, RV_REG_SP, ctx); emit_ld(RV_REG_FP, stack_size - 16, RV_REG_SP, ctx); -- cgit From 3222bc997a24821ea4f96d1a9108dafeadc00cfb Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 18 Jan 2024 11:18:06 -0800 Subject: Revert "net: macsec: use skb_ensure_writable_head_tail to expand the skb" This reverts commit b34ab3527b9622ca4910df24ff5beed5aa66c6b5. Using skb_ensure_writable_head_tail without a call to skb_unshare causes the MACsec stack to operate on the original skb rather than a copy in the macsec_encrypt path. This causes the buffer to be exceeded in space, and leads to warnings generated by skb_put operations. Opting to revert this change since skb_copy_expand is more efficient than skb_ensure_writable_head_tail followed by a call to skb_unshare. Log: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:2464! invalid opcode: 0000 [#1] SMP KASAN CPU: 21 PID: 61997 Comm: iperf3 Not tainted 6.7.0-rc8_for_upstream_debug_2024_01_07_17_05 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:skb_put+0x113/0x190 Code: 03 0f b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 70 3b 9d bc 00 00 00 77 0e 48 83 c4 08 4c 89 e8 5b 5d 41 5d c3 <0f> 0b 4c 8b 6c 24 20 89 74 24 04 e8 6d b7 f0 fe 8b 74 24 04 48 c7 RSP: 0018:ffff8882694e7278 EFLAGS: 00010202 RAX: 0000000000000025 RBX: 0000000000000100 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000010 RDI: ffff88816ae0bad4 RBP: ffff88816ae0ba60 R08: 0000000000000004 R09: 0000000000000004 R10: 0000000000000001 R11: 0000000000000001 R12: ffff88811ba5abfa R13: ffff8882bdecc100 R14: ffff88816ae0ba60 R15: ffff8882bdecc0ae FS: 00007fe54df02740(0000) GS:ffff88881f080000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe54d92e320 CR3: 000000010a345003 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? die+0x33/0x90 ? skb_put+0x113/0x190 ? do_trap+0x1b4/0x3b0 ? skb_put+0x113/0x190 ? do_error_trap+0xb6/0x180 ? skb_put+0x113/0x190 ? handle_invalid_op+0x2c/0x30 ? skb_put+0x113/0x190 ? exc_invalid_op+0x2b/0x40 ? asm_exc_invalid_op+0x16/0x20 ? skb_put+0x113/0x190 ? macsec_start_xmit+0x4e9/0x21d0 macsec_start_xmit+0x830/0x21d0 ? get_txsa_from_nl+0x400/0x400 ? lock_downgrade+0x690/0x690 ? dev_queue_xmit_nit+0x78b/0xae0 dev_hard_start_xmit+0x151/0x560 __dev_queue_xmit+0x1580/0x28f0 ? check_chain_key+0x1c5/0x490 ? netdev_core_pick_tx+0x2d0/0x2d0 ? __ip_queue_xmit+0x798/0x1e00 ? lock_downgrade+0x690/0x690 ? mark_held_locks+0x9f/0xe0 ip_finish_output2+0x11e4/0x2050 ? ip_mc_finish_output+0x520/0x520 ? ip_fragment.constprop.0+0x230/0x230 ? __ip_queue_xmit+0x798/0x1e00 __ip_queue_xmit+0x798/0x1e00 ? __skb_clone+0x57a/0x760 __tcp_transmit_skb+0x169d/0x3490 ? lock_downgrade+0x690/0x690 ? __tcp_select_window+0x1320/0x1320 ? mark_held_locks+0x9f/0xe0 ? lockdep_hardirqs_on_prepare+0x286/0x400 ? tcp_small_queue_check.isra.0+0x120/0x3d0 tcp_write_xmit+0x12b6/0x7100 ? skb_page_frag_refill+0x1e8/0x460 __tcp_push_pending_frames+0x92/0x320 tcp_sendmsg_locked+0x1ed4/0x3190 ? tcp_sendmsg_fastopen+0x650/0x650 ? tcp_sendmsg+0x1a/0x40 ? mark_held_locks+0x9f/0xe0 ? lockdep_hardirqs_on_prepare+0x286/0x400 tcp_sendmsg+0x28/0x40 ? inet_send_prepare+0x1b0/0x1b0 __sock_sendmsg+0xc5/0x190 sock_write_iter+0x222/0x380 ? __sock_sendmsg+0x190/0x190 ? kfree+0x96/0x130 vfs_write+0x842/0xbd0 ? kernel_write+0x530/0x530 ? __fget_light+0x51/0x220 ? __fget_light+0x51/0x220 ksys_write+0x172/0x1d0 ? update_socket_protocol+0x10/0x10 ? __x64_sys_read+0xb0/0xb0 ? lockdep_hardirqs_on_prepare+0x286/0x400 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e RIP: 0033:0x7fe54d9018b7 Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffdbd4191d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000025 RCX: 00007fe54d9018b7 RDX: 0000000000000025 RSI: 0000000000d9859c RDI: 0000000000000004 RBP: 0000000000d9859c R08: 0000000000000004 R09: 0000000000000000 R10: 00007fe54d80afe0 R11: 0000000000000246 R12: 0000000000000004 R13: 0000000000000025 R14: 00007fe54e00ec00 R15: 0000000000d982a0 Modules linked in: 8021q garp mrp iptable_raw bonding vfio_pci rdma_ucm ib_umad mlx5_vfio_pci mlx5_ib vfio_pci_core vfio_iommu_type1 ib_uverbs vfio mlx5_core ip_gre nf_tables ipip tunnel4 ib_ipoib ip6_gre gre ip6_tunnel tunnel6 geneve openvswitch nsh xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core zram zsmalloc fuse [last unloaded: ib_uverbs] ---[ end trace 0000000000000000 ]--- Cc: Radu Pirea (NXP OSS) Cc: Sabrina Dubroca Signed-off-by: Rahul Rameshbabu Link: https://lore.kernel.org/r/20240118191811.50271-1-rrameshbabu@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index e34816638569..7f5426285c61 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -607,11 +607,26 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb, return ERR_PTR(-EINVAL); } - ret = skb_ensure_writable_head_tail(skb, dev); - if (unlikely(ret < 0)) { - macsec_txsa_put(tx_sa); - kfree_skb(skb); - return ERR_PTR(ret); + if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM || + skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) { + struct sk_buff *nskb = skb_copy_expand(skb, + MACSEC_NEEDED_HEADROOM, + MACSEC_NEEDED_TAILROOM, + GFP_ATOMIC); + if (likely(nskb)) { + consume_skb(skb); + skb = nskb; + } else { + macsec_txsa_put(tx_sa); + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + } else { + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + macsec_txsa_put(tx_sa); + return ERR_PTR(-ENOMEM); + } } unprotected_len = skb->len; -- cgit From 6941f67ad37d5465b75b9ffc498fcf6897a3c00e Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 22 Jan 2024 08:20:28 -0800 Subject: hv_netvsc: Calculate correct ring size when PAGE_SIZE is not 4 Kbytes Current code in netvsc_drv_init() incorrectly assumes that PAGE_SIZE is 4 Kbytes, which is wrong on ARM64 with 16K or 64K page size. As a result, the default VMBus ring buffer size on ARM64 with 64K page size is 8 Mbytes instead of the expected 512 Kbytes. While this doesn't break anything, a typical VM with 8 vCPUs and 8 netvsc channels wastes 120 Mbytes (8 channels * 2 ring buffers/channel * 7.5 Mbytes/ring buffer). Unfortunately, the module parameter specifying the ring buffer size is in units of 4 Kbyte pages. Ideally, it should be in units that are independent of PAGE_SIZE, but backwards compatibility prevents changing that now. Fix this by having netvsc_drv_init() hardcode 4096 instead of using PAGE_SIZE when calculating the ring buffer size in bytes. Also use the VMBUS_RING_SIZE macro to ensure proper alignment when running with page size larger than 4K. Cc: # 5.15.x Fixes: 7aff79e297ee ("Drivers: hv: Enable Hyper-V code to be built on ARM64") Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20240122162028.348885-1-mhklinux@outlook.com Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/netvsc_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 4406427d4617..273bd8a20122 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -44,7 +44,7 @@ static unsigned int ring_size __ro_after_init = 128; module_param(ring_size, uint, 0444); -MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)"); +MODULE_PARM_DESC(ring_size, "Ring buffer size (# of 4K pages)"); unsigned int netvsc_ring_bytes __ro_after_init; static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | @@ -2807,7 +2807,7 @@ static int __init netvsc_drv_init(void) pr_info("Increased ring_size to %u (min allowed)\n", ring_size); } - netvsc_ring_bytes = ring_size * PAGE_SIZE; + netvsc_ring_bytes = VMBUS_RING_SIZE(ring_size * 4096); register_netdevice_notifier(&netvsc_netdev_notifier); -- cgit From 04fe7c5029cbdbcdb28917f09a958d939a8f19f7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 22 Jan 2024 12:35:28 -0800 Subject: selftests: fill in some missing configs for net We are missing a lot of config options from net selftests, it seems: tun/tap: CONFIG_TUN, CONFIG_MACVLAN, CONFIG_MACVTAP fib_tests: CONFIG_NET_SCH_FQ_CODEL l2tp: CONFIG_L2TP, CONFIG_L2TP_V3, CONFIG_L2TP_IP, CONFIG_L2TP_ETH sctp-vrf: CONFIG_INET_DIAG txtimestamp: CONFIG_NET_CLS_U32 vxlan_mdb: CONFIG_BRIDGE_VLAN_FILTERING gre_gso: CONFIG_NET_IPGRE_DEMUX, CONFIG_IP_GRE, CONFIG_IPV6_GRE srv6_end_dt*_l3vpn: CONFIG_IPV6_SEG6_LWTUNNEL ip_local_port_range: CONFIG_MPTCP fib_test: CONFIG_NET_CLS_BASIC rtnetlink: CONFIG_MACSEC, CONFIG_NET_SCH_HTB, CONFIG_XFRM_INTERFACE CONFIG_NET_IPGRE, CONFIG_BONDING fib_nexthops: CONFIG_MPLS, CONFIG_MPLS_ROUTING vxlan_mdb: CONFIG_NET_ACT_GACT tls: CONFIG_TLS, CONFIG_CRYPTO_CHACHA20POLY1305 psample: CONFIG_PSAMPLE fcnal: CONFIG_TCP_MD5SIG Try to add them in a semi-alphabetical order. Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test") Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask") Fixes: 122db5e3634b ("selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE") Link: https://lore.kernel.org/r/20240122203528.672004-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/config | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 8da562a9ae87..19ff75051660 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -1,5 +1,6 @@ CONFIG_USER_NS=y CONFIG_NET_NS=y +CONFIG_BONDING=m CONFIG_BPF_SYSCALL=y CONFIG_TEST_BPF=m CONFIG_NUMA=y @@ -14,9 +15,13 @@ CONFIG_VETH=y CONFIG_NET_IPVTI=y CONFIG_IPV6_VTI=y CONFIG_DUMMY=y +CONFIG_BRIDGE_VLAN_FILTERING=y CONFIG_BRIDGE=y +CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_VLAN_8021Q=y CONFIG_IFB=y +CONFIG_INET_DIAG=y +CONFIG_IP_GRE=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NF_CONNTRACK=m @@ -25,15 +30,36 @@ CONFIG_IP6_NF_IPTABLES=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP6_NF_NAT=m CONFIG_IP_NF_NAT=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_L2TP_ETH=m +CONFIG_L2TP_IP=m +CONFIG_L2TP=m +CONFIG_L2TP_V3=y +CONFIG_MACSEC=m +CONFIG_MACVLAN=y +CONFIG_MACVTAP=y +CONFIG_MPLS=y +CONFIG_MPTCP=y CONFIG_NF_TABLES=m CONFIG_NF_TABLES_IPV6=y CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_NAT=m +CONFIG_NET_ACT_GACT=m +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_U32=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IPGRE=m +CONFIG_NET_SCH_FQ_CODEL=m +CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_FQ=m CONFIG_NET_SCH_ETF=m CONFIG_NET_SCH_NETEM=y +CONFIG_PSAMPLE=m +CONFIG_TCP_MD5SIG=y CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_KALLSYMS=y +CONFIG_TLS=m CONFIG_TRACEPOINTS=y CONFIG_NET_DROP_MONITOR=m CONFIG_NETDEVSIM=m @@ -48,7 +74,9 @@ CONFIG_BAREUDP=m CONFIG_IPV6_IOAM6_LWTUNNEL=y CONFIG_CRYPTO_SM4_GENERIC=y CONFIG_AMT=m +CONFIG_TUN=y CONFIG_VXLAN=m CONFIG_IP_SCTP=m CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_CRYPTO_ARIA=y +CONFIG_XFRM_INTERFACE=m -- cgit From 32f2a0afa95fae0d1ceec2ff06e0e816939964b8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 22 Jan 2024 15:28:43 +0200 Subject: net/sched: flower: Fix chain template offload When a qdisc is deleted from a net device the stack instructs the underlying driver to remove its flow offload callback from the associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack then continues to replay the removal of the filters in the block for this driver by iterating over the chains in the block and invoking the 'reoffload' operation of the classifier being used. In turn, the classifier in its 'reoffload' operation prepares and emits a 'FLOW_CLS_DESTROY' command for each filter. However, the stack does not do the same for chain templates and the underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when a qdisc is deleted. This results in a memory leak [1] which can be reproduced using [2]. Fix by introducing a 'tmplt_reoffload' operation and have the stack invoke it with the appropriate arguments as part of the replay. Implement the operation in the sole classifier that supports chain templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}' command based on whether a flow offload callback is being bound to a filter block or being unbound from one. As far as I can tell, the issue happens since cited commit which reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains() in __tcf_block_put(). The order cannot be reversed as the filter block is expected to be freed after flushing all the chains. [1] unreferenced object 0xffff888107e28800 (size 2048): comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) hex dump (first 32 bytes): b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[...... 01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................ backtrace: [] __kmem_cache_alloc_node+0x1e8/0x320 [] __kmalloc+0x4e/0x90 [] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0 [] mlxsw_sp_flower_tmplt_create+0x145/0x180 [] mlxsw_sp_flow_block_cb+0x1ea/0x280 [] tc_setup_cb_call+0x183/0x340 [] fl_tmplt_create+0x3da/0x4c0 [] tc_ctl_chain+0xa15/0x1170 [] rtnetlink_rcv_msg+0x3cc/0xed0 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x540/0x820 [] netlink_sendmsg+0x8d8/0xda0 [] ____sys_sendmsg+0x30f/0xa80 [] ___sys_sendmsg+0x13a/0x1e0 [] __sys_sendmsg+0x11c/0x1f0 [] do_syscall_64+0x40/0xe0 unreferenced object 0xffff88816d2c0400 (size 1024): comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) hex dump (first 32 bytes): 40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8..... 10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m.... backtrace: [] __kmem_cache_alloc_node+0x1e8/0x320 [] __kmalloc_node+0x51/0x90 [] kvmalloc_node+0xa6/0x1f0 [] bucket_table_alloc.isra.0+0x83/0x460 [] rhashtable_init+0x43b/0x7c0 [] mlxsw_sp_acl_ruleset_get+0x428/0x7a0 [] mlxsw_sp_flower_tmplt_create+0x145/0x180 [] mlxsw_sp_flow_block_cb+0x1ea/0x280 [] tc_setup_cb_call+0x183/0x340 [] fl_tmplt_create+0x3da/0x4c0 [] tc_ctl_chain+0xa15/0x1170 [] rtnetlink_rcv_msg+0x3cc/0xed0 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x540/0x820 [] netlink_sendmsg+0x8d8/0xda0 [] ____sys_sendmsg+0x30f/0xa80 [2] # tc qdisc add dev swp1 clsact # tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32 # tc qdisc del dev swp1 clsact # devlink dev reload pci/0000:06:00.0 Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()") Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++++ net/sched/cls_api.c | 9 ++++++++- net/sched/cls_flower.c | 23 +++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ba3e1b315de8..934fdb977551 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -375,6 +375,10 @@ struct tcf_proto_ops { struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); + void (*tmplt_reoffload)(struct tcf_chain *chain, + bool add, + flow_setup_cb_t *cb, + void *cb_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 92a12e3d0fe6..ff3d396a65aa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1560,6 +1560,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, chain_prev = chain, chain = __tcf_get_next_chain(block, chain), tcf_chain_put(chain_prev)) { + if (chain->tmplt_ops && add) + chain->tmplt_ops->tmplt_reoffload(chain, true, cb, + cb_priv); for (tp = __tcf_get_next_proto(chain, NULL); tp; tp_prev = tp, tp = __tcf_get_next_proto(chain, tp), @@ -1575,6 +1578,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, goto err_playback_remove; } } + if (chain->tmplt_ops && !add) + chain->tmplt_ops->tmplt_reoffload(chain, false, cb, + cb_priv); } return 0; @@ -3000,7 +3006,8 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, ops = tcf_proto_lookup_ops(name, true, extack); if (IS_ERR(ops)) return PTR_ERR(ops); - if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { + if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump || + !ops->tmplt_reoffload) { NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); module_put(ops->owner); return -EOPNOTSUPP; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e5314a31f75a..efb9d2811b73 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2721,6 +2721,28 @@ static void fl_tmplt_destroy(void *tmplt_priv) kfree(tmplt); } +static void fl_tmplt_reoffload(struct tcf_chain *chain, bool add, + flow_setup_cb_t *cb, void *cb_priv) +{ + struct fl_flow_tmplt *tmplt = chain->tmplt_priv; + struct flow_cls_offload cls_flower = {}; + + cls_flower.rule = flow_rule_alloc(0); + if (!cls_flower.rule) + return; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = add ? FLOW_CLS_TMPLT_CREATE : + FLOW_CLS_TMPLT_DESTROY; + cls_flower.cookie = (unsigned long) tmplt; + cls_flower.rule->match.dissector = &tmplt->dissector; + cls_flower.rule->match.mask = &tmplt->mask; + cls_flower.rule->match.key = &tmplt->dummy_key; + + cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + kfree(cls_flower.rule); +} + static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) @@ -3628,6 +3650,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .bind_class = fl_bind_class, .tmplt_create = fl_tmplt_create, .tmplt_destroy = fl_tmplt_destroy, + .tmplt_reoffload = fl_tmplt_reoffload, .tmplt_dump = fl_tmplt_dump, .get_exts = fl_get_exts, .owner = THIS_MODULE, -- cgit From 25461ce8b3d28528f2c55f5e737e99d2906eda83 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 15 Dec 2023 19:31:14 -0800 Subject: net/mlx5e: Use the correct lag ports number when creating TISes The cited commit moved the code of mlx5e_create_tises() and changed the loop to create TISes over MLX5_MAX_PORTS constant value, instead of getting the correct lag ports supported by the device, which can cause FW errors on devices with less than MLX5_MAX_PORTS ports. Change that back to mlx5e_get_num_lag_ports(mdev). Also IPoIB interfaces create there own TISes, they don't use the eth TISes, pass a flag to indicate that. This fixes the following errors that might appear in kernel log: mlx5_cmd_out_err:808:(pid 650): CREATE_TIS(0x912) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x595b5d), err(-22) mlx5e_create_mdev_resources:174:(pid 650): alloc tises failed, -22 Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources") Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 21 +++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 2 +- include/linux/mlx5/driver.h | 1 + 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 0bfe1ca8a364..55c6ace0acd5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1124,7 +1124,7 @@ static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev) extern const struct ethtool_ops mlx5e_ethtool_ops; int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey); -int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev); +int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises); void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev); int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, bool enable_mc_lb); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 67f546683e85..6ed3a32b7e22 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -95,7 +95,7 @@ static void mlx5e_destroy_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PO { int tc, i; - for (i = 0; i < MLX5_MAX_PORTS; i++) + for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++) for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) mlx5e_destroy_tis(mdev, tisn[i][tc]); } @@ -110,7 +110,7 @@ static int mlx5e_create_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PORT int tc, i; int err; - for (i = 0; i < MLX5_MAX_PORTS; i++) { + for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++) { for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) { u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; void *tisc; @@ -140,7 +140,7 @@ err_close_tises: return err; } -int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) +int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) { struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; int err; @@ -169,11 +169,15 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) goto err_destroy_mkey; } - err = mlx5e_create_tises(mdev, res->tisn); - if (err) { - mlx5_core_err(mdev, "alloc tises failed, %d\n", err); - goto err_destroy_bfreg; + if (create_tises) { + err = mlx5e_create_tises(mdev, res->tisn); + if (err) { + mlx5_core_err(mdev, "alloc tises failed, %d\n", err); + goto err_destroy_bfreg; + } + res->tisn_valid = true; } + INIT_LIST_HEAD(&res->td.tirs_list); mutex_init(&res->td.list_lock); @@ -203,7 +207,8 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) mlx5_crypto_dek_cleanup(mdev->mlx5e_res.dek_priv); mdev->mlx5e_res.dek_priv = NULL; - mlx5e_destroy_tises(mdev, res->tisn); + if (res->tisn_valid) + mlx5e_destroy_tises(mdev, res->tisn); mlx5_free_bfreg(mdev, &res->bfreg); mlx5_core_destroy_mkey(mdev, res->mkey); mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b5f1c4ca38ba..c8e8f512803e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5992,7 +5992,7 @@ static int mlx5e_resume(struct auxiliary_device *adev) if (netif_device_present(netdev)) return 0; - err = mlx5e_create_mdev_resources(mdev); + err = mlx5e_create_mdev_resources(mdev, true); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 58845121954c..d77be1b4dd9c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -783,7 +783,7 @@ static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num, } /* This should only be called once per mdev */ - err = mlx5e_create_mdev_resources(mdev); + err = mlx5e_create_mdev_resources(mdev, false); if (err) goto destroy_ht; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c55ff351e5f..41f03b352401 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,6 +681,7 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; + bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; -- cgit From cfbc3608a8c69b48bf238bd68f768192f0238e0d Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 19 Dec 2023 14:46:20 +0200 Subject: net/mlx5: Fix query of sd_group field The sd_group field moved in the HW spec from the MPIR register to the vport context. Align the query accordingly. Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 21 +++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 10 +++++++--- include/linux/mlx5/vport.h | 1 + 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 21753f327868..1005bb6935b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -440,6 +440,27 @@ out: } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid); +int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group) +{ + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 *out; + int err; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; + + *sd_group = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.sd_group); +out: + kvfree(out); + return err; +} + int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) { u32 *out; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bf5320b28b8b..37230253f9f1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4036,8 +4036,13 @@ struct mlx5_ifc_nic_vport_context_bits { u8 affiliation_criteria[0x4]; u8 affiliated_vhca_id[0x10]; - u8 reserved_at_60[0xd0]; + u8 reserved_at_60[0xa0]; + u8 reserved_at_100[0x1]; + u8 sd_group[0x3]; + u8 reserved_at_104[0x1c]; + + u8 reserved_at_120[0x10]; u8 mtu[0x10]; u8 system_image_guid[0x40]; @@ -10122,8 +10127,7 @@ struct mlx5_ifc_mpir_reg_bits { u8 reserved_at_20[0x20]; u8 local_port[0x8]; - u8 reserved_at_28[0x15]; - u8 sd_group[0x3]; + u8 reserved_at_28[0x18]; u8 reserved_at_60[0x20]; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index fbb9bf447889..c36cc6d82926 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); +int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); -- cgit From 3876638b2c7ebb2c9d181de1191db0de8cac143a Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 22 Nov 2023 18:32:11 -0800 Subject: net/mlx5e: Fix operation precedence bug in port timestamping napi_poll context Indirection (*) is of lower precedence than postfix increment (++). Logic in napi_poll context would cause an out-of-bound read by first increment the pointer address by byte address space and then dereference the value. Rather, the intended logic was to dereference first and then increment the underlying value. Fixes: 92214be5979c ("net/mlx5e: Update doorbell for port timestamping CQ before the software counter") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index c206cc0a8483..078f56a3cbb2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -213,7 +213,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp); out: napi_consume_skb(skb, budget); - md_buff[*md_buff_sz++] = metadata_id; + md_buff[(*md_buff_sz)++] = metadata_id; if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) && !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work); -- cgit From c20767fd45e82d64352db82d4fc8d281a43e4783 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 5 Nov 2023 17:09:46 +0200 Subject: net/mlx5e: Fix inconsistent hairpin RQT sizes The processing of traffic in hairpin queues occurs in HW/FW and does not involve the cpus, hence the upper bound on max num channels does not apply to them. Using this bound for the hairpin RQT max_table_size is wrong. It could be too small, and cause the error below [1]. As the RQT size provided on init does not get modified later, use the same value for both actual and max table sizes. [1] mlx5_core 0000:08:00.1: mlx5_cmd_out_err:805:(pid 1200): CREATE_RQT(0x916) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x538faf), err(-22) Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 30932c9c9a8f..047b465fc6a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -761,7 +761,7 @@ static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp) err = mlx5e_rss_params_indir_init(&indir, mdev, mlx5e_rqt_size(mdev, hp->num_channels), - mlx5e_rqt_size(mdev, priv->max_nch)); + mlx5e_rqt_size(mdev, hp->num_channels)); if (err) return err; -- cgit From d76fdd31f953ac5046555171620f2562715e9b71 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Nov 2023 11:10:22 +0100 Subject: net/mlx5e: Fix peer flow lists handling The cited change refactored mlx5e_tc_del_fdb_peer_flow() to only clear DUP flag when list of peer flows has become empty. However, if any concurrent user holds a reference to a peer flow (for example, the neighbor update workqueue task is updating peer flow's parent encap entry concurrently), then the flow will not be removed from the peer list and, consecutively, DUP flag will remain set. Since mlx5e_tc_del_fdb_peers_flow() calls mlx5e_tc_del_fdb_peer_flow() for every possible peer index the algorithm will try to remove the flow from eswitch instances that it has never peered with causing either NULL pointer dereference when trying to remove the flow peer list head of peer_index that was never initialized or a warning if the list debug config is enabled[0]. Fix the issue by always removing the peer flow from the list even when not releasing the last reference to it. [0]: [ 3102.985806] ------------[ cut here ]------------ [ 3102.986223] list_del corruption, ffff888139110698->next is NULL [ 3102.986757] WARNING: CPU: 2 PID: 22109 at lib/list_debug.c:53 __list_del_entry_valid_or_report+0x4f/0xc0 [ 3102.987561] Modules linked in: act_ct nf_flow_table bonding act_tunnel_key act_mirred act_skbedit vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa openvswitch nsh xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcg ss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core [last unloaded: bonding] [ 3102.991113] CPU: 2 PID: 22109 Comm: revalidator28 Not tainted 6.6.0-rc6+ #3 [ 3102.991695] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 3102.992605] RIP: 0010:__list_del_entry_valid_or_report+0x4f/0xc0 [ 3102.993122] Code: 39 c2 74 56 48 8b 32 48 39 fe 75 62 48 8b 51 08 48 39 f2 75 73 b8 01 00 00 00 c3 48 89 fe 48 c7 c7 48 fd 0a 82 e8 41 0b ad ff <0f> 0b 31 c0 c3 48 89 fe 48 c7 c7 70 fd 0a 82 e8 2d 0b ad ff 0f 0b [ 3102.994615] RSP: 0018:ffff8881383e7710 EFLAGS: 00010286 [ 3102.995078] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 [ 3102.995670] RDX: 0000000000000001 RSI: ffff88885f89b640 RDI: ffff88885f89b640 [ 3102.997188] DEL flow 00000000be367878 on port 0 [ 3102.998594] RBP: dead000000000122 R08: 0000000000000000 R09: c0000000ffffdfff [ 3102.999604] R10: 0000000000000008 R11: ffff8881383e7598 R12: dead000000000100 [ 3103.000198] R13: 0000000000000002 R14: ffff888139110000 R15: ffff888101901240 [ 3103.000790] FS: 00007f424cde4700(0000) GS:ffff88885f880000(0000) knlGS:0000000000000000 [ 3103.001486] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3103.001986] CR2: 00007fd42e8dcb70 CR3: 000000011e68a003 CR4: 0000000000370ea0 [ 3103.002596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3103.003190] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3103.003787] Call Trace: [ 3103.004055] [ 3103.004297] ? __warn+0x7d/0x130 [ 3103.004623] ? __list_del_entry_valid_or_report+0x4f/0xc0 [ 3103.005094] ? report_bug+0xf1/0x1c0 [ 3103.005439] ? console_unlock+0x4a/0xd0 [ 3103.005806] ? handle_bug+0x3f/0x70 [ 3103.006149] ? exc_invalid_op+0x13/0x60 [ 3103.006531] ? asm_exc_invalid_op+0x16/0x20 [ 3103.007430] ? __list_del_entry_valid_or_report+0x4f/0xc0 [ 3103.007910] mlx5e_tc_del_fdb_peers_flow+0xcf/0x240 [mlx5_core] [ 3103.008463] mlx5e_tc_del_flow+0x46/0x270 [mlx5_core] [ 3103.008944] mlx5e_flow_put+0x26/0x50 [mlx5_core] [ 3103.009401] mlx5e_delete_flower+0x25f/0x380 [mlx5_core] [ 3103.009901] tc_setup_cb_destroy+0xab/0x180 [ 3103.010292] fl_hw_destroy_filter+0x99/0xc0 [cls_flower] [ 3103.010779] __fl_delete+0x2d4/0x2f0 [cls_flower] [ 3103.011207] fl_delete+0x36/0x80 [cls_flower] [ 3103.011614] tc_del_tfilter+0x56f/0x750 [ 3103.011982] rtnetlink_rcv_msg+0xff/0x3a0 [ 3103.012362] ? netlink_ack+0x1c7/0x4e0 [ 3103.012719] ? rtnl_calcit.isra.44+0x130/0x130 [ 3103.013134] netlink_rcv_skb+0x54/0x100 [ 3103.013533] netlink_unicast+0x1ca/0x2b0 [ 3103.013902] netlink_sendmsg+0x361/0x4d0 [ 3103.014269] __sock_sendmsg+0x38/0x60 [ 3103.014643] ____sys_sendmsg+0x1f2/0x200 [ 3103.015018] ? copy_msghdr_from_user+0x72/0xa0 [ 3103.015265] ___sys_sendmsg+0x87/0xd0 [ 3103.016608] ? copy_msghdr_from_user+0x72/0xa0 [ 3103.017014] ? ___sys_recvmsg+0x9b/0xd0 [ 3103.017381] ? ttwu_do_activate.isra.137+0x58/0x180 [ 3103.017821] ? wake_up_q+0x49/0x90 [ 3103.018157] ? futex_wake+0x137/0x160 [ 3103.018521] ? __sys_sendmsg+0x51/0x90 [ 3103.018882] __sys_sendmsg+0x51/0x90 [ 3103.019230] ? exit_to_user_mode_prepare+0x56/0x130 [ 3103.019670] do_syscall_64+0x3c/0x80 [ 3103.020017] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 3103.020469] RIP: 0033:0x7f4254811ef4 [ 3103.020816] Code: 89 f3 48 83 ec 10 48 89 7c 24 08 48 89 14 24 e8 42 eb ff ff 48 8b 14 24 41 89 c0 48 89 de 48 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 30 44 89 c7 48 89 04 24 e8 78 eb ff ff 48 8b [ 3103.022290] RSP: 002b:00007f424cdd9480 EFLAGS: 00000293 ORIG_RAX: 000000000000002e [ 3103.022970] RAX: ffffffffffffffda RBX: 00007f424cdd9510 RCX: 00007f4254811ef4 [ 3103.023564] RDX: 0000000000000000 RSI: 00007f424cdd9510 RDI: 0000000000000012 [ 3103.024158] RBP: 00007f424cdda238 R08: 0000000000000000 R09: 00007f41d801a4b0 [ 3103.024748] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000001 [ 3103.025341] R13: 00007f424cdd9510 R14: 00007f424cdda240 R15: 00007f424cdd99a0 [ 3103.025931] [ 3103.026182] ---[ end trace 0000000000000000 ]--- [ 3103.027033] ------------[ cut here ]------------ Fixes: 9be6c21fdcf8 ("net/mlx5e: Handle offloads flows per peer") Signed-off-by: Vlad Buslov Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 047b465fc6a5..9fb2c057bd78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2014,9 +2014,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) { if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev)) continue; + + list_del(&peer_flow->peer_flows); if (refcount_dec_and_test(&peer_flow->refcnt)) { mlx5e_tc_del_fdb_flow(peer_flow->priv, peer_flow); - list_del(&peer_flow->peer_flows); kfree(peer_flow); } } -- cgit From cc8091587779cfaddb6b29c9e9edb9079a282cad Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 31 Dec 2023 15:19:50 +0200 Subject: net/mlx5: Fix a WARN upon a callback command failure The below WARN [1] is reported once a callback command failed. As a callback runs under an interrupt context, needs to use the IRQ save/restore variant. [1] DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()) WARNING: CPU: 15 PID: 0 at kernel/locking/lockdep.c:4353 lockdep_hardirqs_on_prepare+0x11b/0x180 Modules linked in: vhost_net vhost tap mlx5_vfio_pci vfio_pci vfio_pci_core vfio_iommu_type1 vfio mlx5_vdpa vringh vhost_iotlb vdpa nfnetlink_cttimeout openvswitch nsh ip6table_mangle ip6table_nat ip6table_filter ip6_tables iptable_mangle xt_conntrackxt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_umad ib_ipoib ib_cm mlx5_ib ib_uverbs ib_core fuse mlx5_core CPU: 15 PID: 0 Comm: swapper/15 Tainted: G W 6.7.0-rc4+ #1587 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:lockdep_hardirqs_on_prepare+0x11b/0x180 Code: 00 5b c3 c3 e8 e6 0d 58 00 85 c0 74 d6 8b 15 f0 c3 76 01 85 d2 75 cc 48 c7 c6 04 a5 3b 82 48 c7 c7 f1 e9 39 82 e8 95 12 f9 ff <0f> 0b 5b c3 e8 bc 0d 58 00 85 c0 74 ac 8b 3d c6 c3 76 01 85 ff 75 RSP: 0018:ffffc900003ecd18 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027 RDX: 0000000000000000 RSI: ffff88885fbdb880 RDI: ffff88885fbdb888 RBP: 00000000ffffff87 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 284e4f5f4e524157 R12: 00000000002c9aa1 R13: ffff88810aace980 R14: ffff88810aace9b8 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff88885fbc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f731436f4c8 CR3: 000000010aae6001 CR4: 0000000000372eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn+0x81/0x170 ? lockdep_hardirqs_on_prepare+0x11b/0x180 ? report_bug+0xf8/0x1c0 ? handle_bug+0x3f/0x70 ? exc_invalid_op+0x13/0x60 ? asm_exc_invalid_op+0x16/0x20 ? lockdep_hardirqs_on_prepare+0x11b/0x180 ? lockdep_hardirqs_on_prepare+0x11b/0x180 trace_hardirqs_on+0x4a/0xa0 raw_spin_unlock_irq+0x24/0x30 cmd_status_err+0xc0/0x1a0 [mlx5_core] cmd_status_err+0x1a0/0x1a0 [mlx5_core] mlx5_cmd_exec_cb_handler+0x24/0x40 [mlx5_core] mlx5_cmd_comp_handler+0x129/0x4b0 [mlx5_core] cmd_comp_notifier+0x1a/0x20 [mlx5_core] notifier_call_chain+0x3e/0xe0 atomic_notifier_call_chain+0x5f/0x130 mlx5_eq_async_int+0xe7/0x200 [mlx5_core] notifier_call_chain+0x3e/0xe0 atomic_notifier_call_chain+0x5f/0x130 irq_int_handler+0x11/0x20 [mlx5_core] __handle_irq_event_percpu+0x99/0x220 ? tick_irq_enter+0x5d/0x80 handle_irq_event_percpu+0xf/0x40 handle_irq_event+0x3a/0x60 handle_edge_irq+0xa2/0x1c0 __common_interrupt+0x55/0x140 common_interrupt+0x7d/0xa0 asm_common_interrupt+0x22/0x40 RIP: 0010:default_idle+0x13/0x20 Code: c0 08 00 00 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 72 ff ff ff cc cc cc cc 8b 05 ea 08 25 01 85 c0 7e 07 0f 00 2d 7f b0 26 00 fb f4 c3 90 66 2e 0f 1f 84 00 00 00 00 00 65 48 8b 04 25 80 d0 02 00 RSP: 0018:ffffc9000010fec8 EFLAGS: 00000242 RAX: 0000000000000001 RBX: 000000000000000f RCX: 4000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff811c410c RBP: ffffffff829478c0 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? do_idle+0x1ec/0x210 default_idle_call+0x6c/0x90 do_idle+0x1ec/0x210 cpu_startup_entry+0x26/0x30 start_secondary+0x11b/0x150 secondary_startup_64_no_verify+0x165/0x16b irq event stamp: 833284 hardirqs last enabled at (833283): [] do_idle+0x1ec/0x210 hardirqs last disabled at (833284): [] common_interrupt+0xf/0xa0 softirqs last enabled at (833224): [] __do_softirq+0x2bf/0x40e softirqs last disabled at (833177): [] irq_exit_rcu+0x7f/0xa0 Fixes: 34f46ae0d4b3 ("net/mlx5: Add command failures data to debugfs") Signed-off-by: Yishai Hadas Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index a7b1f9686c09..4957412ff1f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1923,6 +1923,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, { const char *namep = mlx5_command_str(opcode); struct mlx5_cmd_stats *stats; + unsigned long flags; if (!err || !(strcmp(namep, "unknown command opcode"))) return; @@ -1930,7 +1931,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, stats = xa_load(&dev->cmd.stats, opcode); if (!stats) return; - spin_lock_irq(&stats->lock); + spin_lock_irqsave(&stats->lock, flags); stats->failed++; if (err < 0) stats->last_failed_errno = -err; @@ -1939,7 +1940,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, stats->last_failed_mbox_status = status; stats->last_failed_syndrome = syndrome; } - spin_unlock_irq(&stats->lock); + spin_unlock_irqrestore(&stats->lock, flags); } /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */ -- cgit From ec7cc38ef9f83553102e84c82536971a81630739 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 30 Dec 2023 22:40:37 +0200 Subject: net/mlx5: Bridge, fix multicast packets sent to uplink To enable multicast packets which are offloaded in bridge multicast offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should be set. Add this bit to FTE for the bridge multicast offload rules. Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets") Signed-off-by: Moshe Shemesh Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 ++ include/linux/mlx5/fs.h | 1 + include/linux/mlx5/mlx5_ifc.h | 2 +- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c index a7ed87e9d842..22dd30cf8033 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c @@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md i++; } + rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16); ether_addr_copy(dmac_v, entry->key.addr); @@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po if (!rule_spec) return ERR_PTR(-ENOMEM); + rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; @@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port) dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; dest.vport.vhca_id = port->esw_owner_vhca_id; } + rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1); kvfree(rule_spec); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 1616a6144f7b..9b8599c200e2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, fte->flow_context.flow_tag); MLX5_SET(flow_context, in_flow_context, flow_source, fte->flow_context.flow_source); + MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en, + !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN)); MLX5_SET(flow_context, in_flow_context, extended_destination, extended_dest); diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6f7725238abc..3fb428ce7d1c 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -132,6 +132,7 @@ struct mlx5_flow_handle; enum { FLOW_CONTEXT_HAS_TAG = BIT(0), + FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1), }; struct mlx5_flow_context { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 37230253f9f1..c726f90ab752 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3576,7 +3576,7 @@ struct mlx5_ifc_flow_context_bits { u8 action[0x10]; u8 extended_destination[0x1]; - u8 reserved_at_81[0x1]; + u8 uplink_hairpin_en[0x1]; u8 flow_source[0x2]; u8 encrypt_decrypt_type[0x4]; u8 destination_list_size[0x18]; -- cgit From 5665954293f13642f9c052ead83c1e9d8cff186f Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Dec 2023 11:24:08 +0200 Subject: net/mlx5: DR, Use the right GVMI number for drop action When FW provides ICM addresses for drop RX/TX, the provided capability is 64 bits that contain its GVMI as well as the ICM address itself. In case of TX DROP this GVMI is different from the GVMI that the domain is operating on. This patch fixes the action to use these GVMI IDs, as provided by FW. Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 6f9790e97fed..95517c4aca0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -788,6 +788,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, switch (action_type) { case DR_ACTION_TYP_DROP: attr.final_icm_addr = nic_dmn->drop_icm_addr; + attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48; break; case DR_ACTION_TYP_FT: dest_action = action; -- cgit From 5b2a2523eeea5f03d39a9d1ff1bad2e9f8eb98d2 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Dec 2023 13:20:36 +0200 Subject: net/mlx5: DR, Can't go to uplink vport on RX rule Go-To-Vport action on RX is not allowed when the vport is uplink. In such case, the packet should be dropped. Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 95517c4aca0f..2ebb61ef3ea9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -874,11 +874,17 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, action->sampler->tx_icm_addr; break; case DR_ACTION_TYP_VPORT: - attr.hit_gvmi = action->vport->caps->vhca_gvmi; - dest_action = action; - attr.final_icm_addr = rx_rule ? - action->vport->caps->icm_address_rx : - action->vport->caps->icm_address_tx; + if (unlikely(rx_rule && action->vport->caps->num == MLX5_VPORT_UPLINK)) { + /* can't go to uplink on RX rule - dropping instead */ + attr.final_icm_addr = nic_dmn->drop_icm_addr; + attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48; + } else { + attr.hit_gvmi = action->vport->caps->vhca_gvmi; + dest_action = action; + attr.final_icm_addr = rx_rule ? + action->vport->caps->icm_address_rx : + action->vport->caps->icm_address_tx; + } break; case DR_ACTION_TYP_POP_VLAN: if (!rx_rule && !(dmn->ste_ctx->actions_caps & -- cgit From 20cbf8cbb827094197f3b17db60d71449415db1e Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 28 Nov 2023 14:01:54 -0800 Subject: net/mlx5: Use mlx5 device constant for selecting CQ period mode for ASO mlx5 devices have specific constants for choosing the CQ period mode. These constants do not have to match the constants used by the kernel software API for DIM period mode selection. Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO") Signed-off-by: Rahul Rameshbabu Reviewed-by: Jianbo Liu Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c index 40c7be124041..58bd749b5e4d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c @@ -98,7 +98,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data) mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); - MLX5_SET(cqc, cqc, cq_period_mode, DIM_CQ_PERIOD_MODE_START_FROM_EQE); + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - -- cgit From 20f5468a7988dedd94a57ba8acd65ebda6a59723 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 12 Dec 2023 13:52:55 +0200 Subject: net/mlx5e: Allow software parsing when IPsec crypto is enabled All ConnectX devices have software parsing capability enabled, but it is more correct to set allow_swp only if capability exists, which for IPsec means that crypto offload is supported. Fixes: 2451da081a34 ("net/mlx5: Unify device IPsec capabilities check") Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 284253b79266..5d213a9886f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -1064,8 +1064,8 @@ void mlx5e_build_sq_param(struct mlx5_core_dev *mdev, void *wq = MLX5_ADDR_OF(sqc, sqc, wq); bool allow_swp; - allow_swp = - mlx5_geneve_tx_allowed(mdev) || !!mlx5_ipsec_device_caps(mdev); + allow_swp = mlx5_geneve_tx_allowed(mdev) || + (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_CRYPTO); mlx5e_build_sq_param_common(mdev, param); MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size); MLX5_SET(sqc, sqc, allow_swp, allow_swp); -- cgit From 315a597f9bcfe7fe9980985031413457bee95510 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 26 Nov 2023 11:08:10 +0200 Subject: net/mlx5e: Ignore IPsec replay window values on sender side XFRM stack doesn't prevent from users to configure replay window in TX side and strongswan sets replay_window to be 1. It causes to failures in validation logic when trying to offload the SA. Replay window is not relevant in TX side and should be ignored. Fixes: cded6d80129b ("net/mlx5e: Store replay window in XFRM attributes") Signed-off-by: Aya Levin Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 161c5190c236..05612d9c6080 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -336,12 +336,17 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, /* iv len */ aes_gcm->icv_len = x->aead->alg_icv_len; + attrs->dir = x->xso.dir; + /* esn */ if (x->props.flags & XFRM_STATE_ESN) { attrs->replay_esn.trigger = true; attrs->replay_esn.esn = sa_entry->esn_state.esn; attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb; attrs->replay_esn.overlap = sa_entry->esn_state.overlap; + if (attrs->dir == XFRM_DEV_OFFLOAD_OUT) + goto skip_replay_window; + switch (x->replay_esn->replay_window) { case 32: attrs->replay_esn.replay_window = @@ -365,7 +370,7 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, } } - attrs->dir = x->xso.dir; +skip_replay_window: /* spi */ attrs->spi = be32_to_cpu(x->id.spi); @@ -501,7 +506,8 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev, return -EINVAL; } - if (x->replay_esn && x->replay_esn->replay_window != 32 && + if (x->replay_esn && x->xso.dir == XFRM_DEV_OFFLOAD_IN && + x->replay_esn->replay_window != 32 && x->replay_esn->replay_window != 64 && x->replay_esn->replay_window != 128 && x->replay_esn->replay_window != 256) { -- cgit From 3c6d5189246f590e4e1f167991558bdb72a4738b Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Wed, 17 Jan 2024 15:17:36 +0800 Subject: net/mlx5e: fix a double-free in arfs_create_groups When `in` allocated by kvzalloc fails, arfs_create_groups will free ft->g and return an error. However, arfs_create_table, the only caller of arfs_create_groups, will hold this error and call to mlx5e_destroy_flow_table, in which the ft->g will be freed again. Fixes: 1cabe6b0965e ("net/mlx5e: Create aRFS flow tables") Signed-off-by: Zhipeng Lu Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 26 +++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index bb7f86c993e5..e66f486faafe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -254,11 +254,13 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in || !ft->g) { - kfree(ft->g); - kvfree(in); + if (!ft->g) return -ENOMEM; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free_g; } mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); @@ -278,7 +280,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, break; default: err = -EINVAL; - goto out; + goto err_free_in; } switch (type) { @@ -300,7 +302,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, break; default: err = -EINVAL; - goto out; + goto err_free_in; } MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); @@ -309,7 +311,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) - goto err; + goto err_clean_group; ft->num_groups++; memset(in, 0, inlen); @@ -318,18 +320,20 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) - goto err; + goto err_clean_group; ft->num_groups++; kvfree(in); return 0; -err: +err_clean_group: err = PTR_ERR(ft->g[ft->num_groups]); ft->g[ft->num_groups] = NULL; -out: +err_free_in: kvfree(in); - +err_free_g: + kfree(ft->g); + ft->g = NULL; return err; } -- cgit From aef855df7e1bbd5aa4484851561211500b22707e Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Tue, 28 Nov 2023 17:29:01 +0800 Subject: net/mlx5e: fix a potential double-free in fs_any_create_groups When kcalloc() for ft->g succeeds but kvzalloc() for in fails, fs_any_create_groups() will free ft->g. However, its caller fs_any_create_table() will free ft->g again through calling mlx5e_destroy_flow_table(), which will lead to a double-free. Fix this by setting ft->g to NULL in fs_any_create_groups(). Fixes: 0f575c20bf06 ("net/mlx5e: Introduce Flow Steering ANY API") Signed-off-by: Dinghao Liu Reviewed-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c index e1283531e0b8..671adbad0a40 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c @@ -436,6 +436,7 @@ static int fs_any_create_groups(struct mlx5e_flow_table *ft) in = kvzalloc(inlen, GFP_KERNEL); if (!in || !ft->g) { kfree(ft->g); + ft->g = NULL; kvfree(in); return -ENOMEM; } -- cgit From b253d87fd78bf8d3e7efc5d149147765f044e89d Mon Sep 17 00:00:00 2001 From: George Guo Date: Tue, 26 Dec 2023 17:42:42 +0800 Subject: netfilter: nf_tables: cleanup documentation - Correct comments for nlpid, family, udlen and udata in struct nft_table, and afinfo is no longer a member of enum nft_set_class. - Add comment for data in struct nft_set_elem. - Add comment for flags in struct nft_ctx. - Add comments for timeout in struct nft_set_iter, and flags is not a member of struct nft_set_iter, remove the comment for it. - Add comments for commit, abort, estimate and gc_init in struct nft_set_ops. - Add comments for pending_update, num_exprs, exprs and catchall_list in struct nft_set. - Add comment for ext_len in struct nft_set_ext_tmpl. - Add comment for inner_ops in struct nft_expr_type. - Add comments for clone, destroy_clone, reduce, gc, offload, offload_action, offload_stats in struct nft_expr_ops. - Add comments for blob_gen_0, blob_gen_1, bound, genmask, udlen, udata, blob_next in struct nft_chain. - Add comment for flags in struct nft_base_chain. - Add comments for udlen, udata in struct nft_object. - Add comment for type in struct nft_object_ops. - Add comment for hook_list in struct nft_flowtable, and remove comments for dev_name and ops which are not members of struct nft_flowtable. Signed-off-by: George Guo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 49 +++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b157c5cafd14..4e1ea18eb5f0 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -205,6 +205,7 @@ static inline void nft_data_copy(u32 *dst, const struct nft_data *src, * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number + * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message @@ -282,6 +283,7 @@ struct nft_elem_priv { }; * * @key: element key * @key_end: closing element key + * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { @@ -325,10 +327,10 @@ struct nft_set_iter { * @dtype: data type * @dlen: data length * @objtype: object type - * @flags: flags * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval + * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions @@ -351,9 +353,9 @@ struct nft_set_desc { /** * enum nft_set_class - performance class * - * @NFT_LOOKUP_O_1: constant, O(1) - * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) - * @NFT_LOOKUP_O_N: linear, O(N) + * @NFT_SET_CLASS_O_1: constant, O(1) + * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) + * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, @@ -422,9 +424,13 @@ struct nft_set_ext; * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements + * @commit: commit set elements + * @abort: abort set elements * @privsize: function to return size of set private data + * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance + * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster @@ -540,13 +546,16 @@ struct nft_set_elem_expr { * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data - * @expr: stateful expression + * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length + * @num_exprs: numbers of exprs + * @exprs: stateful expression + * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { @@ -692,6 +701,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[]; * * @len: length of extension area * @offset: offsets of individual extension types + * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; @@ -840,6 +850,7 @@ struct nft_expr_ops; * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present + * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference @@ -881,14 +892,22 @@ struct nft_offload_ctx; * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function + * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu + * @destroy_clone: destruction clone function * @dump: function to dump parameters - * @type: expression type * @validate: validate expression, called during loop detection + * @reduce: reduce expression + * @gc: garbage collection expression + * @offload: hardware offload expression + * @offload_action: function to report true/false to allocate one slot or not in the flow + * offload array + * @offload_stats: function to synchronize hardware stats via updating the counter expression + * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { @@ -1041,14 +1060,21 @@ struct nft_rule_blob { /** * struct nft_chain - nf_tables chain * + * @blob_gen_0: rule blob pointer to the current generation + * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain - * @flags: bitmask of enum nft_chain_flags + * @flags: bitmask of enum NFTA_CHAIN_FLAGS + * @bound: bind or not + * @genmask: generation mask * @name: name of the chain + * @udlen: user data length + * @udata: user data in the chain + * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; @@ -1146,6 +1172,7 @@ struct nft_hook { * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy + * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) @@ -1274,11 +1301,13 @@ struct nft_object_hash_key { * struct nft_object - nf_tables stateful object * * @list: table stateful object list node - * @key: keys that identify this object * @rhlhead: nft_objname_ht node + * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle + * @udlen: length of user data + * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ @@ -1344,6 +1373,7 @@ struct nft_object_type { * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object + * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, @@ -1379,9 +1409,8 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle - * @dev_name: array of device names + * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector - * @ops: array of hooks */ struct nft_flowtable { struct list_head list; -- cgit From 01acb2e8666a6529697141a6017edbf206921913 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Jan 2024 10:56:26 +0100 Subject: netfilter: nft_chain_filter: handle NETDEV_UNREGISTER for inet/ingress basechain Remove netdevice from inet/ingress basechain in case NETDEV_UNREGISTER event is reported, otherwise a stale reference to netdevice remains in the hook list. Fixes: 60a3815da702 ("netfilter: add inet ingress support") Cc: stable@vger.kernel.org Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_chain_filter.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 680fe557686e..274b6f7e6bb5 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -357,9 +357,10 @@ static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_base_chain *basechain; struct nftables_pernet *nft_net; - struct nft_table *table; struct nft_chain *chain, *nr; + struct nft_table *table; struct nft_ctx ctx = { .net = dev_net(dev), }; @@ -371,7 +372,8 @@ static int nf_tables_netdev_event(struct notifier_block *this, nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { - if (table->family != NFPROTO_NETDEV) + if (table->family != NFPROTO_NETDEV && + table->family != NFPROTO_INET) continue; ctx.family = table->family; @@ -380,6 +382,11 @@ static int nf_tables_netdev_event(struct notifier_block *this, if (!nft_is_base_chain(chain)) continue; + basechain = nft_base_chain(chain); + if (table->family == NFPROTO_INET && + basechain->ops.hooknum != NF_INET_INGRESS) + continue; + ctx.chain = chain; nft_netdev_event(event, dev, &ctx); } -- cgit From c9d9eb9c53d37cdebbad56b91e40baf42d5a97aa Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 19 Jan 2024 13:11:32 +0100 Subject: netfilter: nft_limit: reject configurations that cause integer overflow Reject bogus configs where internal token counter wraps around. This only occurs with very very large requests, such as 17gbyte/s. Its better to reject this rather than having incorrect ratelimit. Fixes: d2168e849ebf ("netfilter: nft_limit: add per-byte limiting") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 79039afde34e..cefa25e0dbb0 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -58,17 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { + u64 unit, tokens, rate_with_burst; bool invert = false; - u64 unit, tokens; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + if (priv->rate == 0) + return -EINVAL; + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - priv->nsecs = unit * NSEC_PER_SEC; - if (priv->rate == 0 || priv->nsecs < unit) + if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs)) return -EOVERFLOW; if (tb[NFTA_LIMIT_BURST]) @@ -77,18 +79,25 @@ static int nft_limit_init(struct nft_limit_priv *priv, if (pkts && priv->burst == 0) priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; - if (priv->rate + priv->burst < priv->rate) + if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst)) return -EOVERFLOW; if (pkts) { - tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst; + u64 tmp = div64_u64(priv->nsecs, priv->rate); + + if (check_mul_overflow(tmp, priv->burst, &tokens)) + return -EOVERFLOW; } else { + u64 tmp; + /* The token bucket size limits the number of tokens can be * accumulated. tokens_max specifies the bucket size. * tokens_max = unit * (rate + burst) / rate. */ - tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst), - priv->rate); + if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp)) + return -EOVERFLOW; + + tokens = div64_u64(tmp, priv->rate); } if (tb[NFTA_LIMIT_FLAGS]) { -- cgit From b462579b2b86a8f5230543cadd3a4836be27baf7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 19 Jan 2024 13:34:32 +0100 Subject: netfilter: nf_tables: restrict anonymous set and map names to 16 bytes nftables has two types of sets/maps, one where userspace defines the name, and anonymous sets/maps, where userspace defines a template name. For the latter, kernel requires presence of exactly one "%d". nftables uses "__set%d" and "__map%d" for this. The kernel will expand the format specifier and replaces it with the smallest unused number. As-is, userspace could define a template name that allows to move the set name past the 256 bytes upperlimit (post-expansion). I don't see how this could be a problem, but I would prefer if userspace cannot do this, so add a limit of 16 bytes for the '%d' template name. 16 bytes is the old total upper limit for set names that existed when nf_tables was merged initially. Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4b55533ce5ca..02f45424644b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -24,6 +24,7 @@ #include #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) +#define NFT_SET_MAX_ANONLEN 16 unsigned int nf_tables_net_id __read_mostly; @@ -4413,6 +4414,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; + if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) + return -EINVAL; + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; -- cgit From f342de4e2f33e0e39165d8639387aa6c19dff660 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 20 Jan 2024 22:50:04 +0100 Subject: netfilter: nf_tables: reject QUEUE/DROP verdict parameters This reverts commit e0abdadcc6e1. core.c:nf_hook_slow assumes that the upper 16 bits of NF_DROP verdicts contain a valid errno, i.e. -EPERM, -EHOSTUNREACH or similar, or 0. Due to the reverted commit, its possible to provide a positive value, e.g. NF_ACCEPT (1), which results in use-after-free. Its not clear to me why this commit was made. NF_QUEUE is not used by nftables; "queue" rules in nftables will result in use of "nft_queue" expression. If we later need to allow specifiying errno values from userspace (do not know why), this has to call NF_DROP_GETERR and check that "err <= 0" holds true. Fixes: e0abdadcc6e1 ("netfilter: nf_tables: accept QUEUE/DROP verdict parameters") Cc: stable@vger.kernel.org Reported-by: Notselwyn Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 02f45424644b..c537104411e7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10992,16 +10992,10 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict.code) { - default: - switch (data->verdict.code & NF_VERDICT_MASK) { - case NF_ACCEPT: - case NF_DROP: - case NF_QUEUE: - break; - default: - return -EINVAL; - } - fallthrough; + case NF_ACCEPT: + case NF_DROP: + case NF_QUEUE: + break; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: @@ -11036,6 +11030,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, data->verdict.chain = chain; break; + default: + return -EINVAL; } desc->len = sizeof(data->verdict); -- cgit From d0009effa8862c20a13af4cb7475d9771b905693 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 23 Jan 2024 16:38:25 +0100 Subject: netfilter: nf_tables: validate NFPROTO_* family Several expressions explicitly refer to NF_INET_* hook definitions from expr->ops->validate, however, family is not validated. Bail out with EOPNOTSUPP in case they are used from unsupported families. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression") Fixes: 2fa841938c64 ("netfilter: nf_tables: introduce routing expression") Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") Fixes: ad49d86e07a4 ("netfilter: nf_tables: Add synproxy support") Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Fixes: 6c47260250fc ("netfilter: nf_tables: add xfrm expression") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 12 ++++++++++++ net/netfilter/nft_flow_offload.c | 5 +++++ net/netfilter/nft_nat.c | 5 +++++ net/netfilter/nft_rt.c | 5 +++++ net/netfilter/nft_socket.c | 5 +++++ net/netfilter/nft_synproxy.c | 7 +++++-- net/netfilter/nft_tproxy.c | 5 +++++ net/netfilter/nft_xfrm.c | 5 +++++ 8 files changed, 47 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 5284cd2ad532..f0eeda97bfcd 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -350,6 +350,12 @@ static int nft_target_validate(const struct nft_ctx *ctx, unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); @@ -595,6 +601,12 @@ static int nft_match_validate(const struct nft_ctx *ctx, unsigned int hook_mask = 0; int ret; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_BRIDGE && + ctx->family != NFPROTO_ARP) + return -EOPNOTSUPP; + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index ab3362c483b4..397351fa4d5f 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -384,6 +384,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx, { unsigned int hook_mask = (1 << NF_INET_FORWARD); + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, hook_mask); } diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 583885ce7232..808f5802c270 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -143,6 +143,11 @@ static int nft_nat_validate(const struct nft_ctx *ctx, struct nft_nat *priv = nft_expr_priv(expr); int err; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 35a2c28caa60..24d977138572 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -166,6 +166,11 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp const struct nft_rt *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->key) { case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 9ed85be79452..f30163e2ca62 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -242,6 +242,11 @@ static int nft_socket_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 13da882669a4..1d737f89dfc1 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx, break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; @@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) break; #endif case NFPROTO_INET: - case NFPROTO_BRIDGE: nf_synproxy_ipv4_fini(snet, ctx->net); nf_synproxy_ipv6_fini(snet, ctx->net); break; @@ -253,6 +251,11 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD)); } diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index ae15cd693f0e..71412adb73d4 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -316,6 +316,11 @@ static int nft_tproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); } diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 452f8587adda..1c866757db55 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -235,6 +235,11 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | -- cgit From a5f5eee282a0aae80227697e1d9c811b1726d31d Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Mon, 22 Jan 2024 19:19:09 +0100 Subject: net: stmmac: Wait a bit for the reset to take effect otherwise the synopsys_id value may be read out wrong, because the GMAC_VERSION register might still be in reset state, for at least 1 us after the reset is de-asserted. Add a wait for 10 us before continuing to be on the safe side. > From what have you got that delay value? Just try and error, with very old linux versions and old gcc versions the synopsys_id was read out correctly most of the time (but not always), with recent linux versions and recnet gcc versions it was read out wrongly most of the time, but again not always. I don't have access to the VHDL code in question, so I cannot tell why it takes so long to get the correct values, I also do not have more than a few hardware samples, so I cannot tell how long this timeout must be in worst case. Experimentally I can tell that the register is read several times as zero immediately after the reset is de-asserted, also adding several no-ops is not enough, adding a printk is enough, also udelay(1) seems to be enough but I tried that not very often, and I have not access to many hardware samples to be 100% sure about the necessary delay. And since the udelay here is only executed once per device instance, it seems acceptable to delay the boot for 10 us. BTW: my hardware's synopsys id is 0x37. Fixes: c5e4ddbdfa11 ("net: stmmac: Add support for optional reset control") Signed-off-by: Bernd Edlinger Reviewed-by: Jiri Pirko Reviewed-by: Serge Semin Link: https://lore.kernel.org/r/AS8P193MB1285A810BD78C111E7F6AA34E4752@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a0e46369ae15..b334eb16da23 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7542,6 +7542,9 @@ int stmmac_dvr_probe(struct device *device, dev_err(priv->device, "unable to bring out of ahb reset: %pe\n", ERR_PTR(ret)); + /* Wait a bit for the reset to take effect */ + udelay(10); + /* Init MAC and get the capabilities */ ret = stmmac_hw_init(priv); if (ret) -- cgit From 9f538b415db862e74b8c5d3abbccfc1b2b6caa38 Mon Sep 17 00:00:00 2001 From: Jenishkumar Maheshbhai Patel Date: Thu, 18 Jan 2024 19:59:14 -0800 Subject: net: mvpp2: clear BM pool before initialization Register value persist after booting the kernel using kexec which results in kernel panic. Thus clear the BM pool registers before initialisation to fix the issue. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Jenishkumar Maheshbhai Patel Reviewed-by: Maxime Chevallier Link: https://lore.kernel.org/r/20240119035914.2595665-1-jpatel2@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 27 ++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 820b1fabe297..23adf53c2aa1 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -614,12 +614,38 @@ static void mvpp23_bm_set_8pool_mode(struct mvpp2 *priv) mvpp2_write(priv, MVPP22_BM_POOL_BASE_ADDR_HIGH_REG, val); } +/* Cleanup pool before actual initialization in the OS */ +static void mvpp2_bm_pool_cleanup(struct mvpp2 *priv, int pool_id) +{ + unsigned int thread = mvpp2_cpu_to_thread(priv, get_cpu()); + u32 val; + int i; + + /* Drain the BM from all possible residues left by firmware */ + for (i = 0; i < MVPP2_BM_POOL_SIZE_MAX; i++) + mvpp2_thread_read(priv, thread, MVPP2_BM_PHY_ALLOC_REG(pool_id)); + + put_cpu(); + + /* Stop the BM pool */ + val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(pool_id)); + val |= MVPP2_BM_STOP_MASK; + mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(pool_id), val); +} + static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv) { enum dma_data_direction dma_dir = DMA_FROM_DEVICE; int i, err, poolnum = MVPP2_BM_POOLS_NUM; struct mvpp2_port *port; + if (priv->percpu_pools) + poolnum = mvpp2_get_nrxqs(priv) * 2; + + /* Clean up the pool state in case it contains stale state */ + for (i = 0; i < poolnum; i++) + mvpp2_bm_pool_cleanup(priv, i); + if (priv->percpu_pools) { for (i = 0; i < priv->port_count; i++) { port = priv->port_list[i]; @@ -629,7 +655,6 @@ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv) } } - poolnum = mvpp2_get_nrxqs(priv) * 2; for (i = 0; i < poolnum; i++) { /* the pool in use */ int pn = i / (poolnum / 2); -- cgit From 0719b5338a0cbe80d1637a5fb03d8141b5bfc7a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 22 Jan 2024 11:58:15 -0800 Subject: selftests: net: fix rps_default_mask with >32 CPUs If there is more than 32 cpus the bitmask will start to contain commas, leading to: ./rps_default_mask.sh: line 36: [: 00000000,00000000: integer expression expected Remove the commas, bash doesn't interpret leading zeroes as oct so that should be good enough. Switch to bash, Simon reports that not all shells support this type of substitution. Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask") Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240122195815.638997-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rps_default_mask.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh index a26c5624429f..4287a8529890 100755 --- a/tools/testing/selftests/net/rps_default_mask.sh +++ b/tools/testing/selftests/net/rps_default_mask.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 readonly ksft_skip=4 @@ -33,6 +33,10 @@ chk_rps() { rps_mask=$($cmd /sys/class/net/$dev_name/queues/rx-0/rps_cpus) printf "%-60s" "$msg" + + # In case there is more than 32 CPUs we need to remove commas from masks + rps_mask=${rps_mask//,} + expected_rps_mask=${expected_rps_mask//,} if [ $rps_mask -eq $expected_rps_mask ]; then echo "[ ok ]" else -- cgit From 0879020a7817e7ce636372c016b4528f541c9f4d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 22 Jan 2024 22:05:29 -0800 Subject: selftests: netdevsim: fix the udp_tunnel_nic test This test is missing a whole bunch of checks for interface renaming and one ifup. Presumably it was only used on a system with renaming disabled and NetworkManager running. Fixes: 91f430b2c49d ("selftests: net: add a test for UDP tunnel info infra") Acked-by: Paolo Abeni Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240123060529.1033912-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh index 4855ef597a15..f98435c502f6 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh @@ -270,6 +270,7 @@ for port in 0 1; do echo 1 > $NSIM_DEV_SYS/new_port fi NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up msg="new NIC device created" exp0=( 0 0 0 0 ) @@ -431,6 +432,7 @@ for port in 0 1; do fi echo $port > $NSIM_DEV_SYS/new_port + NSIM_NETDEV=`get_netdev_name old_netdevs` ifconfig $NSIM_NETDEV up overflow_table0 "overflow NIC table" @@ -488,6 +490,7 @@ for port in 0 1; do fi echo $port > $NSIM_DEV_SYS/new_port + NSIM_NETDEV=`get_netdev_name old_netdevs` ifconfig $NSIM_NETDEV up overflow_table0 "overflow NIC table" @@ -544,6 +547,7 @@ for port in 0 1; do fi echo $port > $NSIM_DEV_SYS/new_port + NSIM_NETDEV=`get_netdev_name old_netdevs` ifconfig $NSIM_NETDEV up overflow_table0 "destroy NIC" @@ -573,6 +577,7 @@ for port in 0 1; do fi echo $port > $NSIM_DEV_SYS/new_port + NSIM_NETDEV=`get_netdev_name old_netdevs` ifconfig $NSIM_NETDEV up msg="create VxLANs v6" @@ -633,6 +638,7 @@ for port in 0 1; do fi echo $port > $NSIM_DEV_SYS/new_port + NSIM_NETDEV=`get_netdev_name old_netdevs` ifconfig $NSIM_NETDEV up echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error @@ -688,6 +694,7 @@ for port in 0 1; do fi echo $port > $NSIM_DEV_SYS/new_port + NSIM_NETDEV=`get_netdev_name old_netdevs` ifconfig $NSIM_NETDEV up msg="create VxLANs v6" @@ -747,6 +754,7 @@ for port in 0 1; do fi echo $port > $NSIM_DEV_SYS/new_port + NSIM_NETDEV=`get_netdev_name old_netdevs` ifconfig $NSIM_NETDEV up msg="create VxLANs v6" @@ -877,6 +885,7 @@ msg="re-add a port" echo 2 > $NSIM_DEV_SYS/del_port echo 2 > $NSIM_DEV_SYS/new_port +NSIM_NETDEV=`get_netdev_name old_netdevs` check_tables msg="replace VxLAN in overflow table" -- cgit From f5e414167be768b0373891d301478351f757ec65 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:22 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for 8390 W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to all the good old 8390 modules and drivers. Signed-off-by: Breno Leitao CC: geert@linux-m68k.org Link: https://lore.kernel.org/r/20240123190332.677489-2-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/8390/8390.c | 1 + drivers/net/ethernet/8390/8390p.c | 1 + drivers/net/ethernet/8390/apne.c | 1 + drivers/net/ethernet/8390/hydra.c | 1 + drivers/net/ethernet/8390/stnic.c | 1 + drivers/net/ethernet/8390/zorro8390.c | 1 + 6 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/8390/8390.c b/drivers/net/ethernet/8390/8390.c index 0e0aa4016858..c5636245f1ca 100644 --- a/drivers/net/ethernet/8390/8390.c +++ b/drivers/net/ethernet/8390/8390.c @@ -100,4 +100,5 @@ static void __exit ns8390_module_exit(void) module_init(ns8390_module_init); module_exit(ns8390_module_exit); #endif /* MODULE */ +MODULE_DESCRIPTION("National Semiconductor 8390 core driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/8390/8390p.c b/drivers/net/ethernet/8390/8390p.c index 6834742057b3..6d429b11e9c6 100644 --- a/drivers/net/ethernet/8390/8390p.c +++ b/drivers/net/ethernet/8390/8390p.c @@ -102,4 +102,5 @@ static void __exit NS8390p_cleanup_module(void) module_init(NS8390p_init_module); module_exit(NS8390p_cleanup_module); +MODULE_DESCRIPTION("National Semiconductor 8390 core for ISA driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/8390/apne.c b/drivers/net/ethernet/8390/apne.c index a09f383dd249..828edca8d30c 100644 --- a/drivers/net/ethernet/8390/apne.c +++ b/drivers/net/ethernet/8390/apne.c @@ -610,4 +610,5 @@ static int init_pcmcia(void) return 1; } +MODULE_DESCRIPTION("National Semiconductor 8390 Amiga PCMCIA ethernet driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/8390/hydra.c b/drivers/net/ethernet/8390/hydra.c index 24f49a8ff903..fd9dcdc356e6 100644 --- a/drivers/net/ethernet/8390/hydra.c +++ b/drivers/net/ethernet/8390/hydra.c @@ -270,4 +270,5 @@ static void __exit hydra_cleanup_module(void) module_init(hydra_init_module); module_exit(hydra_cleanup_module); +MODULE_DESCRIPTION("Zorro-II Hydra 8390 ethernet driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/8390/stnic.c b/drivers/net/ethernet/8390/stnic.c index 265976e3b64a..6cc0e190aa79 100644 --- a/drivers/net/ethernet/8390/stnic.c +++ b/drivers/net/ethernet/8390/stnic.c @@ -296,4 +296,5 @@ static void __exit stnic_cleanup(void) module_init(stnic_probe); module_exit(stnic_cleanup); +MODULE_DESCRIPTION("National Semiconductor DP83902AV ethernet driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/8390/zorro8390.c b/drivers/net/ethernet/8390/zorro8390.c index d70390e9d03d..c24dd4fe7a10 100644 --- a/drivers/net/ethernet/8390/zorro8390.c +++ b/drivers/net/ethernet/8390/zorro8390.c @@ -443,4 +443,5 @@ static void __exit zorro8390_cleanup_module(void) module_init(zorro8390_init_module); module_exit(zorro8390_cleanup_module); +MODULE_DESCRIPTION("Zorro NS8390-based ethernet driver"); MODULE_LICENSE("GPL"); -- cgit From 39535d7ff6c1e5bda0a3f3c87250bab63e910969 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:23 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for Broadcom bgmac W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Broadcom iProc GBit driver. Signed-off-by: Breno Leitao Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20240123190332.677489-3-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcm4908_enet.c | 1 + drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c | 1 + drivers/net/ethernet/broadcom/bgmac-bcma.c | 1 + drivers/net/ethernet/broadcom/bgmac-platform.c | 1 + drivers/net/ethernet/broadcom/bgmac.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 3e7c8671cd11..72df1bb10172 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -793,5 +793,6 @@ static struct platform_driver bcm4908_enet_driver = { }; module_platform_driver(bcm4908_enet_driver); +MODULE_DESCRIPTION("Broadcom BCM4908 Gigabit Ethernet driver"); MODULE_LICENSE("GPL v2"); MODULE_DEVICE_TABLE(of, bcm4908_enet_of_match); diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c b/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c index 9b83d5361699..50b8e97a811d 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c @@ -260,4 +260,5 @@ void bcma_mdio_mii_unregister(struct mii_bus *mii_bus) EXPORT_SYMBOL_GPL(bcma_mdio_mii_unregister); MODULE_AUTHOR("Rafał Miłecki"); +MODULE_DESCRIPTION("Broadcom iProc GBit BCMA MDIO helpers"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index 6e4f36aaf5db..36f9bad28e6a 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -362,4 +362,5 @@ module_init(bgmac_init) module_exit(bgmac_exit) MODULE_AUTHOR("Rafał Miłecki"); +MODULE_DESCRIPTION("Broadcom iProc GBit BCMA interface driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index 0b21fd5bd457..77425c7a32db 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -298,4 +298,5 @@ static struct platform_driver bgmac_enet_driver = { }; module_platform_driver(bgmac_enet_driver); +MODULE_DESCRIPTION("Broadcom iProc GBit platform interface driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 448a1b90de5e..6ffdc4229407 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -1626,4 +1626,5 @@ int bgmac_enet_resume(struct bgmac *bgmac) EXPORT_SYMBOL_GPL(bgmac_enet_resume); MODULE_AUTHOR("Rafał Miłecki"); +MODULE_DESCRIPTION("Broadcom iProc GBit driver"); MODULE_LICENSE("GPL"); -- cgit From bb567fbbbbb41d61b685e224098806a90df8cef2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:24 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for liquidio W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Cavium Liquidio. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240123190332.677489-4-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/liquidio/lio_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c index 9cc6303c82ff..f38d31bfab1b 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_core.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c @@ -27,6 +27,7 @@ #include "octeon_network.h" MODULE_AUTHOR("Cavium Networks, "); +MODULE_DESCRIPTION("Cavium LiquidIO Intelligent Server Adapter Core"); MODULE_LICENSE("GPL"); /* OOM task polling interval */ -- cgit From 53c83e2d36484f8df87792bb5368653bdb441014 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:25 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for ep93xxx_eth W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Cirrus EP93xx ethernet driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240123190332.677489-5-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cirrus/ep93xx_eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c index 1c2a540db13d..1f495cfd7959 100644 --- a/drivers/net/ethernet/cirrus/ep93xx_eth.c +++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c @@ -868,5 +868,6 @@ static struct platform_driver ep93xx_eth_driver = { module_platform_driver(ep93xx_eth_driver); +MODULE_DESCRIPTION("Cirrus EP93xx Ethernet driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:ep93xx-eth"); -- cgit From 27881ca8c8e1b6179ac41dc787cbfc3cb4362ff8 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:26 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for nps_enet W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the EZchip NPS ethernet driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240123190332.677489-6-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ezchip/nps_enet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index 07c2b701b5fa..9ebe751c1df0 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -661,4 +661,5 @@ static struct platform_driver nps_enet_driver = { module_platform_driver(nps_enet_driver); MODULE_AUTHOR("EZchip Semiconductor"); +MODULE_DESCRIPTION("EZchip NPS Ethernet driver"); MODULE_LICENSE("GPL v2"); -- cgit From 07c42d237567d47225499d0586b9b90a432a7b58 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:27 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for enetc W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the NXP ENETC Ethernet driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240123190332.677489-7-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index cffbf27c4656..bfdbdab443ae 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -3216,4 +3216,5 @@ void enetc_pci_remove(struct pci_dev *pdev) } EXPORT_SYMBOL_GPL(enetc_pci_remove); +MODULE_DESCRIPTION("NXP ENETC Ethernet driver"); MODULE_LICENSE("Dual BSD/GPL"); -- cgit From 2e87576488552aab742391b6c442beedffd31abe Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:28 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for fec W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the FEC (MPC8xx) Ethernet controller. Signed-off-by: Breno Leitao Reviewed-by: Wei Fang Link: https://lore.kernel.org/r/20240123190332.677489-8-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index d42594f32275..4b0259e9269a 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -4769,4 +4769,5 @@ static struct platform_driver fec_driver = { module_platform_driver(fec_driver); +MODULE_DESCRIPTION("NXP Fast Ethernet Controller (FEC) driver"); MODULE_LICENSE("GPL"); -- cgit From 8183c470c17602275ec1e3525d010f6c9cd383e9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:29 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for fsl_pq_mdio W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Freescale PQ MDIO driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240123190332.677489-9-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fsl_pq_mdio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c index 70dd982a5edc..026f7270a54d 100644 --- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c +++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c @@ -531,4 +531,5 @@ static struct platform_driver fsl_pq_mdio_driver = { module_platform_driver(fsl_pq_mdio_driver); +MODULE_DESCRIPTION("Freescale PQ MDIO helpers"); MODULE_LICENSE("GPL"); -- cgit From 07d1e0ce874377a88c13bb56a336d6c544367837 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:30 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for litex W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the LiteX Liteeth Ethernet device. Signed-off-by: Breno Leitao Acked-by: Gabriel Somlo Link: https://lore.kernel.org/r/20240123190332.677489-10-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/litex/litex_liteeth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/litex/litex_liteeth.c b/drivers/net/ethernet/litex/litex_liteeth.c index 5182fe737c37..ff54fbe41bcc 100644 --- a/drivers/net/ethernet/litex/litex_liteeth.c +++ b/drivers/net/ethernet/litex/litex_liteeth.c @@ -318,4 +318,5 @@ static struct platform_driver liteeth_driver = { module_platform_driver(liteeth_driver); MODULE_AUTHOR("Joel Stanley "); +MODULE_DESCRIPTION("LiteX Liteeth Ethernet driver"); MODULE_LICENSE("GPL"); -- cgit From bdc6734115d7f66d8bea155454d4ce9259821660 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 23 Jan 2024 11:03:31 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for rvu_mbox W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Marvel RVU mbox driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240123190332.677489-11-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/mbox.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.c b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c index 9690ac01f02c..b92264d0a77e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c @@ -413,4 +413,5 @@ const char *otx2_mbox_id2name(u16 id) EXPORT_SYMBOL(otx2_mbox_id2name); MODULE_AUTHOR("Marvell."); +MODULE_DESCRIPTION("Marvell RVU NIC Mbox helpers"); MODULE_LICENSE("GPL v2"); -- cgit From 269009893146c495f41e9572dd9319e787c2eba9 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:52 +0100 Subject: xsk: recycle buffer in case Rx queue was full Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce descriptor to XSK Rx queue. Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Acked-by: Magnus Karlsson Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-2-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9f13aa3353e3..1eadfac03cc4 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -167,8 +167,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) contd = XDP_PKT_CONTD; err = __xsk_rcv_zc(xs, xskb, len, contd); - if (err || likely(!frags)) - goto out; + if (err) + goto err; + if (likely(!frags)) + return 0; xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { @@ -177,11 +179,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) - return err; + goto err; list_del(&pos->xskb_list_node); } -out: + return 0; +err: + xsk_buff_free(xdp); return err; } -- cgit From f7f6aa8e24383fbb11ac55942e66da9660110f80 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:53 +0100 Subject: xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is used by drivers to notify data path whether xdp_buff contains fragments or not. Data path looks up mentioned flag on first buffer that occupies the linear part of xdp_buff, so drivers only modify it there. This is sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on stack or it resides within struct representing driver's queue and fragments are carried via skb_frag_t structs. IOW, we are dealing with only one xdp_buff. ZC mode though relies on list of xdp_buff structs that is carried via xsk_buff_pool::xskb_list, so ZC data path has to make sure that fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise, xsk_buff_free() could misbehave if it would be executed against xdp_buff that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can take place when within supplied XDP program bpf_xdp_adjust_tail() is used with negative offset that would in turn release the tail fragment from multi-buffer frame. Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would result in releasing all the nodes from xskb_list that were produced by driver before XDP program execution, which is not what is intended - only tail fragment should be deleted from xskb_list and then it should be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never make it up to user space, so from AF_XDP application POV there would be no traffic running, however due to free_list getting constantly new nodes, driver will be able to feed HW Rx queue with recycled buffers. Bottom line is that instead of traffic being redirected to user space, it would be continuously dropped. To fix this, let us clear the mentioned flag on xsk_buff_pool side during xdp_buff initialization, which is what should have been done right from the start of XSK multi-buffer support. Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-3-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 - drivers/net/ethernet/intel/ice/ice_xsk.c | 1 - include/net/xdp_sock_drv.h | 1 + net/xdp/xsk_buff_pool.c | 1 + 4 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index af7d5fa6cdc1..82aca0d16a3e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -498,7 +498,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog); i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets, &rx_bytes, xdp_res, &failure); - first->flags = 0; next_to_clean = next_to_process; if (failure) break; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 5d1ae8e4058a..d9073a618ad6 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -895,7 +895,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) if (!first) { first = xdp; - xdp_buff_clear_frags_flag(first); } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) { break; } diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 526c1e7f505e..9819e2af0378 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -164,6 +164,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; + xdp->flags = 0; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 28711cc44ced..ce60ecd48a4d 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -555,6 +555,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; xskb->xdp.data_meta = xskb->xdp.data; + xskb->xdp.flags = 0; if (pool->dma_need_sync) { dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, -- cgit From c5114710c8ce86b8317e9b448f4fd15c711c2a82 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:54 +0100 Subject: xsk: fix usage of multi-buffer BPF helpers for ZC XDP Currently when packet is shrunk via bpf_xdp_adjust_tail() and memory type is set to MEM_TYPE_XSK_BUFF_POOL, null ptr dereference happens: [1136314.192256] BUG: kernel NULL pointer dereference, address: 0000000000000034 [1136314.203943] #PF: supervisor read access in kernel mode [1136314.213768] #PF: error_code(0x0000) - not-present page [1136314.223550] PGD 0 P4D 0 [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257 [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210 [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86 [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246 [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX: 0000000000000000 [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffffc9003168c000 [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09: 0000000000010000 [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12: 0000000000000001 [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15: 0000000000000001 [1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000) knlGS:0000000000000000 [1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4: 00000000007706f0 [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1136314.431890] PKRU: 55555554 [1136314.439143] Call Trace: [1136314.446058] [1136314.452465] ? __die+0x20/0x70 [1136314.459881] ? page_fault_oops+0x15b/0x440 [1136314.468305] ? exc_page_fault+0x6a/0x150 [1136314.476491] ? asm_exc_page_fault+0x22/0x30 [1136314.484927] ? __xdp_return+0x6c/0x210 [1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0 [1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60 [1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice] [1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice] [1136314.528506] ice_napi_poll+0x467/0x670 [ice] [1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0 [1136314.546010] __napi_poll+0x29/0x1b0 [1136314.553462] net_rx_action+0x133/0x270 [1136314.561619] __do_softirq+0xbe/0x28e [1136314.569303] do_softirq+0x3f/0x60 This comes from __xdp_return() call with xdp_buff argument passed as NULL which is supposed to be consumed by xsk_buff_free() call. To address this properly, in ZC case, a node that represents the frag being removed has to be pulled out of xskb_list. Introduce appropriate xsk helpers to do such node operation and use them accordingly within bpf_xdp_adjust_tail(). Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Acked-by: Magnus Karlsson # For the xsk header part Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-4-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock_drv.h | 26 ++++++++++++++++++++++++++ net/core/filter.c | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 9819e2af0378..c9aec9ab6191 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -159,6 +159,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return ret; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + + list_del(&xskb->xskb_list_node); +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + xskb_list_node); + return &frag->xdp; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; @@ -351,6 +368,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return NULL; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } diff --git a/net/core/filter.c b/net/core/filter.c index 24061f29c9dd..36fb5ae8af69 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -83,6 +83,7 @@ #include #include #include +#include #include "dev.h" @@ -4096,6 +4097,40 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } +static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + struct xdp_mem_info *mem_info, bool release) +{ + struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + + if (release) { + xsk_buff_del_tail(zc_frag); + __xdp_return(NULL, mem_info, false, zc_frag); + } else { + zc_frag->data_end -= shrink; + } +} + +static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, + int shrink) +{ + struct xdp_mem_info *mem_info = &xdp->rxq->mem; + bool release = skb_frag_size(frag) == shrink; + + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { + bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); + goto out; + } + + if (release) { + struct page *page = skb_frag_page(frag); + + __xdp_return(page_address(page), mem_info, false, NULL); + } + +out: + return release; +} + static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -4110,12 +4145,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - - if (skb_frag_size(frag) == shrink) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), &xdp->rxq->mem, - false, NULL); + if (bpf_xdp_shrink_data(xdp, frag, shrink)) { n_frags_free++; } else { skb_frag_size_sub(frag, shrink); -- cgit From ad2047cf5d9313200e308612aed516548873d124 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:55 +0100 Subject: ice: work on pre-XDP prog frag count Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a multi-buffer packet by 4k bytes and then redirects it to an AF_XDP socket. Since support for handling multi-buffer frames was added to XDP, usage of bpf_xdp_adjust_tail() helper within XDP program can free the page that given fragment occupies and in turn decrease the fragment count within skb_shared_info that is embedded in xdp_buff struct. In current ice driver codebase, it can become problematic when page recycling logic decides not to reuse the page. In such case, __page_frag_cache_drain() is used with ice_rx_buf::pagecnt_bias that was not adjusted after refcount of page was changed by XDP prog which in turn does not drain the refcount to 0 and page is never freed. To address this, let us store the count of frags before the XDP program was executed on Rx ring struct. This will be used to compare with current frag count from skb_shared_info embedded in xdp_buff. A smaller value in the latter indicates that XDP prog freed frag(s). Then, for given delta decrement pagecnt_bias for XDP_DROP verdict. While at it, let us also handle the EOP frag within ice_set_rx_bufs_act() to make our life easier, so all of the adjustments needed to be applied against freed frags are performed in the single place. Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Acked-by: Magnus Karlsson Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-5-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/ice/ice_txrx.c | 14 +++++++----- drivers/net/ethernet/intel/ice/ice_txrx.h | 1 + drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++++++++-------- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 74d13cc5a3a7..0c9b4aa8a049 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -603,9 +603,7 @@ out_failure: ret = ICE_XDP_CONSUMED; } exit: - rx_buf->act = ret; - if (unlikely(xdp_buff_has_frags(xdp))) - ice_set_rx_bufs_act(xdp, rx_ring, ret); + ice_set_rx_bufs_act(xdp, rx_ring, ret); } /** @@ -893,14 +891,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, } if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - if (unlikely(xdp_buff_has_frags(xdp))) - ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); + ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); return -ENOMEM; } __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, rx_buf->page_offset, size); sinfo->xdp_frags_size += size; + /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() + * can pop off frags but driver has to handle it on its own + */ + rx_ring->nr_frags = sinfo->nr_frags; if (page_is_pfmemalloc(rx_buf->page)) xdp_buff_set_frag_pfmemalloc(xdp); @@ -1251,6 +1252,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp->data = NULL; rx_ring->first_desc = ntc; + rx_ring->nr_frags = 0; continue; construct_skb: if (likely(ice_ring_uses_build_skb(rx_ring))) @@ -1266,10 +1268,12 @@ construct_skb: ICE_XDP_CONSUMED); xdp->data = NULL; rx_ring->first_desc = ntc; + rx_ring->nr_frags = 0; break; } xdp->data = NULL; rx_ring->first_desc = ntc; + rx_ring->nr_frags = 0; stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S); if (unlikely(ice_test_staterr(rx_desc->wb.status_error0, diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index b3379ff73674..af955b0e5dc5 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -358,6 +358,7 @@ struct ice_rx_ring { struct ice_tx_ring *xdp_ring; struct ice_rx_ring *next; /* pointer to next ring in q_vector */ struct xsk_buff_pool *xsk_pool; + u32 nr_frags; dma_addr_t dma; /* physical address of ring */ u16 rx_buf_len; u8 dcb_tc; /* Traffic class of ring */ diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h index 762047508619..afcead4baef4 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h @@ -12,26 +12,39 @@ * act: action to store onto Rx buffers related to XDP buffer parts * * Set action that should be taken before putting Rx buffer from first frag - * to one before last. Last one is handled by caller of this function as it - * is the EOP frag that is currently being processed. This function is - * supposed to be called only when XDP buffer contains frags. + * to the last. */ static inline void ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring, const unsigned int act) { - const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); - u32 first = rx_ring->first_desc; - u32 nr_frags = sinfo->nr_frags; + u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; + u32 nr_frags = rx_ring->nr_frags + 1; + u32 idx = rx_ring->first_desc; u32 cnt = rx_ring->count; struct ice_rx_buf *buf; for (int i = 0; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[first]; + buf = &rx_ring->rx_buf[idx]; buf->act = act; - if (++first == cnt) - first = 0; + if (++idx == cnt) + idx = 0; + } + + /* adjust pagecnt_bias on frags freed by XDP prog */ + if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) { + u32 delta = rx_ring->nr_frags - sinfo_frags; + + while (delta) { + if (idx == 0) + idx = cnt - 1; + else + idx--; + buf = &rx_ring->rx_buf[idx]; + buf->pagecnt_bias--; + delta--; + } } } -- cgit From 83014323c642b8faa2d64a5f303b41c019322478 Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 24 Jan 2024 20:15:56 +0100 Subject: i40e: handle multi-buffer packets that are shrunk by xdp prog XDP programs can shrink packets by calling the bpf_xdp_adjust_tail() helper function. For multi-buffer packets this may lead to reduction of frag count stored in skb_shared_info area of the xdp_buff struct. This results in issues with the current handling of XDP_PASS and XDP_DROP cases. For XDP_PASS, currently skb is being built using frag count of xdp_buffer before it was processed by XDP prog and thus will result in an inconsistent skb when frag count gets reduced by XDP prog. To fix this, get correct frag count while building the skb instead of using pre-obtained frag count. For XDP_DROP, current page recycling logic will not reuse the page but instead will adjust the pagecnt_bias so that the page can be freed. This again results in inconsistent behavior as the page refcnt has already been changed by the helper while freeing the frag(s) as part of shrinking the packet. To fix this, only adjust pagecnt_bias for buffers that are stillpart of the packet post-xdp prog run. Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx") Reported-by: Maciej Fijalkowski Signed-off-by: Tirthendu Sarkar Link: https://lore.kernel.org/r/20240124191602.566724-6-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 40 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 971ba3322038..1f0a0f13a334 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2087,7 +2087,8 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, struct xdp_buff *xdp) { - u32 next = rx_ring->next_to_clean; + u32 nr_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; + u32 next = rx_ring->next_to_clean, i = 0; struct i40e_rx_buffer *rx_buffer; xdp->flags = 0; @@ -2100,10 +2101,10 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, if (!rx_buffer->page) continue; - if (xdp_res == I40E_XDP_CONSUMED) - rx_buffer->pagecnt_bias++; - else + if (xdp_res != I40E_XDP_CONSUMED) i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz); + else if (i++ <= nr_frags) + rx_buffer->pagecnt_bias++; /* EOP buffer will be put in i40e_clean_rx_irq() */ if (next == rx_ring->next_to_process) @@ -2117,20 +2118,20 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, * i40e_construct_skb - Allocate skb and populate it * @rx_ring: rx descriptor ring to transact packets on * @xdp: xdp_buff pointing to the data - * @nr_frags: number of buffers for the packet * * This function allocates an skb. It then populates it with the page * data from the current receive descriptor, taking care to set up the * skb correctly. */ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, - struct xdp_buff *xdp, - u32 nr_frags) + struct xdp_buff *xdp) { unsigned int size = xdp->data_end - xdp->data; struct i40e_rx_buffer *rx_buffer; + struct skb_shared_info *sinfo; unsigned int headlen; struct sk_buff *skb; + u32 nr_frags = 0; /* prefetch first cache line of first page */ net_prefetch(xdp->data); @@ -2168,6 +2169,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, sizeof(long))); + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); /* update all of the pointers */ size -= headlen; @@ -2187,9 +2192,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, } if (unlikely(xdp_buff_has_frags(xdp))) { - struct skb_shared_info *sinfo, *skinfo = skb_shinfo(skb); + struct skb_shared_info *skinfo = skb_shinfo(skb); - sinfo = xdp_get_shared_info_from_buff(xdp); memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], sizeof(skb_frag_t) * nr_frags); @@ -2212,17 +2216,17 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, * i40e_build_skb - Build skb around an existing buffer * @rx_ring: Rx descriptor ring to transact packets on * @xdp: xdp_buff pointing to the data - * @nr_frags: number of buffers for the packet * * This function builds an skb around an existing Rx buffer, taking care * to set up the skb correctly and avoid any memcpy overhead. */ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, - struct xdp_buff *xdp, - u32 nr_frags) + struct xdp_buff *xdp) { unsigned int metasize = xdp->data - xdp->data_meta; + struct skb_shared_info *sinfo; struct sk_buff *skb; + u32 nr_frags; /* Prefetch first cache line of first page. If xdp->data_meta * is unused, this points exactly as xdp->data, otherwise we @@ -2231,6 +2235,11 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, */ net_prefetch(xdp->data_meta); + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } + /* build an skb around the page buffer */ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); if (unlikely(!skb)) @@ -2243,9 +2252,6 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) { - struct skb_shared_info *sinfo; - - sinfo = xdp_get_shared_info_from_buff(xdp); xdp_update_skb_shared_info(skb, nr_frags, sinfo->xdp_frags_size, nr_frags * xdp->frame_sz, @@ -2589,9 +2595,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget, total_rx_bytes += size; } else { if (ring_uses_build_skb(rx_ring)) - skb = i40e_build_skb(rx_ring, xdp, nfrags); + skb = i40e_build_skb(rx_ring, xdp); else - skb = i40e_construct_skb(rx_ring, xdp, nfrags); + skb = i40e_construct_skb(rx_ring, xdp); /* drop if we failed to retrieve a buffer */ if (!skb) { -- cgit From 2ee788c06493d02ee85855414cca39825e768aaf Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:57 +0100 Subject: ice: remove redundant xdp_rxq_info registration xdp_rxq_info struct can be registered by drivers via two functions - xdp_rxq_info_reg() and __xdp_rxq_info_reg(). The latter one allows drivers that support XDP multi-buffer to set up xdp_rxq_info::frag_size which in turn will make it possible to grow the packet via bpf_xdp_adjust_tail() BPF helper. Currently, ice registers xdp_rxq_info in two spots: 1) ice_setup_rx_ring() // via xdp_rxq_info_reg(), BUG 2) ice_vsi_cfg_rxq() // via __xdp_rxq_info_reg(), OK Cited commit under fixes tag took care of setting up frag_size and updated registration scheme in 2) but it did not help as 1) is called before 2) and as shown above it uses old registration function. This means that 2) sees that xdp_rxq_info is already registered and never calls __xdp_rxq_info_reg() which leaves us with xdp_rxq_info::frag_size being set to 0. To fix this misbehavior, simply remove xdp_rxq_info_reg() call from ice_setup_rx_ring(). Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Acked-by: Magnus Karlsson Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-7-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/ice/ice_txrx.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 0c9b4aa8a049..97d41d6ebf1f 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -513,11 +513,6 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) if (ice_is_xdp_ena_vsi(rx_ring->vsi)) WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog); - if (rx_ring->vsi->type == ICE_VSI_PF && - !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) - if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->q_index, rx_ring->q_vector->napi.napi_id)) - goto err; return 0; err: -- cgit From 290779905d09d5fdf6caa4f58ddefc3f4db0c0a9 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:58 +0100 Subject: intel: xsk: initialize skb_frag_t::bv_offset in ZC drivers Ice and i40e ZC drivers currently set offset of a frag within skb_shared_info to 0, which is incorrect. xdp_buffs that come from xsk_buff_pool always have 256 bytes of a headroom, so they need to be taken into account to retrieve xdp_buff::data via skb_frag_address(). Otherwise, bpf_xdp_frags_increase_tail() would be starting its job from xdp_buff::data_hard_start which would result in overwriting existing payload. Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") Acked-by: Magnus Karlsson Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-8-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 ++- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 82aca0d16a3e..11500003af0d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -414,7 +414,8 @@ i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first, } __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, - virt_to_page(xdp->data_hard_start), 0, size); + virt_to_page(xdp->data_hard_start), + XDP_PACKET_HEADROOM, size); sinfo->xdp_frags_size += size; xsk_buff_add_frag(xdp); diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index d9073a618ad6..8b81a1677045 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -825,7 +825,8 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, } __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, - virt_to_page(xdp->data_hard_start), 0, size); + virt_to_page(xdp->data_hard_start), + XDP_PACKET_HEADROOM, size); sinfo->xdp_frags_size += size; xsk_buff_add_frag(xdp); -- cgit From 3de38c87174225487fc93befeea7d380db80aef6 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:15:59 +0100 Subject: ice: update xdp_rxq_info::frag_size for ZC enabled Rx queue Now that ice driver correctly sets up frag_size in xdp_rxq_info, let us make it work for ZC multi-buffer as well. ice_rx_ring::rx_buf_len for ZC is being set via xsk_pool_get_rx_frame_size() and this needs to be propagated up to xdp_rxq_info. Use a bigger hammer and instead of unregistering only xdp_rxq_info's memory model, unregister it altogether and register it again and have xdp_rxq_info with correct frag_size value. Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-9-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/ice/ice_base.c | 37 +++++++++++++++++++------------ 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 533b923cae2d..7ac847718882 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -547,19 +547,27 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) ring->rx_buf_len = ring->vsi->rx_buf_len; if (ring->vsi->type == ICE_VSI_PF) { - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) - /* coverity[check_return] */ - __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->q_index, - ring->q_vector->napi.napi_id, - ring->vsi->rx_buf_len); + if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->q_index, + ring->q_vector->napi.napi_id, + ring->rx_buf_len); + if (err) + return err; + } ring->xsk_pool = ice_xsk_pool(ring); if (ring->xsk_pool) { - xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + xdp_rxq_info_unreg(&ring->xdp_rxq); ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->q_index, + ring->q_vector->napi.napi_id, + ring->rx_buf_len); + if (err) + return err; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL); @@ -571,13 +579,14 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->q_index); } else { - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) - /* coverity[check_return] */ - __xdp_rxq_info_reg(&ring->xdp_rxq, - ring->netdev, - ring->q_index, - ring->q_vector->napi.napi_id, - ring->vsi->rx_buf_len); + if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->q_index, + ring->q_vector->napi.napi_id, + ring->rx_buf_len); + if (err) + return err; + } err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, -- cgit From fbadd83a612c3b7aad2987893faca6bd24aaebb3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:16:00 +0100 Subject: xdp: reflect tail increase for MEM_TYPE_XSK_BUFF_POOL XSK ZC Rx path calculates the size of data that will be posted to XSK Rx queue via subtracting xdp_buff::data_end from xdp_buff::data. In bpf_xdp_frags_increase_tail(), when underlying memory type of xdp_rxq_info is MEM_TYPE_XSK_BUFF_POOL, add offset to data_end in tail fragment, so that later on user space will be able to take into account the amount of bytes added by XDP program. Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-10-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 36fb5ae8af69..ef3e78b6a39c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4093,6 +4093,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; + if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + xsk_buff_get_tail(xdp)->data_end += offset; return 0; } -- cgit From a045d2f2d03d23e7db6772dd83e0ba2705dfad93 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:16:01 +0100 Subject: i40e: set xdp_rxq_info::frag_size i40e support XDP multi-buffer so it is supposed to use __xdp_rxq_info_reg() instead of xdp_rxq_info_reg() and set the frag_size. It can not be simply converted at existing callsite because rx_buf_len could be un-initialized, so let us register xdp_rxq_info within i40e_configure_rx_ring(), which happen to be called with already initialized rx_buf_len value. Commit 5180ff1364bc ("i40e: use int for i40e_status") converted 'err' to int, so two variables to deal with return codes are not needed within i40e_configure_rx_ring(). Remove 'ret' and use 'err' to handle status from xdp_rxq_info registration. Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx") Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-11-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_main.c | 40 +++++++++++++++++------------ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 9 ------- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index ae8f9f135725..d3b00d8ed39a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3588,40 +3588,48 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) struct i40e_hmc_obj_rxq rx_ctx; int err = 0; bool ok; - int ret; bitmap_zero(ring->state, __I40E_RING_STATE_NBITS); /* clear the context structure first */ memset(&rx_ctx, 0, sizeof(rx_ctx)); - if (ring->vsi->type == I40E_VSI_MAIN) - xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + ring->rx_buf_len = vsi->rx_buf_len; + + /* XDP RX-queue info only needed for RX rings exposed to XDP */ + if (ring->vsi->type != I40E_VSI_MAIN) + goto skip; + + if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->queue_index, + ring->q_vector->napi.napi_id, + ring->rx_buf_len); + if (err) + return err; + } ring->xsk_pool = i40e_xsk_pool(ring); if (ring->xsk_pool) { - ring->rx_buf_len = - xsk_pool_get_rx_frame_size(ring->xsk_pool); - ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL); - if (ret) - return ret; + if (err) + return err; dev_info(&vsi->back->pdev->dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->queue_index); } else { - ring->rx_buf_len = vsi->rx_buf_len; - if (ring->vsi->type == I40E_VSI_MAIN) { - ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_PAGE_SHARED, - NULL); - if (ret) - return ret; - } + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (err) + return err; } +skip: xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq); rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 1f0a0f13a334..0d7177083708 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1548,7 +1548,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring) int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) { struct device *dev = rx_ring->dev; - int err; u64_stats_init(&rx_ring->syncp); @@ -1569,14 +1568,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) rx_ring->next_to_process = 0; rx_ring->next_to_use = 0; - /* XDP RX-queue info only needed for RX rings exposed to XDP */ - if (rx_ring->vsi->type == I40E_VSI_MAIN) { - err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index, rx_ring->q_vector->napi.napi_id); - if (err < 0) - return err; - } - rx_ring->xdp_prog = rx_ring->vsi->xdp_prog; rx_ring->rx_bi = -- cgit From 0cbb08707c932b3f004bc1a8ec6200ef572c1f5f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 24 Jan 2024 20:16:02 +0100 Subject: i40e: update xdp_rxq_info::frag_size for ZC enabled Rx queue Now that i40e driver correctly sets up frag_size in xdp_rxq_info, let us make it work for ZC multi-buffer as well. i40e_ring::rx_buf_len for ZC is being set via xsk_pool_get_rx_frame_size() and this needs to be propagated up to xdp_rxq_info. Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") Acked-by: Magnus Karlsson Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20240124191602.566724-12-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index d3b00d8ed39a..6e7fd473abfd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3611,7 +3611,14 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ring->xsk_pool = i40e_xsk_pool(ring); if (ring->xsk_pool) { + xdp_rxq_info_unreg(&ring->xdp_rxq); ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->queue_index, + ring->q_vector->napi.napi_id, + ring->rx_buf_len); + if (err) + return err; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_XSK_BUFF_POOL, NULL); -- cgit From f6cc4b6a3ae53df425771000e9c9540cce9b7bb1 Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Tue, 23 Jan 2024 01:24:42 +0800 Subject: fjes: fix memleaks in fjes_hw_setup In fjes_hw_setup, it allocates several memory and delay the deallocation to the fjes_hw_exit in fjes_probe through the following call chain: fjes_probe |-> fjes_hw_init |-> fjes_hw_setup |-> fjes_hw_exit However, when fjes_hw_setup fails, fjes_hw_exit won't be called and thus all the resources allocated in fjes_hw_setup will be leaked. In this patch, we free those resources in fjes_hw_setup and prevents such leaks. Fixes: 2fcbca687702 ("fjes: platform_driver's .probe and .remove routine") Signed-off-by: Zhipeng Lu Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240122172445.3841883-1-alexious@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/fjes/fjes_hw.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c index 704e949484d0..b9b5554ea862 100644 --- a/drivers/net/fjes/fjes_hw.c +++ b/drivers/net/fjes/fjes_hw.c @@ -221,21 +221,25 @@ static int fjes_hw_setup(struct fjes_hw *hw) mem_size = FJES_DEV_REQ_BUF_SIZE(hw->max_epid); hw->hw_info.req_buf = kzalloc(mem_size, GFP_KERNEL); - if (!(hw->hw_info.req_buf)) - return -ENOMEM; + if (!(hw->hw_info.req_buf)) { + result = -ENOMEM; + goto free_ep_info; + } hw->hw_info.req_buf_size = mem_size; mem_size = FJES_DEV_RES_BUF_SIZE(hw->max_epid); hw->hw_info.res_buf = kzalloc(mem_size, GFP_KERNEL); - if (!(hw->hw_info.res_buf)) - return -ENOMEM; + if (!(hw->hw_info.res_buf)) { + result = -ENOMEM; + goto free_req_buf; + } hw->hw_info.res_buf_size = mem_size; result = fjes_hw_alloc_shared_status_region(hw); if (result) - return result; + goto free_res_buf; hw->hw_info.buffer_share_bit = 0; hw->hw_info.buffer_unshare_reserve_bit = 0; @@ -246,11 +250,11 @@ static int fjes_hw_setup(struct fjes_hw *hw) result = fjes_hw_alloc_epbuf(&buf_pair->tx); if (result) - return result; + goto free_epbuf; result = fjes_hw_alloc_epbuf(&buf_pair->rx); if (result) - return result; + goto free_epbuf; spin_lock_irqsave(&hw->rx_status_lock, flags); fjes_hw_setup_epbuf(&buf_pair->tx, mac, @@ -273,6 +277,25 @@ static int fjes_hw_setup(struct fjes_hw *hw) fjes_hw_init_command_registers(hw, ¶m); return 0; + +free_epbuf: + for (epidx = 0; epidx < hw->max_epid ; epidx++) { + if (epidx == hw->my_epid) + continue; + fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].tx); + fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].rx); + } + fjes_hw_free_shared_status_region(hw); +free_res_buf: + kfree(hw->hw_info.res_buf); + hw->hw_info.res_buf = NULL; +free_req_buf: + kfree(hw->hw_info.req_buf); + hw->hw_info.req_buf = NULL; +free_ep_info: + kfree(hw->ep_shm_info); + hw->ep_shm_info = NULL; + return result; } static void fjes_hw_cleanup(struct fjes_hw *hw) -- cgit From a2933a8759a62269754e54733d993b19de870e84 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 23 Jan 2024 15:59:17 +0800 Subject: selftests: bonding: do not test arp/ns target with mode balance-alb/tlb The prio_arp/ns tests hard code the mode to active-backup. At the same time, The balance-alb/tlb modes do not support arp/ns target. So remove the prio_arp/ns tests from the loop and only test active-backup mode. Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests") Reported-by: Jay Vosburgh Closes: https://lore.kernel.org/netdev/17415.1705965957@famine/ Signed-off-by: Hangbin Liu Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20240123075917.1576360-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/bonding/bond_options.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index c54d1697f439..d508486cc0bd 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -162,7 +162,7 @@ prio_arp() local mode=$1 for primary_reselect in 0 1 2; do - prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" + prio_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect" done } @@ -178,7 +178,7 @@ prio_ns() fi for primary_reselect in 0 1 2; do - prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" + prio_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect" done } @@ -194,9 +194,9 @@ prio() for mode in $modes; do prio_miimon $mode - prio_arp $mode - prio_ns $mode done + prio_arp "active-backup" + prio_ns "active-backup" } arp_validate_test() -- cgit From 5e344807735023cd3a67c37a1852b849caa42620 Mon Sep 17 00:00:00 2001 From: Shenwei Wang Date: Tue, 23 Jan 2024 10:51:41 -0600 Subject: net: fec: fix the unhandled context fault from smmu When repeatedly changing the interface link speed using the command below: ethtool -s eth0 speed 100 duplex full ethtool -s eth0 speed 1000 duplex full The following errors may sometimes be reported by the ARM SMMU driver: [ 5395.035364] fec 5b040000.ethernet eth0: Link is Down [ 5395.039255] arm-smmu 51400000.iommu: Unhandled context fault: fsr=0x402, iova=0x00000000, fsynr=0x100001, cbfrsynra=0x852, cb=2 [ 5398.108460] fec 5b040000.ethernet eth0: Link is Up - 100Mbps/Full - flow control off It is identified that the FEC driver does not properly stop the TX queue during the link speed transitions, and this results in the invalid virtual I/O address translations from the SMMU and causes the context faults. Fixes: dbc64a8ea231 ("net: fec: move calls to quiesce/resume packet processing out of fec_restart()") Signed-off-by: Shenwei Wang Link: https://lore.kernel.org/r/20240123165141.2008104-1-shenwei.wang@nxp.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fec_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 4b0259e9269a..432523b2c789 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2036,6 +2036,7 @@ static void fec_enet_adjust_link(struct net_device *ndev) /* if any of the above changed restart the FEC */ if (status_change) { + netif_stop_queue(ndev); napi_disable(&fep->napi); netif_tx_lock_bh(ndev); fec_restart(ndev); @@ -2045,6 +2046,7 @@ static void fec_enet_adjust_link(struct net_device *ndev) } } else { if (fep->link) { + netif_stop_queue(ndev); napi_disable(&fep->napi); netif_tx_lock_bh(ndev); fec_stop(ndev); -- cgit From 50bad6f797d4d501c5ef416a6f92e1912ab5aa8b Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Tue, 23 Jan 2024 21:09:17 +0100 Subject: tsnep: Remove FCS for XDP data path The RX data buffer includes the FCS. The FCS is already stripped for the normal data path. But for the XDP data path the FCS is included and acts like additional/useless data. Remove the FCS from the RX data buffer also for XDP. Fixes: 65b28c810035 ("tsnep: Add XDP RX support") Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support") Signed-off-by: Gerhard Engleder Signed-off-by: Paolo Abeni --- drivers/net/ethernet/engleder/tsnep_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index df40c720e7b2..456e0336f3f6 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -1485,7 +1485,7 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi, xdp_prepare_buff(&xdp, page_address(entry->page), XDP_PACKET_HEADROOM + TSNEP_RX_INLINE_METADATA_SIZE, - length, false); + length - ETH_FCS_LEN, false); consume = tsnep_xdp_run_prog(rx, prog, &xdp, &xdp_status, tx_nq, tx); @@ -1568,7 +1568,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi, prefetch(entry->xdp->data); length = __le32_to_cpu(entry->desc_wb->properties) & TSNEP_DESC_LENGTH_MASK; - xsk_buff_set_size(entry->xdp, length); + xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN); xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool); /* RX metadata with timestamps is in front of actual data, -- cgit From 9a91c05f4bd6f6bdd6b8f90445e0da92e3ac956c Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Tue, 23 Jan 2024 21:09:18 +0100 Subject: tsnep: Fix XDP_RING_NEED_WAKEUP for empty fill ring The fill ring of the XDP socket may contain not enough buffers to completey fill the RX queue during socket creation. In this case the flag XDP_RING_NEED_WAKEUP is not set as this flag is only set if the RX queue is not completely filled during polling. Set XDP_RING_NEED_WAKEUP flag also if RX queue is not completely filled during XDP socket creation. Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support") Signed-off-by: Gerhard Engleder Signed-off-by: Paolo Abeni --- drivers/net/ethernet/engleder/tsnep_main.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index 456e0336f3f6..9aeff2b37a61 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -1762,6 +1762,19 @@ static void tsnep_rx_reopen_xsk(struct tsnep_rx *rx) allocated--; } } + + /* set need wakeup flag immediately if ring is not filled completely, + * first polling would be too late as need wakeup signalisation would + * be delayed for an indefinite time + */ + if (xsk_uses_need_wakeup(rx->xsk_pool)) { + int desc_available = tsnep_rx_desc_available(rx); + + if (desc_available) + xsk_set_rx_need_wakeup(rx->xsk_pool); + else + xsk_clear_rx_need_wakeup(rx->xsk_pool); + } } static bool tsnep_pending(struct tsnep_queue *queue) -- cgit