From 3a5913183aa1b14148c723bda030e6102ad73008 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sun, 9 Oct 2022 22:16:43 +0300 Subject: xfrm: fix "disable_policy" on ipv4 early demux The commit in the "Fixes" tag tried to avoid a case where policy check is ignored due to dst caching in next hops. However, when the traffic is locally consumed, the dst may be cached in a local TCP or UDP socket as part of early demux. In this case the "disable_policy" flag is not checked as ip_route_input_noref() was only called before caching, and thus, packets after the initial packet in a flow will be dropped if not matching policies. Fix by checking the "disable_policy" flag also when a valid dst is already available. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216557 Reported-by: Monil Patel Fixes: e6175a2ed1f1 ("xfrm: fix "disable_policy" flag use when arriving from different devices") Signed-off-by: Eyal Birger ---- v2: use dev instead of skb->dev Signed-off-by: Steffen Klassert --- net/ipv4/ip_input.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1b512390b3cf..e880ce77322a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -366,6 +366,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, iph->tos, dev); if (unlikely(err)) goto drop_error; + } else { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) + IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID -- cgit From d83f7040e18489265b4b121f33f99b02e52dabda Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Tue, 11 Oct 2022 11:01:37 +0300 Subject: xfrm: lwtunnel: squelch kernel warning in case XFRM encap type is not available Ido reported that a kernel warning [1] can be triggered from user space when the kernel is compiled with CONFIG_MODULES=y and CONFIG_XFRM=n when adding an xfrm encap type route, e.g: $ ip route add 198.51.100.0/24 dev dummy1 encap xfrm if_id 1 Error: lwt encapsulation type not supported. The reason for the warning is that the LWT infrastructure has an autoloading feature which is meant only for encap types that don't use a net device, which is not the case in xfrm encap. Mute this warning for xfrm encap as there's no encap module to autoload in this case. [1] WARNING: CPU: 3 PID: 2746262 at net/core/lwtunnel.c:57 lwtunnel_valid_encap_type+0x4f/0x120 [...] Call Trace: rtm_to_fib_config+0x211/0x350 inet_rtm_newroute+0x3a/0xa0 rtnetlink_rcv_msg+0x154/0x3c0 netlink_rcv_skb+0x49/0xf0 netlink_unicast+0x22f/0x350 netlink_sendmsg+0x208/0x440 ____sys_sendmsg+0x21f/0x250 ___sys_sendmsg+0x83/0xd0 __sys_sendmsg+0x54/0xa0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Reported-by: Ido Schimmel Fixes: 2c2493b9da91 ("xfrm: lwtunnel: add lwtunnel support for xfrm interfaces in collect_md mode") Signed-off-by: Eyal Birger Tested-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Steffen Klassert --- net/core/lwtunnel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 6fac2f0ef074..711cd3b4347a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -48,9 +48,11 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; + case LWTUNNEL_ENCAP_XFRM: + /* module autoload not supported for encap type */ + return NULL; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: - case LWTUNNEL_ENCAP_XFRM: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ -- cgit From 4b549ccce941798703f159b227aa28c716aa78fa Mon Sep 17 00:00:00 2001 From: Christian Langrock Date: Mon, 17 Oct 2022 08:34:47 +0200 Subject: xfrm: replay: Fix ESN wrap around for GSO When using GSO it can happen that the wrong seq_hi is used for the last packets before the wrap around. This can lead to double usage of a sequence number. To avoid this, we should serialize this last GSO packet. Fixes: d7dbefc45cf5 ("xfrm: Add xfrm_replay_overflow functions for offloading") Co-developed-by: Steffen Klassert Signed-off-by: Christian Langrock Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ net/xfrm/xfrm_device.c | 15 ++++++++++++++- net/xfrm/xfrm_replay.c | 2 +- 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 170152772d33..3969fa805679 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -314,6 +314,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ xo->seq.low += skb_shinfo(skb)->gso_segs; } + if (xo->seq.low < seq) + xo->seq.hi++; + esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); ip_hdr(skb)->tot_len = htons(skb->len); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 79d43548279c..242f4295940e 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -346,6 +346,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features xo->seq.low += skb_shinfo(skb)->gso_segs; } + if (xo->seq.low < seq) + xo->seq.hi++; + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); len = skb->len - sizeof(struct ipv6hdr); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 5f5aafd418af..21269e8f2db4 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -97,6 +97,18 @@ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) } } +static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + __u32 seq = xo->seq.low; + + seq += skb_shinfo(skb)->gso_segs; + if (unlikely(seq < xo->seq.low)) + return true; + + return false; +} + struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; @@ -134,7 +146,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - if (skb_is_gso(skb) && unlikely(x->xso.dev != dev)) { + if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || + unlikely(xmit_xfrm_check_overflow(skb)))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 9f4d42eb090f..ce56d659c55a 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -714,7 +714,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff oseq += skb_shinfo(skb)->gso_segs; } - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(xo->seq.low < replay_esn->oseq)) { XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; xo->seq.hi = oseq_hi; replay_esn->oseq_hi = oseq_hi; -- cgit From 7f57f8165cb6d2c206e2b9ada53b9e2d6d8af42f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 25 Oct 2022 14:06:48 +0800 Subject: af_key: Fix send_acquire race with pfkey_register The function pfkey_send_acquire may race with pfkey_register (which could even be in a different name space). This may result in a buffer overrun. Allocating the maximum amount of memory that could be used prevents this. Reported-by: syzbot+1e9af9185d8850e2c2fa@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Herbert Xu Reviewed-by: Sabrina Dubroca Reviewed-by: Eric Dumazet Signed-off-by: Steffen Klassert --- net/key/af_key.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index c85df5b958d2..213287814328 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2905,7 +2905,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2923,7 +2923,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { @@ -2934,16 +2934,17 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } -static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -2971,13 +2972,17 @@ static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } -static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -3019,8 +3024,11 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) @@ -3150,6 +3158,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; + int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) @@ -3161,16 +3170,16 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) - size += count_ah_combs(t); + alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) - size += count_esp_combs(t); + alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } - skb = alloc_skb(size + 16, GFP_ATOMIC); + skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -3224,10 +3233,13 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ + alg_size = 0; if (x->id.proto == IPPROTO_AH) - dump_ah_combs(skb, t); + alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) - dump_esp_combs(skb, t); + alg_size = dump_esp_combs(skb, t); + + hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { -- cgit From 22b29557aef3c9d673c887911b504c6d47009de4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 15 Nov 2022 14:10:44 -0800 Subject: selftests: mptcp: gives slow test-case more time On slow or busy VM, some test-cases still fail because the data transfer completes before the endpoint manipulation actually took effect. Address the issue by artificially increasing the runtime for the relevant test-cases. Fixes: ef360019db40 ("selftests: mptcp: signal addresses testcases") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/309 Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index f3dd5f2a0272..2eeaf4aca644 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2152,7 +2152,7 @@ remove_tests() pm_nl_set_limits $ns2 1 3 pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow - run_tests $ns1 $ns2 10.0.1.1 0 -1 -2 slow + run_tests $ns1 $ns2 10.0.1.1 0 -1 -2 speed_10 chk_join_nr 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2 @@ -2165,7 +2165,7 @@ remove_tests() pm_nl_add_endpoint $ns1 10.0.3.1 flags signal pm_nl_add_endpoint $ns1 10.0.4.1 flags signal pm_nl_set_limits $ns2 3 3 - run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 speed_10 chk_join_nr 3 3 3 chk_add_nr 3 3 chk_rm_nr 3 3 invert @@ -2178,7 +2178,7 @@ remove_tests() pm_nl_add_endpoint $ns1 10.0.3.1 flags signal pm_nl_add_endpoint $ns1 10.0.14.1 flags signal pm_nl_set_limits $ns2 3 3 - run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 speed_10 chk_join_nr 1 1 1 chk_add_nr 3 3 chk_rm_nr 3 1 invert -- cgit From 7e68d31020f18f8d695d5f143fc16cdaa96166cb Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 15 Nov 2022 14:10:45 -0800 Subject: selftests: mptcp: run mptcp_sockopt from a new netns Not running it from a new netns causes issues if some MPTCP settings are modified, e.g. if MPTCP is disabled from the sysctl knob, if multiple addresses are available and added to the MPTCP path-manager, etc. In these cases, the created connection will not behave as expected, e.g. unable to create an MPTCP socket, more than one subflow is seen, etc. A new "sandbox" net namespace is now created and used to run mptcp_sockopt from this controlled environment. Fixes: ce9979129a0b ("selftests: mptcp: add mptcp getsockopt test cases") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 0879da915014..80d36f7cfee8 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -35,8 +35,9 @@ init() ns1="ns1-$rndh" ns2="ns2-$rndh" + ns_sbox="ns_sbox-$rndh" - for netns in "$ns1" "$ns2";do + for netns in "$ns1" "$ns2" "$ns_sbox";do ip netns add $netns || exit $ksft_skip ip -net $netns link set lo up ip netns exec $netns sysctl -q net.mptcp.enabled=1 @@ -73,7 +74,7 @@ init() cleanup() { - for netns in "$ns1" "$ns2"; do + for netns in "$ns1" "$ns2" "$ns_sbox"; do ip netns del $netns done rm -f "$cin" "$cout" @@ -243,7 +244,7 @@ do_mptcp_sockopt_tests() { local lret=0 - ./mptcp_sockopt + ip netns exec "$ns_sbox" ./mptcp_sockopt lret=$? if [ $lret -ne 0 ]; then @@ -252,7 +253,7 @@ do_mptcp_sockopt_tests() return fi - ./mptcp_sockopt -6 + ip netns exec "$ns_sbox" ./mptcp_sockopt -6 lret=$? if [ $lret -ne 0 ]; then -- cgit From 3de88b95c4d436d78afc0266a0bed76c35ddeb62 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 15 Nov 2022 14:10:46 -0800 Subject: selftests: mptcp: fix mibit vs mbit mix up The estimated time was supposing the rate was expressed in mibit (bit * 1024^2) but it is in mbit (bit * 1000^2). This makes the threshold higher but in a more realistic way to avoid false positives reported by CI instances. Before this patch, the thresholds were at 7561/4005ms and now they are at 7906/4178ms. While at it, also fix a typo in the linked comment, spotted by Mat. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/310 Fixes: 1a418cb8e888 ("mptcp: simult flow self-tests") Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/simult_flows.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index ffa13a957a36..40aeb5a71a2a 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -247,9 +247,10 @@ run_test() tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1 tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2 - # time is measured in ms, account for transfer size, affegated link speed + # time is measured in ms, account for transfer size, aggregated link speed # and header overhead (10%) - local time=$((size * 8 * 1000 * 10 / (( $rate1 + $rate2) * 1024 *1024 * 9) )) + # ms byte -> bit 10% mbit -> kbit -> bit 10% + local time=$((1000 * size * 8 * 10 / ((rate1 + rate2) * 1000 * 1000 * 9) )) # mptcp_connect will do some sleeps to allow the mp_join handshake # completion (see mptcp_connect): 200ms on each side, add some slack -- cgit From 733d4bbf9514890eb53ebe75827bf1fb4fd25ebe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 15 Nov 2022 19:34:39 +0200 Subject: net: liquidio: simplify if expression Fix the warning reported by kbuild: cocci warnings: (new ones prefixed by >>) >> drivers/net/ethernet/cavium/liquidio/lio_main.c:1797:54-56: WARNING !A || A && B is equivalent to !A || B drivers/net/ethernet/cavium/liquidio/lio_main.c:1827:54-56: WARNING !A || A && B is equivalent to !A || B Fixes: 8979f428a4af ("net: liquidio: release resources when liquidio driver open failed") Reported-by: kernel test robot Signed-off-by: Leon Romanovsky Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 75771825c3f9..98793b2ac2c7 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -1794,7 +1794,7 @@ static int liquidio_open(struct net_device *netdev) ifstate_set(lio, LIO_IFSTATE_RUNNING); - if (!OCTEON_CN23XX_PF(oct) || (OCTEON_CN23XX_PF(oct) && !oct->msix_on)) { + if (!OCTEON_CN23XX_PF(oct) || !oct->msix_on) { ret = setup_tx_poll_fn(netdev); if (ret) goto err_poll; @@ -1824,7 +1824,7 @@ static int liquidio_open(struct net_device *netdev) return 0; err_rx_ctrl: - if (!OCTEON_CN23XX_PF(oct) || (OCTEON_CN23XX_PF(oct) && !oct->msix_on)) + if (!OCTEON_CN23XX_PF(oct) || !oct->msix_on) cleanup_tx_poll_fn(netdev); err_poll: if (lio->ptp_clock) { -- cgit From 8207f253a097fe15c93d85ac15ebb73c5e39e1e1 Mon Sep 17 00:00:00 2001 From: Thomas Zeitlhofer Date: Tue, 15 Nov 2022 23:09:41 +0100 Subject: net: neigh: decrement the family specific qlen Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") introduced the length counter qlen in struct neigh_parms. There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and while the family specific qlen is incremented in pneigh_enqueue(), the mentioned commit decrements always the IPv4/ARP specific qlen, regardless of the currently processed family, in pneigh_queue_purge() and neigh_proxy_process(). As a result, with IPv6/ND, the family specific qlen is only incremented (and never decremented) until it exceeds PROXY_QLEN, and then, according to the check in pneigh_enqueue(), neighbor solicitations are not answered anymore. As an example, this is noted when using the subnet-router anycast address to access a Linux router. After a certain amount of time (in the observed case, qlen exceeded PROXY_QLEN after two days), the Linux router stops answering neighbor solicitations for its subnet-router anycast address and effectively becomes unreachable. Another result with IPv6/ND is that the IPv4/ARP specific qlen is decremented more often than incremented. This leads to negative qlen values, as a signed integer has been used for the length counter qlen, and potentially to an integer overflow. Fix this by introducing the helper function neigh_parms_qlen_dec(), which decrements the family specific qlen. Thereby, make use of the existing helper function neigh_get_dev_parms_rcu(), whose definition therefore needs to be placed earlier in neighbour.c. Take the family member from struct neigh_table to determine the currently processed family and appropriately call neigh_parms_qlen_dec() from pneigh_queue_purge() and neigh_proxy_process(). Additionally, use an unsigned integer for the length counter qlen. Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") Signed-off-by: Thomas Zeitlhofer Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- net/core/neighbour.c | 58 +++++++++++++++++++++++++------------------------ 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 20745cf7ae1a..2f2a6023fb0e 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -83,7 +83,7 @@ struct neigh_parms { struct rcu_head rcu_head; int reachable_time; - int qlen; + u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a77a85e357e0..952a54763358 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -307,7 +307,31 @@ static int neigh_del_timer(struct neighbour *n) return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_parms_qlen_dec(struct net_device *dev, int family) +{ + struct neigh_parms *p; + + rcu_read_lock(); + p = neigh_get_dev_parms_rcu(dev, family); + if (p) + p->qlen--; + rcu_read_unlock(); +} + +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, + int family) { struct sk_buff_head tmp; unsigned long flags; @@ -321,13 +345,7 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { - struct in_device *in_dev; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } @@ -409,7 +427,8 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, + tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) del_timer_sync(&tbl->proxy_timer); return 0; @@ -1621,13 +1640,8 @@ static void neigh_proxy_process(struct timer_list *t) if (tdif <= 0) { struct net_device *dev = skb->dev; - struct in_device *in_dev; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { @@ -1821,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue, NULL); + pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); @@ -3539,18 +3553,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, return ret; } -static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, - int family) -{ - switch (family) { - case AF_INET: - return __in_dev_arp_parms_get_rcu(dev); - case AF_INET6: - return __in6_dev_nd_parms_get_rcu(dev); - } - return NULL; -} - static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { -- cgit From 40b9d1ab63f5c4f3cb69450044d07b45e5af72e1 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Tue, 15 Nov 2022 17:19:14 -0800 Subject: ipvlan: hold lower dev to avoid possible use-after-free Recently syzkaller discovered the issue of disappearing lower device (NETDEV_UNREGISTER) while the virtual device (like macvlan) is still having it as a lower device. So it's just a matter of time similar discovery will be made for IPvlan device setup. So fixing it preemptively. Also while at it, add a refcount tracker. Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan.h | 1 + drivers/net/ipvlan/ipvlan_main.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index de94921cbef9..025e0c19ec25 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -98,6 +98,7 @@ struct ipvl_port { struct sk_buff_head backlog; int count; struct ida ida; + netdevice_tracker dev_tracker; }; struct ipvl_skb_cb { diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 54c94a69c2bb..796a38f9d7b2 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -83,6 +83,7 @@ static int ipvlan_port_create(struct net_device *dev) if (err) goto err; + netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); return 0; err: @@ -95,6 +96,7 @@ static void ipvlan_port_destroy(struct net_device *dev) struct ipvl_port *port = ipvlan_port_get_rtnl(dev); struct sk_buff *skb; + netdev_put(dev, &port->dev_tracker); if (port->mode == IPVLAN_MODE_L3S) ipvlan_l3s_unregister(port); netdev_rx_handler_unregister(dev); -- cgit From 302e57f809be8b678d1ab0b2634504d5d51a166d Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Thu, 17 Nov 2022 10:45:03 +0800 Subject: selftests/net: fix missing xdp_dummy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit afef88e65554 ("selftests/bpf: Store BPF object files with .bpf.o extension"), we should use xdp_dummy.bpf.o instade of xdp_dummy.o. In addition, use the BPF_FILE variable to save the BPF object file name, which can be better identified and modified. Fixes: afef88e65554 ("selftests/bpf: Store BPF object files with .bpf.o extension") Signed-off-by: Wang Yufen Cc: Daniel Müller Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro.sh | 8 +++++--- tools/testing/selftests/net/udpgro_bench.sh | 8 +++++--- tools/testing/selftests/net/udpgro_frglist.sh | 8 +++++--- tools/testing/selftests/net/udpgro_fwd.sh | 3 ++- tools/testing/selftests/net/veth.sh | 11 ++++++----- 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 6a443ca3cd3a..0c743752669a 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -5,6 +5,8 @@ readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" +BPF_FILE="../bpf/xdp_dummy.bpf.o" + # set global exit status, but never reset nonzero one. check_err() { @@ -34,7 +36,7 @@ cfg_veth() { ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24 ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad ip -netns "${PEER_NS}" link set dev veth1 up - ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp + ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp } run_one() { @@ -195,8 +197,8 @@ run_all() { return $ret } -if [ ! -f ../bpf/xdp_dummy.o ]; then - echo "Missing xdp_dummy helper. Build bpf selftest first" +if [ ! -f ${BPF_FILE} ]; then + echo "Missing ${BPF_FILE}. Build bpf selftest first" exit -1 fi diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh index 8a1109a545db..894972877e8b 100755 --- a/tools/testing/selftests/net/udpgro_bench.sh +++ b/tools/testing/selftests/net/udpgro_bench.sh @@ -5,6 +5,8 @@ readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" +BPF_FILE="../bpf/xdp_dummy.bpf.o" + cleanup() { local -r jobs="$(jobs -p)" local -r ns="$(ip netns list|grep $PEER_NS)" @@ -34,7 +36,7 @@ run_one() { ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad ip -netns "${PEER_NS}" link set dev veth1 up - ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp + ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r & ip netns exec "${PEER_NS}" ./udpgso_bench_rx -t ${rx_args} -r & @@ -80,8 +82,8 @@ run_all() { run_udp "${ipv6_args}" } -if [ ! -f ../bpf/xdp_dummy.o ]; then - echo "Missing xdp_dummy helper. Build bpf selftest first" +if [ ! -f ${BPF_FILE} ]; then + echo "Missing ${BPF_FILE}. Build bpf selftest first" exit -1 fi diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh index 7fe85ba51075..c9c4b9d65839 100755 --- a/tools/testing/selftests/net/udpgro_frglist.sh +++ b/tools/testing/selftests/net/udpgro_frglist.sh @@ -5,6 +5,8 @@ readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" +BPF_FILE="../bpf/xdp_dummy.bpf.o" + cleanup() { local -r jobs="$(jobs -p)" local -r ns="$(ip netns list|grep $PEER_NS)" @@ -36,7 +38,7 @@ run_one() { ip netns exec "${PEER_NS}" ethtool -K veth1 rx-gro-list on - ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp + ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp tc -n "${PEER_NS}" qdisc add dev veth1 clsact tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file ../bpf/nat6to4.o section schedcls/ingress6/nat_6 direct-action tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file ../bpf/nat6to4.o section schedcls/egress4/snat4 direct-action @@ -81,8 +83,8 @@ run_all() { run_udp "${ipv6_args}" } -if [ ! -f ../bpf/xdp_dummy.o ]; then - echo "Missing xdp_dummy helper. Build bpf selftest first" +if [ ! -f ${BPF_FILE} ]; then + echo "Missing ${BPF_FILE}. Build bpf selftest first" exit -1 fi diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 1bcd82e1f662..c079565add39 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -1,6 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +BPF_FILE="../bpf/xdp_dummy.bpf.o" readonly BASE="ns-$(mktemp -u XXXXXX)" readonly SRC=2 readonly DST=1 @@ -46,7 +47,7 @@ create_ns() { ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad done - ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp 2>/dev/null + ip -n $NS_DST link set veth$DST xdp object ${BPF_FILE} section xdp 2>/dev/null } create_vxlan_endpoint() { diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 430895d1a2b6..2d073595c620 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -1,6 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 +BPF_FILE="../bpf/xdp_dummy.bpf.o" readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" readonly BASE=`basename $STATS` readonly SRC=2 @@ -216,8 +217,8 @@ while getopts "hs:" option; do esac done -if [ ! -f ../bpf/xdp_dummy.o ]; then - echo "Missing xdp_dummy helper. Build bpf selftest first" +if [ ! -f ${BPF_FILE} ]; then + echo "Missing ${BPF_FILE}. Build bpf selftest first" exit 1 fi @@ -288,14 +289,14 @@ if [ $CPUS -gt 1 ]; then ip netns exec $NS_DST ethtool -L veth$DST rx 1 tx 2 2>/dev/null ip netns exec $NS_SRC ethtool -L veth$SRC rx 1 tx 2 2>/dev/null printf "%-60s" "bad setting: XDP with RX nr less than TX" - ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + ip -n $NS_DST link set dev veth$DST xdp object ${BPF_FILE} \ section xdp 2>/dev/null &&\ echo "fail - set operation successful ?!?" || echo " ok " # the following tests will run with multiple channels active ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 ip netns exec $NS_DST ethtool -L veth$DST rx 2 - ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + ip -n $NS_DST link set dev veth$DST xdp object ${BPF_FILE} \ section xdp 2>/dev/null printf "%-60s" "bad setting: reducing RX nr below peer TX with XDP set" ip netns exec $NS_DST ethtool -L veth$DST rx 1 2>/dev/null &&\ @@ -311,7 +312,7 @@ if [ $CPUS -gt 2 ]; then chk_channels "setting invalid channels nr" $DST 2 2 fi -ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp 2>/dev/null +ip -n $NS_DST link set dev veth$DST xdp object ${BPF_FILE} section xdp 2>/dev/null chk_gro_flag "with xdp attached - gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off chk_tso_flag " - tso flag" $SRC off -- cgit From 3bcd6c7eaa53b56c3f584da46a1f7652e759d0e5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 16 Nov 2022 14:02:28 +0000 Subject: rxrpc: Fix race between conn bundle lookup and bundle removal [ZDI-CAN-15975] After rxrpc_unbundle_conn() has removed a connection from a bundle, it checks to see if there are any conns with available channels and, if not, removes and attempts to destroy the bundle. Whilst it does check after grabbing client_bundles_lock that there are no connections attached, this races with rxrpc_look_up_bundle() retrieving the bundle, but not attaching a connection for the connection to be attached later. There is therefore a window in which the bundle can get destroyed before we manage to attach a new connection to it. Fix this by adding an "active" counter to struct rxrpc_bundle: (1) rxrpc_connect_call() obtains an active count by prepping/looking up a bundle and ditches it before returning. (2) If, during rxrpc_connect_call(), a connection is added to the bundle, this obtains an active count, which is held until the connection is discarded. (3) rxrpc_deactivate_bundle() is created to drop an active count on a bundle and destroy it when the active count reaches 0. The active count is checked inside client_bundles_lock() to prevent a race with rxrpc_look_up_bundle(). (4) rxrpc_unbundle_conn() then calls rxrpc_deactivate_bundle(). Fixes: 245500d853e9 ("rxrpc: Rewrite the client connection manager") Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-15975 Signed-off-by: David Howells Tested-by: zdi-disclosures@trendmicro.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/conn_client.c | 38 +++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1ad0ec5afb50..8499ceb7719c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -399,6 +399,7 @@ enum rxrpc_conn_proto_state { struct rxrpc_bundle { struct rxrpc_conn_parameters params; refcount_t ref; + atomic_t active; /* Number of active users */ unsigned int debug_id; bool try_upgrade; /* True if the bundle is attempting upgrade */ bool alloc_conn; /* True if someone's getting a conn */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3c9eeb5b750c..bdb335cb2d05 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -40,6 +40,8 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The @@ -123,6 +125,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, bundle->params = *cp; rxrpc_get_peer(bundle->params.peer); refcount_set(&bundle->ref, 1); + atomic_set(&bundle->active, 1); spin_lock_init(&bundle->channel_lock); INIT_LIST_HEAD(&bundle->waiting_calls); } @@ -149,7 +152,7 @@ void rxrpc_put_bundle(struct rxrpc_bundle *bundle) dead = __refcount_dec_and_test(&bundle->ref, &r); - _debug("PUT B=%x %d", d, r); + _debug("PUT B=%x %d", d, r - 1); if (dead) rxrpc_free_bundle(bundle); } @@ -338,6 +341,7 @@ found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: rxrpc_get_bundle(bundle); + atomic_inc(&bundle->active); spin_unlock(&local->client_bundles_lock); _leave(" = %u [found]", bundle->debug_id); return bundle; @@ -435,6 +439,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) if (old) trace_rxrpc_client(old, -1, rxrpc_client_replace); candidate->bundle_shift = shift; + atomic_inc(&bundle->active); bundle->conns[i] = candidate; for (j = 0; j < RXRPC_MAXCALLS; j++) set_bit(shift + j, &bundle->avail_chans); @@ -725,6 +730,7 @@ granted_channel: smp_rmb(); out_put_bundle: + rxrpc_deactivate_bundle(bundle); rxrpc_put_bundle(bundle); out: _leave(" = %d", ret); @@ -900,9 +906,8 @@ out: static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; - struct rxrpc_local *local = bundle->params.local; unsigned int bindex; - bool need_drop = false, need_put = false; + bool need_drop = false; int i; _enter("C=%x", conn->debug_id); @@ -921,15 +926,22 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) } spin_unlock(&bundle->channel_lock); - /* If there are no more connections, remove the bundle */ - if (!bundle->avail_chans) { - _debug("maybe unbundle"); - spin_lock(&local->client_bundles_lock); + if (need_drop) { + rxrpc_deactivate_bundle(bundle); + rxrpc_put_connection(conn); + } +} - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) - if (bundle->conns[i]) - break; - if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) { +/* + * Drop the active count on a bundle. + */ +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) +{ + struct rxrpc_local *local = bundle->params.local; + bool need_put = false; + + if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { + if (!bundle->params.exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); need_put = true; @@ -939,10 +951,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (need_put) rxrpc_put_bundle(bundle); } - - if (need_drop) - rxrpc_put_connection(conn); - _leave(""); } /* -- cgit From 24deec6b9e4a051635f75777844ffc184644fec9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Nov 2022 12:06:53 +0200 Subject: net: dsa: sja1105: disallow C45 transactions on the BASE-TX MDIO bus You'd think people know that the internal 100BASE-TX PHY on the SJA1110 responds only to clause 22 MDIO transactions, but they don't :) When a clause 45 transaction is attempted, sja1105_base_tx_mdio_read() and sja1105_base_tx_mdio_write() don't expect "reg" to contain bit 30 set (MII_ADDR_C45) and pack this value into the SPI transaction buffer. But the field in the SPI buffer has a width smaller than 30 bits, so we see this confusing message from the packing() API rather than a proper rejection of C45 transactions: Call trace: dump_stack+0x1c/0x38 sja1105_pack+0xbc/0xc0 [sja1105] sja1105_xfer+0x114/0x2b0 [sja1105] sja1105_xfer_u32+0x44/0xf4 [sja1105] sja1105_base_tx_mdio_read+0x44/0x7c [sja1105] mdiobus_read+0x44/0x80 get_phy_c45_ids+0x70/0x234 get_phy_device+0x68/0x15c fwnode_mdiobus_register_phy+0x74/0x240 of_mdiobus_register+0x13c/0x380 sja1105_mdiobus_register+0x368/0x490 [sja1105] sja1105_setup+0x94/0x119c [sja1105] Cannot store 401d2405 inside bits 24-4 (would truncate) Fixes: 5a8f09748ee7 ("net: dsa: sja1105: register the MDIO buses for 100base-T1 and 100base-TX") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_mdio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/sja1105/sja1105_mdio.c b/drivers/net/dsa/sja1105/sja1105_mdio.c index 215dd17ca790..4059fcc8c832 100644 --- a/drivers/net/dsa/sja1105/sja1105_mdio.c +++ b/drivers/net/dsa/sja1105/sja1105_mdio.c @@ -256,6 +256,9 @@ static int sja1105_base_tx_mdio_read(struct mii_bus *bus, int phy, int reg) u32 tmp; int rc; + if (reg & MII_ADDR_C45) + return -EOPNOTSUPP; + rc = sja1105_xfer_u32(priv, SPI_READ, regs->mdio_100base_tx + reg, &tmp, NULL); if (rc < 0) @@ -272,6 +275,9 @@ static int sja1105_base_tx_mdio_write(struct mii_bus *bus, int phy, int reg, const struct sja1105_regs *regs = priv->info->regs; u32 tmp = val; + if (reg & MII_ADDR_C45) + return -EOPNOTSUPP; + return sja1105_xfer_u32(priv, SPI_WRITE, regs->mdio_100base_tx + reg, &tmp, NULL); } -- cgit From 0ad6bded175e829c2ca261529c9dce39a32a042d Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 16 Nov 2022 21:02:49 +0800 Subject: nfc/nci: fix race with opening and closing Previously we leverage NCI_UNREG and the lock inside nci_close_device to prevent the race condition between opening a device and closing a device. However, it still has problem because a failed opening command will erase the NCI_UNREG flag and allow another opening command to bypass the status checking. This fix corrects that by making sure the NCI_UNREG is held. Reported-by: syzbot+43475bf3cfbd6e41f5b7@syzkaller.appspotmail.com Fixes: 48b71a9e66c2 ("NFC: add NCI_UNREG flag to eliminate the race") Signed-off-by: Lin Ma Signed-off-by: David S. Miller --- net/nfc/nci/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6a193cce2a75..4ffdf2f45c44 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -542,7 +542,7 @@ static int nci_open_device(struct nci_dev *ndev) skb_queue_purge(&ndev->tx_q); ndev->ops->close(ndev); - ndev->flags = 0; + ndev->flags &= BIT(NCI_UNREG); } done: -- cgit From 2360f9b8c4e81d242d4cbf99d630a2fffa681fab Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Thu, 17 Nov 2022 14:55:27 +0800 Subject: net: pch_gbe: fix potential memleak in pch_gbe_tx_queue() In pch_gbe_xmit_frame(), NETDEV_TX_OK will be returned whether pch_gbe_tx_queue() sends data successfully or not, so pch_gbe_tx_queue() needs to free skb before returning. But pch_gbe_tx_queue() returns without freeing skb in case of dma_map_single() fails. Add dev_kfree_skb_any() to fix it. Fixes: 77555ee72282 ("net: Add Gigabit Ethernet driver of Topcliff PCH") Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c index 3f2c30184752..c9ae47128a07 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c @@ -1143,6 +1143,7 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter *adapter, buffer_info->dma = 0; buffer_info->time_stamp = 0; tx_ring->next_to_use = ring_num; + dev_kfree_skb_any(skb); return; } buffer_info->mapped = true; -- cgit From 52d1aa8b8249ff477aaa38b6f74a8ced780d079c Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 9 Nov 2022 12:39:07 -0700 Subject: netfilter: conntrack: Fix data-races around ct mark nf_conn:mark can be read from and written to in parallel. Use READ_ONCE()/WRITE_ONCE() for reads and writes to prevent unwanted compiler optimizations. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daniel Xu Signed-off-by: Pablo Neira Ayuso --- net/core/flow_dissector.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 ++-- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 24 ++++++++++++++---------- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nft_ct.c | 6 +++--- net/netfilter/xt_connmark.c | 18 ++++++++++-------- net/openvswitch/conntrack.c | 8 ++++---- net/sched/act_connmark.c | 4 ++-- net/sched/act_ct.c | 8 ++++---- net/sched/act_ctinfo.c | 6 +++--- 11 files changed, 45 insertions(+), 39 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25cd35f5922e..007730412947 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -296,7 +296,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - key->ct_mark = ct->mark; + key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f8e176c77d1c..b3cc416ed292 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -435,7 +435,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + WRITE_ONCE(ct->mark, hash); break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: @@ -452,7 +452,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) #ifdef DEBUG nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - pr_debug("hash=%u ct_hash=%u ", hash, ct->mark); + pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); if (!clusterip_responsible(cipinfo->config, hash)) { pr_debug("not responsible\n"); return NF_DROP; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f97bda06d2a9..2692139ce417 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1781,7 +1781,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } #ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7562b215b932..d71150a40fb0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -328,9 +328,9 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_mark(struct sk_buff *skb, u32 mark) { - if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) + if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; @@ -543,7 +543,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb, static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_mark(skb, READ_ONCE(ct->mark)) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || @@ -722,6 +722,7 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; + u32 mark; int err; if (events & (1 << IPCT_DESTROY)) { @@ -826,8 +827,9 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((events & (1 << IPCT_MARK) || ct->mark) - && ctnetlink_dump_mark(skb, ct) < 0) + mark = READ_ONCE(ct->mark); + if ((events & (1 << IPCT_MARK) || mark) && + ctnetlink_dump_mark(skb, mark) < 0) goto nla_put_failure; #endif nlmsg_end(skb, nlh); @@ -1154,7 +1156,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) != filter->mark.val) + if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif status = (u32)READ_ONCE(ct->status); @@ -2002,9 +2004,9 @@ static void ctnetlink_change_mark(struct nf_conn *ct, mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); mark = ntohl(nla_get_be32(cda[CTA_MARK])); - newmark = (ct->mark & mask) ^ mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (READ_ONCE(ct->mark) & mask) ^ mark; + if (newmark != READ_ONCE(ct->mark)) + WRITE_ONCE(ct->mark, newmark); } #endif @@ -2669,6 +2671,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; + u32 mark; zone = nf_ct_zone(ct); @@ -2730,7 +2733,8 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK - if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) + mark = READ_ONCE(ct->mark); + if (mark && ctnetlink_dump_mark(skb, mark) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 4ffe84c5a82c..bca839ab1ae8 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -366,7 +366,7 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) - seq_printf(s, "mark=%u ", ct->mark); + seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a3f01f209a53..641dc21f92b4 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -98,7 +98,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - *dest = ct->mark; + *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK @@ -297,8 +297,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr, switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - if (ct->mark != value) { - ct->mark = value; + if (READ_ONCE(ct->mark) != value) { + WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index e5ebc0810675..ad3c033db64e 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -30,6 +30,7 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; + u_int32_t oldmark; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) @@ -37,14 +38,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) switch (info->mode) { case XT_CONNMARK_SET: - newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + oldmark = READ_ONCE(ct->mark); + newmark = (oldmark & ~info->ctmask) ^ info->ctmark; if (info->shift_dir == D_SHIFT_RIGHT) newmark >>= info->shift_bits; else newmark <<= info->shift_bits; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; @@ -55,15 +57,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) else new_targetmark <<= info->shift_bits; - newmark = (ct->mark & ~info->ctmask) ^ + newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^ new_targetmark; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: - new_targetmark = (ct->mark & info->ctmask); + new_targetmark = (READ_ONCE(ct->mark) & info->ctmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else @@ -126,7 +128,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL) return false; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert; } static int connmark_mt_check(const struct xt_mtchk_param *par) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c7b10234cf7c..c8eaf4234b2e 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -152,7 +152,7 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) static u32 ovs_ct_get_mark(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - return ct ? ct->mark : 0; + return ct ? READ_ONCE(ct->mark) : 0; #else return 0; #endif @@ -340,9 +340,9 @@ static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) u32 new_mark; - new_mark = ct_mark | (ct->mark & ~(mask)); - if (ct->mark != new_mark) { - ct->mark = new_mark; + new_mark = ct_mark | (READ_ONCE(ct->mark) & ~(mask)); + if (READ_ONCE(ct->mark) != new_mark) { + WRITE_ONCE(ct->mark, new_mark); if (nf_ct_is_confirmed(ct)) nf_conntrack_event_cache(IPCT_MARK, ct); key->ct.mark = new_mark; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 66b143bb04ac..d41002e4613f 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -61,7 +61,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, c = nf_ct_get(skb, &ctinfo); if (c) { - skb->mark = c->mark; + skb->mark = READ_ONCE(c->mark); /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; goto out; @@ -81,7 +81,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, c = nf_ct_tuplehash_to_ctrack(thash); /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; - skb->mark = c->mark; + skb->mark = READ_ONCE(c->mark); nf_ct_put(c); out: diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index b38d91d6b249..4c7f7861ea96 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -178,7 +178,7 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, entry = tcf_ct_flow_table_flow_action_get_next(action); entry->id = FLOW_ACTION_CT_METADATA; #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - entry->ct_metadata.mark = ct->mark; + entry->ct_metadata.mark = READ_ONCE(ct->mark); #endif ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : IP_CT_ESTABLISHED_REPLY; @@ -936,9 +936,9 @@ static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) if (!mask) return; - new_mark = mark | (ct->mark & ~(mask)); - if (ct->mark != new_mark) { - ct->mark = new_mark; + new_mark = mark | (READ_ONCE(ct->mark) & ~(mask)); + if (READ_ONCE(ct->mark) != new_mark) { + WRITE_ONCE(ct->mark, new_mark); if (nf_ct_is_confirmed(ct)) nf_conntrack_event_cache(IPCT_MARK, ct); } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index d4102f0a9abd..eaa02f098d1c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -32,7 +32,7 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, { u8 dscp, newdscp; - newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) & + newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) & ~INET_ECN_MASK; switch (proto) { @@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct sk_buff *skb) { ca->stats_cpmark_set++; - skb->mark = ct->mark & cp->cpmarkmask; + skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, @@ -130,7 +130,7 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, } if (cp->mode & CTINFO_MODE_DSCP) - if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask)) + if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask)) tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); if (cp->mode & CTINFO_MODE_CPMARK) -- cgit From 33c7aba0b4ffd6d7cdab862a034eb582a5120a38 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2022 11:31:54 +0100 Subject: netfilter: nf_tables: do not set up extensions for end interval Elements with an end interval flag set on do not store extensions. The global set definition is currently setting on the timeout and stateful expression for end interval elements. This leads to skipping end interval elements from the set->ops->walk() path as the expired check bogusly reports true. Moreover, do not set up stateful expressions for elements with end interval flag set on since this is never used. Fixes: 65038428b2c6 ("netfilter: nf_tables: allow to specify stateful expression in set definition") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e7152d599d73..7a09421f19e1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5958,7 +5958,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, &timeout); if (err) return err; - } else if (set->flags & NFT_SET_TIMEOUT) { + } else if (set->flags & NFT_SET_TIMEOUT && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { timeout = set->timeout; } @@ -6024,7 +6025,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EOPNOTSUPP; goto err_set_elem_expr; } - } else if (set->num_exprs > 0) { + } else if (set->num_exprs > 0 && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; -- cgit From c678669d6b13b77de3b99b97526aaf23c3088d0a Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 8 Nov 2022 10:35:34 +0100 Subject: iavf: Fix a crash during reset task Recent commit aa626da947e9 ("iavf: Detach device during reset task") removed netif_tx_stop_all_queues() with an assumption that Tx queues are already stopped by netif_device_detach() in the beginning of reset task. This assumption is incorrect because during reset task a potential link event can start Tx queues again. Revert this change to fix this issue. Reproducer: 1. Run some Tx traffic (e.g. iperf3) over iavf interface 2. Switch MTU of this interface in a loop [root@host ~]# cat repro.sh IF=enp2s0f0v0 iperf3 -c 192.168.0.1 -t 600 --logfile /dev/null & sleep 2 while :; do for i in 1280 1500 2000 900 ; do ip link set $IF mtu $i sleep 2 done done [root@host ~]# ./repro.sh Result: [ 306.199917] iavf 0000:02:02.0 enp2s0f0v0: NIC Link is Up Speed is 40 Gbps Full Duplex [ 308.205944] iavf 0000:02:02.0 enp2s0f0v0: NIC Link is Up Speed is 40 Gbps Full Duplex [ 310.103223] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 310.110179] #PF: supervisor write access in kernel mode [ 310.115396] #PF: error_code(0x0002) - not-present page [ 310.120526] PGD 0 P4D 0 [ 310.123057] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 310.127408] CPU: 24 PID: 183 Comm: kworker/u64:9 Kdump: loaded Not tainted 6.1.0-rc3+ #2 [ 310.135485] Hardware name: Abacus electric, s.r.o. - servis@abacus.cz Super Server/H12SSW-iN, BIOS 2.4 04/13/2022 [ 310.145728] Workqueue: iavf iavf_reset_task [iavf] [ 310.150520] RIP: 0010:iavf_xmit_frame_ring+0xd1/0xf70 [iavf] [ 310.156180] Code: d0 0f 86 da 00 00 00 83 e8 01 0f b7 fa 29 f8 01 c8 39 c6 0f 8f a0 08 00 00 48 8b 45 20 48 8d 14 92 bf 01 00 00 00 4c 8d 3c d0 <49> 89 5f 08 8b 43 70 66 41 89 7f 14 41 89 47 10 f6 83 82 00 00 00 [ 310.174918] RSP: 0018:ffffbb5f0082caa0 EFLAGS: 00010293 [ 310.180137] RAX: 0000000000000000 RBX: ffff92345471a6e8 RCX: 0000000000000200 [ 310.187259] RDX: 0000000000000000 RSI: 000000000000000d RDI: 0000000000000001 [ 310.194385] RBP: ffff92341d249000 R08: ffff92434987fcac R09: 0000000000000001 [ 310.201509] R10: 0000000011f683b9 R11: 0000000011f50641 R12: 0000000000000008 [ 310.208631] R13: ffff923447500000 R14: 0000000000000000 R15: 0000000000000000 [ 310.215756] FS: 0000000000000000(0000) GS:ffff92434ee00000(0000) knlGS:0000000000000000 [ 310.223835] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 310.229572] CR2: 0000000000000008 CR3: 0000000fbc210004 CR4: 0000000000770ee0 [ 310.236696] PKRU: 55555554 [ 310.239399] Call Trace: [ 310.241844] [ 310.243855] ? dst_alloc+0x5b/0xb0 [ 310.247260] dev_hard_start_xmit+0x9e/0x1f0 [ 310.251439] sch_direct_xmit+0xa0/0x370 [ 310.255276] __qdisc_run+0x13e/0x580 [ 310.258848] __dev_queue_xmit+0x431/0xd00 [ 310.262851] ? selinux_ip_postroute+0x147/0x3f0 [ 310.267377] ip_finish_output2+0x26c/0x540 Fixes: aa626da947e9 ("iavf: Detach device during reset task") Cc: Jacob Keller Cc: Patryk Piotrowski Cc: SlawomirX Laba Signed-off-by: Ivan Vecera Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 3fc572341781..5abcd66e7c7a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3033,6 +3033,7 @@ continue_reset: if (running) { netif_carrier_off(netdev); + netif_tx_stop_all_queues(netdev); adapter->link_up = false; iavf_napi_disable_all(adapter); } -- cgit From 08f1c147b7265245d67321585c68a27e990e0c4b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 8 Nov 2022 11:25:02 +0100 Subject: iavf: Do not restart Tx queues after reset task failure After commit aa626da947e9 ("iavf: Detach device during reset task") the device is detached during reset task and re-attached at its end. The problem occurs when reset task fails because Tx queues are restarted during device re-attach and this leads later to a crash. To resolve this issue properly close the net device in cause of failure in reset task to avoid restarting of tx queues at the end. Also replace the hacky manipulation with IFF_UP flag by device close that clears properly both IFF_UP and __LINK_STATE_START flags. In these case iavf_close() does not do anything because the adapter state is already __IAVF_DOWN. Reproducer: 1) Run some Tx traffic (e.g. iperf3) over iavf interface 2) Set VF trusted / untrusted in loop [root@host ~]# cat repro.sh PF=enp65s0f0 IF=${PF}v0 ip link set up $IF ip addr add 192.168.0.2/24 dev $IF sleep 1 iperf3 -c 192.168.0.1 -t 600 --logfile /dev/null & sleep 2 while :; do ip link set $PF vf 0 trust on ip link set $PF vf 0 trust off done [root@host ~]# ./repro.sh Result: [ 2006.650969] iavf 0000:41:01.0: Failed to init adminq: -53 [ 2006.675662] ice 0000:41:00.0: VF 0 is now trusted [ 2006.689997] iavf 0000:41:01.0: Reset task did not complete, VF disabled [ 2006.696611] iavf 0000:41:01.0: failed to allocate resources during reinit [ 2006.703209] ice 0000:41:00.0: VF 0 is now untrusted [ 2006.737011] ice 0000:41:00.0: VF 0 is now trusted [ 2006.764536] ice 0000:41:00.0: VF 0 is now untrusted [ 2006.768919] BUG: kernel NULL pointer dereference, address: 0000000000000b4a [ 2006.776358] #PF: supervisor read access in kernel mode [ 2006.781488] #PF: error_code(0x0000) - not-present page [ 2006.786620] PGD 0 P4D 0 [ 2006.789152] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 2006.792903] ice 0000:41:00.0: VF 0 is now trusted [ 2006.793501] CPU: 4 PID: 0 Comm: swapper/4 Kdump: loaded Not tainted 6.1.0-rc3+ #2 [ 2006.805668] Hardware name: Abacus electric, s.r.o. - servis@abacus.cz Super Server/H12SSW-iN, BIOS 2.4 04/13/2022 [ 2006.815915] RIP: 0010:iavf_xmit_frame_ring+0x96/0xf70 [iavf] [ 2006.821028] ice 0000:41:00.0: VF 0 is now untrusted [ 2006.821572] Code: 48 83 c1 04 48 c1 e1 04 48 01 f9 48 83 c0 10 6b 50 f8 55 c1 ea 14 45 8d 64 14 01 48 39 c8 75 eb 41 83 fc 07 0f 8f e9 08 00 00 <0f> b7 45 4a 0f b7 55 48 41 8d 74 24 05 31 c9 66 39 d0 0f 86 da 00 [ 2006.845181] RSP: 0018:ffffb253004bc9e8 EFLAGS: 00010293 [ 2006.850397] RAX: ffff9d154de45b00 RBX: ffff9d15497d52e8 RCX: ffff9d154de45b00 [ 2006.856327] ice 0000:41:00.0: VF 0 is now trusted [ 2006.857523] RDX: 0000000000000000 RSI: 00000000000005a8 RDI: ffff9d154de45ac0 [ 2006.857525] RBP: 0000000000000b00 R08: ffff9d159cb010ac R09: 0000000000000001 [ 2006.857526] R10: ffff9d154de45940 R11: 0000000000000000 R12: 0000000000000002 [ 2006.883600] R13: ffff9d1770838dc0 R14: 0000000000000000 R15: ffffffffc07b8380 [ 2006.885840] ice 0000:41:00.0: VF 0 is now untrusted [ 2006.890725] FS: 0000000000000000(0000) GS:ffff9d248e900000(0000) knlGS:0000000000000000 [ 2006.890727] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2006.909419] CR2: 0000000000000b4a CR3: 0000000c39c10002 CR4: 0000000000770ee0 [ 2006.916543] PKRU: 55555554 [ 2006.918254] ice 0000:41:00.0: VF 0 is now trusted [ 2006.919248] Call Trace: [ 2006.919250] [ 2006.919252] dev_hard_start_xmit+0x9e/0x1f0 [ 2006.932587] sch_direct_xmit+0xa0/0x370 [ 2006.936424] __dev_queue_xmit+0x7af/0xd00 [ 2006.940429] ip_finish_output2+0x26c/0x540 [ 2006.944519] ip_output+0x71/0x110 [ 2006.947831] ? __ip_finish_output+0x2b0/0x2b0 [ 2006.952180] __ip_queue_xmit+0x16d/0x400 [ 2006.952721] ice 0000:41:00.0: VF 0 is now untrusted [ 2006.956098] __tcp_transmit_skb+0xa96/0xbf0 [ 2006.965148] __tcp_retransmit_skb+0x174/0x860 [ 2006.969499] ? cubictcp_cwnd_event+0x40/0x40 [ 2006.973769] tcp_retransmit_skb+0x14/0xb0 ... Fixes: aa626da947e9 ("iavf: Detach device during reset task") Cc: Jacob Keller Cc: Patryk Piotrowski Cc: SlawomirX Laba Signed-off-by: Ivan Vecera Reviewed-by: Jacob Keller Reviewed-by: Leon Romanovsky Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 5abcd66e7c7a..b66f8fa1d83b 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2921,7 +2921,6 @@ static void iavf_disable_vf(struct iavf_adapter *adapter) iavf_free_queues(adapter); memset(adapter->vf_res, 0, IAVF_VIRTCHNL_VF_RESOURCE_SIZE); iavf_shutdown_adminq(&adapter->hw); - adapter->netdev->flags &= ~IFF_UP; adapter->flags &= ~IAVF_FLAG_RESET_PENDING; iavf_change_state(adapter, __IAVF_DOWN); wake_up(&adapter->down_waitqueue); @@ -3021,6 +3020,11 @@ static void iavf_reset_task(struct work_struct *work) iavf_disable_vf(adapter); mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); + if (netif_running(netdev)) { + rtnl_lock(); + dev_close(netdev); + rtnl_unlock(); + } return; /* Do not attempt to reinit. It's dead, Jim. */ } @@ -3173,6 +3177,16 @@ reset_err: mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); + + if (netif_running(netdev)) { + /* Close device to ensure that Tx queues will not be started + * during netif_device_attach() at the end of the reset task. + */ + rtnl_lock(); + dev_close(netdev); + rtnl_unlock(); + } + dev_err(&adapter->pdev->dev, "failed to allocate resources during reinit\n"); reset_finish: rtnl_lock(); -- cgit From bb861c14f1b8cb9cbf03a132db7f22ec4e692b91 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Thu, 10 Nov 2022 15:14:44 +0100 Subject: iavf: remove INITIAL_MAC_SET to allow gARP to work properly IAVF_FLAG_INITIAL_MAC_SET prevents waiting on iavf_is_mac_set_handled() the first time the MAC is set. This breaks gratuitous ARP because the MAC address has not been updated yet when the gARP packet is sent out. Current behaviour: $ echo 1 > /sys/class/net/ens4f0/device/sriov_numvfs iavf 0000:88:02.0: MAC address: ee:04:19:14:ec:ea $ ip addr add 192.168.1.1/24 dev ens4f0v0 $ ip link set dev ens4f0v0 up $ echo 1 > /proc/sys/net/ipv4/conf/ens4f0v0/arp_notify $ ip link set ens4f0v0 addr 00:11:22:33:44:55 07:23:41.676611 ee:04:19:14:ec:ea > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.168.1.1 tell 192.168.1.1, length 28 With IAVF_FLAG_INITIAL_MAC_SET removed: $ echo 1 > /sys/class/net/ens4f0/device/sriov_numvfs iavf 0000:88:02.0: MAC address: 3e:8a:16:a2:37:6d $ ip addr add 192.168.1.1/24 dev ens4f0v0 $ ip link set dev ens4f0v0 up $ echo 1 > /proc/sys/net/ipv4/conf/ens4f0v0/arp_notify $ ip link set ens4f0v0 addr 00:11:22:33:44:55 07:28:01.836608 00:11:22:33:44:55 > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.168.1.1 tell 192.168.1.1, length 28 Fixes: 35a2443d0910 ("iavf: Add waiting for response from PF in set mac") Signed-off-by: Stefan Assmann Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 - drivers/net/ethernet/intel/iavf/iavf_main.c | 8 -------- 2 files changed, 9 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 3f6187c16424..0d1bab4ac1b0 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -298,7 +298,6 @@ struct iavf_adapter { #define IAVF_FLAG_QUEUES_DISABLED BIT(17) #define IAVF_FLAG_SETUP_NETDEV_FEATURES BIT(18) #define IAVF_FLAG_REINIT_MSIX_NEEDED BIT(20) -#define IAVF_FLAG_INITIAL_MAC_SET BIT(23) /* duplicates for common code */ #define IAVF_FLAG_DCB_ENABLED 0 /* flags for admin queue service task */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index b66f8fa1d83b..801f5b7b8119 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1087,12 +1087,6 @@ static int iavf_set_mac(struct net_device *netdev, void *p) if (ret) return ret; - /* If this is an initial set MAC during VF spawn do not wait */ - if (adapter->flags & IAVF_FLAG_INITIAL_MAC_SET) { - adapter->flags &= ~IAVF_FLAG_INITIAL_MAC_SET; - return 0; - } - ret = wait_event_interruptible_timeout(adapter->vc_waitqueue, iavf_is_mac_set_handled(netdev, addr->sa_data), msecs_to_jiffies(2500)); @@ -2605,8 +2599,6 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter) ether_addr_copy(netdev->perm_addr, adapter->hw.mac.addr); } - adapter->flags |= IAVF_FLAG_INITIAL_MAC_SET; - adapter->tx_desc_count = IAVF_DEFAULT_TXD; adapter->rx_desc_count = IAVF_DEFAULT_RXD; err = iavf_init_interrupt_scheme(adapter); -- cgit From a8417330f8a57275ed934293e832982b6d882713 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Thu, 3 Nov 2022 14:00:03 +0100 Subject: iavf: Fix race condition between iavf_shutdown and iavf_remove Fix a deadlock introduced by commit 974578017fc1 ("iavf: Add waiting so the port is initialized in remove") due to race condition between iavf_shutdown and iavf_remove, where iavf_remove stucks forever in while loop since iavf_shutdown already set __IAVF_REMOVE adapter state. Fix this by checking if the __IAVF_IN_REMOVE_TASK has already been set and return if so. Fixes: 974578017fc1 ("iavf: Add waiting so the port is initialized in remove") Signed-off-by: Slawomir Laba Signed-off-by: Mateusz Palczewski Tested-by: Marek Szlosek Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 801f5b7b8119..d7465296f650 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -5042,23 +5042,21 @@ static int __maybe_unused iavf_resume(struct device *dev_d) static void iavf_remove(struct pci_dev *pdev) { struct iavf_adapter *adapter = iavf_pdev_to_adapter(pdev); - struct net_device *netdev = adapter->netdev; struct iavf_fdir_fltr *fdir, *fdirtmp; struct iavf_vlan_filter *vlf, *vlftmp; + struct iavf_cloud_filter *cf, *cftmp; struct iavf_adv_rss *rss, *rsstmp; struct iavf_mac_filter *f, *ftmp; - struct iavf_cloud_filter *cf, *cftmp; - struct iavf_hw *hw = &adapter->hw; + struct net_device *netdev; + struct iavf_hw *hw; int err; - /* When reboot/shutdown is in progress no need to do anything - * as the adapter is already REMOVE state that was set during - * iavf_shutdown() callback. - */ - if (adapter->state == __IAVF_REMOVE) + netdev = adapter->netdev; + hw = &adapter->hw; + + if (test_and_set_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) return; - set_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section); /* Wait until port initialization is complete. * There are flows where register/unregister netdev may race. */ -- cgit From f70074140524c59a0935947b06dd6cb6e1ea642d Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 17 Nov 2022 19:13:56 +0800 Subject: net: ethernet: mtk_eth_soc: fix error handling in mtk_open() If mtk_start_dma() fails, invoke phylink_disconnect_phy() to perform cleanup. phylink_disconnect_phy() contains the put_device action. If phylink_disconnect_phy is not performed, the Kref of netdev will leak. Fixes: b8fc9f30821e ("net: ethernet: mediatek: Add basic PHYLINK support") Signed-off-by: Liu Jian Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20221117111356.161547-1-liujian56@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 7cd381530aa4..1d1f2342e3ec 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2996,8 +2996,10 @@ static int mtk_open(struct net_device *dev) int i; err = mtk_start_dma(eth); - if (err) + if (err) { + phylink_disconnect_phy(mac->phylink); return err; + } for (i = 0; i < ARRAY_SIZE(eth->ppe); i++) mtk_ppe_start(eth->ppe[i]); -- cgit From 594c61ffc77de0a197934aa0f1df9285c68801c6 Mon Sep 17 00:00:00 2001 From: Peter Kosyh Date: Thu, 17 Nov 2022 18:28:06 +0300 Subject: net/mlx4: Check retval of mlx4_bitmap_init If mlx4_bitmap_init fails, mlx4_bitmap_alloc_range will dereference the NULL pointer (bitmap->table). Make sure, that mlx4_bitmap_alloc_range called in no error case. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: d57febe1a478 ("net/mlx4: Add A0 hybrid steering") Reviewed-by: Tariq Toukan Signed-off-by: Peter Kosyh Link: https://lore.kernel.org/r/20221117152806.278072-1-pkosyh@yandex.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/qp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index b149e601f673..48cfaa7eaf50 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -697,7 +697,8 @@ static int mlx4_create_zones(struct mlx4_dev *dev, err = mlx4_bitmap_init(*bitmap + k, 1, MLX4_QP_TABLE_RAW_ETH_SIZE - 1, 0, 0); - mlx4_bitmap_alloc_range(*bitmap + k, 1, 1, 0); + if (!err) + mlx4_bitmap_alloc_range(*bitmap + k, 1, 1, 0); } if (err) -- cgit From cbe867685386af1f0a2648f5279f6e4c74bfd17f Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Thu, 17 Nov 2022 16:40:32 +0800 Subject: net: mvpp2: fix possible invalid pointer dereference It will cause invalid pointer dereference to priv->cm3_base behind, if PTR_ERR(priv->cm3_base) in mvpp2_get_sram(). Fixes: e54ad1e01c00 ("net: mvpp2: add CM3 SRAM memory map") Signed-off-by: Hui Tang Link: https://lore.kernel.org/r/20221117084032.101144-1-tanghui20@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index eb0fb8128096..b399bdb1ca36 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -7350,6 +7350,7 @@ static int mvpp2_get_sram(struct platform_device *pdev, struct mvpp2 *priv) { struct resource *res; + void __iomem *base; res = platform_get_resource(pdev, IORESOURCE_MEM, 2); if (!res) { @@ -7360,9 +7361,12 @@ static int mvpp2_get_sram(struct platform_device *pdev, return 0; } - priv->cm3_base = devm_ioremap_resource(&pdev->dev, res); + base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(base)) + return PTR_ERR(base); - return PTR_ERR_OR_ZERO(priv->cm3_base); + priv->cm3_base = base; + return 0; } static int mvpp2_probe(struct platform_device *pdev) -- cgit From 62a7311fb96c61d281da9852dbee4712fc8c3277 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 17 Nov 2022 16:50:38 +0800 Subject: net/qla3xxx: fix potential memleak in ql3xxx_send() The ql3xxx_send() returns NETDEV_TX_OK without freeing skb in error handling case, add dev_kfree_skb_any() to fix it. Fixes: bd36b0ac5d06 ("qla3xxx: Add support for Qlogic 4032 chip.") Signed-off-by: Zhang Changzhong Link: https://lore.kernel.org/r/1668675039-21138-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qla3xxx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index 76072f8c3d2f..0d57ffcedf0c 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -2471,6 +2471,7 @@ static netdev_tx_t ql3xxx_send(struct sk_buff *skb, skb_shinfo(skb)->nr_frags); if (tx_cb->seg_count == -1) { netdev_err(ndev, "%s: invalid segment count!\n", __func__); + dev_kfree_skb_any(skb); return NETDEV_TX_OK; } -- cgit From d66608803aa2ffb9e475623343f69996305771ae Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 17 Nov 2022 20:46:58 +0800 Subject: octeontx2-af: debugsfs: fix pci device refcount leak As comment of pci_get_domain_bus_and_slot() says, it returns a pci device with refcount increment, when finish using it, the caller must decrement the reference count by calling pci_dev_put(). So before returning from rvu_dbg_rvu_pf_cgx_map_display() or cgx_print_dmac_flt(), pci_dev_put() is called to avoid refcount leak. Fixes: dbc52debf95f ("octeontx2-af: Debugfs support for DMAC filters") Fixes: e2fb37303865 ("octeontx2-af: Display CGX, NIX and PF map in debugfs.") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20221117124658.162409-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index a1970ebedf95..f66dde2b0f92 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -880,6 +880,8 @@ static int rvu_dbg_rvu_pf_cgx_map_display(struct seq_file *filp, void *unused) sprintf(lmac, "LMAC%d", lmac_id); seq_printf(filp, "%s\t0x%x\t\tNIX%d\t\t%s\t%s\n", dev_name(&pdev->dev), pcifunc, blkid, cgx, lmac); + + pci_dev_put(pdev); } return 0; } @@ -2566,6 +2568,7 @@ static int cgx_print_dmac_flt(struct seq_file *s, int lmac_id) } } + pci_dev_put(pdev); return 0; } -- cgit From 5619537284f1017e9f6c7500b02b859b3830a06d Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 17 Nov 2022 21:51:48 +0800 Subject: net: pch_gbe: fix pci device refcount leak while module exiting As comment of pci_get_domain_bus_and_slot() says, it returns a pci device with refcount increment, when finish using it, the caller must decrement the reference count by calling pci_dev_put(). In pch_gbe_probe(), pci_get_domain_bus_and_slot() is called, so in error path in probe() and remove() function, pci_dev_put() should be called to avoid refcount leak. Compile tested only. Fixes: 1a0bdadb4e36 ("net/pch_gbe: supports eg20t ptp clock") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20221117135148.301014-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c index c9ae47128a07..28b7cec485ef 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c @@ -2460,6 +2460,7 @@ static void pch_gbe_remove(struct pci_dev *pdev) unregister_netdev(netdev); pch_gbe_phy_hw_reset(&adapter->hw); + pci_dev_put(adapter->ptp_pdev); free_netdev(netdev); } @@ -2534,7 +2535,7 @@ static int pch_gbe_probe(struct pci_dev *pdev, /* setup the private structure */ ret = pch_gbe_sw_init(adapter); if (ret) - goto err_free_netdev; + goto err_put_dev; /* Initialize PHY */ ret = pch_gbe_init_phy(adapter); @@ -2592,6 +2593,8 @@ static int pch_gbe_probe(struct pci_dev *pdev, err_free_adapter: pch_gbe_phy_hw_reset(&adapter->hw); +err_put_dev: + pci_dev_put(adapter->ptp_pdev); err_free_netdev: free_netdev(netdev); return ret; -- cgit From 4abd9600b9d15d3d92a9ac25cf200422a4c415ee Mon Sep 17 00:00:00 2001 From: Diana Wang Date: Thu, 17 Nov 2022 16:37:43 +0100 Subject: nfp: fill splittable of devlink_port_attrs correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error is reflected in that it shows wrong splittable status of port when executing "devlink port show". The reason which leads the error is that the assigned operation of splittable is just a simple negation operation of split and it does not consider port lanes quantity. A splittable port should have several lanes that can be split(lanes quantity > 1). If without the judgement, it will show wrong message for some firmware, such as 2x25G, 2x10G. Fixes: a0f49b548652 ("devlink: Add a new devlink port split ability attribute and pass to netlink") Signed-off-by: Diana Wang Reviewed-by: Louis Peens Reviewed-by: Niklas Söderlund Signed-off-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 405786c00334..cb08d7bf9524 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -341,7 +341,7 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) return ret; attrs.split = eth_port.is_split; - attrs.splittable = !attrs.split; + attrs.splittable = eth_port.port_lanes > 1 && !attrs.split; attrs.lanes = eth_port.port_lanes; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = eth_port.label_port; -- cgit From 0873016d46f6dfafd1bdf4d9b935b3331b226f7c Mon Sep 17 00:00:00 2001 From: Jaco Coetzee Date: Thu, 17 Nov 2022 16:37:44 +0100 Subject: nfp: add port from netdev validation for EEPROM access Setting of the port flag `NFP_PORT_CHANGED`, introduced to ensure the correct reading of EEPROM data, causes a fatal kernel NULL pointer dereference in cases where the target netdev type cannot be determined. Add validation of port struct pointer before attempting to set the `NFP_PORT_CHANGED` flag. Return that operation is not supported if the netdev type cannot be determined. Fixes: 4ae97cae07e1 ("nfp: ethtool: fix the display error of `ethtool -m DEVNAME`") Signed-off-by: Jaco Coetzee Reviewed-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index 1775997f9c69..991059d6cb32 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -1432,6 +1432,9 @@ nfp_port_get_module_info(struct net_device *netdev, u8 data; port = nfp_port_from_netdev(netdev); + if (!port) + return -EOPNOTSUPP; + /* update port state to get latest interface */ set_bit(NFP_PORT_CHANGED, &port->flags); eth_port = nfp_port_get_eth_port(port); -- cgit From 4d633d1b468b6eb107a81b2fd10b9debddca3d47 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 18 Nov 2022 11:43:53 +0800 Subject: bonding: fix ICMPv6 header handling when receiving IPv6 messages Currently, we get icmp6hdr via function icmp6_hdr(), which needs the skb transport header to be set first. But there is no rule to ask driver set transport header before netif_receive_skb() and bond_handle_frame(). So we will not able to get correct icmp6hdr on some drivers. Fix this by using skb_header_pointer to get the IPv6 and ICMPV6 headers. Reported-by: Liang Li Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Suggested-by: Eric Dumazet Signed-off-by: Hangbin Liu Reviewed-by: Eric Dumazet Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20221118034353.1736727-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e84c49bf4d0c..f298b9b3eb77 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3231,16 +3231,23 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct slave *curr_active_slave, *curr_arp_slave; - struct icmp6hdr *hdr = icmp6_hdr(skb); struct in6_addr *saddr, *daddr; + struct { + struct ipv6hdr ip6; + struct icmp6hdr icmp6; + } *combined, _combined; if (skb->pkt_type == PACKET_OTHERHOST || - skb->pkt_type == PACKET_LOOPBACK || - hdr->icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT) + skb->pkt_type == PACKET_LOOPBACK) + goto out; + + combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined); + if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP || + combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT) goto out; - saddr = &ipv6_hdr(skb)->saddr; - daddr = &ipv6_hdr(skb)->daddr; + saddr = &combined->ip6.saddr; + daddr = &combined->ip6.saddr; slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI6c tip %pI6c\n", __func__, slave->dev->name, bond_slave_state(slave), -- cgit From 7cef6b73fba96abef731a53501924fc3c4a0f947 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 18 Nov 2022 09:12:49 +0800 Subject: macsec: Fix invalid error code set 'ret' is defined twice in macsec_changelink(), when it is set in macsec_is_offloaded case, it will be invalid before return. Fixes: 3cf3227a21d1 ("net: macsec: hardware offloading infrastructure") Signed-off-by: YueHaibing Reviewed-by: Saeed Mahameed Reviewed-by: Antoine Tenart Link: https://lore.kernel.org/r/20221118011249.48112-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 85376d2f24ca..f41f67b583db 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3835,7 +3835,6 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; - int ret; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { -- cgit From e204ead35401af5e120f653a133d54ee2595627e Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 17 Nov 2022 19:37:12 +0800 Subject: nfc: nfcmrvl: Fix potential memory leak in nfcmrvl_i2c_nci_send() nfcmrvl_i2c_nci_send() will be called by nfcmrvl_nci_send(), and skb should be freed in nfcmrvl_i2c_nci_send(). However, nfcmrvl_nci_send() won't free the skb when it failed for the test_bit(). Free the skb when test_bit() failed. Fixes: b5b3e23e4cac ("NFC: nfcmrvl: add i2c driver") Signed-off-by: Shang XiaoJing Suggested-by: Pavel Machek Signed-off-by: David S. Miller --- drivers/nfc/nfcmrvl/i2c.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/nfcmrvl/i2c.c b/drivers/nfc/nfcmrvl/i2c.c index 24436c9e54c9..97600826af69 100644 --- a/drivers/nfc/nfcmrvl/i2c.c +++ b/drivers/nfc/nfcmrvl/i2c.c @@ -112,8 +112,10 @@ static int nfcmrvl_i2c_nci_send(struct nfcmrvl_private *priv, struct nfcmrvl_i2c_drv_data *drv_data = priv->drv_data; int ret; - if (test_bit(NFCMRVL_PHY_ERROR, &priv->flags)) + if (test_bit(NFCMRVL_PHY_ERROR, &priv->flags)) { + kfree_skb(skb); return -EREMOTEIO; + } ret = i2c_master_send(drv_data->i2c, skb->data, skb->len); -- cgit From 614761e1119c994a7f19e4c9f37b1d2d7fe7306e Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 17 Nov 2022 19:37:13 +0800 Subject: nfc: nxp-nci: Fix potential memory leak in nxp_nci_send() nxp_nci_send() won't free the skb when it failed for the check before write(). As the result, the skb will memleak. Free the skb when the check failed. Fixes: dece45855a8b ("NFC: nxp-nci: Add support for NXP NCI chips") Signed-off-by: Shang XiaoJing Suggested-by: Pavel Machek Signed-off-by: David S. Miller --- drivers/nfc/nxp-nci/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nfc/nxp-nci/core.c b/drivers/nfc/nxp-nci/core.c index 580cb6ecffee..66b198663387 100644 --- a/drivers/nfc/nxp-nci/core.c +++ b/drivers/nfc/nxp-nci/core.c @@ -73,11 +73,15 @@ static int nxp_nci_send(struct nci_dev *ndev, struct sk_buff *skb) struct nxp_nci_info *info = nci_get_drvdata(ndev); int r; - if (!info->phy_ops->write) + if (!info->phy_ops->write) { + kfree_skb(skb); return -EOPNOTSUPP; + } - if (info->mode != NXP_NCI_MODE_NCI) + if (info->mode != NXP_NCI_MODE_NCI) { + kfree_skb(skb); return -EINVAL; + } r = info->phy_ops->write(info->phy_id, skb); if (r < 0) { -- cgit From 60dcb5ff55e5c5da259a0dcc4c24c842de1abc9d Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 17 Nov 2022 19:37:14 +0800 Subject: nfc: s3fwrn5: Fix potential memory leak in s3fwrn5_nci_send() s3fwrn5_nci_send() won't free the skb when it failed for the check before s3fwrn5_write(). As the result, the skb will memleak. Free the skb when the check failed. Fixes: c04c674fadeb ("nfc: s3fwrn5: Add driver for Samsung S3FWRN5 NFC Chip") Signed-off-by: Shang XiaoJing Suggested-by: Pavel Machek Signed-off-by: David S. Miller --- drivers/nfc/s3fwrn5/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nfc/s3fwrn5/core.c b/drivers/nfc/s3fwrn5/core.c index 0270e05b68df..aec356880adf 100644 --- a/drivers/nfc/s3fwrn5/core.c +++ b/drivers/nfc/s3fwrn5/core.c @@ -105,6 +105,7 @@ static int s3fwrn5_nci_send(struct nci_dev *ndev, struct sk_buff *skb) mutex_lock(&info->mutex); if (s3fwrn5_get_mode(info) != S3FWRN5_MODE_NCI) { + kfree_skb(skb); mutex_unlock(&info->mutex); return -EINVAL; } -- cgit From 19d04a947db53e0d99e60aeb914d2148f61ab5f9 Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Thu, 17 Nov 2022 23:07:22 +0800 Subject: net: microchip: sparx5: Fix return value in sparx5_tc_setup_qdisc_ets() Function sparx5_tc_setup_qdisc_ets() always returns negative value because it return -EOPNOTSUPP in the end. This patch returns the rersult of sparx5_tc_ets_add() and sparx5_tc_ets_del() directly. Fixes: 211225428d65 ("net: microchip: sparx5: add support for offloading ets qdisc") Signed-off-by: Lu Wei Reviewed-by: Daniel Machon Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/sparx5/sparx5_tc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_tc.c b/drivers/net/ethernet/microchip/sparx5/sparx5_tc.c index e05429c751ee..dc2c3756e3a2 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_tc.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_tc.c @@ -90,13 +90,10 @@ static int sparx5_tc_setup_qdisc_ets(struct net_device *ndev, } } - sparx5_tc_ets_add(port, params); - break; + return sparx5_tc_ets_add(port, params); case TC_ETS_DESTROY: - sparx5_tc_ets_del(port); - - break; + return sparx5_tc_ets_del(port); case TC_ETS_GRAFT: return -EOPNOTSUPP; -- cgit From bd5e1e42826f18147afb0ba07e6a815f52cf8bcb Mon Sep 17 00:00:00 2001 From: Daniel Díaz Date: Thu, 17 Nov 2022 21:44:21 -0600 Subject: selftests/net: Find nettest in current directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `nettest` binary, built from `selftests/net/nettest.c`, was expected to be found in the path during test execution of `fcnal-test.sh` and `pmtu.sh`, leading to tests getting skipped when the binary is not installed in the system, as can be seen in these logs found in the wild [1]: # TEST: vti4: PMTU exceptions [SKIP] [ 350.600250] IPv6: ADDRCONF(NETDEV_CHANGE): veth_b: link becomes ready [ 350.607421] IPv6: ADDRCONF(NETDEV_CHANGE): veth_a: link becomes ready # 'nettest' command not found; skipping tests # xfrm6udp not supported # TEST: vti6: PMTU exceptions (ESP-in-UDP) [SKIP] [ 351.605102] IPv6: ADDRCONF(NETDEV_CHANGE): veth_b: link becomes ready [ 351.612243] IPv6: ADDRCONF(NETDEV_CHANGE): veth_a: link becomes ready # 'nettest' command not found; skipping tests # xfrm4udp not supported The `unicast_extensions.sh` tests also rely on `nettest`, but it runs fine there because it looks for the binary in the current working directory [2]: The same mechanism that works for the Unicast extensions tests is here copied over to the PMTU and functional tests. [1] https://lkft.validation.linaro.org/scheduler/job/5839508#L6221 [2] https://lkft.validation.linaro.org/scheduler/job/5839508#L7958 Signed-off-by: Daniel Díaz Signed-off-by: David S. Miller --- tools/testing/selftests/net/fcnal-test.sh | 11 +++++++---- tools/testing/selftests/net/pmtu.sh | 10 ++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 31c3b6ebd388..21ca91473c09 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -4196,10 +4196,13 @@ elif [ "$TESTS" = "ipv6" ]; then TESTS="$TESTS_IPV6" fi -which nettest >/dev/null -if [ $? -ne 0 ]; then - echo "'nettest' command not found; skipping tests" - exit $ksft_skip +# nettest can be run from PATH or from same directory as this selftest +if ! which nettest >/dev/null; then + PATH=$PWD:$PATH + if ! which nettest >/dev/null; then + echo "'nettest' command not found; skipping tests" + exit $ksft_skip + fi fi declare -i nfail=0 diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 736e358dc549..dfe3d287f01d 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -686,10 +686,12 @@ setup_xfrm() { } setup_nettest_xfrm() { - which nettest >/dev/null - if [ $? -ne 0 ]; then - echo "'nettest' command not found; skipping tests" - return 1 + if ! which nettest >/dev/null; then + PATH=$PWD:$PATH + if ! which nettest >/dev/null; then + echo "'nettest' command not found; skipping tests" + return 1 + fi fi [ ${1} -eq 6 ] && proto="-6" || proto="" -- cgit From 764f8485890d4988a2083f5dea90cfe0116b25f6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 20:21:52 -0800 Subject: ipv4/fib: Replace zero-length array with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated[1] and are being replaced with flexible array members in support of the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy(), correctly instrument array indexing with UBSAN_BOUNDS, and to globally enable -fstrict-flex-arrays=3. Replace zero-length array with flexible-array member in struct key_vector. This results in no differences in binary output. [1] https://github.com/KSPP/linux/issues/78 Cc: Jakub Kicinski Cc: "David S. Miller" Cc: Hideaki YOSHIFUJI Cc: David Ahern Cc: Eric Dumazet Cc: Paolo Abeni Cc: "Gustavo A. R. Silva" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 452ff177e4da..c88bf856c443 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -126,7 +126,7 @@ struct key_vector { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ - struct key_vector __rcu *tnode[0]; + DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; -- cgit From badbda1a01860c80c6ab60f329ef46c713653a27 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Nov 2022 18:07:54 +0300 Subject: octeontx2-af: cn10k: mcs: Fix copy and paste bug in mcs_bbe_intr_handler() This code accidentally uses the RX macro twice instead of the RX and TX. Fixes: 6c635f78c474 ("octeontx2-af: cn10k: mcs: Handle MCS block interrupts") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/mcs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c index 4a343f853b28..c0bedf402da9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c @@ -951,7 +951,7 @@ static void mcs_bbe_intr_handler(struct mcs *mcs, u64 intr, enum mcs_direction d else event.intr_mask = (dir == MCS_RX) ? MCS_BBE_RX_PLFIFO_OVERFLOW_INT : - MCS_BBE_RX_PLFIFO_OVERFLOW_INT; + MCS_BBE_TX_PLFIFO_OVERFLOW_INT; /* Notify the lmac_id info which ran into BBE fatal error */ event.lmac_id = i & 0x3ULL; -- cgit From c7aa1a76d4a0a3c401025b60c401412bbb60f8c6 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Wed, 28 Sep 2022 14:26:50 -0400 Subject: netfilter: ipset: regression in ip_set_hash_ip.c This patch introduced a regression: commit 48596a8ddc46 ("netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses") The variable e.ip is passed to adtfn() function which finally adds the ip address to the set. The patch above refactored the for loop and moved e.ip = htonl(ip) to the end of the for loop. What this means is that if the value of "ip" changes between the first assignement of e.ip and the forloop, then e.ip is pointing to a different ip address than "ip". Test case: $ ipset create jdtest_tmp hash:ip family inet hashsize 2048 maxelem 100000 $ ipset add jdtest_tmp 10.0.1.1/31 ipset v6.21.1: Element cannot be added to the set: it's already added The value of ip gets updated inside the "else if (tb[IPSET_ATTR_CIDR])" block but e.ip is still pointing to the old value. Fixes: 48596a8ddc46 ("netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses") Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ip.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index dd30c03d5a23..75d556d71652 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -151,18 +151,16 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) return -ERANGE; - if (retried) { + if (retried) ip = ntohl(h->next.ip); - e.ip = htonl(ip); - } for (; ip <= ip_to;) { + e.ip = htonl(ip); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ip += hosts; - e.ip = htonl(ip); - if (e.ip == 0) + if (ip == 0) return 0; ret = 0; -- cgit From 394164f9d5a3020a7fd719d228386d48d544ec67 Mon Sep 17 00:00:00 2001 From: Roy Novich Date: Sun, 24 Jul 2022 09:49:07 +0300 Subject: net/mlx5: Do not query pci info while pci disabled The driver should not interact with PCI while PCI is disabled. Trying to do so may result in being unable to get vital signs during PCI reset, driver gets timed out and fails to recover. Fixes: fad1783a6d66 ("net/mlx5: Print more info on pci error handlers") Signed-off-by: Roy Novich Reviewed-by: Moshe Shemesh Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 283c4cc28944..e58775a7d955 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1798,7 +1798,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, res = state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; - mlx5_pci_trace(dev, "Exit, result = %d, %s\n", res, result2str(res)); + mlx5_core_info(dev, "%s Device state = %d pci_status: %d. Exit, result = %d, %s\n", + __func__, dev->state, dev->pci_status, res, result2str(res)); return res; } @@ -1837,7 +1838,8 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) struct mlx5_core_dev *dev = pci_get_drvdata(pdev); int err; - mlx5_pci_trace(dev, "Enter\n"); + mlx5_core_info(dev, "%s Device state = %d pci_status: %d. Enter\n", + __func__, dev->state, dev->pci_status); err = mlx5_pci_enable_device(dev); if (err) { @@ -1859,7 +1861,8 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) res = PCI_ERS_RESULT_RECOVERED; out: - mlx5_pci_trace(dev, "Exit, err = %d, result = %d, %s\n", err, res, result2str(res)); + mlx5_core_info(dev, "%s Device state = %d pci_status: %d. Exit, err = %d, result = %d, %s\n", + __func__, dev->state, dev->pci_status, err, res, result2str(res)); return res; } -- cgit From 61db3d7b99a367416e489ccf764cc5f9b00d62a1 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 20 Oct 2022 12:25:59 +0300 Subject: net/mlx5: Fix FW tracer timestamp calculation Fix a bug in calculation of FW tracer timestamp. Decreasing one in the calculation should effect only bits 52_7 and not effect bits 6_0 of the timestamp, otherwise bits 6_0 are always set in this calculation. Fixes: 70dd6fdb8987 ("net/mlx5: FW tracer, parse traces and kernel tracing support") Signed-off-by: Moshe Shemesh Reviewed-by: Feras Daoud Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index 978a2bb8e122..21831386b26e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -638,7 +638,7 @@ static void mlx5_tracer_handle_timestamp_trace(struct mlx5_fw_tracer *tracer, trace_timestamp = (timestamp_event.timestamp & MASK_52_7) | (str_frmt->timestamp & MASK_6_0); else - trace_timestamp = ((timestamp_event.timestamp & MASK_52_7) - 1) | + trace_timestamp = ((timestamp_event.timestamp - 1) & MASK_52_7) | (str_frmt->timestamp & MASK_6_0); mlx5_tracer_print_trace(str_frmt, dev, trace_timestamp); -- cgit From 4f57332d6a551185ba729617f04455e83fbe4e41 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 4 Aug 2022 12:38:41 +0300 Subject: net/mlx5: SF: Fix probing active SFs during driver probe phase When SF devices and SF port representors are located on different functions, unloading and reloading of SF parent driver doesn't recreate the existing SF present in the device. Fix it by querying SFs and probe active SFs during driver probe phase. Fixes: 90d010b8634b ("net/mlx5: SF, Add auxiliary device support") Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c | 88 ++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index 7da012ff0d41..8e2abbab05f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -18,6 +18,10 @@ struct mlx5_sf_dev_table { phys_addr_t base_address; u64 sf_bar_length; struct notifier_block nb; + struct mutex table_lock; /* Serializes sf life cycle and vhca state change handler */ + struct workqueue_struct *active_wq; + struct work_struct work; + u8 stop_active_wq:1; struct mlx5_core_dev *dev; }; @@ -168,6 +172,7 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_ return 0; sf_index = event->function_id - base_id; + mutex_lock(&table->table_lock); sf_dev = xa_load(&table->devices, sf_index); switch (event->new_vhca_state) { case MLX5_VHCA_STATE_INVALID: @@ -191,6 +196,7 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_ default: break; } + mutex_unlock(&table->table_lock); return 0; } @@ -215,6 +221,78 @@ static int mlx5_sf_dev_vhca_arm_all(struct mlx5_sf_dev_table *table) return 0; } +static void mlx5_sf_dev_add_active_work(struct work_struct *work) +{ + struct mlx5_sf_dev_table *table = container_of(work, struct mlx5_sf_dev_table, work); + u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; + struct mlx5_core_dev *dev = table->dev; + u16 max_functions; + u16 function_id; + u16 sw_func_id; + int err = 0; + u8 state; + int i; + + max_functions = mlx5_sf_max_functions(dev); + function_id = MLX5_CAP_GEN(dev, sf_base_id); + for (i = 0; i < max_functions; i++, function_id++) { + if (table->stop_active_wq) + return; + err = mlx5_cmd_query_vhca_state(dev, function_id, out, sizeof(out)); + if (err) + /* A failure of specific vhca doesn't mean others will + * fail as well. + */ + continue; + state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state); + if (state != MLX5_VHCA_STATE_ACTIVE) + continue; + + sw_func_id = MLX5_GET(query_vhca_state_out, out, vhca_state_context.sw_function_id); + mutex_lock(&table->table_lock); + /* Don't probe device which is already probe */ + if (!xa_load(&table->devices, i)) + mlx5_sf_dev_add(dev, i, function_id, sw_func_id); + /* There is a race where SF got inactive after the query + * above. e.g.: the query returns that the state of the + * SF is active, and after that the eswitch manager set it to + * inactive. + * This case cannot be managed in SW, since the probing of the + * SF is on one system, and the inactivation is on a different + * system. + * If the inactive is done after the SF perform init_hca(), + * the SF will fully probe and then removed. If it was + * done before init_hca(), the SF probe will fail. + */ + mutex_unlock(&table->table_lock); + } +} + +/* In case SFs are generated externally, probe active SFs */ +static int mlx5_sf_dev_queue_active_work(struct mlx5_sf_dev_table *table) +{ + if (MLX5_CAP_GEN(table->dev, eswitch_manager)) + return 0; /* the table is local */ + + /* Use a workqueue to probe active SFs, which are in large + * quantity and may take up to minutes to probe. + */ + table->active_wq = create_singlethread_workqueue("mlx5_active_sf"); + if (!table->active_wq) + return -ENOMEM; + INIT_WORK(&table->work, &mlx5_sf_dev_add_active_work); + queue_work(table->active_wq, &table->work); + return 0; +} + +static void mlx5_sf_dev_destroy_active_work(struct mlx5_sf_dev_table *table) +{ + if (table->active_wq) { + table->stop_active_wq = true; + destroy_workqueue(table->active_wq); + } +} + void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) { struct mlx5_sf_dev_table *table; @@ -240,11 +318,17 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) table->base_address = pci_resource_start(dev->pdev, 2); table->max_sfs = max_sfs; xa_init(&table->devices); + mutex_init(&table->table_lock); dev->priv.sf_dev_table = table; err = mlx5_vhca_event_notifier_register(dev, &table->nb); if (err) goto vhca_err; + + err = mlx5_sf_dev_queue_active_work(table); + if (err) + goto add_active_err; + err = mlx5_sf_dev_vhca_arm_all(table); if (err) goto arm_err; @@ -252,6 +336,8 @@ void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) return; arm_err: + mlx5_sf_dev_destroy_active_work(table); +add_active_err: mlx5_vhca_event_notifier_unregister(dev, &table->nb); vhca_err: table->max_sfs = 0; @@ -279,7 +365,9 @@ void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) if (!table) return; + mlx5_sf_dev_destroy_active_work(table); mlx5_vhca_event_notifier_unregister(dev, &table->nb); + mutex_destroy(&table->table_lock); /* Now that event handler is not running, it is safe to destroy * the sf device without race. -- cgit From 870c2481174b839e7159555127bc8b5a5d0699ba Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 31 May 2022 09:14:03 +0300 Subject: net/mlx5: cmdif, Print info on any firmware cmd failure to tracepoint While moving to new CMD API (quiet API), some pre-existing flows may call the new API function that in case of error, returns the error instead of printing it as previously done. For such flows we bring back the print but to tracepoint this time for sys admins to have the ability to check for errors especially for commands using the new quiet API. Tracepoint output example: devlink-1333 [001] ..... 822.746922: mlx5_cmd: ACCESS_REG(0x805) op_mod(0x0) failed, status bad resource(0x5), syndrome (0xb06e1f), err(-22) Fixes: f23519e542e5 ("net/mlx5: cmdif, Add new api for command execution") Signed-off-by: Moshe Shemesh Reviewed-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 41 +++++++++++--------- .../mellanox/mlx5/core/diag/cmd_tracepoint.h | 45 ++++++++++++++++++++++ include/linux/mlx5/driver.h | 1 + 3 files changed, 68 insertions(+), 19 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 2e0d59ca62b5..df3e284ca5c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -45,6 +45,8 @@ #include "mlx5_core.h" #include "lib/eq.h" #include "lib/tout.h" +#define CREATE_TRACE_POINTS +#include "diag/cmd_tracepoint.h" enum { CMD_IF_REV = 5, @@ -785,27 +787,14 @@ EXPORT_SYMBOL(mlx5_cmd_out_err); static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out) { u16 opcode, op_mod; - u32 syndrome; - u8 status; u16 uid; - int err; - - syndrome = MLX5_GET(mbox_out, out, syndrome); - status = MLX5_GET(mbox_out, out, status); opcode = MLX5_GET(mbox_in, in, opcode); op_mod = MLX5_GET(mbox_in, in, op_mod); uid = MLX5_GET(mbox_in, in, uid); - err = cmd_status_to_err(status); - if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY) mlx5_cmd_out_err(dev, opcode, op_mod, out); - else - mlx5_core_dbg(dev, - "%s(0x%x) op_mod(0x%x) uid(%d) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", - mlx5_command_str(opcode), opcode, op_mod, uid, - cmd_status_str(status), status, syndrome, err); } int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out) @@ -1892,6 +1881,16 @@ out_in: return err; } +static void mlx5_cmd_err_trace(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod, void *out) +{ + u32 syndrome = MLX5_GET(mbox_out, out, syndrome); + u8 status = MLX5_GET(mbox_out, out, status); + + trace_mlx5_cmd(mlx5_command_str(opcode), opcode, op_mod, + cmd_status_str(status), status, syndrome, + cmd_status_to_err(status)); +} + static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, u32 syndrome, int err) { @@ -1914,7 +1913,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, } /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */ -static int cmd_status_err(struct mlx5_core_dev *dev, int err, u16 opcode, void *out) +static int cmd_status_err(struct mlx5_core_dev *dev, int err, u16 opcode, u16 op_mod, void *out) { u32 syndrome = MLX5_GET(mbox_out, out, syndrome); u8 status = MLX5_GET(mbox_out, out, status); @@ -1922,8 +1921,10 @@ static int cmd_status_err(struct mlx5_core_dev *dev, int err, u16 opcode, void * if (err == -EREMOTEIO) /* -EREMOTEIO is preserved */ err = -EIO; - if (!err && status != MLX5_CMD_STAT_OK) + if (!err && status != MLX5_CMD_STAT_OK) { err = -EREMOTEIO; + mlx5_cmd_err_trace(dev, opcode, op_mod, out); + } cmd_status_log(dev, opcode, status, syndrome, err); return err; @@ -1951,9 +1952,9 @@ int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int { int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); u16 opcode = MLX5_GET(mbox_in, in, opcode); + u16 op_mod = MLX5_GET(mbox_in, in, op_mod); - err = cmd_status_err(dev, err, opcode, out); - return err; + return cmd_status_err(dev, err, opcode, op_mod, out); } EXPORT_SYMBOL(mlx5_cmd_do); @@ -1997,8 +1998,9 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, { int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); u16 opcode = MLX5_GET(mbox_in, in, opcode); + u16 op_mod = MLX5_GET(mbox_in, in, op_mod); - err = cmd_status_err(dev, err, opcode, out); + err = cmd_status_err(dev, err, opcode, op_mod, out); return mlx5_cmd_check(dev, err, in, out); } EXPORT_SYMBOL(mlx5_cmd_exec_polling); @@ -2034,7 +2036,7 @@ static void mlx5_cmd_exec_cb_handler(int status, void *_work) struct mlx5_async_ctx *ctx; ctx = work->ctx; - status = cmd_status_err(ctx->dev, status, work->opcode, work->out); + status = cmd_status_err(ctx->dev, status, work->opcode, work->op_mod, work->out); work->user_callback(status, work); if (atomic_dec_and_test(&ctx->num_inflight)) complete(&ctx->inflight_done); @@ -2049,6 +2051,7 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, work->ctx = ctx; work->user_callback = callback; work->opcode = MLX5_GET(mbox_in, in, opcode); + work->op_mod = MLX5_GET(mbox_in, in, op_mod); work->out = out; if (WARN_ON(!atomic_inc_not_zero(&ctx->num_inflight))) return -EIO; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h new file mode 100644 index 000000000000..406ebe17405f --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_CMD_TP_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_CMD_TP_H_ + +#include +#include + +TRACE_EVENT(mlx5_cmd, + TP_PROTO(const char *command_str, u16 opcode, u16 op_mod, + const char *status_str, u8 status, u32 syndrome, int err), + TP_ARGS(command_str, opcode, op_mod, status_str, status, syndrome, err), + TP_STRUCT__entry(__string(command_str, command_str) + __field(u16, opcode) + __field(u16, op_mod) + __string(status_str, status_str) + __field(u8, status) + __field(u32, syndrome) + __field(int, err) + ), + TP_fast_assign(__assign_str(command_str, command_str); + __entry->opcode = opcode; + __entry->op_mod = op_mod; + __assign_str(status_str, status_str); + __entry->status = status; + __entry->syndrome = syndrome; + __entry->err = err; + ), + TP_printk("%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x), err(%d)", + __get_str(command_str), __entry->opcode, __entry->op_mod, + __get_str(status_str), __entry->status, __entry->syndrome, + __entry->err) +); + +#endif /* _MLX5_CMD_TP_H_ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ./diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE cmd_tracepoint +#include diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index af2ceb4160bc..06cbad166225 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -981,6 +981,7 @@ struct mlx5_async_work { struct mlx5_async_ctx *ctx; mlx5_async_cbk_t user_callback; u16 opcode; /* cmd opcode */ + u16 op_mod; /* cmd op_mod */ void *out; /* pointer to the cmd output buffer */ }; -- cgit From aaf2e65cac7f2e1ae729c2fbc849091df9699f96 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 17 Nov 2022 09:07:20 +0200 Subject: net/mlx5: Fix handling of entry refcount when command is not issued to FW In case command interface is down, or the command is not allowed, driver did not increment the entry refcount, but might have decrement as part of forced completion handling. Fix that by always increment and decrement the refcount to make it symmetric for all flows. Fixes: 50b2412b7e78 ("net/mlx5: Avoid possible free of command entry while timeout comp handler") Signed-off-by: Eran Ben Elisha Signed-off-by: Moshe Shemesh Reported-by: Jack Wang Tested-by: Jack Wang Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index df3e284ca5c6..74bd05e5dda2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1005,6 +1005,7 @@ static void cmd_work_handler(struct work_struct *work) cmd_ent_get(ent); set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); + cmd_ent_get(ent); /* for the _real_ FW event on completion */ /* Skip sending command to fw if internal error */ if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, ent->op)) { ent->ret = -ENXIO; @@ -1012,7 +1013,6 @@ static void cmd_work_handler(struct work_struct *work) return; } - cmd_ent_get(ent); /* for the _real_ FW event on completion */ /* ring doorbell after the descriptor is valid */ mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); wmb(); @@ -1661,8 +1661,8 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force cmd_ent_put(ent); /* timeout work was canceled */ if (!forced || /* Real FW completion */ - pci_channel_offline(dev->pdev) || /* FW is inaccessible */ - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + mlx5_cmd_is_down(dev) || /* No real FW completion is expected */ + !opcode_allowed(cmd, ent->op)) cmd_ent_put(ent); ent->ts2 = ktime_get_ns(); -- cgit From 0d4e8ed139d871fcb2844dd71075997753baeec8 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 15 Aug 2022 11:25:26 +0300 Subject: net/mlx5: Lag, avoid lockdep warnings ldev->lock is used to serialize lag change operations. Since multiport eswtich functionality was added, we now change the mode dynamically. However, acquiring ldev->lock is not allowed as it could possibly lead to a deadlock as reported by the lockdep mechanism. [ 836.154963] WARNING: possible circular locking dependency detected [ 836.155850] 5.19.0-rc5_net_56b7df2 #1 Not tainted [ 836.156549] ------------------------------------------------------ [ 836.157418] handler1/12198 is trying to acquire lock: [ 836.158178] ffff888187d52b58 (&ldev->lock){+.+.}-{3:3}, at: mlx5_lag_do_mirred+0x3b/0x70 [mlx5_core] [ 836.159575] [ 836.159575] but task is already holding lock: [ 836.160474] ffff8881d4de2930 (&block->cb_lock){++++}-{3:3}, at: tc_setup_cb_add+0x5b/0x200 [ 836.161669] which lock already depends on the new lock. [ 836.162905] [ 836.162905] the existing dependency chain (in reverse order) is: [ 836.164008] -> #3 (&block->cb_lock){++++}-{3:3}: [ 836.164946] down_write+0x25/0x60 [ 836.165548] tcf_block_get_ext+0x1c6/0x5d0 [ 836.166253] ingress_init+0x74/0xa0 [sch_ingress] [ 836.167028] qdisc_create.constprop.0+0x130/0x5e0 [ 836.167805] tc_modify_qdisc+0x481/0x9f0 [ 836.168490] rtnetlink_rcv_msg+0x16e/0x5a0 [ 836.169189] netlink_rcv_skb+0x4e/0xf0 [ 836.169861] netlink_unicast+0x190/0x250 [ 836.170543] netlink_sendmsg+0x243/0x4b0 [ 836.171226] sock_sendmsg+0x33/0x40 [ 836.171860] ____sys_sendmsg+0x1d1/0x1f0 [ 836.172535] ___sys_sendmsg+0xab/0xf0 [ 836.173183] __sys_sendmsg+0x51/0x90 [ 836.173836] do_syscall_64+0x3d/0x90 [ 836.174471] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 836.175282] [ 836.175282] -> #2 (rtnl_mutex){+.+.}-{3:3}: [ 836.176190] __mutex_lock+0x6b/0xf80 [ 836.176830] register_netdevice_notifier+0x21/0x120 [ 836.177631] rtnetlink_init+0x2d/0x1e9 [ 836.178289] netlink_proto_init+0x163/0x179 [ 836.178994] do_one_initcall+0x63/0x300 [ 836.179672] kernel_init_freeable+0x2cb/0x31b [ 836.180403] kernel_init+0x17/0x140 [ 836.181035] ret_from_fork+0x1f/0x30 [ 836.181687] -> #1 (pernet_ops_rwsem){+.+.}-{3:3}: [ 836.182628] down_write+0x25/0x60 [ 836.183235] unregister_netdevice_notifier+0x1c/0xb0 [ 836.184029] mlx5_ib_roce_cleanup+0x94/0x120 [mlx5_ib] [ 836.184855] __mlx5_ib_remove+0x35/0x60 [mlx5_ib] [ 836.185637] mlx5_eswitch_unregister_vport_reps+0x22f/0x440 [mlx5_core] [ 836.186698] auxiliary_bus_remove+0x18/0x30 [ 836.187409] device_release_driver_internal+0x1f6/0x270 [ 836.188253] bus_remove_device+0xef/0x160 [ 836.188939] device_del+0x18b/0x3f0 [ 836.189562] mlx5_rescan_drivers_locked+0xd6/0x2d0 [mlx5_core] [ 836.190516] mlx5_lag_remove_devices+0x69/0xe0 [mlx5_core] [ 836.191414] mlx5_do_bond_work+0x441/0x620 [mlx5_core] [ 836.192278] process_one_work+0x25c/0x590 [ 836.192963] worker_thread+0x4f/0x3d0 [ 836.193609] kthread+0xcb/0xf0 [ 836.194189] ret_from_fork+0x1f/0x30 [ 836.194826] -> #0 (&ldev->lock){+.+.}-{3:3}: [ 836.195734] __lock_acquire+0x15b8/0x2a10 [ 836.196426] lock_acquire+0xce/0x2d0 [ 836.197057] __mutex_lock+0x6b/0xf80 [ 836.197708] mlx5_lag_do_mirred+0x3b/0x70 [mlx5_core] [ 836.198575] tc_act_parse_mirred+0x25b/0x800 [mlx5_core] [ 836.199467] parse_tc_actions+0x168/0x5a0 [mlx5_core] [ 836.200340] __mlx5e_add_fdb_flow+0x263/0x480 [mlx5_core] [ 836.201241] mlx5e_configure_flower+0x8a0/0x1820 [mlx5_core] [ 836.202187] tc_setup_cb_add+0xd7/0x200 [ 836.202856] fl_hw_replace_filter+0x14c/0x1f0 [cls_flower] [ 836.203739] fl_change+0xbbe/0x1730 [cls_flower] [ 836.204501] tc_new_tfilter+0x407/0xd90 [ 836.205168] rtnetlink_rcv_msg+0x406/0x5a0 [ 836.205877] netlink_rcv_skb+0x4e/0xf0 [ 836.206535] netlink_unicast+0x190/0x250 [ 836.207217] netlink_sendmsg+0x243/0x4b0 [ 836.207915] sock_sendmsg+0x33/0x40 [ 836.208538] ____sys_sendmsg+0x1d1/0x1f0 [ 836.209219] ___sys_sendmsg+0xab/0xf0 [ 836.209878] __sys_sendmsg+0x51/0x90 [ 836.210510] do_syscall_64+0x3d/0x90 [ 836.211137] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 836.211954] other info that might help us debug this: [ 836.213174] Chain exists of: [ 836.213174] &ldev->lock --> rtnl_mutex --> &block->cb_lock 836.214650] Possible unsafe locking scenario: [ 836.214650] [ 836.215574] CPU0 CPU1 [ 836.216255] ---- ---- [ 836.216943] lock(&block->cb_lock); [ 836.217518] lock(rtnl_mutex); [ 836.218348] lock(&block->cb_lock); [ 836.219212] lock(&ldev->lock); [ 836.219758] [ 836.219758] *** DEADLOCK *** [ 836.219758] [ 836.220747] 2 locks held by handler1/12198: [ 836.221390] #0: ffff8881d4de2930 (&block->cb_lock){++++}-{3:3}, at: tc_setup_cb_add+0x5b/0x200 [ 836.222646] #1: ffff88810c9a92c0 (&esw->mode_lock){++++}-{3:3}, at: mlx5_esw_hold+0x39/0x50 [mlx5_core] [ 836.224063] stack backtrace: [ 836.224799] CPU: 6 PID: 12198 Comm: handler1 Not tainted 5.19.0-rc5_net_56b7df2 #1 [ 836.225923] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 836.227476] Call Trace: [ 836.227929] [ 836.228332] dump_stack_lvl+0x57/0x7d [ 836.228924] check_noncircular+0x104/0x120 [ 836.229562] __lock_acquire+0x15b8/0x2a10 [ 836.230201] lock_acquire+0xce/0x2d0 [ 836.230776] ? mlx5_lag_do_mirred+0x3b/0x70 [mlx5_core] [ 836.231614] ? find_held_lock+0x2b/0x80 [ 836.232221] __mutex_lock+0x6b/0xf80 [ 836.232799] ? mlx5_lag_do_mirred+0x3b/0x70 [mlx5_core] [ 836.233636] ? mlx5_lag_do_mirred+0x3b/0x70 [mlx5_core] [ 836.234451] ? xa_load+0xc3/0x190 [ 836.234995] mlx5_lag_do_mirred+0x3b/0x70 [mlx5_core] [ 836.235803] tc_act_parse_mirred+0x25b/0x800 [mlx5_core] [ 836.236636] ? tc_act_can_offload_mirred+0x135/0x210 [mlx5_core] [ 836.237550] parse_tc_actions+0x168/0x5a0 [mlx5_core] [ 836.238364] __mlx5e_add_fdb_flow+0x263/0x480 [mlx5_core] [ 836.239202] mlx5e_configure_flower+0x8a0/0x1820 [mlx5_core] [ 836.240076] ? lock_acquire+0xce/0x2d0 [ 836.240668] ? tc_setup_cb_add+0x5b/0x200 [ 836.241294] tc_setup_cb_add+0xd7/0x200 [ 836.241917] fl_hw_replace_filter+0x14c/0x1f0 [cls_flower] [ 836.242709] fl_change+0xbbe/0x1730 [cls_flower] [ 836.243408] tc_new_tfilter+0x407/0xd90 [ 836.244043] ? tc_del_tfilter+0x880/0x880 [ 836.244672] rtnetlink_rcv_msg+0x406/0x5a0 [ 836.245310] ? netlink_deliver_tap+0x7a/0x4b0 [ 836.245991] ? if_nlmsg_stats_size+0x2b0/0x2b0 [ 836.246675] netlink_rcv_skb+0x4e/0xf0 [ 836.258046] netlink_unicast+0x190/0x250 [ 836.258669] netlink_sendmsg+0x243/0x4b0 [ 836.259288] sock_sendmsg+0x33/0x40 [ 836.259857] ____sys_sendmsg+0x1d1/0x1f0 [ 836.260473] ___sys_sendmsg+0xab/0xf0 [ 836.261064] ? lock_acquire+0xce/0x2d0 [ 836.261669] ? find_held_lock+0x2b/0x80 [ 836.262272] ? __fget_files+0xb9/0x190 [ 836.262871] ? __fget_files+0xd3/0x190 [ 836.263462] __sys_sendmsg+0x51/0x90 [ 836.264064] do_syscall_64+0x3d/0x90 [ 836.264652] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 836.265425] RIP: 0033:0x7fdbe5e2677d [ 836.266012] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 ba ee ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 ee ee ff ff 48 [ 836.268485] RSP: 002b:00007fdbe48a75a0 EFLAGS: 00000293 ORIG_RAX: 000000000000002e [ 836.269598] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fdbe5e2677d [ 836.270576] RDX: 0000000000000000 RSI: 00007fdbe48a7640 RDI: 000000000000003c [ 836.271565] RBP: 00007fdbe48a8368 R08: 0000000000000000 R09: 0000000000000000 [ 836.272546] R10: 00007fdbe48a84b0 R11: 0000000000000293 R12: 0000557bd17dc860 [ 836.273527] R13: 0000000000000000 R14: 0000557bd17dc860 R15: 00007fdbe48a7640 [ 836.274521] To avoid using mode holding ldev->lock in the configure flow, we queue a work to the lag workqueue and cease wait on a completion object. In addition, we remove the lock from mlx5_lag_do_mirred() since it is not really protecting anything. It should be noted that an actual deadlock has not been observed. Signed-off-by: Eli Cohen Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 3 +- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 14 ++- .../net/ethernet/mellanox/mlx5/core/lag/mpesw.c | 100 +++++++++++++-------- .../net/ethernet/mellanox/mlx5/core/lag/mpesw.h | 1 - 4 files changed, 78 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index a9f4ede4a9bf..be1307a63e6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -228,9 +228,8 @@ static void mlx5_ldev_free(struct kref *ref) if (ldev->nb.notifier_call) unregister_netdevice_notifier_net(&init_net, &ldev->nb); mlx5_lag_mp_cleanup(ldev); - mlx5_lag_mpesw_cleanup(ldev); - cancel_work_sync(&ldev->mpesw_work); destroy_workqueue(ldev->wq); + mlx5_lag_mpesw_cleanup(ldev); mutex_destroy(&ldev->lock); kfree(ldev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index ce2ce8ccbd70..f30ac2de639f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -50,6 +50,19 @@ struct lag_tracker { enum netdev_lag_hash hash_type; }; +enum mpesw_op { + MLX5_MPESW_OP_ENABLE, + MLX5_MPESW_OP_DISABLE, +}; + +struct mlx5_mpesw_work_st { + struct work_struct work; + struct mlx5_lag *lag; + enum mpesw_op op; + struct completion comp; + int result; +}; + /* LAG data of a ConnectX card. * It serves both its phys functions. */ @@ -66,7 +79,6 @@ struct mlx5_lag { struct lag_tracker tracker; struct workqueue_struct *wq; struct delayed_work bond_work; - struct work_struct mpesw_work; struct notifier_block nb; struct lag_mp lag_mp; struct mlx5_lag_port_sel port_sel; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index f643202b29c6..c17e8f1ec914 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -7,63 +7,95 @@ #include "eswitch.h" #include "lib/mlx5.h" -void mlx5_mpesw_work(struct work_struct *work) +static int add_mpesw_rule(struct mlx5_lag *ldev) { - struct mlx5_lag *ldev = container_of(work, struct mlx5_lag, mpesw_work); + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + int err; - mutex_lock(&ldev->lock); - mlx5_disable_lag(ldev); - mutex_unlock(&ldev->lock); -} + if (atomic_add_return(1, &ldev->lag_mpesw.mpesw_rule_count) != 1) + return 0; -static void mlx5_lag_disable_mpesw(struct mlx5_core_dev *dev) -{ - struct mlx5_lag *ldev = dev->priv.lag; + if (ldev->mode != MLX5_LAG_MODE_NONE) { + err = -EINVAL; + goto out_err; + } - if (!queue_work(ldev->wq, &ldev->mpesw_work)) - mlx5_core_warn(dev, "failed to queue work\n"); + err = mlx5_activate_lag(ldev, NULL, MLX5_LAG_MODE_MPESW, false); + if (err) { + mlx5_core_warn(dev, "Failed to create LAG in MPESW mode (%d)\n", err); + goto out_err; + } + + return 0; + +out_err: + atomic_dec(&ldev->lag_mpesw.mpesw_rule_count); + return err; } -void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev) +static void del_mpesw_rule(struct mlx5_lag *ldev) { - struct mlx5_lag *ldev = dev->priv.lag; + if (!atomic_dec_return(&ldev->lag_mpesw.mpesw_rule_count) && + ldev->mode == MLX5_LAG_MODE_MPESW) + mlx5_disable_lag(ldev); +} - if (!ldev) - return; +static void mlx5_mpesw_work(struct work_struct *work) +{ + struct mlx5_mpesw_work_st *mpesww = container_of(work, struct mlx5_mpesw_work_st, work); + struct mlx5_lag *ldev = mpesww->lag; mutex_lock(&ldev->lock); - if (!atomic_dec_return(&ldev->lag_mpesw.mpesw_rule_count) && - ldev->mode == MLX5_LAG_MODE_MPESW) - mlx5_lag_disable_mpesw(dev); + if (mpesww->op == MLX5_MPESW_OP_ENABLE) + mpesww->result = add_mpesw_rule(ldev); + else if (mpesww->op == MLX5_MPESW_OP_DISABLE) + del_mpesw_rule(ldev); mutex_unlock(&ldev->lock); + + complete(&mpesww->comp); } -int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev) +static int mlx5_lag_mpesw_queue_work(struct mlx5_core_dev *dev, + enum mpesw_op op) { struct mlx5_lag *ldev = dev->priv.lag; + struct mlx5_mpesw_work_st *work; int err = 0; if (!ldev) return 0; - mutex_lock(&ldev->lock); - if (atomic_add_return(1, &ldev->lag_mpesw.mpesw_rule_count) != 1) - goto out; + work = kzalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return -ENOMEM; - if (ldev->mode != MLX5_LAG_MODE_NONE) { + INIT_WORK(&work->work, mlx5_mpesw_work); + init_completion(&work->comp); + work->op = op; + work->lag = ldev; + + if (!queue_work(ldev->wq, &work->work)) { + mlx5_core_warn(dev, "failed to queue mpesw work\n"); err = -EINVAL; goto out; } - - err = mlx5_activate_lag(ldev, NULL, MLX5_LAG_MODE_MPESW, false); - if (err) - mlx5_core_warn(dev, "Failed to create LAG in MPESW mode (%d)\n", err); - + wait_for_completion(&work->comp); + err = work->result; out: - mutex_unlock(&ldev->lock); + kfree(work); return err; } +void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev) +{ + mlx5_lag_mpesw_queue_work(dev, MLX5_MPESW_OP_DISABLE); +} + +int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev) +{ + return mlx5_lag_mpesw_queue_work(dev, MLX5_MPESW_OP_ENABLE); +} + int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev) { struct mlx5_lag *ldev = mdev->priv.lag; @@ -71,12 +103,9 @@ int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev) if (!netif_is_bond_master(out_dev) || !ldev) return 0; - mutex_lock(&ldev->lock); - if (ldev->mode == MLX5_LAG_MODE_MPESW) { - mutex_unlock(&ldev->lock); + if (ldev->mode == MLX5_LAG_MODE_MPESW) return -EOPNOTSUPP; - } - mutex_unlock(&ldev->lock); + return 0; } @@ -90,11 +119,10 @@ bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev) void mlx5_lag_mpesw_init(struct mlx5_lag *ldev) { - INIT_WORK(&ldev->mpesw_work, mlx5_mpesw_work); atomic_set(&ldev->lag_mpesw.mpesw_rule_count, 0); } void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev) { - cancel_delayed_work_sync(&ldev->bond_work); + WARN_ON(atomic_read(&ldev->lag_mpesw.mpesw_rule_count)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h index be4abcb8fcd5..88e8daffcf92 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h @@ -12,7 +12,6 @@ struct lag_mpesw { atomic_t mpesw_rule_count; }; -void mlx5_mpesw_work(struct work_struct *work); int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev); bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev); #if IS_ENABLED(CONFIG_MLX5_ESWITCH) -- cgit From 6d942e40448931be9371f1ba8cb592778807ce18 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 16 Nov 2022 11:10:15 +0200 Subject: net/mlx5: E-Switch, Set correctly vport destination The cited commit moved from using reformat_id integer to packet_reformat pointer which introduced the possibility to null pointer dereference. When setting packet reformat flag and pkt_reformat pointer must exists so checking MLX5_ESW_DEST_ENCAP is not enough, we need to make sure the pkt_reformat is valid and check for MLX5_ESW_DEST_ENCAP_VALID. If the dest encap valid flag does not exists then pkt_reformat can be either invalid address or null. Also, to make sure we don't try to access invalid pkt_reformat set it to null when invalidated and invalidate it before calling add flow code as its logically more correct and to be safe. Fixes: 2b688ea5efde ("net/mlx5: Add flow steering actions to fs_cmd shim layer") Signed-off-by: Roi Dayan Reviewed-by: Chris Mi Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 10 ++++++---- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 5aff97914367..5b6a79d2034e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -224,15 +224,16 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, list_for_each_entry(flow, flow_list, tmp_list) { if (!mlx5e_is_offloaded_flow(flow) || flow_flag_test(flow, SLOW)) continue; - spec = &flow->attr->parse_attr->spec; - - /* update from encap rule to slow path rule */ - rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec); attr = mlx5e_tc_get_encap_attr(flow); esw_attr = attr->esw_attr; /* mark the flow's encap dest as non-valid */ esw_attr->dests[flow->tmp_entry_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID; + esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL; + + /* update from encap rule to slow path rule */ + spec = &flow->attr->parse_attr->spec; + rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec); if (IS_ERR(rule)) { err = PTR_ERR(rule); @@ -251,6 +252,7 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, /* we know that the encap is valid */ e->flags &= ~MLX5_ENCAP_ENTRY_VALID; mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat); + e->pkt_reformat = NULL; } static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 728ca9f2bb9d..3fda75fe168c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -433,7 +433,7 @@ esw_setup_vport_dest(struct mlx5_flow_destination *dest, struct mlx5_flow_act *f mlx5_lag_mpesw_is_activated(esw->dev)) dest[dest_idx].type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; } - if (esw_attr->dests[attr_idx].flags & MLX5_ESW_DEST_ENCAP) { + if (esw_attr->dests[attr_idx].flags & MLX5_ESW_DEST_ENCAP_VALID) { if (pkt_reformat) { flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; flow_act->pkt_reformat = esw_attr->dests[attr_idx].pkt_reformat; -- cgit From e1ad07b9227f9cbaf4bd2b6ec00b84c303657593 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 29 Oct 2022 09:03:48 +0300 Subject: net/mlx5: Fix sync reset event handler error flow When sync reset now event handling fails on mlx5_pci_link_toggle() then no reset was done. However, since mlx5_cmd_fast_teardown_hca() was already done, the firmware function is closed and the driver is left without firmware functionality. Fix it by setting device error state and reopen the firmware resources. Reopening is done by the thread that was called for devlink reload fw_activate as it already holds the devlink lock. Fixes: 5ec697446f46 ("net/mlx5: Add support for devlink reload action fw activate") Signed-off-by: Moshe Shemesh Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 9d908a0ccfef..1e46f9afa40e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -9,7 +9,8 @@ enum { MLX5_FW_RESET_FLAGS_RESET_REQUESTED, MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, MLX5_FW_RESET_FLAGS_PENDING_COMP, - MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS + MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, + MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED }; struct mlx5_fw_reset { @@ -406,7 +407,7 @@ static void mlx5_sync_reset_now_event(struct work_struct *work) err = mlx5_pci_link_toggle(dev); if (err) { mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, no reset done, err %d\n", err); - goto done; + set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags); } mlx5_enter_error_state(dev, true); @@ -482,6 +483,10 @@ int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev) goto out; } err = fw_reset->ret; + if (test_and_clear_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags)) { + mlx5_unload_one_devl_locked(dev); + mlx5_load_one_devl_locked(dev, false); + } out: clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); return err; -- cgit From 3e874cb1e0a376b0e682f82c1612ae89ab7867d7 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 27 Oct 2022 08:29:06 +0300 Subject: net/mlx5e: Fix missing alignment in size of MTT/KLM entries In the cited patch, an alignment required by the HW spec was mistakenly dropped. Bring it back to fix error completions like the below: mlx5_core 0000:00:08.0 eth2: Error cqe on cqn 0x40b, ci 0x0, qn 0x104f, opcode 0xd, syndrome 0x2, vendor syndrome 0x68 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000030: 00 00 00 00 86 00 68 02 25 00 10 4f 00 00 bb d2 WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0x0, len: 192 00000000: 00 00 00 25 00 10 4f 0c 00 00 00 00 00 18 2e 00 00000010: 90 00 00 00 00 02 00 00 00 00 00 00 20 00 00 00 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000050: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000080: 08 00 00 00 48 6a 00 02 08 00 00 00 0e 10 00 02 00000090: 08 00 00 00 0c db 00 02 08 00 00 00 0e 82 00 02 000000a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 000000b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Fixes: 9f123f740428 ("net/mlx5e: Improve MTT/KSM alignment") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e3a4f01bcceb..5e41dfdf79c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -206,10 +206,11 @@ static void mlx5e_disable_blocking_events(struct mlx5e_priv *priv) static u16 mlx5e_mpwrq_umr_octowords(u32 entries, enum mlx5e_mpwrq_umr_mode umr_mode) { u8 umr_entry_size = mlx5e_mpwrq_umr_entry_size(umr_mode); + u32 sz; - WARN_ON_ONCE(entries * umr_entry_size % MLX5_OCTWORD); + sz = ALIGN(entries * umr_entry_size, MLX5_UMR_MTT_ALIGNMENT); - return entries * umr_entry_size / MLX5_OCTWORD; + return sz / MLX5_OCTWORD; } static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, -- cgit From f377422044b2093c835e5f3717f8c8c58da1db1f Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Thu, 17 Nov 2022 07:45:45 +0200 Subject: net/mlx5e: Offload rule only when all encaps are valid The cited commit adds a for loop to support multiple encapsulations. But it only checks if the last encap is valid. Fix it by setting slow path flag when one of the encap is invalid. Fixes: f493f15534ec ("net/mlx5e: Move flow attr reformat action bit to per dest flags") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 6 ++---- .../net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 17 ++++++----------- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 5b6a79d2034e..ff73d25bc6eb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -764,8 +764,7 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, struct net_device *mirred_dev, int out_index, struct netlink_ext_ack *extack, - struct net_device **encap_dev, - bool *encap_valid) + struct net_device **encap_dev) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5e_tc_flow_parse_attr *parse_attr; @@ -880,9 +879,8 @@ attach_flow: if (e->flags & MLX5_ENCAP_ENTRY_VALID) { attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat; attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; - *encap_valid = true; } else { - *encap_valid = false; + flow_flag_set(flow, SLOW); } mutex_unlock(&esw->offloads.encap_tbl_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h index d542b8476491..8ad273dde40e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h @@ -17,8 +17,7 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, struct net_device *mirred_dev, int out_index, struct netlink_ext_ack *extack, - struct net_device **encap_dev, - bool *encap_valid); + struct net_device **encap_dev); int mlx5e_attach_decap(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 5a6aa61ec82a..bd9936af4582 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1634,7 +1634,6 @@ set_encap_dests(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr, struct netlink_ext_ack *extack, - bool *encap_valid, bool *vf_tun) { struct mlx5e_tc_flow_parse_attr *parse_attr; @@ -1651,7 +1650,6 @@ set_encap_dests(struct mlx5e_priv *priv, parse_attr = attr->parse_attr; esw_attr = attr->esw_attr; *vf_tun = false; - *encap_valid = true; for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { struct net_device *out_dev; @@ -1668,7 +1666,7 @@ set_encap_dests(struct mlx5e_priv *priv, goto out; } err = mlx5e_attach_encap(priv, flow, attr, out_dev, out_index, - extack, &encap_dev, encap_valid); + extack, &encap_dev); dev_put(out_dev); if (err) goto out; @@ -1732,8 +1730,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow_parse_attr *parse_attr; struct mlx5_flow_attr *attr = flow->attr; struct mlx5_esw_flow_attr *esw_attr; - bool vf_tun, encap_valid; u32 max_prio, max_chain; + bool vf_tun; int err = 0; parse_attr = attr->parse_attr; @@ -1823,7 +1821,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, esw_attr->int_port = int_port; } - err = set_encap_dests(priv, flow, attr, extack, &encap_valid, &vf_tun); + err = set_encap_dests(priv, flow, attr, extack, &vf_tun); if (err) goto err_out; @@ -1853,7 +1851,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, * (1) there's no error * (2) there's an encap action and we don't have valid neigh */ - if (!encap_valid || flow_flag_test(flow, SLOW)) + if (flow_flag_test(flow, SLOW)) flow->rule[0] = mlx5e_tc_offload_to_slow_path(esw, flow, &parse_attr->spec); else flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr); @@ -3759,7 +3757,7 @@ alloc_flow_post_acts(struct mlx5e_tc_flow *flow, struct netlink_ext_ack *extack) struct mlx5e_post_act *post_act = get_post_action(flow->priv); struct mlx5_flow_attr *attr, *next_attr = NULL; struct mlx5e_post_act_handle *handle; - bool vf_tun, encap_valid = true; + bool vf_tun; int err; /* This is going in reverse order as needed. @@ -3781,13 +3779,10 @@ alloc_flow_post_acts(struct mlx5e_tc_flow *flow, struct netlink_ext_ack *extack) if (list_is_last(&attr->list, &flow->attrs)) break; - err = set_encap_dests(flow->priv, flow, attr, extack, &encap_valid, &vf_tun); + err = set_encap_dests(flow->priv, flow, attr, extack, &vf_tun); if (err) goto out_free; - if (!encap_valid) - flow_flag_set(flow, SLOW); - err = actions_prepare_mod_hdr_actions(flow->priv, flow, attr, extack); if (err) goto out_free; -- cgit From 11abca031ee34d8d50876e899cb2875d8fac01df Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 14 Nov 2022 11:56:11 +0200 Subject: net/mlx5e: Remove leftovers from old XSK queues enumeration Before the cited commit, for N channels, a dedicated set of N queues was created to support XSK, in indices [N, 2N-1], doubling the number of queues. In addition, changing the number of channels was prohibited, as it would shift the indices. Remove these two leftovers, as we moved XSK to a new queueing scheme, starting from index 0. Fixes: 3db4c85cde7a ("net/mlx5e: xsk: Use queue indices starting from 0 for XSK queues") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 24aa25da482b..1728e197558d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -35,7 +35,6 @@ #include "en.h" #include "en/port.h" #include "en/params.h" -#include "en/xsk/pool.h" #include "en/ptp.h" #include "lib/clock.h" #include "en/fs_ethtool.h" @@ -412,15 +411,8 @@ void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv, struct ethtool_channels *ch) { mutex_lock(&priv->state_lock); - ch->max_combined = priv->max_nch; ch->combined_count = priv->channels.params.num_channels; - if (priv->xsk.refcnt) { - /* The upper half are XSK queues. */ - ch->max_combined *= 2; - ch->combined_count *= 2; - } - mutex_unlock(&priv->state_lock); } @@ -454,16 +446,6 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); - /* Don't allow changing the number of channels if there is an active - * XSK, because the numeration of the XSK and regular RQs will change. - */ - if (priv->xsk.refcnt) { - err = -EINVAL; - netdev_err(priv->netdev, "%s: AF_XDP is active, cannot change the number of channels\n", - __func__); - goto out; - } - /* Don't allow changing the number of channels if HTB offload is active, * because the numeration of the QoS SQs will change, while per-queue * qdiscs are attached. -- cgit From d20a56b0eb006096a023a59efccb27a277b38344 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Sun, 30 Oct 2022 11:43:24 +0200 Subject: net/mlx5e: Fix MACsec SA initialization routine Currently as part of MACsec SA initialization routine extended packet number (EPN) object attribute is always being set without checking if EPN is actually enabled, the above could lead to a NULL dereference. Fix by adding such a check. Fixes: 4411a6c0abd3 ("net/mlx5e: Support MACsec offload extended packet number (EPN)") Signed-off-by: Emeel Hakim Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 2ef36cb9555a..8f8a735a4501 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -368,15 +368,15 @@ static int mlx5e_macsec_init_sa(struct macsec_context *ctx, obj_attrs.aso_pdn = macsec->aso.pdn; obj_attrs.epn_state = sa->epn_state; - if (is_tx) { - obj_attrs.ssci = cpu_to_be32((__force u32)ctx->sa.tx_sa->ssci); - key = &ctx->sa.tx_sa->key; - } else { - obj_attrs.ssci = cpu_to_be32((__force u32)ctx->sa.rx_sa->ssci); - key = &ctx->sa.rx_sa->key; + key = (is_tx) ? &ctx->sa.tx_sa->key : &ctx->sa.rx_sa->key; + + if (sa->epn_state.epn_enabled) { + obj_attrs.ssci = (is_tx) ? cpu_to_be32((__force u32)ctx->sa.tx_sa->ssci) : + cpu_to_be32((__force u32)ctx->sa.rx_sa->ssci); + + memcpy(&obj_attrs.salt, &key->salt, sizeof(key->salt)); } - memcpy(&obj_attrs.salt, &key->salt, sizeof(key->salt)); obj_attrs.replay_window = ctx->secy->replay_window; obj_attrs.replay_protect = ctx->secy->replay_protect; -- cgit From 94ffd6e0c7dbcffbcded79e283aefbee3499af96 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Sun, 30 Oct 2022 11:52:42 +0200 Subject: net/mlx5e: Fix MACsec update SecY Currently updating SecY destroys and re-creates RX SA objects, the re-created RX SA objects are not identical to the destroyed objects and it disagree on the encryption enabled property which holds the value false after recreation, this value is not supported with offload which leads to no traffic after an update. Fix by recreating an identical objects. Fixes: 5a39816a75e5 ("net/mlx5e: Add MACsec offload SecY support") Signed-off-by: Emeel Hakim Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 8f8a735a4501..4f96c69c6cc4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -1155,7 +1155,7 @@ static int macsec_upd_secy_hw_address(struct macsec_context *ctx, continue; if (rx_sa->active) { - err = mlx5e_macsec_init_sa(ctx, rx_sa, false, false); + err = mlx5e_macsec_init_sa(ctx, rx_sa, true, false); if (err) goto out; } -- cgit From 8514e325ef016e3fdabaa015ed1adaa6e6d8722a Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Sun, 30 Oct 2022 11:19:52 +0200 Subject: net/mlx5e: Fix possible race condition in macsec extended packet number update routine Currenty extended packet number (EPN) update routine is accessing macsec object without holding the general macsec lock hence facing a possible race condition when an EPN update occurs while updating or deleting the SA. Fix by holding the general macsec lock before accessing the object. Fixes: 4411a6c0abd3 ("net/mlx5e: Support MACsec offload extended packet number (EPN)") Signed-off-by: Emeel Hakim Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 4f96c69c6cc4..3dc6c987b8da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -1536,6 +1536,8 @@ static void macsec_async_event(struct work_struct *work) async_work = container_of(work, struct mlx5e_macsec_async_work, work); macsec = async_work->macsec; + mutex_lock(&macsec->lock); + mdev = async_work->mdev; obj_id = async_work->obj_id; macsec_sa = get_macsec_tx_sa_from_obj_id(macsec, obj_id); @@ -1557,6 +1559,7 @@ static void macsec_async_event(struct work_struct *work) out_async_work: kfree(async_work); + mutex_unlock(&macsec->lock); } static int macsec_obj_change_event(struct notifier_block *nb, unsigned long event, void *data) -- cgit From 1f0dd412e34e177621769866bef347f0b22364df Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 18 Nov 2022 10:36:35 +0000 Subject: net: phy: at803x: fix error return code in at803x_probe() Fix to return a negative error code from the ccr read error handling case instead of 0, as done elsewhere in this function. Fixes: 3265f4218878 ("net: phy: at803x: add fiber support") Signed-off-by: Wei Yongjun Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20221118103635.254256-1-weiyongjun@huaweicloud.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/at803x.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 349b7b1dbbf2..d49965907561 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -870,8 +870,10 @@ static int at803x_probe(struct phy_device *phydev) .wolopts = 0, }; - if (ccr < 0) + if (ccr < 0) { + ret = ccr; goto err; + } mode_cfg = ccr & AT803X_MODE_CFG_MASK; switch (mode_cfg) { -- cgit From 0e5d56c64afcd6fd2d132ea972605b66f8a7d3c4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:45:00 -0500 Subject: tipc: set con sock in tipc_conn_alloc A crash was reported by Wei Chen: BUG: kernel NULL pointer dereference, address: 0000000000000018 RIP: 0010:tipc_conn_close+0x12/0x100 Call Trace: tipc_topsrv_exit_net+0x139/0x320 ops_exit_list.isra.9+0x49/0x80 cleanup_net+0x31a/0x540 process_one_work+0x3fa/0x9f0 worker_thread+0x42/0x5c0 It was caused by !con->sock in tipc_conn_close(). In tipc_topsrv_accept(), con is allocated in conn_idr then its sock is set: con = tipc_conn_alloc(); ... <----[1] con->sock = newsock; If tipc_conn_close() is called in anytime of [1], the null-pointer-def is triggered by con->sock->sk due to con->sock is not yet set. This patch fixes it by moving the con->sock setting to tipc_conn_alloc() under s->idr_lock. So that con->sock can never be NULL when getting the con from s->conn_idr. It will be also safer to move con->server and flag CF_CONNECTED setting under s->idr_lock, as they should all be set before tipc_conn_alloc() is called. Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure") Reported-by: Wei Chen Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index d92ec92f0b71..b0f9aa521670 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -176,7 +176,7 @@ static void tipc_conn_close(struct tipc_conn *con) conn_put(con); } -static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) +static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock) { struct tipc_conn *con; int ret; @@ -202,10 +202,11 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) } con->conid = ret; s->idr_in_use++; - spin_unlock_bh(&s->idr_lock); set_bit(CF_CONNECTED, &con->flags); con->server = s; + con->sock = sock; + spin_unlock_bh(&s->idr_lock); return con; } @@ -467,7 +468,7 @@ static void tipc_topsrv_accept(struct work_struct *work) ret = kernel_accept(lsock, &newsock, O_NONBLOCK); if (ret < 0) return; - con = tipc_conn_alloc(srv); + con = tipc_conn_alloc(srv, newsock); if (IS_ERR(con)) { ret = PTR_ERR(con); sock_release(newsock); @@ -479,7 +480,6 @@ static void tipc_topsrv_accept(struct work_struct *work) newsk->sk_data_ready = tipc_conn_data_ready; newsk->sk_write_space = tipc_conn_write_space; newsk->sk_user_data = con; - con->sock = newsock; write_unlock_bh(&newsk->sk_callback_lock); /* Wake up receive process in case of 'SYN+' message */ @@ -577,12 +577,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, sub.filter = filter; *(u64 *)&sub.usr_handle = (u64)port; - con = tipc_conn_alloc(tipc_topsrv(net)); + con = tipc_conn_alloc(tipc_topsrv(net), NULL); if (IS_ERR(con)) return false; *conid = con->conid; - con->sock = NULL; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); if (rc >= 0) return true; -- cgit From a7b42969d63f47320853a802efd879fbdc4e010e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:45:01 -0500 Subject: tipc: add an extra conn_get in tipc_conn_alloc One extra conn_get() is needed in tipc_conn_alloc(), as after tipc_conn_alloc() is called, tipc_conn_close() may free this con before deferencing it in tipc_topsrv_accept(): tipc_conn_alloc(); newsk = newsock->sk; <---- tipc_conn_close(); write_lock_bh(&sk->sk_callback_lock); newsk->sk_data_ready = tipc_conn_data_ready; Then an uaf issue can be triggered: BUG: KASAN: use-after-free in tipc_topsrv_accept+0x1e7/0x370 [tipc] Call Trace: dump_stack_lvl+0x33/0x46 print_report+0x178/0x4b0 kasan_report+0x8c/0x100 kasan_check_range+0x179/0x1e0 tipc_topsrv_accept+0x1e7/0x370 [tipc] process_one_work+0x6a3/0x1030 worker_thread+0x8a/0xdf0 This patch fixes it by holding it in tipc_conn_alloc(), then after all accessing in tipc_topsrv_accept() releasing it. Note when does this in tipc_topsrv_kern_subscr(), as tipc_conn_rcv_sub() returns 0 or -1 only, we don't need to check for "> 0". Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure") Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index b0f9aa521670..e3b427a70398 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -206,6 +206,7 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *s set_bit(CF_CONNECTED, &con->flags); con->server = s; con->sock = sock; + conn_get(con); spin_unlock_bh(&s->idr_lock); return con; @@ -484,6 +485,7 @@ static void tipc_topsrv_accept(struct work_struct *work) /* Wake up receive process in case of 'SYN+' message */ newsk->sk_data_ready(newsk); + conn_put(con); } } @@ -583,10 +585,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, *conid = con->conid; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); - if (rc >= 0) - return true; + if (rc) + conn_put(con); + conn_put(con); - return false; + return !rc; } void tipc_topsrv_kern_unsubscr(struct net *net, int conid) -- cgit From cd0f6421162201e4b22ce757a1966729323185eb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 19 Nov 2022 15:28:32 +0800 Subject: tipc: check skb_linearize() return value in tipc_disc_rcv() If skb_linearize() fails in tipc_disc_rcv(), we need to free the skb instead of handle it. Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: YueHaibing Acked-by: Jon Maloy Link: https://lore.kernel.org/r/20221119072832.7896-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/tipc/discover.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/tipc/discover.c b/net/tipc/discover.c index e8630707901e..e8dcdf267c0c 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -211,7 +211,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, u32 self; int err; - skb_linearize(skb); + if (skb_linearize(skb)) { + kfree_skb(skb); + return; + } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) -- cgit From 30f158740984f9949765f6112456d62d2ca6deba Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 18 Nov 2022 14:27:29 -0800 Subject: ice: fix handling of burst Tx timestamps Commit 1229b33973c7 ("ice: Add low latency Tx timestamp read") refactored PTP timestamping logic to use a threaded IRQ instead of a separate kthread. This implementation introduced ice_misc_intr_thread_fn and redefined the ice_ptp_process_ts function interface to return a value of whether or not the timestamp processing was complete. ice_misc_intr_thread_fn would take the return value from ice_ptp_process_ts and convert it into either IRQ_HANDLED if there were no more timestamps to be processed, or IRQ_WAKE_THREAD if the thread should continue processing. This is not correct, as the kernel does not re-schedule threaded IRQ functions automatically. IRQ_WAKE_THREAD can only be used by the main IRQ function. This results in the ice_ptp_process_ts function (and in turn the ice_ptp_tx_tstamp function) from only being called exactly once per interrupt. If an application sends a burst of Tx timestamps without waiting for a response, the interrupt will trigger for the first timestamp. However, later timestamps may not have arrived yet. This can result in dropped or discarded timestamps. Worse, on E822 hardware this results in the interrupt logic getting stuck such that no future interrupts will be triggered. The result is complete loss of Tx timestamp functionality. Fix this by modifying the ice_misc_intr_thread_fn to perform its own polling of the ice_ptp_process_ts function. We sleep for a few microseconds between attempts to avoid wasting significant CPU time. The value was chosen to allow time for the Tx timestamps to complete without wasting so much time that we overrun application wait budgets in the worst case. The ice_ptp_process_ts function also currently returns false in the event that the Tx tracker is not initialized. This would result in the threaded IRQ handler never exiting if it gets started while the tracker is not initialized. Fix the function to appropriately return true when the tracker is not initialized. Note that this will not reproduce with default ptp4l behavior, as the program always synchronously waits for a timestamp response before sending another timestamp request. Reported-by: Siddaraju DH Fixes: 1229b33973c7 ("ice: Add low latency Tx timestamp read") Signed-off-by: Jacob Keller Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20221118222729.1565317-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_main.c | 12 ++++++------ drivers/net/ethernet/intel/ice/ice_ptp.c | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 0f6718719453..ca2898467dcb 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3145,15 +3145,15 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data) */ static irqreturn_t ice_misc_intr_thread_fn(int __always_unused irq, void *data) { - irqreturn_t ret = IRQ_HANDLED; struct ice_pf *pf = data; - bool irq_handled; - irq_handled = ice_ptp_process_ts(pf); - if (!irq_handled) - ret = IRQ_WAKE_THREAD; + if (ice_is_reset_in_progress(pf->state)) + return IRQ_HANDLED; - return ret; + while (!ice_ptp_process_ts(pf)) + usleep_range(50, 100); + + return IRQ_HANDLED; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 011b727ab190..0f668468d141 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -614,11 +614,14 @@ static u64 ice_ptp_extend_40b_ts(struct ice_pf *pf, u64 in_tstamp) * 2) extend the 40b timestamp value to get a 64bit timestamp * 3) send that timestamp to the stack * - * After looping, if we still have waiting SKBs, return true. This may cause us - * effectively poll even when not strictly necessary. We do this because it's - * possible a new timestamp was requested around the same time as the interrupt. - * In some cases hardware might not interrupt us again when the timestamp is - * captured. + * Returns true if all timestamps were handled, and false if any slots remain + * without a timestamp. + * + * After looping, if we still have waiting SKBs, return false. This may cause + * us effectively poll even when not strictly necessary. We do this because + * it's possible a new timestamp was requested around the same time as the + * interrupt. In some cases hardware might not interrupt us again when the + * timestamp is captured. * * Note that we only take the tracking lock when clearing the bit and when * checking if we need to re-queue this task. The only place where bits can be @@ -641,7 +644,7 @@ static bool ice_ptp_tx_tstamp(struct ice_ptp_tx *tx) u8 idx; if (!tx->init) - return false; + return true; ptp_port = container_of(tx, struct ice_ptp_port, tx); pf = ptp_port_to_pf(ptp_port); @@ -2381,10 +2384,7 @@ s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb) */ bool ice_ptp_process_ts(struct ice_pf *pf) { - if (pf->ptp.port.tx.init) - return ice_ptp_tx_tstamp(&pf->ptp.port.tx); - - return false; + return ice_ptp_tx_tstamp(&pf->ptp.port.tx); } static void ice_ptp_periodic_work(struct kthread_work *work) -- cgit From b97df039a68b2f3e848e238df5d5d06343ea497b Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Wed, 2 Nov 2022 11:18:48 +0100 Subject: xfrm: Fix oops in __xfrm_state_delete() Kernel 5.14 added a new "byseq" index to speed up xfrm_state lookups by sequence number in commit fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") While the patch was thorough, the function pfkey_send_new_mapping() in net/af_key.c also modifies x->km.seq and never added the current xfrm_state to the "byseq" index. This leads to the following kernel Ooops: BUG: kernel NULL pointer dereference, address: 0000000000000000 .. RIP: 0010:__xfrm_state_delete+0xc9/0x1c0 .. Call Trace: xfrm_state_delete+0x1e/0x40 xfrm_del_sa+0xb0/0x110 [xfrm_user] xfrm_user_rcv_msg+0x12d/0x270 [xfrm_user] ? remove_entity_load_avg+0x8a/0xa0 ? copy_to_user_state_extra+0x580/0x580 [xfrm_user] netlink_rcv_skb+0x51/0x100 xfrm_netlink_rcv+0x30/0x50 [xfrm_user] netlink_unicast+0x1a6/0x270 netlink_sendmsg+0x22a/0x480 __sys_sendto+0x1a6/0x1c0 ? __audit_syscall_entry+0xd8/0x130 ? __audit_syscall_exit+0x249/0x2b0 __x64_sys_sendto+0x23/0x30 do_syscall_64+0x3a/0x90 entry_SYSCALL_64_after_hwframe+0x61/0xcb Exact location of the crash in __xfrm_state_delete(): if (x->km.seq) hlist_del_rcu(&x->byseq); The hlist_node "byseq" was never populated. The bug only triggers if a new NAT traversal mapping (changed IP or port) is detected in esp_input_done2() / esp6_input_done2(), which in turn indirectly calls pfkey_send_new_mapping() *if* the kernel is compiled with CONFIG_NET_KEY and "af_key" is active. The PF_KEYv2 message SADB_X_NAT_T_NEW_MAPPING is not part of RFC 2367. Various implementations have been examined how they handle the "sadb_msg_seq" header field: - racoon (Android): does not process SADB_X_NAT_T_NEW_MAPPING - strongswan: does not care about sadb_msg_seq - openswan: does not care about sadb_msg_seq There is no standard how PF_KEYv2 sadb_msg_seq should be populated for SADB_X_NAT_T_NEW_MAPPING and it's not used in popular implementations either. Herbert Xu suggested we should just use the current km.seq value as is. This fixes the root cause of the oops since we no longer modify km.seq itself. The update of "km.seq" looks like a copy'n'paste error from pfkey_send_acquire(). SADB_ACQUIRE must indeed assign a unique km.seq number according to RFC 2367. It has been verified that code paths involving pfkey_send_acquire() don't cause the same Oops. PF_KEYv2 SADB_X_NAT_T_NEW_MAPPING support was originally added here: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git commit cbc3488685b20e7b2a98ad387a1a816aada569d8 Author: Derek Atkins AuthorDate: Wed Apr 2 13:21:02 2003 -0800 [IPSEC]: Implement UDP Encapsulation framework. In particular, implement ESPinUDP encapsulation for IPsec Nat Traversal. A note on triggering the bug: I was not able to trigger it using VMs. There is one VPN using a high latency link on our production VPN server that triggered it like once a day though. Link: https://github.com/strongswan/strongswan/issues/992 Link: https://lore.kernel.org/netdev/00959f33ee52c4b3b0084d42c430418e502db554.1652340703.git.antony.antony@secunet.com/T/ Link: https://lore.kernel.org/netdev/20221027142455.3975224-1-chenzhihao@meizu.com/T/ Fixes: fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") Reported-by: Roth Mark Reported-by: Zhihao Chen Tested-by: Roth Mark Signed-off-by: Thomas Jarosch Acked-by: Antony Antony Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index 213287814328..95edcbedf6ef 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3394,7 +3394,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; - hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ -- cgit From 40781bfb836eda57d19c0baa37c7e72590e05fdc Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Thu, 3 Nov 2022 17:07:13 +0800 Subject: xfrm: Fix ignored return value in xfrm6_init() When IPv6 module initializing in xfrm6_init(), register_pernet_subsys() is possible to fail but its return value is ignored. If IPv6 initialization fails later and xfrm6_fini() is called, removing uninitialized list in xfrm6_net_ops will cause null-ptr-deref: KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 PID: 330 Comm: insmod RIP: 0010:unregister_pernet_operations+0xc9/0x450 Call Trace: unregister_pernet_subsys+0x31/0x3e xfrm6_fini+0x16/0x30 [ipv6] ip6_route_init+0xcd/0x128 [ipv6] inet6_init+0x29c/0x602 [ipv6] ... Fix it by catching the error return value of register_pernet_subsys(). Fixes: 8d068875caca ("xfrm: make gc_thresh configurable in all namespaces") Signed-off-by: Chen Zhongjin Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 4a4b0e49ec92..ea435eba3053 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -287,9 +287,13 @@ int __init xfrm6_init(void) if (ret) goto out_state; - register_pernet_subsys(&xfrm6_net_ops); + ret = register_pernet_subsys(&xfrm6_net_ops); + if (ret) + goto out_protocol; out: return ret; +out_protocol: + xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: -- cgit From e541dd7763fc34aec2f93f652a396cc2e7b92d8d Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Fri, 18 Nov 2022 14:24:47 +0800 Subject: net: wwan: iosm: use ACPI_FREE() but not kfree() in ipc_pcie_read_bios_cfg() acpi_evaluate_dsm() should be coupled with ACPI_FREE() to free the ACPI memory, because we need to track the allocation of acpi_object when ACPI_DBG_TRACK_ALLOCATIONS enabled, so use ACPI_FREE() instead of kfree(). Fixes: d38a648d2d6c ("net: wwan: iosm: fix memory leak in ipc_pcie_read_bios_cfg") Signed-off-by: Wang ShaoBo Link: https://lore.kernel.org/r/20221118062447.2324881-1-bobo.shaobowang@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/wwan/iosm/iosm_ipc_pcie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_pcie.c b/drivers/net/wwan/iosm/iosm_ipc_pcie.c index d3d34d1c4704..5bf5a93937c9 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_pcie.c +++ b/drivers/net/wwan/iosm/iosm_ipc_pcie.c @@ -249,7 +249,7 @@ static enum ipc_pcie_sleep_state ipc_pcie_read_bios_cfg(struct device *dev) if (object->integer.value == 3) sleep_state = IPC_PCIE_D3L2; - kfree(object); + ACPI_FREE(object); default_ret: return sleep_state; -- cgit From aad98abd5cb8133507f22654f56bcb443aaa2d89 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Thu, 17 Nov 2022 15:50:09 +0800 Subject: sfc: fix potential memleak in __ef100_hard_start_xmit() The __ef100_hard_start_xmit() returns NETDEV_TX_OK without freeing skb in error handling case, add dev_kfree_skb_any() to fix it. Fixes: 51b35a454efd ("sfc: skeleton EF100 PF driver") Signed-off-by: Zhang Changzhong Acked-by: Martin Habets Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/1668671409-10909-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/sfc/ef100_netdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/sfc/ef100_netdev.c b/drivers/net/ethernet/sfc/ef100_netdev.c index 88fa29572e23..ddcc325ed570 100644 --- a/drivers/net/ethernet/sfc/ef100_netdev.c +++ b/drivers/net/ethernet/sfc/ef100_netdev.c @@ -218,6 +218,7 @@ netdev_tx_t __ef100_hard_start_xmit(struct sk_buff *skb, skb->len, skb->data_len, channel->channel); if (!efx->n_channels || !efx->n_tx_channels || !channel) { netif_stop_queue(net_dev); + dev_kfree_skb_any(skb); goto err; } -- cgit From 4305fe232b8aa59af3761adc9fe6b6aa40913960 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 17 Nov 2022 20:59:18 +0800 Subject: net: sparx5: fix error handling in sparx5_port_open() If phylink_of_phy_connect() fails, the port should be disabled. If sparx5_serdes_set()/phy_power_on() fails, the port should be disabled and the phylink should be stopped and disconnected. Fixes: 946e7fd5053a ("net: sparx5: add port module support") Fixes: f3cad2611a77 ("net: sparx5: add hostmode with phylink support") Signed-off-by: Liu Jian Tested-by: Bjarni Jonasson Reviewed-by: Steen Hegelund Link: https://lore.kernel.org/r/20221117125918.203997-1-liujian56@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c index 19516ccad533..d078156581d5 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c @@ -104,7 +104,7 @@ static int sparx5_port_open(struct net_device *ndev) err = phylink_of_phy_connect(port->phylink, port->of_node, 0); if (err) { netdev_err(ndev, "Could not attach to PHY\n"); - return err; + goto err_connect; } phylink_start(port->phylink); @@ -116,10 +116,20 @@ static int sparx5_port_open(struct net_device *ndev) err = sparx5_serdes_set(port->sparx5, port, &port->conf); else err = phy_power_on(port->serdes); - if (err) + if (err) { netdev_err(ndev, "%s failed\n", __func__); + goto out_power; + } } + return 0; + +out_power: + phylink_stop(port->phylink); + phylink_disconnect_phy(port->phylink); +err_connect: + sparx5_port_enable(port, false); + return err; } -- cgit From 8427fd100c7b7793650e212a81e42f1cf124613d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:33:03 -0500 Subject: net: sched: allow act_ct to be built without NF_NAT In commit f11fe1dae1c4 ("net/sched: Make NET_ACT_CT depends on NF_NAT"), it fixed the build failure when NF_NAT is m and NET_ACT_CT is y by adding depends on NF_NAT for NET_ACT_CT. However, it would also cause NET_ACT_CT cannot be built without NF_NAT, which is not expected. This patch fixes it by changing to use "(!NF_NAT || NF_NAT)" as the depend. Fixes: f11fe1dae1c4 ("net/sched: Make NET_ACT_CT depends on NF_NAT") Signed-off-by: Xin Long Link: https://lore.kernel.org/r/b6386f28d1ba34721795fb776a91cbdabb203447.1668807183.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sched/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 1e8ab4749c6c..4662a6ce8a7e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -976,7 +976,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE + depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE help Say Y here to allow sending the packets to conntrack module. -- cgit From 53270fb0fd77fe786d8c07a0793981d797836b93 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 18 Nov 2022 16:24:19 +0800 Subject: NFC: nci: fix memory leak in nci_rx_data_packet() Syzbot reported a memory leak about skb: unreferenced object 0xffff88810e144e00 (size 240): comm "syz-executor284", pid 3701, jiffies 4294952403 (age 12.620s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __alloc_skb+0x1f9/0x270 net/core/skbuff.c:497 [] alloc_skb include/linux/skbuff.h:1267 [inline] [] virtual_ncidev_write+0x24/0xe0 drivers/nfc/virtual_ncidev.c:116 [] do_loop_readv_writev fs/read_write.c:759 [inline] [] do_loop_readv_writev fs/read_write.c:743 [inline] [] do_iter_write+0x253/0x300 fs/read_write.c:863 [] vfs_writev+0xdd/0x240 fs/read_write.c:934 [] do_writev+0xa6/0x1c0 fs/read_write.c:977 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd In nci_rx_data_packet(), if we don't get a valid conn_info, we will return directly but forget to release the skb. Reported-by: syzbot+cdb9a427d1bc08815104@syzkaller.appspotmail.com Fixes: 4aeee6871e8c ("NFC: nci: Add dynamic logical connections support") Signed-off-by: Liu Shixin Link: https://lore.kernel.org/r/20221118082419.239475-1-liushixin2@huawei.com Signed-off-by: Paolo Abeni --- net/nfc/nci/data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index aa5e712adf07..3d36ea5701f0 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -279,8 +279,10 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) nci_plen(skb->data)); conn_info = nci_get_conn_info_by_conn_id(ndev, nci_conn_id(skb->data)); - if (!conn_info) + if (!conn_info) { + kfree_skb(skb); return; + } /* strip the nci data header */ skb_pull(skb, NCI_DATA_HDR_SIZE); -- cgit From 3637a29ccbb6461b7268c5c5db525935d510afc6 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 19 Nov 2022 15:02:02 +0800 Subject: bnx2x: fix pci device refcount leak in bnx2x_vf_is_pcie_pending() As comment of pci_get_domain_bus_and_slot() says, it returns a pci device with refcount increment, when finish using it, the caller must decrement the reference count by calling pci_dev_put(). Call pci_dev_put() before returning from bnx2x_vf_is_pcie_pending() to avoid refcount leak. Fixes: b56e9670ffa4 ("bnx2x: Prepare device and initialize VF database") Suggested-by: Jakub Kicinski Signed-off-by: Yang Yingliang Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20221119070202.1407648-1-yangyingliang@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 11d15cd03600..77d4cb4ad782 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -795,16 +795,20 @@ static void bnx2x_vf_enable_traffic(struct bnx2x *bp, struct bnx2x_virtf *vf) static u8 bnx2x_vf_is_pcie_pending(struct bnx2x *bp, u8 abs_vfid) { - struct pci_dev *dev; struct bnx2x_virtf *vf = bnx2x_vf_by_abs_fid(bp, abs_vfid); + struct pci_dev *dev; + bool pending; if (!vf) return false; dev = pci_get_domain_bus_and_slot(vf->domain, vf->bus, vf->devfn); - if (dev) - return bnx2x_is_pcie_pending(dev); - return false; + if (!dev) + return false; + pending = bnx2x_is_pcie_pending(dev); + pci_dev_put(dev); + + return pending; } int bnx2x_vf_flr_clnup_epilog(struct bnx2x *bp, u8 abs_vfid) -- cgit From bb3cfbaf7c6416f3109fdb14f6fc0eb1a50361ad Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Sat, 19 Nov 2022 21:36:16 +0800 Subject: octeontx2-pf: Remove duplicate MACSEC setting Commit 4581dd480c9e ("net: octeontx2-pf: mcs: consider MACSEC setting") has already added "depends on MACSEC || !MACSEC", so remove it. Signed-off-by: Zheng Bin Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20221119133616.3583538-1-zhengbin13@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig b/drivers/net/ethernet/marvell/octeontx2/Kconfig index 6b4f640163f7..993ac180a5db 100644 --- a/drivers/net/ethernet/marvell/octeontx2/Kconfig +++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig @@ -32,7 +32,6 @@ config OCTEONTX2_PF tristate "Marvell OcteonTX2 NIC Physical Function driver" select OCTEONTX2_MBOX select NET_DEVLINK - depends on MACSEC || !MACSEC depends on (64BIT && COMPILE_TEST) || ARM64 select DIMLIB depends on PCI -- cgit From 2dc4ac91f845b690ddf2ad39172c3698b2769fa2 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Sat, 19 Nov 2022 22:18:25 +0100 Subject: tsnep: Fix rotten packets If PTP synchronisation is done every second, then sporadic the interval is higher than one second: ptp4l[696.582]: master offset -17 s2 freq -1891 path delay 573 ptp4l[697.582]: master offset -22 s2 freq -1901 path delay 573 ptp4l[699.368]: master offset -1 s2 freq -1887 path delay 573 ^^^^^^^ Should be 698.582! This problem is caused by rotten packets, which are received after polling but before interrupts are enabled again. This can be fixed by checking for pending work and rescheduling if necessary after interrupts has been enabled again. Fixes: 403f69bbdbad ("tsnep: Add TSN endpoint Ethernet MAC driver") Signed-off-by: Gerhard Engleder Link: https://lore.kernel.org/r/20221119211825.81805-1-gerhard@engleder-embedded.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/engleder/tsnep_main.c | 57 +++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index 48fb391951dd..13d5ff4e0e02 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -542,6 +542,27 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget) return (budget != 0); } +static bool tsnep_tx_pending(struct tsnep_tx *tx) +{ + unsigned long flags; + struct tsnep_tx_entry *entry; + bool pending = false; + + spin_lock_irqsave(&tx->lock, flags); + + if (tx->read != tx->write) { + entry = &tx->entry[tx->read]; + if ((__le32_to_cpu(entry->desc_wb->properties) & + TSNEP_TX_DESC_OWNER_MASK) == + (entry->properties & TSNEP_TX_DESC_OWNER_MASK)) + pending = true; + } + + spin_unlock_irqrestore(&tx->lock, flags); + + return pending; +} + static int tsnep_tx_open(struct tsnep_adapter *adapter, void __iomem *addr, int queue_index, struct tsnep_tx *tx) { @@ -821,6 +842,19 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi, return done; } +static bool tsnep_rx_pending(struct tsnep_rx *rx) +{ + struct tsnep_rx_entry *entry; + + entry = &rx->entry[rx->read]; + if ((__le32_to_cpu(entry->desc_wb->properties) & + TSNEP_DESC_OWNER_COUNTER_MASK) == + (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK)) + return true; + + return false; +} + static int tsnep_rx_open(struct tsnep_adapter *adapter, void __iomem *addr, int queue_index, struct tsnep_rx *rx) { @@ -866,6 +900,17 @@ static void tsnep_rx_close(struct tsnep_rx *rx) tsnep_rx_ring_cleanup(rx); } +static bool tsnep_pending(struct tsnep_queue *queue) +{ + if (queue->tx && tsnep_tx_pending(queue->tx)) + return true; + + if (queue->rx && tsnep_rx_pending(queue->rx)) + return true; + + return false; +} + static int tsnep_poll(struct napi_struct *napi, int budget) { struct tsnep_queue *queue = container_of(napi, struct tsnep_queue, @@ -886,9 +931,19 @@ static int tsnep_poll(struct napi_struct *napi, int budget) if (!complete) return budget; - if (likely(napi_complete_done(napi, done))) + if (likely(napi_complete_done(napi, done))) { tsnep_enable_irq(queue->adapter, queue->irq_mask); + /* reschedule if work is already pending, prevent rotten packets + * which are transmitted or received after polling but before + * interrupt enable + */ + if (tsnep_pending(queue)) { + tsnep_disable_irq(queue->adapter, queue->irq_mask); + napi_schedule(napi); + } + } + return min(done, budget - 1); } -- cgit From 6a66ce44a51bdfc47721f0c591137df2d4b21247 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 22 Nov 2022 20:18:58 +0100 Subject: netfilter: ipset: restore allowing 64 clashing elements in hash:net,iface The commit 510841da1fcc ("netfilter: ipset: enforce documented limit to prevent allocating huge memory") was too strict and prevented to add up to 64 clashing elements to a hash:net,iface type of set. This patch fixes the issue and now the type behaves as documented. Fixes: 510841da1fcc ("netfilter: ipset: enforce documented limit to prevent allocating huge memory") Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 3adc291d9ce1..7499192af586 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -916,7 +916,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, #ifdef IP_SET_HASH_WITH_MULTI if (h->bucketsize >= AHASH_MAX_TUNED) goto set_full; - else if (h->bucketsize < multi) + else if (h->bucketsize <= multi) h->bucketsize += AHASH_INIT_SIZE; #endif if (n->size >= AHASH_MAX(h)) { -- cgit From bcd9e3c1656d0f7dd9743598c65c3ae24efb38d0 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 21 Nov 2022 19:26:15 +0100 Subject: netfilter: flowtable_offload: add missing locking nf_flow_table_block_setup and the driver TC_SETUP_FT call can modify the flow block cb list while they are being traversed elsewhere, causing a crash. Add a write lock around the calls to protect readers Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Reported-by: Chad Monroe Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b04645ced89b..00b522890d77 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1098,6 +1098,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, struct flow_block_cb *block_cb, *next; int err = 0; + down_write(&flowtable->flow_block_lock); switch (cmd) { case FLOW_BLOCK_BIND: list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); @@ -1112,6 +1113,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, WARN_ON_ONCE(1); err = -EOPNOTSUPP; } + up_write(&flowtable->flow_block_lock); return err; } @@ -1168,7 +1170,9 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); + down_write(&flowtable->flow_block_lock); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + up_write(&flowtable->flow_block_lock); if (err < 0) return err; -- cgit From 77934dc6db0d2b111a8f2759e9ad2fb67f5cffa5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:11 -0800 Subject: dccp/tcp: Reset saddr on failure after inet6?_hash_connect(). When connect() is called on a socket bound to the wildcard address, we change the socket's saddr to a local address. If the socket fails to connect() to the destination, we have to reset the saddr. However, when an error occurs after inet_hash6?_connect() in (dccp|tcp)_v[46]_conect(), we forget to reset saddr and leave the socket bound to the address. From the user's point of view, whether saddr is reset or not varies with errno. Let's fix this inconsistent behaviour. Note that after this patch, the repro [0] will trigger the WARN_ON() in inet_csk_get_port() again, but this patch is not buggy and rather fixes a bug papering over the bhash2's bug for which we need another fix. For the record, the repro causes -EADDRNOTAVAIL in inet_hash6_connect() by this sequence: s1 = socket() s1.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s1.bind(('127.0.0.1', 10000)) s1.sendto(b'hello', MSG_FASTOPEN, (('127.0.0.1', 10000))) # or s1.connect(('127.0.0.1', 10000)) s2 = socket() s2.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s2.bind(('0.0.0.0', 10000)) s2.connect(('127.0.0.1', 10000)) # -EADDRNOTAVAIL s2.listen(32) # WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); [0]: https://syzkaller.appspot.com/bug?extid=015d756bbd1f8b5c8f09 Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/dccp/ipv4.c | 2 ++ net/dccp/ipv6.c | 2 ++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv6/tcp_ipv6.c | 2 ++ 4 files changed, 8 insertions(+) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 713b7b8dad7e..40640c26680e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -157,6 +157,8 @@ failure: * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e57b43006074..626166cb6d7e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -985,6 +985,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 87d440f47a70..6a3a732b584d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -343,6 +343,8 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2a3f9296df1e..81b396e5cf79 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -359,6 +359,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; -- cgit From 8acdad37cd13ce777b0811b2d332314037fcefa8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:12 -0800 Subject: dccp/tcp: Remove NULL check for prev_saddr in inet_bhash2_update_saddr(). When we call inet_bhash2_update_saddr(), prev_saddr is always non-NULL. Let's remove the unnecessary test. Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 033bf3c2538f..d745f962745e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -877,13 +877,10 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); - if (prev_saddr) { - spin_lock_bh(&prev_saddr->lock); - __sk_del_bind2_node(sk); - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); - } + spin_lock_bh(&prev_saddr->lock); + __sk_del_bind2_node(sk); + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); + spin_unlock_bh(&prev_saddr->lock); spin_lock_bh(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); -- cgit From 8c5dae4c1a49489499e6708c7dd284370ca36287 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:13 -0800 Subject: dccp/tcp: Update saddr under bhash's lock. When we call connect() for a socket bound to a wildcard address, we update saddr locklessly. However, it could result in a data race; another thread iterating over bhash might see a corrupted address. Let's update saddr under the bhash bucket's lock. Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 2 +- net/dccp/ipv4.c | 22 ++++----------------- net/dccp/ipv6.c | 23 ++++------------------ net/ipv4/af_inet.c | 11 +---------- net/ipv4/inet_hashtables.c | 45 +++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_ipv4.c | 20 ++++--------------- net/ipv6/tcp_ipv6.c | 19 +++--------------- 7 files changed, 56 insertions(+), 86 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3af1e927247d..ba06e8b52264 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -281,7 +281,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ -int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk); +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 40640c26680e..95e376e3b911 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -45,11 +45,10 @@ static unsigned int dccp_v4_pernet_id __read_mostly; int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; @@ -91,26 +90,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) daddr = fl4->daddr; if (inet->inet_saddr == 0) { - if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = - inet_bhashfn_portaddr(&dccp_hashinfo, sk, - sock_net(sk), - inet->inet_num); - prev_sk_rcv_saddr = sk->sk_rcv_saddr; - } - inet->inet_saddr = fl4->saddr; - } - - sk_rcv_saddr_set(sk, inet->inet_saddr); - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { - inet->inet_saddr = 0; - sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); ip_rt_put(rt); return err; } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); } inet->inet_dport = usin->sin_port; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 626166cb6d7e..94c101ed57a9 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -934,26 +934,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, } if (saddr == NULL) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - struct in6_addr prev_v6_rcv_saddr; - - if (icsk->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo, - sk, sock_net(sk), - inet->inet_num); - prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - } - saddr = &fl6.saddr; - sk->sk_v6_rcv_saddr = *saddr; - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); - if (err) { - sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; - goto failure; - } - } + + err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); + if (err) + goto failure; } /* set the source address */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4728087c42a5..0da679411330 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1230,7 +1230,6 @@ EXPORT_SYMBOL(inet_unregister_protosw); static int inet_sk_reselect_saddr(struct sock *sk) { - struct inet_bind_hashbucket *prev_addr_hashbucket; struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; @@ -1260,16 +1259,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) return 0; } - prev_addr_hashbucket = - inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk, - sock_net(sk), inet->inet_num); - - inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; - - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET); if (err) { - inet->inet_saddr = old_saddr; - inet->inet_rcv_saddr = old_saddr; ip_rt_put(rt); return err; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d745f962745e..18ef370af113 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -858,14 +858,34 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } -int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) +static void inet_update_saddr(struct sock *sk, void *saddr, int family) +{ + if (family == AF_INET) { + inet_sk(sk)->inet_saddr = *(__be32 *)saddr; + sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); + } +#if IS_ENABLED(CONFIG_IPV6) + else { + sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; + } +#endif +} + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); - struct inet_bind_hashbucket *head2; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + int bhash; + + if (!inet_csk(sk)->icsk_bind2_hash) { + /* Not bind()ed before. */ + inet_update_saddr(sk, saddr, family); + return 0; + } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket @@ -875,14 +895,25 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc if (!new_tb2) return -ENOMEM; + bhash = inet_bhashfn(net, port, hinfo->bhash_size); + head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); - spin_lock_bh(&prev_saddr->lock); + /* If we change saddr locklessly, another thread + * iterating over bhash might see corrupted address. + */ + spin_lock_bh(&head->lock); + + spin_lock(&head2->lock); __sk_del_bind2_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); + spin_unlock(&head2->lock); + + inet_update_saddr(sk, saddr, family); - spin_lock_bh(&head2->lock); + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); + + spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; @@ -890,7 +921,9 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc } sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; - spin_unlock_bh(&head2->lock); + spin_unlock(&head2->lock); + + spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a3a732b584d..23dd7e9df2d5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -199,15 +199,14 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_timewait_death_row *tcp_death_row; - __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct ip_options_rcu *inet_opt; struct net *net = sock_net(sk); __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; @@ -251,24 +250,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!inet->inet_saddr) { - if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, - sk, net, inet->inet_num); - prev_sk_rcv_saddr = sk->sk_rcv_saddr; - } - inet->inet_saddr = fl4->saddr; - } - - sk_rcv_saddr_set(sk, inet->inet_saddr); - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { - inet->inet_saddr = 0; - sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); ip_rt_put(rt); return err; } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); } if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 81b396e5cf79..2f3ca3190d26 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -292,24 +292,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - struct in6_addr prev_v6_rcv_saddr; - - if (icsk->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, - sk, net, inet->inet_num); - prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - } saddr = &fl6.saddr; - sk->sk_v6_rcv_saddr = *saddr; - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); - if (err) { - sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; - goto failure; - } - } + err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); + if (err) + goto failure; } /* set the source address */ -- cgit From e0833d1fedb02f038b526ae7dde178a076f56545 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:14 -0800 Subject: dccp/tcp: Fixup bhash2 bucket when connect() fails. If a socket bound to a wildcard address fails to connect(), we only reset saddr and keep the port. Then, we have to fix up the bhash2 bucket; otherwise, the bucket has an inconsistent address in the list. Also, listen() for such a socket will fire the WARN_ON() in inet_csk_get_port(). [0] Note that when a system runs out of memory, we give up fixing the bucket and unlink sk from bhash and bhash2 by inet_put_port(). [0]: WARNING: CPU: 0 PID: 207 at net/ipv4/inet_connection_sock.c:548 inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Modules linked in: CPU: 0 PID: 207 Comm: bhash2_prev_rep Not tainted 6.1.0-rc3-00799-gc8421681c845 #63 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014 RIP: 0010:inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Code: 74 a7 eb 93 48 8b 54 24 18 0f b7 cb 4c 89 e6 4c 89 ff e8 48 b2 ff ff 49 8b 87 18 04 00 00 e9 32 ff ff ff 0f 0b e9 34 ff ff ff <0f> 0b e9 42 ff ff ff 41 8b 7f 50 41 8b 4f 54 89 fe 81 f6 00 00 ff RSP: 0018:ffffc900003d7e50 EFLAGS: 00010202 RAX: ffff8881047fb500 RBX: 0000000000004e20 RCX: 0000000000000000 RDX: 000000000000000a RSI: 00000000fffffe00 RDI: 00000000ffffffff RBP: ffffffff8324dc00 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000001 R14: 0000000000004e20 R15: ffff8881054e1280 FS: 00007f8ac04dc740(0000) GS:ffff88842fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020001540 CR3: 00000001055fa003 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: inet_csk_listen_start (net/ipv4/inet_connection_sock.c:1205) inet_listen (net/ipv4/af_inet.c:228) __sys_listen (net/socket.c:1810) __x64_sys_listen (net/socket.c:1819 net/socket.c:1817 net/socket.c:1817) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f8ac051de5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc1c177248 EFLAGS: 00000206 ORIG_RAX: 0000000000000032 RAX: ffffffffffffffda RBX: 0000000020001550 RCX: 00007f8ac051de5d RDX: ffffffffffffff80 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 00007ffc1c177270 R08: 0000000000000018 R09: 0000000000000007 R10: 0000000020001540 R11: 0000000000000206 R12: 00007ffc1c177388 R13: 0000000000401169 R14: 0000000000403e18 R15: 00007f8ac0723000 Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: syzbot Reported-by: Mat Martineau Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 1 + net/dccp/ipv4.c | 3 +-- net/dccp/ipv6.c | 3 +-- net/dccp/proto.c | 3 +-- net/ipv4/inet_hashtables.c | 38 ++++++++++++++++++++++++++++++++++---- net/ipv4/tcp.c | 3 +-- net/ipv4/tcp_ipv4.c | 3 +-- net/ipv6/tcp_ipv6.c | 3 +-- 8 files changed, 41 insertions(+), 16 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ba06e8b52264..69174093078f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -282,6 +282,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); +void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 95e376e3b911..b780827f5e0a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -143,8 +143,7 @@ failure: * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 94c101ed57a9..602f3432d80b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -970,8 +970,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: dccp_set_state(sk, DCCP_CLOSED); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index c548ca3e9b0e..85e35c5e8890 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -279,8 +279,7 @@ int dccp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 18ef370af113..3cec471a2cd2 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -871,7 +871,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) #endif } -int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; @@ -883,7 +883,11 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ - inet_update_saddr(sk, saddr, family); + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); + return 0; } @@ -892,8 +896,19 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); - if (!new_tb2) + if (!new_tb2) { + if (reset) { + /* The (INADDR_ANY, port) bucket might have already + * been freed, then we cannot fixup icsk_bind2_hash, + * so we give up and unlink sk from bhash/bhash2 not + * to leave inconsistency in bhash2. + */ + inet_put_port(sk); + inet_reset_saddr(sk); + } + return -ENOMEM; + } bhash = inet_bhashfn(net, port, hinfo->bhash_size); head = &hinfo->bhash[bhash]; @@ -909,7 +924,10 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); - inet_update_saddr(sk, saddr, family); + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); @@ -930,8 +948,20 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) return 0; } + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +{ + return __inet_bhash2_update_saddr(sk, saddr, family, false); +} EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +void inet_bhash2_reset_saddr(struct sock *sk) +{ + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + __inet_bhash2_update_saddr(sk, NULL, 0, true); +} +EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); + /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 54836a6b81d6..4f2205756cfe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3114,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 23dd7e9df2d5..da46357f501b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -331,8 +331,7 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2f3ca3190d26..f0548dbcabd2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -346,8 +346,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; -- cgit From 3213f808ae21be3891885de2f3a775afafcda987 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 20 Nov 2022 11:54:05 +0800 Subject: net: ethernet: mtk_eth_soc: fix potential memory leak in mtk_rx_alloc() When fail to dma_map_single() in mtk_rx_alloc(), it returns directly. But the memory allocated for local variable data is not freed, and local variabel data has not been attached to ring->data[i] yet, so the memory allocated for local variable data will not be freed outside mtk_rx_alloc() too. Thus memory leak would occur in this scenario. Add skb_free_frag(data) when dma_map_single() failed. Fixes: 23233e577ef9 ("net: ethernet: mtk_eth_soc: rely on page_pool for single page buffers") Signed-off-by: Ziyang Xuan Acked-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/20221120035405.1464341-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 1d1f2342e3ec..bbffd92089bf 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2378,8 +2378,10 @@ static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag) data + NET_SKB_PAD + eth->ip_align, ring->buf_size, DMA_FROM_DEVICE); if (unlikely(dma_mapping_error(eth->dma_dev, - dma_addr))) + dma_addr))) { + skb_free_frag(data); return -ENOMEM; + } } rxd->rxd1 = (unsigned int)dma_addr; ring->data[i] = data; -- cgit From 8110437e59616293228cd781c486d8495a61e36a Mon Sep 17 00:00:00 2001 From: Yan Cangang Date: Sun, 20 Nov 2022 13:52:58 +0800 Subject: net: ethernet: mtk_eth_soc: fix resource leak in error path In mtk_probe(), when mtk_ppe_init() or mtk_eth_offload_init() failed, mtk_mdio_cleanup() isn't called. Fix it. Fixes: ba37b7caf1ed ("net: ethernet: mtk_eth_soc: add support for initializing the PPE") Fixes: 502e84e2382d ("net: ethernet: mtk_eth_soc: add flow offloading support") Signed-off-by: Yan Cangang Reviewed-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index bbffd92089bf..ae073b431738 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -4147,13 +4147,13 @@ static int mtk_probe(struct platform_device *pdev) eth->soc->offload_version, i); if (!eth->ppe[i]) { err = -ENOMEM; - goto err_free_dev; + goto err_deinit_mdio; } } err = mtk_eth_offload_init(eth); if (err) - goto err_free_dev; + goto err_deinit_mdio; } for (i = 0; i < MTK_MAX_DEVS; i++) { -- cgit From 603ea5e7ffa73c7fac07d8713d97285990695213 Mon Sep 17 00:00:00 2001 From: Yan Cangang Date: Sun, 20 Nov 2022 13:52:59 +0800 Subject: net: ethernet: mtk_eth_soc: fix memory leak in error path In mtk_ppe_init(), when dmam_alloc_coherent() or devm_kzalloc() failed, the rhashtable ppe->l2_flows isn't destroyed. Fix it. In mtk_probe(), when mtk_ppe_init() or mtk_eth_offload_init() or register_netdev() failed, have the same problem. Fix it. Fixes: 33fc42de3327 ("net: ethernet: mtk_eth_soc: support creating mac address based offload entries") Signed-off-by: Yan Cangang Reviewed-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 9 +++++---- drivers/net/ethernet/mediatek/mtk_ppe.c | 19 +++++++++++++++++-- drivers/net/ethernet/mediatek/mtk_ppe.h | 1 + 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index ae073b431738..1d36619c5ec9 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -4147,13 +4147,13 @@ static int mtk_probe(struct platform_device *pdev) eth->soc->offload_version, i); if (!eth->ppe[i]) { err = -ENOMEM; - goto err_deinit_mdio; + goto err_deinit_ppe; } } err = mtk_eth_offload_init(eth); if (err) - goto err_deinit_mdio; + goto err_deinit_ppe; } for (i = 0; i < MTK_MAX_DEVS; i++) { @@ -4163,7 +4163,7 @@ static int mtk_probe(struct platform_device *pdev) err = register_netdev(eth->netdev[i]); if (err) { dev_err(eth->dev, "error bringing up device\n"); - goto err_deinit_mdio; + goto err_deinit_ppe; } else netif_info(eth, probe, eth->netdev[i], "mediatek frame engine at 0x%08lx, irq %d\n", @@ -4181,7 +4181,8 @@ static int mtk_probe(struct platform_device *pdev) return 0; -err_deinit_mdio: +err_deinit_ppe: + mtk_ppe_deinit(eth); mtk_mdio_cleanup(eth); err_free_dev: mtk_free_dev(eth); diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index 2d8ca99f2467..784ecb2dc9fb 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -737,7 +737,7 @@ struct mtk_ppe *mtk_ppe_init(struct mtk_eth *eth, void __iomem *base, MTK_PPE_ENTRIES * soc->foe_entry_size, &ppe->foe_phys, GFP_KERNEL); if (!foe) - return NULL; + goto err_free_l2_flows; ppe->foe_table = foe; @@ -745,11 +745,26 @@ struct mtk_ppe *mtk_ppe_init(struct mtk_eth *eth, void __iomem *base, sizeof(*ppe->foe_flow); ppe->foe_flow = devm_kzalloc(dev, foe_flow_size, GFP_KERNEL); if (!ppe->foe_flow) - return NULL; + goto err_free_l2_flows; mtk_ppe_debugfs_init(ppe, index); return ppe; + +err_free_l2_flows: + rhashtable_destroy(&ppe->l2_flows); + return NULL; +} + +void mtk_ppe_deinit(struct mtk_eth *eth) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(eth->ppe); i++) { + if (!eth->ppe[i]) + return; + rhashtable_destroy(ð->ppe[i]->l2_flows); + } } static void mtk_ppe_init_foe_table(struct mtk_ppe *ppe) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h index 0b7a67a958e4..a09c32539bcc 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.h +++ b/drivers/net/ethernet/mediatek/mtk_ppe.h @@ -304,6 +304,7 @@ struct mtk_ppe { struct mtk_ppe *mtk_ppe_init(struct mtk_eth *eth, void __iomem *base, int version, int index); +void mtk_ppe_deinit(struct mtk_eth *eth); void mtk_ppe_start(struct mtk_ppe *ppe); int mtk_ppe_stop(struct mtk_ppe *ppe); -- cgit From 568fe84940ac0e4e0b2cd7751b8b4911f7b9c215 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 20 Nov 2022 15:28:38 +0800 Subject: ipv4: Fix error return code in fib_table_insert() In fib_table_insert(), if the alias was already inserted, but node not exist, the error code should be set before return from error handling path. Fixes: a6c76c17df02 ("ipv4: Notify route after insertion to the routing table") Signed-off-by: Ziyang Xuan Link: https://lore.kernel.org/r/20221120072838.2167047-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_trie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c88bf856c443..74d403dbd2b4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1381,8 +1381,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); - if (WARN_ON_ONCE(!l)) + if (WARN_ON_ONCE(!l)) { + err = -ENOENT; goto out_free_new_fa; + } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { -- cgit From 1c40cde6b5171d9c8dfc69be00464fd1c75e210b Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Sun, 20 Nov 2022 14:24:38 +0800 Subject: arcnet: fix potential memory leak in com20020_probe() In com20020_probe(), if com20020_config() fails, dev and info will not be freed, which will lead to a memory leak. This patch adds freeing dev and info after com20020_config() fails to fix this bug. Compile tested only. Fixes: 15b99ac17295 ("[PATCH] pcmcia: add return value to _config() functions") Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020_cs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c index 24150c933fcb..dc3253b318da 100644 --- a/drivers/net/arcnet/com20020_cs.c +++ b/drivers/net/arcnet/com20020_cs.c @@ -113,6 +113,7 @@ static int com20020_probe(struct pcmcia_device *p_dev) struct com20020_dev *info; struct net_device *dev; struct arcnet_local *lp; + int ret = -ENOMEM; dev_dbg(&p_dev->dev, "com20020_attach()\n"); @@ -142,12 +143,18 @@ static int com20020_probe(struct pcmcia_device *p_dev) info->dev = dev; p_dev->priv = info; - return com20020_config(p_dev); + ret = com20020_config(p_dev); + if (ret) + goto fail_config; + + return 0; +fail_config: + free_arcdev(dev); fail_alloc_dev: kfree(info); fail_alloc_info: - return -ENOMEM; + return ret; } /* com20020_attach */ static void com20020_detach(struct pcmcia_device *link) -- cgit From bac81f40c2c1484a2bd416b3fbf983f6e76488cd Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Mon, 21 Nov 2022 03:32:26 +0000 Subject: net: dm9051: Fix missing dev_kfree_skb() in dm9051_loop_rx() The dm9051_loop_rx() returns without release skb when dm9051_stop_mrcmd() returns error, free the skb to avoid this leak. Fixes: 2dc95a4d30ed ("net: Add dm9051 driver") Signed-off-by: Yuan Can Reviewed-by: Maciej Fijalkowski Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9051.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/davicom/dm9051.c b/drivers/net/ethernet/davicom/dm9051.c index a523ddda7609..de7105a84747 100644 --- a/drivers/net/ethernet/davicom/dm9051.c +++ b/drivers/net/ethernet/davicom/dm9051.c @@ -798,8 +798,10 @@ static int dm9051_loop_rx(struct board_info *db) } ret = dm9051_stop_mrcmd(db); - if (ret) + if (ret) { + dev_kfree_skb(skb); return ret; + } skb->protocol = eth_type_trans(skb, db->ndev); if (db->ndev->features & NETIF_F_RXCSUM) -- cgit From af295e854a4e3813ffbdef26dbb6a4d6226c3ea1 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 21 Nov 2022 09:54:26 +0100 Subject: l2tp: Don't sleep and disable BH under writer-side sk_callback_lock When holding a reader-writer spin lock we cannot sleep. Calling setup_udp_tunnel_sock() with write lock held violates this rule, because we end up calling percpu_down_read(), which might sleep, as syzbot reports [1]: __might_resched.cold+0x222/0x26b kernel/sched/core.c:9890 percpu_down_read include/linux/percpu-rwsem.h:49 [inline] cpus_read_lock+0x1b/0x140 kernel/cpu.c:310 static_key_slow_inc+0x12/0x20 kernel/jump_label.c:158 udp_tunnel_encap_enable include/net/udp_tunnel.h:187 [inline] setup_udp_tunnel_sock+0x43d/0x550 net/ipv4/udp_tunnel_core.c:81 l2tp_tunnel_register+0xc51/0x1210 net/l2tp/l2tp_core.c:1509 pppol2tp_connect+0xcdc/0x1a10 net/l2tp/l2tp_ppp.c:723 Trim the writer-side critical section for sk_callback_lock down to the minimum, so that it covers only operations on sk_user_data. Also, when grabbing the sk_callback_lock, we always need to disable BH, as Eric points out. Failing to do so leads to deadlocks because we acquire sk_callback_lock in softirq context, which can get stuck waiting on us if: 1) it runs on the same CPU, or CPU0 ---- lock(clock-AF_INET6); lock(clock-AF_INET6); 2) lock ordering leads to priority inversion CPU0 CPU1 ---- ---- lock(clock-AF_INET6); local_irq_disable(); lock(&tcp_hashinfo.bhash[i].lock); lock(clock-AF_INET6); lock(&tcp_hashinfo.bhash[i].lock); ... as syzbot reports [2,3]. Use the _bh variants for write_(un)lock. [1] https://lore.kernel.org/netdev/0000000000004e78ec05eda79749@google.com/ [2] https://lore.kernel.org/netdev/000000000000e38b6605eda76f98@google.com/ [3] https://lore.kernel.org/netdev/000000000000dfa31e05eda76f75@google.com/ v2: - Check and set sk_user_data while holding sk_callback_lock for both L2TP encapsulation types (IP and UDP) (Tetsuo) Cc: Tom Parkin Cc: Tetsuo Handa Fixes: b68777d54fac ("l2tp: Serialize access to sk_user_data with sk_callback_lock") Reported-by: Eric Dumazet Reported-by: syzbot+703d9e154b3b58277261@syzkaller.appspotmail.com Reported-by: syzbot+50680ced9e98a61f7698@syzkaller.appspotmail.com Reported-by: syzbot+de987172bb74a381879b@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 754fdda8a5f5..9a1415fe3fa7 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1474,11 +1474,12 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, } sk = sock->sk; - write_lock(&sk->sk_callback_lock); - + write_lock_bh(&sk->sk_callback_lock); ret = l2tp_validate_socket(sk, net, tunnel->encap); if (ret < 0) - goto err_sock; + goto err_inval_sock; + rcu_assign_sk_user_data(sk, tunnel); + write_unlock_bh(&sk->sk_callback_lock); tunnel->l2tp_net = net; pn = l2tp_pernet(net); @@ -1507,8 +1508,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, }; setup_udp_tunnel_sock(net, sock, &udp_cfg); - } else { - rcu_assign_sk_user_data(sk, tunnel); } tunnel->old_sk_destruct = sk->sk_destruct; @@ -1522,16 +1521,18 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, if (tunnel->fd >= 0) sockfd_put(sock); - write_unlock(&sk->sk_callback_lock); return 0; err_sock: + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_sk_user_data(sk, NULL); +err_inval_sock: + write_unlock_bh(&sk->sk_callback_lock); + if (tunnel->fd < 0) sock_release(sock); else sockfd_put(sock); - - write_unlock(&sk->sk_callback_lock); err: return ret; } -- cgit From a487069e11b6527373f7c6f435d8998051d0b5d9 Mon Sep 17 00:00:00 2001 From: Davide Tronchin Date: Mon, 21 Nov 2022 13:54:55 +0100 Subject: net: usb: qmi_wwan: add u-blox 0x1342 composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add RmNet support for LARA-L6. LARA-L6 module can be configured (by AT interface) in three different USB modes: * Default mode (Vendor ID: 0x1546 Product ID: 0x1341) with 4 serial interfaces * RmNet mode (Vendor ID: 0x1546 Product ID: 0x1342) with 4 serial interfaces and 1 RmNet virtual network interface * CDC-ECM mode (Vendor ID: 0x1546 Product ID: 0x1343) with 4 serial interface and 1 CDC-ECM virtual network interface In RmNet mode LARA-L6 exposes the following interfaces: If 0: Diagnostic If 1: AT parser If 2: AT parser If 3: AT parset/alternative functions If 4: RMNET interface Signed-off-by: Davide Tronchin Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index afd6faa4c2ec..554d4e2a84a4 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1423,6 +1423,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ {QMI_FIXED_INTF(0x0489, 0xe0b5, 0)}, /* Foxconn T77W968 LTE with eSIM support*/ {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ + {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ -- cgit From 748064b54c99418f615aabff5755996cd9816969 Mon Sep 17 00:00:00 2001 From: Santiago Ruano Rincón Date: Mon, 21 Nov 2022 21:53:05 +0100 Subject: net/cdc_ncm: Fix multicast RX support for CDC NCM devices with ZLP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ZLP for DisplayLink ethernet devices was enabled in 6.0: 266c0190aee3 ("net/cdc_ncm: Enable ZLP for DisplayLink ethernet devices"). The related driver_info should be the "same as cdc_ncm_info, but with FLAG_SEND_ZLP". However, set_rx_mode that enables handling multicast traffic was missing in the new cdc_ncm_zlp_info. usbnet_cdc_update_filter rx mode was introduced in linux 5.9 with: e10dcb1b6ba7 ("net: cdc_ncm: hook into set_rx_mode to admit multicast traffic") Without this hook, multicast, and then IPv6 SLAAC, is broken. Fixes: 266c0190aee3 ("net/cdc_ncm: Enable ZLP for DisplayLink ethernet devices") Signed-off-by: Santiago Ruano Rincón Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 8d5cbda33f66..0897fdb6254b 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1915,6 +1915,7 @@ static const struct driver_info cdc_ncm_zlp_info = { .status = cdc_ncm_status, .rx_fixup = cdc_ncm_rx_fixup, .tx_fixup = cdc_ncm_tx_fixup, + .set_rx_mode = usbnet_cdc_update_filter, }; /* Same as cdc_ncm_info, but with FLAG_WWAN */ -- cgit From b6e7c196ac2f90f08955ca4db568a52160876407 Mon Sep 17 00:00:00 2001 From: Nir Levy Date: Mon, 21 Nov 2022 00:06:30 +0200 Subject: Documentation: networking: Update generic_netlink_howto URL The documentation refers to invalid web page under www.linuxfoundation.org The patch refers to a working URL under wiki.linuxfoundation.org Signed-off-by: Nir Levy Link: https://lore.kernel.org/all/20221120220630.7443-1-bhr166@gmail.com/ Signed-off-by: Jakub Kicinski --- Documentation/networking/generic_netlink.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/generic_netlink.rst b/Documentation/networking/generic_netlink.rst index 59e04ccf80c1..d960dbd7e80e 100644 --- a/Documentation/networking/generic_netlink.rst +++ b/Documentation/networking/generic_netlink.rst @@ -6,4 +6,4 @@ Generic Netlink A wiki document on how to use Generic Netlink can be found here: - * http://www.linuxfoundation.org/collaborate/workgroups/networking/generic_netlink_howto + * https://wiki.linuxfoundation.org/networking/generic_netlink_howto -- cgit From c60c152230828825c06e62a8f1ce956d4b659266 Mon Sep 17 00:00:00 2001 From: Martin Faltesek Date: Mon, 21 Nov 2022 18:42:44 -0600 Subject: nfc: st-nci: fix incorrect validating logic in EVT_TRANSACTION The first validation check for EVT_TRANSACTION has two different checks tied together with logical AND. One is a check for minimum packet length, and the other is for a valid aid_tag. If either condition is true (fails), then an error should be triggered. The fix is to change && to ||. Reported-by: Denis Efremov Reviewed-by: Guenter Roeck Fixes: 5d1ceb7f5e56 ("NFC: st21nfcb: Add HCI transaction event support") Signed-off-by: Martin Faltesek Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski --- drivers/nfc/st-nci/se.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/st-nci/se.c b/drivers/nfc/st-nci/se.c index 7764b1a4c3cf..589e1dec78e7 100644 --- a/drivers/nfc/st-nci/se.c +++ b/drivers/nfc/st-nci/se.c @@ -326,7 +326,7 @@ static int st_nci_hci_connectivity_event_received(struct nci_dev *ndev, * AID 81 5 to 16 * PARAMETERS 82 0 to 255 */ - if (skb->len < NFC_MIN_AID_LENGTH + 2 && + if (skb->len < NFC_MIN_AID_LENGTH + 2 || skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) return -EPROTO; -- cgit From 440f2ae9c9f06e26f5dcea697a53717fc61a318c Mon Sep 17 00:00:00 2001 From: Martin Faltesek Date: Mon, 21 Nov 2022 18:42:45 -0600 Subject: nfc: st-nci: fix memory leaks in EVT_TRANSACTION Error path does not free previously allocated memory. Add devm_kfree() to the failure path. Reported-by: Denis Efremov Reviewed-by: Guenter Roeck Fixes: 5d1ceb7f5e56 ("NFC: st21nfcb: Add HCI transaction event support") Signed-off-by: Martin Faltesek Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski --- drivers/nfc/st-nci/se.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/st-nci/se.c b/drivers/nfc/st-nci/se.c index 589e1dec78e7..fc59916ae5ae 100644 --- a/drivers/nfc/st-nci/se.c +++ b/drivers/nfc/st-nci/se.c @@ -339,8 +339,10 @@ static int st_nci_hci_connectivity_event_received(struct nci_dev *ndev, /* Check next byte is PARAMETERS tag (82) */ if (skb->data[transaction->aid_len + 2] != - NFC_EVT_TRANSACTION_PARAMS_TAG) + NFC_EVT_TRANSACTION_PARAMS_TAG) { + devm_kfree(dev, transaction); return -EPROTO; + } transaction->params_len = skb->data[transaction->aid_len + 3]; memcpy(transaction->params, skb->data + -- cgit From 0254f31a7df3bb3b90c2d9dd2d4052f7b95eb287 Mon Sep 17 00:00:00 2001 From: Martin Faltesek Date: Mon, 21 Nov 2022 18:42:46 -0600 Subject: nfc: st-nci: fix incorrect sizing calculations in EVT_TRANSACTION The transaction buffer is allocated by using the size of the packet buf, and subtracting two which seems intended to remove the two tags which are not present in the target structure. This calculation leads to under counting memory because of differences between the packet contents and the target structure. The aid_len field is a u8 in the packet, but a u32 in the structure, resulting in at least 3 bytes always being under counted. Further, the aid data is a variable length field in the packet, but fixed in the structure, so if this field is less than the max, the difference is added to the under counting. To fix, perform validation checks progressively to safely reach the next field, to determine the size of both buffers and verify both tags. Once all validation checks pass, allocate the buffer and copy the data. This eliminates freeing memory on the error path, as validation checks are moved ahead of memory allocation. Reported-by: Denis Efremov Reviewed-by: Guenter Roeck Fixes: 5d1ceb7f5e56 ("NFC: st21nfcb: Add HCI transaction event support") Signed-off-by: Martin Faltesek Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski --- drivers/nfc/st-nci/se.c | 51 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/drivers/nfc/st-nci/se.c b/drivers/nfc/st-nci/se.c index fc59916ae5ae..ec87dd21e054 100644 --- a/drivers/nfc/st-nci/se.c +++ b/drivers/nfc/st-nci/se.c @@ -312,6 +312,8 @@ static int st_nci_hci_connectivity_event_received(struct nci_dev *ndev, int r = 0; struct device *dev = &ndev->nfc_dev->dev; struct nfc_evt_transaction *transaction; + u32 aid_len; + u8 params_len; pr_debug("connectivity gate event: %x\n", event); @@ -325,28 +327,47 @@ static int st_nci_hci_connectivity_event_received(struct nci_dev *ndev, * Description Tag Length * AID 81 5 to 16 * PARAMETERS 82 0 to 255 + * + * The key differences are aid storage length is variably sized + * in the packet, but fixed in nfc_evt_transaction, and that + * the aid_len is u8 in the packet, but u32 in the structure, + * and the tags in the packet are not included in + * nfc_evt_transaction. + * + * size(b): 1 1 5-16 1 1 0-255 + * offset: 0 1 2 aid_len + 2 aid_len + 3 aid_len + 4 + * mem name: aid_tag(M) aid_len aid params_tag(M) params_len params + * example: 0x81 5-16 X 0x82 0-255 X */ - if (skb->len < NFC_MIN_AID_LENGTH + 2 || - skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) + if (skb->len < 2 || skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) return -EPROTO; - transaction = devm_kzalloc(dev, skb->len - 2, GFP_KERNEL); - if (!transaction) - return -ENOMEM; + aid_len = skb->data[1]; - transaction->aid_len = skb->data[1]; - memcpy(transaction->aid, &skb->data[2], transaction->aid_len); + if (skb->len < aid_len + 4 || + aid_len > sizeof(transaction->aid)) + return -EPROTO; - /* Check next byte is PARAMETERS tag (82) */ - if (skb->data[transaction->aid_len + 2] != - NFC_EVT_TRANSACTION_PARAMS_TAG) { - devm_kfree(dev, transaction); + params_len = skb->data[aid_len + 3]; + + /* Verify PARAMETERS tag is (82), and final check that there is + * enough space in the packet to read everything. + */ + if (skb->data[aid_len + 2] != NFC_EVT_TRANSACTION_PARAMS_TAG || + skb->len < aid_len + 4 + params_len) return -EPROTO; - } - transaction->params_len = skb->data[transaction->aid_len + 3]; - memcpy(transaction->params, skb->data + - transaction->aid_len + 4, transaction->params_len); + transaction = devm_kzalloc(dev, sizeof(*transaction) + + params_len, GFP_KERNEL); + if (!transaction) + return -ENOMEM; + + transaction->aid_len = aid_len; + transaction->params_len = params_len; + + memcpy(transaction->aid, &skb->data[2], aid_len); + memcpy(transaction->params, &skb->data[aid_len + 4], + params_len); r = nfc_se_transaction(ndev->nfc_dev, host, transaction); break; -- cgit From 9a234a2a085ab9fd2be8d0c1eedfcd10f74b97eb Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Tue, 22 Nov 2022 19:10:31 +0800 Subject: net: marvell: prestera: add missing unregister_netdev() in prestera_port_create() If prestera_port_sfp_bind() fails, unregister_netdev() should be called in error handling path. Compile tested only. Fixes: 52323ef75414 ("net: marvell: prestera: add phylink support") Signed-off-by: Zhang Changzhong Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/1669115432-36841-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/prestera_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 24f9d6024745..47796e4d900c 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -746,6 +746,7 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id) return 0; err_sfp_bind: + unregister_netdev(dev); err_register_netdev: prestera_port_list_del(port); err_port_init: -- cgit From 290b5fe096e7dd0aad730d1af4f7f2d9fea43e11 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 22 Nov 2022 15:09:36 +0200 Subject: net: enetc: preserve TX ring priority across reconfiguration In the blamed commit, a rudimentary reallocation procedure for RX buffer descriptors was implemented, for the situation when their format changes between normal (no PTP) and extended (PTP). enetc_hwtstamp_set() calls enetc_close() and enetc_open() in a sequence, and this sequence loses information which was previously configured in the TX BDR Mode Register, specifically via the enetc_set_bdr_prio() call. The TX ring priority is configured by tc-mqprio and tc-taprio, and affects important things for TSN such as the TX time of packets. The issue manifests itself most visibly by the fact that isochron --txtime reports premature packet transmissions when PTP is first enabled on an enetc interface. Save the TX ring priority in a new field in struct enetc_bdr (occupies a 2 byte hole on arm64) in order to make this survive a ring reconfiguration. Fixes: 434cebabd3a2 ("enetc: Add dynamic allocation of extended Rx BD rings") Signed-off-by: Vladimir Oltean Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/r/20221122130936.1704151-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 8 +++++--- drivers/net/ethernet/freescale/enetc/enetc.h | 1 + drivers/net/ethernet/freescale/enetc/enetc_qos.c | 21 +++++++++++++-------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index f8c06c3f9464..8671591cb750 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2058,7 +2058,7 @@ static void enetc_setup_txbdr(struct enetc_hw *hw, struct enetc_bdr *tx_ring) /* enable Tx ints by setting pkt thr to 1 */ enetc_txbdr_wr(hw, idx, ENETC_TBICR0, ENETC_TBICR0_ICEN | 0x1); - tbmr = ENETC_TBMR_EN; + tbmr = ENETC_TBMR_EN | ENETC_TBMR_SET_PRIO(tx_ring->prio); if (tx_ring->ndev->features & NETIF_F_HW_VLAN_CTAG_TX) tbmr |= ENETC_TBMR_VIH; @@ -2461,7 +2461,8 @@ int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data) /* Reset all ring priorities to 0 */ for (i = 0; i < priv->num_tx_rings; i++) { tx_ring = priv->tx_ring[i]; - enetc_set_bdr_prio(hw, tx_ring->index, 0); + tx_ring->prio = 0; + enetc_set_bdr_prio(hw, tx_ring->index, tx_ring->prio); } return 0; @@ -2480,7 +2481,8 @@ int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data) */ for (i = 0; i < num_tc; i++) { tx_ring = priv->tx_ring[i]; - enetc_set_bdr_prio(hw, tx_ring->index, i); + tx_ring->prio = i; + enetc_set_bdr_prio(hw, tx_ring->index, tx_ring->prio); } /* Reset the number of netdev queues based on the TC count */ diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 161930a65f61..c6d8cc15c270 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -95,6 +95,7 @@ struct enetc_bdr { void __iomem *rcir; }; u16 index; + u16 prio; int bd_count; /* # of BDs */ int next_to_use; int next_to_clean; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_qos.c b/drivers/net/ethernet/freescale/enetc/enetc_qos.c index a842e1999122..fcebb54224c0 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_qos.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_qos.c @@ -137,6 +137,7 @@ int enetc_setup_tc_taprio(struct net_device *ndev, void *type_data) struct tc_taprio_qopt_offload *taprio = type_data; struct enetc_ndev_priv *priv = netdev_priv(ndev); struct enetc_hw *hw = &priv->si->hw; + struct enetc_bdr *tx_ring; int err; int i; @@ -145,16 +146,20 @@ int enetc_setup_tc_taprio(struct net_device *ndev, void *type_data) if (priv->tx_ring[i]->tsd_enable) return -EBUSY; - for (i = 0; i < priv->num_tx_rings; i++) - enetc_set_bdr_prio(hw, priv->tx_ring[i]->index, - taprio->enable ? i : 0); + for (i = 0; i < priv->num_tx_rings; i++) { + tx_ring = priv->tx_ring[i]; + tx_ring->prio = taprio->enable ? i : 0; + enetc_set_bdr_prio(hw, tx_ring->index, tx_ring->prio); + } err = enetc_setup_taprio(ndev, taprio); - - if (err) - for (i = 0; i < priv->num_tx_rings; i++) - enetc_set_bdr_prio(hw, priv->tx_ring[i]->index, - taprio->enable ? 0 : i); + if (err) { + for (i = 0; i < priv->num_tx_rings; i++) { + tx_ring = priv->tx_ring[i]; + tx_ring->prio = taprio->enable ? 0 : i; + enetc_set_bdr_prio(hw, tx_ring->index, tx_ring->prio); + } + } return err; } -- cgit From cd07eadd5147ffdae11b6fd28b77a3872f2a2484 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 22 Nov 2022 13:54:49 +0800 Subject: octeontx2-pf: Add check for devm_kcalloc As the devm_kcalloc may return NULL pointer, it should be better to add check for the return value, as same as the others. Fixes: e8e095b3b370 ("octeontx2-af: cn10k: Bandwidth profiles config support") Signed-off-by: Jiasheng Jiang Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20221122055449.31247-1-jiasheng@iscas.ac.cn Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 7646bb2ec89b..a62c1b322012 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -4985,6 +4985,8 @@ static int nix_setup_ipolicers(struct rvu *rvu, ipolicer->ref_count = devm_kcalloc(rvu->dev, ipolicer->band_prof.max, sizeof(u16), GFP_KERNEL); + if (!ipolicer->ref_count) + return -ENOMEM; } /* Set policer timeunit to 2us ie (19 + 1) * 100 nsec = 2us */ -- cgit From 08e8a949f684e1fbc4b1efd2337d72ec8f3613d9 Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Tue, 22 Nov 2022 20:19:40 +0800 Subject: net: wwan: t7xx: Fix the ACPI memory leak The ACPI buffer memory (buffer.pointer) should be freed as the buffer is not used after acpi_evaluate_object(), free it to prevent memory leak. Fixes: 13e920d93e37 ("net: wwan: t7xx: Add core components") Signed-off-by: Hanjun Guo Link: https://lore.kernel.org/r/1669119580-28977-1-git-send-email-guohanjun@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/wwan/t7xx/t7xx_modem_ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wwan/t7xx/t7xx_modem_ops.c b/drivers/net/wwan/t7xx/t7xx_modem_ops.c index 3458af31e864..7d0f5e4f0a78 100644 --- a/drivers/net/wwan/t7xx/t7xx_modem_ops.c +++ b/drivers/net/wwan/t7xx/t7xx_modem_ops.c @@ -165,6 +165,8 @@ static int t7xx_acpi_reset(struct t7xx_pci_dev *t7xx_dev, char *fn_name) return -EFAULT; } + kfree(buffer.pointer); + #endif return 0; } -- cgit From b0686565946368892c2cdf92f102392e24823588 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Tue, 22 Nov 2022 23:00:46 +0800 Subject: virtio_net: Fix probe failed when modprobe virtio_net When doing the following test steps, an error was found: step 1: modprobe virtio_net succeeded # modprobe virtio_net <-- OK step 2: fault injection in register_netdevice() # modprobe -r virtio_net <-- OK # ... FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 0 CPU: 0 PID: 3521 Comm: modprobe Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), Call Trace: ... should_failslab+0xa/0x20 ... dev_set_name+0xc0/0x100 netdev_register_kobject+0xc2/0x340 register_netdevice+0xbb9/0x1320 virtnet_probe+0x1d72/0x2658 [virtio_net] ... virtio_net: probe of virtio0 failed with error -22 step 3: modprobe virtio_net failed # modprobe virtio_net <-- failed virtio_net: probe of virtio0 failed with error -2 The root cause of the problem is that the queues are not disable on the error handling path when register_netdevice() fails in virtnet_probe(), resulting in an error "-ENOENT" returned in the next modprobe call in setup_vq(). virtio_pci_modern_device uses virtqueues to send or receive message, and "queue_enable" records whether the queues are available. In vp_modern_find_vqs(), all queues will be selected and activated, but once queues are enabled there is no way to go back except reset. Fix it by reset virtio device on error handling path. This makes error handling follow the same order as normal device cleanup in virtnet_remove() which does: unregister, destroy failover, then reset. And that flow is better tested than error handling so we can be reasonably sure it works well. Fixes: 024655555021 ("virtio_net: fix use after free on allocation failure") Signed-off-by: Li Zetao Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20221122150046.3910638-1-lizetao1@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7106932c6f88..86e52454b5b5 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3949,12 +3949,11 @@ static int virtnet_probe(struct virtio_device *vdev) return 0; free_unregister_netdev: - virtio_reset_device(vdev); - unregister_netdev(dev); free_failover: net_failover_destroy(vi->failover); free_vqs: + virtio_reset_device(vdev); cancel_delayed_work_sync(&vi->refill); free_receive_page_frags(vi); virtnet_del_vqs(vi); -- cgit From 6aae1bcb41c7aefd28bf5d90e36ebdd151c2d8ba Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Wed, 23 Nov 2022 09:16:17 +0800 Subject: net: altera_tse: release phylink resources in tse_shutdown() Call phylink_disconnect_phy() in tse_shutdown() to release the resources occupied by phylink_of_phy_connect() in the tse_open(). Fixes: fef2998203e1 ("net: altera: tse: convert to phylink") Signed-off-by: Liu Jian Link: https://lore.kernel.org/r/20221123011617.332302-1-liujian56@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/altera/altera_tse_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index 7633b227b2ca..711d5b5a4c49 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -990,6 +990,7 @@ static int tse_shutdown(struct net_device *dev) int ret; phylink_stop(priv->phylink); + phylink_disconnect_phy(priv->phylink); netif_stop_queue(dev); napi_disable(&priv->napi); -- cgit From ad17c2a3f11b0f6b122e7842d8f7d9a5fcc7ac63 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Wed, 23 Nov 2022 14:59:19 +0800 Subject: octeontx2-af: Fix reference count issue in rvu_sdp_init() pci_get_device() will decrease the reference count for the *from* parameter. So we don't need to call put_device() to decrease the reference. Let's remove the put_device() in the loop and only decrease the reference count of the returned 'pdev' for the last loop because it will not be passed to pci_get_device() as input parameter. We don't need to check if 'pdev' is NULL because it is already checked inside pci_dev_put(). Also add pci_dev_put() for the error path. Fixes: fe1939bb2340 ("octeontx2-af: Add SDP interface support") Signed-off-by: Xiongfeng Wang Reviewed-by: Saeed Mahameed Link: https://lore.kernel.org/r/20221123065919.31499-1-wangxiongfeng2@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c index b04fb226f708..ae50d56258ec 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c @@ -62,15 +62,18 @@ int rvu_sdp_init(struct rvu *rvu) pfvf->sdp_info = devm_kzalloc(rvu->dev, sizeof(struct sdp_node_info), GFP_KERNEL); - if (!pfvf->sdp_info) + if (!pfvf->sdp_info) { + pci_dev_put(pdev); return -ENOMEM; + } dev_info(rvu->dev, "SDP PF number:%d\n", sdp_pf_num[i]); - put_device(&pdev->dev); i++; } + pci_dev_put(pdev); + return 0; } -- cgit From 661e5ebbafd26d9d2e3c749f5cf591e55c7364f5 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Wed, 23 Nov 2022 16:22:36 +0800 Subject: net: thunderx: Fix the ACPI memory leak The ACPI buffer memory (string.pointer) should be freed as the buffer is not used after returning from bgx_acpi_match_id(), free it to prevent memory leak. Fixes: 46b903a01c05 ("net, thunder, bgx: Add support to get MAC address from ACPI.") Signed-off-by: Yu Liao Link: https://lore.kernel.org/r/20221123082237.1220521-1-liaoyu15@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 2f6484dc186a..7eb2ddbe9bad 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1436,8 +1436,10 @@ static acpi_status bgx_acpi_match_id(acpi_handle handle, u32 lvl, return AE_OK; } - if (strncmp(string.pointer, bgx_sel, 4)) + if (strncmp(string.pointer, bgx_sel, 4)) { + kfree(string.pointer); return AE_OK; + } acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, 1, bgx_acpi_register_phy, NULL, bgx, NULL); -- cgit