From 7b16871f9932d8a371488d2967b033387870a747 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 18 May 2022 15:04:43 -0700
Subject: mptcp: stop using the mptcp_has_another_subflow() helper

The mentioned helper requires the msk socket lock, and the
current callers don't own it nor can't acquire it, so the
access is racy.

All the current callers are really checking for infinite mapping
fallback, and the latter condition is explicitly tracked by
the relevant msk variable: we can safely remove the caller usage
- and the caller itself.

The issue is present since MP_FAIL implementation, but the
fix only applies since the infinite fallback support, ence the
somewhat unexpected fixes tag.

Fixes: 0530020a7c8f ("mptcp: track and update contiguous data status")
Acked-and-tested-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/pm.c       |  2 +-
 net/mptcp/protocol.h | 13 -------------
 net/mptcp/subflow.c  |  3 +--
 3 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 59fdab2d0c27..8ba51120f35b 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -303,7 +303,7 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
 
 	pr_debug("fail_seq=%llu", fail_seq);
 
-	if (mptcp_has_another_subflow(sk) || !READ_ONCE(msk->allow_infinite_fallback))
+	if (!READ_ONCE(msk->allow_infinite_fallback))
 		return;
 
 	if (!READ_ONCE(subflow->mp_fail_response_expect)) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 08b8015cb69f..fb4760ee8d47 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -650,19 +650,6 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
 	inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
 }
 
-static inline bool mptcp_has_another_subflow(struct sock *ssk)
-{
-	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk), *tmp;
-	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
-
-	mptcp_for_each_subflow(msk, tmp) {
-		if (tmp != subflow)
-			return true;
-	}
-
-	return false;
-}
-
 void __init mptcp_proto_init(void);
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 int __init mptcp_proto_v6_init(void);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index aecd98ac5dfe..27273cf091db 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1233,8 +1233,7 @@ fallback:
 	if (!__mptcp_check_fallback(msk)) {
 		/* RFC 8684 section 3.7. */
 		if (subflow->send_mp_fail) {
-			if (mptcp_has_another_subflow(ssk) ||
-			    !READ_ONCE(msk->allow_infinite_fallback)) {
+			if (!READ_ONCE(msk->allow_infinite_fallback)) {
 				ssk->sk_err = EBADMSG;
 				tcp_set_state(ssk, TCP_CLOSE);
 				subflow->reset_transient = 0;
-- 
cgit 


From d42f9e4e2384febf9cb2d19ffa0cfac96189517a Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Wed, 18 May 2022 15:04:44 -0700
Subject: mptcp: Check for orphaned subflow before handling MP_FAIL timer

MP_FAIL timeout (waiting for a peer to respond to an MP_FAIL with
another MP_FAIL) is implemented using the MPTCP socket's sk_timer. That
timer is also used at MPTCP socket close, so it's important to not have
the two timer users interfere with each other.

At MPTCP socket close, all subflows are orphaned before sk_timer is
manipulated. By checking the SOCK_DEAD flag on the subflows, each
subflow can determine if the timer is safe to alter without acquiring
any MPTCP-level lock. This replaces code that was using the
mptcp_data_lock and MPTCP-level socket state checks that did not
correctly protect the timer.

Fixes: 49fa1919d6bc ("mptcp: reset subflow when MP_FAIL doesn't respond")
Reviewed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/pm.c      |  7 ++-----
 net/mptcp/subflow.c | 12 ++++--------
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 8ba51120f35b..59a85220edc9 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -312,13 +312,10 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
 		subflow->send_mp_fail = 1;
 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
 		subflow->send_infinite_map = 1;
-	} else if (s && inet_sk_state_load(s) != TCP_CLOSE) {
+	} else if (!sock_flag(sk, SOCK_DEAD)) {
 		pr_debug("MP_FAIL response received");
 
-		mptcp_data_lock(s);
-		if (inet_sk_state_load(s) != TCP_CLOSE)
-			sk_stop_timer(s, &s->sk_timer);
-		mptcp_data_unlock(s);
+		sk_stop_timer(s, &s->sk_timer);
 	}
 }
 
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 27273cf091db..8841e8cd9ad8 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1016,12 +1016,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
 		pr_debug("infinite mapping received");
 		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
 		subflow->map_data_len = 0;
-		if (sk && inet_sk_state_load(sk) != TCP_CLOSE) {
-			mptcp_data_lock(sk);
-			if (inet_sk_state_load(sk) != TCP_CLOSE)
-				sk_stop_timer(sk, &sk->sk_timer);
-			mptcp_data_unlock(sk);
-		}
+		if (!sock_flag(ssk, SOCK_DEAD))
+			sk_stop_timer(sk, &sk->sk_timer);
+
 		return MAPPING_INVALID;
 	}
 
@@ -1241,9 +1238,8 @@ fallback:
 				tcp_send_active_reset(ssk, GFP_ATOMIC);
 				while ((skb = skb_peek(&ssk->sk_receive_queue)))
 					sk_eat_skb(ssk, skb);
-			} else {
+			} else if (!sock_flag(ssk, SOCK_DEAD)) {
 				WRITE_ONCE(subflow->mp_fail_response_expect, true);
-				/* The data lock is acquired in __mptcp_move_skbs() */
 				sk_reset_timer((struct sock *)msk,
 					       &((struct sock *)msk)->sk_timer,
 					       jiffies + TCP_RTO_MAX);
-- 
cgit 


From d9fb797046c596187b97a08ea88b954964cc2d33 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Wed, 18 May 2022 15:04:45 -0700
Subject: mptcp: Do not traverse the subflow connection list without lock

The MPTCP socket's conn_list (list of subflows) requires the socket lock
to access. The MP_FAIL timeout code added such an access, where it would
check the list of subflows both in timer context and (later) in workqueue
context where the socket lock is held.

Rather than check the list twice, remove the check in the timeout
handler and only depend on the check in the workqueue. Also remove the
MPTCP_FAIL_NO_RESPONSE flag, since mptcp_mp_fail_no_response() has
insignificant overhead and can be checked on each worker run.

Fixes: 49fa1919d6bc ("mptcp: reset subflow when MP_FAIL doesn't respond")
Reported-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/protocol.c | 16 +---------------
 net/mptcp/protocol.h |  1 -
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 921d67174e49..17e13396024a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2190,23 +2190,10 @@ mp_fail_response_expect_subflow(struct mptcp_sock *msk)
 	return ret;
 }
 
-static void mptcp_check_mp_fail_response(struct mptcp_sock *msk)
-{
-	struct mptcp_subflow_context *subflow;
-	struct sock *sk = (struct sock *)msk;
-
-	bh_lock_sock(sk);
-	subflow = mp_fail_response_expect_subflow(msk);
-	if (subflow)
-		__set_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags);
-	bh_unlock_sock(sk);
-}
-
 static void mptcp_timeout_timer(struct timer_list *t)
 {
 	struct sock *sk = from_timer(sk, t, sk_timer);
 
-	mptcp_check_mp_fail_response(mptcp_sk(sk));
 	mptcp_schedule_work(sk);
 	sock_put(sk);
 }
@@ -2588,8 +2575,7 @@ static void mptcp_worker(struct work_struct *work)
 	if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
 		__mptcp_retrans(sk);
 
-	if (test_and_clear_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags))
-		mptcp_mp_fail_no_response(msk);
+	mptcp_mp_fail_no_response(msk);
 
 unlock:
 	release_sock(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fb4760ee8d47..200f89f6d62f 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -117,7 +117,6 @@
 #define MPTCP_WORK_EOF		3
 #define MPTCP_FALLBACK_DONE	4
 #define MPTCP_WORK_CLOSE_SUBFLOW 5
-#define MPTCP_FAIL_NO_RESPONSE	6
 
 /* MPTCP socket release cb flags */
 #define MPTCP_PUSH_PENDING	1
-- 
cgit 


From 2ba18161d407c596f256f7c4a6447b8588b9eb55 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliang.tang@suse.com>
Date: Wed, 18 May 2022 15:04:46 -0700
Subject: selftests: mptcp: add MP_FAIL reset testcase

Add the multiple subflows test case for MP_FAIL, to test the MP_FAIL
reset case. Use the test_linkfail value to make 1024KB test files.

Invoke reset_with_fail() to use 'iptables' and 'tc action pedit' rules
to produce the bit flips to trigger the checksum failures on ns2eth2.
Add delays on ns2eth1 to make sure more data can translate on ns2eth2.

The check_invert flag is enabled in reset_with_fail(), so this test
prints out the inverted bytes, instead of the file mismatch errors.

Invoke pedit_action_pkts() to get the numbers of the packets edited
by the tc pedit actions, and print this numbers to the output.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 1a5f211c5902..a4406b7a8064 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -2732,6 +2732,16 @@ fail_tests()
 		chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)"
 		chk_fail_nr 1 -1 invert
 	fi
+
+	# multiple subflows
+	if reset_with_fail "MP_FAIL MP_RST" 2; then
+		tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5
+		pm_nl_set_limits $ns1 0 1
+		pm_nl_set_limits $ns2 0 1
+		pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow
+		run_tests $ns1 $ns2 10.0.1.1 1024
+		chk_join_nr 1 1 1 1 0 1 1 0 "$(pedit_action_pkts)"
+	fi
 }
 
 userspace_tests()
-- 
cgit