diff options
Diffstat (limited to 'net/mptcp/protocol.c')
| -rw-r--r-- | net/mptcp/protocol.c | 476 | 
1 files changed, 314 insertions, 162 deletions
| diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 54613f5b7521..f60f01b14fac 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -22,6 +22,7 @@  #endif  #include <net/mptcp.h>  #include <net/xfrm.h> +#include <asm/ioctls.h>  #include "protocol.h"  #include "mib.h" @@ -46,9 +47,10 @@ struct mptcp_skb_cb {  enum {  	MPTCP_CMSG_TS = BIT(0), +	MPTCP_CMSG_INQ = BIT(1),  }; -static struct percpu_counter mptcp_sockets_allocated; +static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;  static void __mptcp_destroy_sock(struct sock *sk);  static void __mptcp_check_send_data_fin(struct sock *sk); @@ -738,6 +740,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)  				 MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,  				 delta);  			MPTCP_SKB_CB(skb)->offset += delta; +			MPTCP_SKB_CB(skb)->map_seq += delta;  			__skb_queue_tail(&sk->sk_receive_queue, skb);  		}  		msk->ack_seq = end_seq; @@ -760,7 +763,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)  		if (!sock_owned_by_user(sk))  			__mptcp_error_report(sk);  		else -			set_bit(MPTCP_ERROR_REPORT,  &msk->flags); +			__set_bit(MPTCP_ERROR_REPORT,  &msk->cb_flags);  	}  	/* If the moves have caught up with the DATA_FIN sequence number @@ -805,47 +808,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)  	mptcp_data_unlock(sk);  } -static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) +static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)  { -	struct mptcp_subflow_context *subflow; -	bool ret = false; +	struct sock *sk = (struct sock *)msk; -	if (likely(list_empty(&msk->join_list))) +	if (sk->sk_state != TCP_ESTABLISHED)  		return false; -	spin_lock_bh(&msk->join_list_lock); -	list_for_each_entry(subflow, &msk->join_list, node) { -		u32 sseq = READ_ONCE(subflow->setsockopt_seq); - -		mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); -		if (READ_ONCE(msk->setsockopt_seq) != sseq) -			ret = true; -	} -	list_splice_tail_init(&msk->join_list, &msk->conn_list); -	spin_unlock_bh(&msk->join_list_lock); - -	return ret; -} - -void __mptcp_flush_join_list(struct mptcp_sock *msk) -{ -	if (likely(!mptcp_do_flush_join_list(msk))) -		return; +	/* attach to msk socket only after we are sure we will deal with it +	 * at close time +	 */ +	if (sk->sk_socket && !ssk->sk_socket) +		mptcp_sock_graft(ssk, sk->sk_socket); -	if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) -		mptcp_schedule_work((struct sock *)msk); +	mptcp_propagate_sndbuf((struct sock *)msk, ssk); +	mptcp_sockopt_sync_locked(msk, ssk); +	return true;  } -static void mptcp_flush_join_list(struct mptcp_sock *msk) +static void __mptcp_flush_join_list(struct sock *sk)  { -	bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); - -	might_sleep(); +	struct mptcp_subflow_context *tmp, *subflow; +	struct mptcp_sock *msk = mptcp_sk(sk); -	if (!mptcp_do_flush_join_list(msk) && !sync_needed) -		return; +	list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { +		struct sock *ssk = mptcp_subflow_tcp_sock(subflow); +		bool slow = lock_sock_fast(ssk); -	mptcp_sockopt_sync_all(msk); +		list_move_tail(&subflow->node, &msk->conn_list); +		if (!__mptcp_finish_join(msk, ssk)) +			mptcp_subflow_reset(ssk); +		unlock_sock_fast(ssk, slow); +	}  }  static bool mptcp_timer_pending(struct sock *sk) @@ -972,7 +966,9 @@ static void __mptcp_mem_reclaim_partial(struct sock *sk)  	lockdep_assert_held_once(&sk->sk_lock.slock); -	__mptcp_rmem_reclaim(sk, reclaimable - 1); +	if (reclaimable > SK_MEM_QUANTUM) +		__mptcp_rmem_reclaim(sk, reclaimable - 1); +  	sk_mem_reclaim_partial(sk);  } @@ -1369,7 +1365,7 @@ out:  struct subflow_send_info {  	struct sock *ssk; -	u64 ratio; +	u64 linger_time;  };  void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) @@ -1394,20 +1390,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)  	return __mptcp_subflow_active(subflow);  } +#define SSK_MODE_ACTIVE	0 +#define SSK_MODE_BACKUP	1 +#define SSK_MODE_MAX	2 +  /* implement the mptcp packet scheduler;   * returns the subflow that will transmit the next DSS   * additionally updates the rtx timeout   */  static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)  { -	struct subflow_send_info send_info[2]; +	struct subflow_send_info send_info[SSK_MODE_MAX];  	struct mptcp_subflow_context *subflow;  	struct sock *sk = (struct sock *)msk; +	u32 pace, burst, wmem;  	int i, nr_active = 0;  	struct sock *ssk; +	u64 linger_time;  	long tout = 0; -	u64 ratio; -	u32 pace;  	sock_owned_by_me(sk); @@ -1426,10 +1426,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)  	}  	/* pick the subflow with the lower wmem/wspace ratio */ -	for (i = 0; i < 2; ++i) { +	for (i = 0; i < SSK_MODE_MAX; ++i) {  		send_info[i].ssk = NULL; -		send_info[i].ratio = -1; +		send_info[i].linger_time = -1;  	} +  	mptcp_for_each_subflow(msk, subflow) {  		trace_mptcp_subflow_get_send(subflow);  		ssk =  mptcp_subflow_tcp_sock(subflow); @@ -1438,34 +1439,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)  		tout = max(tout, mptcp_timeout_from_subflow(subflow));  		nr_active += !subflow->backup; -		if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) -			continue; - -		pace = READ_ONCE(ssk->sk_pacing_rate); -		if (!pace) -			continue; +		pace = subflow->avg_pacing_rate; +		if (unlikely(!pace)) { +			/* init pacing rate from socket */ +			subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); +			pace = subflow->avg_pacing_rate; +			if (!pace) +				continue; +		} -		ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, -				pace); -		if (ratio < send_info[subflow->backup].ratio) { +		linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); +		if (linger_time < send_info[subflow->backup].linger_time) {  			send_info[subflow->backup].ssk = ssk; -			send_info[subflow->backup].ratio = ratio; +			send_info[subflow->backup].linger_time = linger_time;  		}  	}  	__mptcp_set_timeout(sk, tout);  	/* pick the best backup if no other subflow is active */  	if (!nr_active) -		send_info[0].ssk = send_info[1].ssk; - -	if (send_info[0].ssk) { -		msk->last_snd = send_info[0].ssk; -		msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, -				       tcp_sk(msk->last_snd)->snd_wnd); -		return msk->last_snd; -	} +		send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; + +	/* According to the blest algorithm, to avoid HoL blocking for the +	 * faster flow, we need to: +	 * - estimate the faster flow linger time +	 * - use the above to estimate the amount of byte transferred +	 *   by the faster flow +	 * - check that the amount of queued data is greter than the above, +	 *   otherwise do not use the picked, slower, subflow +	 * We select the subflow with the shorter estimated time to flush +	 * the queued mem, which basically ensure the above. We just need +	 * to check that subflow has a non empty cwin. +	 */ +	ssk = send_info[SSK_MODE_ACTIVE].ssk; +	if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd) +		return NULL; -	return NULL; +	burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd); +	wmem = READ_ONCE(ssk->sk_wmem_queued); +	subflow = mptcp_subflow_ctx(ssk); +	subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + +					   READ_ONCE(ssk->sk_pacing_rate) * burst, +					   burst + wmem); +	msk->last_snd = ssk; +	msk->snd_burst = burst; +	return ssk;  }  static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) @@ -1499,11 +1517,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,  		msk->snd_nxt = snd_nxt_new;  } -static void mptcp_check_and_set_pending(struct sock *sk) +void mptcp_check_and_set_pending(struct sock *sk)  { -	if (mptcp_send_head(sk) && -	    !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) -		set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +	if (mptcp_send_head(sk)) +		mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);  }  void __mptcp_push_pending(struct sock *sk, unsigned int flags) @@ -1524,7 +1541,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)  			int ret = 0;  			prev_ssk = ssk; -			__mptcp_flush_join_list(msk);  			ssk = mptcp_subflow_get_send(msk);  			/* First check. If the ssk has changed since @@ -1784,8 +1800,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,  		copied += count;  		if (count < data_len) { -			if (!(flags & MSG_PEEK)) +			if (!(flags & MSG_PEEK)) {  				MPTCP_SKB_CB(skb)->offset += count; +				MPTCP_SKB_CB(skb)->map_seq += count; +			}  			break;  		} @@ -1927,7 +1945,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)  	unsigned int moved = 0;  	bool ret, done; -	mptcp_flush_join_list(msk);  	do {  		struct sock *ssk = mptcp_subflow_recv_lookup(msk);  		bool slowpath; @@ -1965,6 +1982,27 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)  	return !skb_queue_empty(&msk->receive_queue);  } +static unsigned int mptcp_inq_hint(const struct sock *sk) +{ +	const struct mptcp_sock *msk = mptcp_sk(sk); +	const struct sk_buff *skb; + +	skb = skb_peek(&msk->receive_queue); +	if (skb) { +		u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + +		if (hint_val >= INT_MAX) +			return INT_MAX; + +		return (unsigned int)hint_val; +	} + +	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) +		return 1; + +	return 0; +} +  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,  			 int nonblock, int flags, int *addr_len)  { @@ -1989,6 +2027,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,  	len = min_t(size_t, len, INT_MAX);  	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +	if (unlikely(msk->recvmsg_inq)) +		cmsg_flags = MPTCP_CMSG_INQ; +  	while (copied < len) {  		int bytes_read; @@ -2062,6 +2103,12 @@ out_err:  	if (cmsg_flags && copied >= 0) {  		if (cmsg_flags & MPTCP_CMSG_TS)  			tcp_recv_timestamp(msg, sk, &tss); + +		if (cmsg_flags & MPTCP_CMSG_INQ) { +			unsigned int inq = mptcp_inq_hint(sk); + +			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); +		}  	}  	pr_debug("msk=%p rx queue empty=%d:%d copied=%d", @@ -2088,7 +2135,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)  			mptcp_schedule_work(sk);  	} else {  		/* delegate our work to tcp_release_cb() */ -		set_bit(MPTCP_RETRANSMIT, &msk->flags); +		__set_bit(MPTCP_RETRANSMIT, &msk->cb_flags);  	}  	bh_unlock_sock(sk);  	sock_put(sk); @@ -2196,6 +2243,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)  	return true;  } +/* flags for __mptcp_close_ssk() */ +#define MPTCP_CF_PUSH		BIT(1) +#define MPTCP_CF_FASTCLOSE	BIT(2) +  /* subflow sockets can be either outgoing (connect) or incoming   * (accept).   * @@ -2205,22 +2256,37 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)   * parent socket.   */  static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, -			      struct mptcp_subflow_context *subflow) +			      struct mptcp_subflow_context *subflow, +			      unsigned int flags)  {  	struct mptcp_sock *msk = mptcp_sk(sk); -	bool need_push; +	bool need_push, dispose_it; -	list_del(&subflow->node); +	dispose_it = !msk->subflow || ssk != msk->subflow->sk; +	if (dispose_it) +		list_del(&subflow->node);  	lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); +	if (flags & MPTCP_CF_FASTCLOSE) +		subflow->send_fastclose = 1; + +	need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); +	if (!dispose_it) { +		tcp_disconnect(ssk, 0); +		msk->subflow->state = SS_UNCONNECTED; +		mptcp_subflow_ctx_reset(subflow); +		release_sock(ssk); + +		goto out; +	} +  	/* if we are invoked by the msk cleanup code, the subflow is  	 * already orphaned  	 */  	if (ssk->sk_socket)  		sock_orphan(ssk); -	need_push = __mptcp_retransmit_pending_data(sk);  	subflow->disposable = 1;  	/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2240,14 +2306,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,  	sock_put(ssk); -	if (ssk == msk->last_snd) -		msk->last_snd = NULL; -  	if (ssk == msk->first)  		msk->first = NULL; -	if (msk->subflow && ssk == msk->subflow->sk) -		mptcp_dispose_initial_subflow(msk); +out: +	if (ssk == msk->last_snd) +		msk->last_snd = NULL;  	if (need_push)  		__mptcp_push_pending(sk, 0); @@ -2258,7 +2322,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,  {  	if (sk->sk_state == TCP_ESTABLISHED)  		mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); -	__mptcp_close_ssk(sk, ssk, subflow); + +	/* subflow aborted before reaching the fully_established status +	 * attempt the creation of the next subflow +	 */ +	mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow); + +	__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);  }  static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -2410,12 +2480,10 @@ static void mptcp_worker(struct work_struct *work)  		goto unlock;  	mptcp_check_data_fin_ack(sk); -	mptcp_flush_join_list(msk);  	mptcp_check_fastclose(msk); -	if (msk->pm.status) -		mptcp_pm_nl_work(msk); +	mptcp_pm_nl_work(msk);  	if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))  		mptcp_check_for_eof(msk); @@ -2449,8 +2517,6 @@ static int __mptcp_init_sock(struct sock *sk)  {  	struct mptcp_sock *msk = mptcp_sk(sk); -	spin_lock_init(&msk->join_list_lock); -  	INIT_LIST_HEAD(&msk->conn_list);  	INIT_LIST_HEAD(&msk->join_list);  	INIT_LIST_HEAD(&msk->rtx_queue); @@ -2476,9 +2542,20 @@ static int __mptcp_init_sock(struct sock *sk)  	return 0;  } -static int mptcp_init_sock(struct sock *sk) +static void mptcp_ca_reset(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk); + +	tcp_assign_congestion_control(sk); +	strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); + +	/* no need to keep a reference to the ops, the name will suffice */ +	tcp_cleanup_congestion_control(sk); +	icsk->icsk_ca_ops = NULL; +} + +static int mptcp_init_sock(struct sock *sk) +{  	struct net *net = sock_net(sk);  	int ret; @@ -2499,12 +2576,7 @@ static int mptcp_init_sock(struct sock *sk)  	/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will  	 * propagate the correct value  	 */ -	tcp_assign_congestion_control(sk); -	strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); - -	/* no need to keep a reference to the ops, the name will suffice */ -	tcp_cleanup_congestion_control(sk); -	icsk->icsk_ca_ops = NULL; +	mptcp_ca_reset(sk);  	sk_sockets_allocated_inc(sk);  	sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; @@ -2609,6 +2681,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk)  	 * state now  	 */  	if (__mptcp_check_fallback(msk)) { +		WRITE_ONCE(msk->snd_una, msk->write_seq);  		if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {  			inet_sk_state_store(sk, TCP_CLOSE);  			mptcp_close_wake_up(sk); @@ -2617,7 +2690,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)  		}  	} -	mptcp_flush_join_list(msk);  	mptcp_for_each_subflow(msk, subflow) {  		struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2650,21 +2722,20 @@ static void __mptcp_destroy_sock(struct sock *sk)  	might_sleep(); -	/* be sure to always acquire the join list lock, to sync vs -	 * mptcp_finish_join(). -	 */ -	spin_lock_bh(&msk->join_list_lock); -	list_splice_tail_init(&msk->join_list, &msk->conn_list); -	spin_unlock_bh(&msk->join_list_lock); +	/* join list will be eventually flushed (with rst) at sock lock release time*/  	list_splice_init(&msk->conn_list, &conn_list);  	sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);  	sk_stop_timer(sk, &sk->sk_timer);  	msk->pm.status = 0; +	/* clears msk->subflow, allowing the following loop to close +	 * even the initial subflow +	 */ +	mptcp_dispose_initial_subflow(msk);  	list_for_each_entry_safe(subflow, tmp, &conn_list, node) {  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow); -		__mptcp_close_ssk(sk, ssk, subflow); +		__mptcp_close_ssk(sk, ssk, subflow, 0);  	}  	sk->sk_prot->destroy(sk); @@ -2675,7 +2746,6 @@ static void __mptcp_destroy_sock(struct sock *sk)  	xfrm_sk_free_policy(sk);  	sk_refcnt_debug_release(sk); -	mptcp_dispose_initial_subflow(msk);  	sock_put(sk);  } @@ -2711,6 +2781,9 @@ cleanup:  	sock_hold(sk);  	pr_debug("msk=%p state=%d", sk, sk->sk_state); +	if (mptcp_sk(sk)->token) +		mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); +  	if (sk->sk_state == TCP_CLOSE) {  		__mptcp_destroy_sock(sk);  		do_cancel_work = true; @@ -2721,9 +2794,6 @@ cleanup:  	if (do_cancel_work)  		mptcp_cancel_work(sk); -	if (mptcp_sk(sk)->token) -		mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); -  	sock_put(sk);  } @@ -2755,15 +2825,38 @@ static int mptcp_disconnect(struct sock *sk, int flags)  	struct mptcp_subflow_context *subflow;  	struct mptcp_sock *msk = mptcp_sk(sk); -	mptcp_do_flush_join_list(msk); +	inet_sk_state_store(sk, TCP_CLOSE);  	mptcp_for_each_subflow(msk, subflow) {  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow); -		lock_sock(ssk); -		tcp_disconnect(ssk, flags); -		release_sock(ssk); +		__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE);  	} + +	sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); +	sk_stop_timer(sk, &sk->sk_timer); + +	if (mptcp_sk(sk)->token) +		mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + +	mptcp_destroy_common(msk); +	msk->last_snd = NULL; +	WRITE_ONCE(msk->flags, 0); +	msk->cb_flags = 0; +	msk->push_pending = 0; +	msk->recovery = false; +	msk->can_ack = false; +	msk->fully_established = false; +	msk->rcv_data_fin = false; +	msk->snd_data_fin_enable = false; +	msk->rcv_fastclose = false; +	msk->use_64bit_ack = false; +	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); +	mptcp_pm_data_reset(msk); +	mptcp_ca_reset(sk); + +	sk->sk_shutdown = 0; +	sk_error_report(sk);  	return 0;  } @@ -2903,9 +2996,11 @@ void mptcp_destroy_common(struct mptcp_sock *msk)  	__mptcp_clear_xmit(sk);  	/* move to sk_receive_queue, sk_stream_kill_queues will purge it */ +	mptcp_data_lock(sk);  	skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);  	__skb_queue_purge(&sk->sk_receive_queue);  	skb_rbtree_purge(&msk->out_of_order_queue); +	mptcp_data_unlock(sk);  	/* move all the rx fwd alloc into the sk_mem_reclaim_final in  	 * inet_sock_destruct() will dispose it @@ -2929,7 +3024,7 @@ void __mptcp_data_acked(struct sock *sk)  	if (!sock_owned_by_user(sk))  		__mptcp_clean_una(sk);  	else -		set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); +		__set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);  	if (mptcp_pending_data_fin_ack(sk))  		mptcp_schedule_work(sk); @@ -2948,20 +3043,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)  		else if (xmit_ssk)  			mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND);  	} else { -		set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +		__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);  	}  } +#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ +				      BIT(MPTCP_RETRANSMIT) | \ +				      BIT(MPTCP_FLUSH_JOIN_LIST)) +  /* processes deferred events and flush wmem */  static void mptcp_release_cb(struct sock *sk) +	__must_hold(&sk->sk_lock.slock)  { -	for (;;) { -		unsigned long flags = 0; +	struct mptcp_sock *msk = mptcp_sk(sk); -		if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) -			flags |= BIT(MPTCP_PUSH_PENDING); -		if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) -			flags |= BIT(MPTCP_RETRANSMIT); +	for (;;) { +		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | +				      msk->push_pending;  		if (!flags)  			break; @@ -2972,8 +3070,11 @@ static void mptcp_release_cb(struct sock *sk)  		 *    datapath acquires the msk socket spinlock while helding  		 *    the subflow socket lock  		 */ - +		msk->push_pending = 0; +		msk->cb_flags &= ~flags;  		spin_unlock_bh(&sk->sk_lock.slock); +		if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) +			__mptcp_flush_join_list(sk);  		if (flags & BIT(MPTCP_PUSH_PENDING))  			__mptcp_push_pending(sk, 0);  		if (flags & BIT(MPTCP_RETRANSMIT)) @@ -2986,11 +3087,11 @@ static void mptcp_release_cb(struct sock *sk)  	/* be sure to set the current sk state before tacking actions  	 * depending on sk_state  	 */ -	if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) +	if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))  		__mptcp_set_connected(sk); -	if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) +	if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))  		__mptcp_clean_una_wakeup(sk); -	if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) +	if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))  		__mptcp_error_report(sk);  	__mptcp_update_rmem(sk); @@ -3032,7 +3133,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk)  		if (!sock_owned_by_user(sk))  			__mptcp_subflow_push_pending(sk, ssk);  		else -			set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +			__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);  		mptcp_data_unlock(sk);  		mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND);  	} @@ -3118,8 +3219,7 @@ bool mptcp_finish_join(struct sock *ssk)  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);  	struct mptcp_sock *msk = mptcp_sk(subflow->conn);  	struct sock *parent = (void *)msk; -	struct socket *parent_sock; -	bool ret; +	bool ret = true;  	pr_debug("msk=%p, subflow=%p", msk, subflow); @@ -3132,35 +3232,38 @@ bool mptcp_finish_join(struct sock *ssk)  	if (!msk->pm.server_side)  		goto out; -	if (!mptcp_pm_allow_new_subflow(msk)) { -		subflow->reset_reason = MPTCP_RST_EPROHIBIT; -		return false; -	} +	if (!mptcp_pm_allow_new_subflow(msk)) +		goto err_prohibited; + +	if (WARN_ON_ONCE(!list_empty(&subflow->node))) +		goto err_prohibited; -	/* active connections are already on conn_list, and we can't acquire -	 * msk lock here. -	 * use the join list lock as synchronization point and double-check -	 * msk status to avoid racing with __mptcp_destroy_sock() +	/* active connections are already on conn_list. +	 * If we can't acquire msk socket lock here, let the release callback +	 * handle it  	 */ -	spin_lock_bh(&msk->join_list_lock); -	ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; -	if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { -		list_add_tail(&subflow->node, &msk->join_list); +	mptcp_data_lock(parent); +	if (!sock_owned_by_user(parent)) { +		ret = __mptcp_finish_join(msk, ssk); +		if (ret) { +			sock_hold(ssk); +			list_add_tail(&subflow->node, &msk->conn_list); +		} +	} else {  		sock_hold(ssk); +		list_add_tail(&subflow->node, &msk->join_list); +		__set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);  	} -	spin_unlock_bh(&msk->join_list_lock); +	mptcp_data_unlock(parent); +  	if (!ret) { +err_prohibited:  		subflow->reset_reason = MPTCP_RST_EPROHIBIT;  		return false;  	} -	/* attach to msk socket only after we are sure he will deal with us -	 * at close time -	 */ -	parent_sock = READ_ONCE(parent->sk_socket); -	if (parent_sock && !ssk->sk_socket) -		mptcp_sock_graft(ssk, parent_sock);  	subflow->map_seq = READ_ONCE(msk->ack_seq); +  out:  	mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);  	return true; @@ -3179,6 +3282,57 @@ static int mptcp_forward_alloc_get(const struct sock *sk)  	return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc;  } +static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) +{ +	const struct sock *sk = (void *)msk; +	u64 delta; + +	if (sk->sk_state == TCP_LISTEN) +		return -EINVAL; + +	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) +		return 0; + +	delta = msk->write_seq - v; +	if (delta > INT_MAX) +		delta = INT_MAX; + +	return (int)delta; +} + +static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ +	struct mptcp_sock *msk = mptcp_sk(sk); +	bool slow; +	int answ; + +	switch (cmd) { +	case SIOCINQ: +		if (sk->sk_state == TCP_LISTEN) +			return -EINVAL; + +		lock_sock(sk); +		__mptcp_move_skbs(msk); +		answ = mptcp_inq_hint(sk); +		release_sock(sk); +		break; +	case SIOCOUTQ: +		slow = lock_sock_fast(sk); +		answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); +		unlock_sock_fast(sk, slow); +		break; +	case SIOCOUTQNSD: +		slow = lock_sock_fast(sk); +		answ = mptcp_ioctl_outq(msk, msk->snd_nxt); +		unlock_sock_fast(sk, slow); +		break; +	default: +		return -ENOIOCTLCMD; +	} + +	return put_user(answ, (int __user *)arg); +} +  static struct proto mptcp_prot = {  	.name		= "MPTCP",  	.owner		= THIS_MODULE, @@ -3191,6 +3345,7 @@ static struct proto mptcp_prot = {  	.shutdown	= mptcp_shutdown,  	.destroy	= mptcp_destroy,  	.sendmsg	= mptcp_sendmsg, +	.ioctl		= mptcp_ioctl,  	.recvmsg	= mptcp_recvmsg,  	.release_cb	= mptcp_release_cb,  	.hash		= mptcp_hash, @@ -3243,9 +3398,20 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	struct mptcp_sock *msk = mptcp_sk(sock->sk);  	struct mptcp_subflow_context *subflow;  	struct socket *ssock; -	int err; +	int err = -EINVAL;  	lock_sock(sock->sk); +	if (uaddr) { +		if (addr_len < sizeof(uaddr->sa_family)) +			goto unlock; + +		if (uaddr->sa_family == AF_UNSPEC) { +			err = mptcp_disconnect(sock->sk, flags); +			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; +			goto unlock; +		} +	} +  	if (sock->state != SS_UNCONNECTED && msk->subflow) {  		/* pending connection or invalid state, let existing subflow  		 * cope with that @@ -3255,10 +3421,8 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	}  	ssock = __mptcp_nmpc_socket(msk); -	if (!ssock) { -		err = -EINVAL; +	if (!ssock)  		goto unlock; -	}  	mptcp_token_destroy(msk);  	inet_sk_state_store(sock->sk, TCP_SYN_SENT); @@ -3332,17 +3496,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,  	pr_debug("msk=%p", msk); -	lock_sock(sock->sk); -	if (sock->sk->sk_state != TCP_LISTEN) -		goto unlock_fail; -  	ssock = __mptcp_nmpc_socket(msk);  	if (!ssock) -		goto unlock_fail; - -	clear_bit(MPTCP_DATA_READY, &msk->flags); -	sock_hold(ssock->sk); -	release_sock(sock->sk); +		return -EINVAL;  	err = ssock->ops->accept(sock, newsock, flags, kern);  	if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { @@ -3372,7 +3528,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,  		/* set ssk->sk_socket of accept()ed flows to mptcp socket.  		 * This is needed so NOSPACE flag can be set from tcp stack.  		 */ -		mptcp_flush_join_list(msk);  		mptcp_for_each_subflow(msk, subflow) {  			struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -3382,14 +3537,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,  		release_sock(newsk);  	} -	if (inet_csk_listen_poll(ssock->sk)) -		set_bit(MPTCP_DATA_READY, &msk->flags); -	sock_put(ssock->sk);  	return err; - -unlock_fail: -	release_sock(sock->sk); -	return -EINVAL;  }  static __poll_t mptcp_check_readable(struct mptcp_sock *msk) @@ -3435,8 +3583,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,  	state = inet_sk_state_load(sk);  	pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); -	if (state == TCP_LISTEN) -		return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; +	if (state == TCP_LISTEN) { +		if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) +			return 0; + +		return inet_csk_listen_poll(msk->subflow->sk); +	}  	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {  		mask |= mptcp_check_readable(msk); |