diff options
Diffstat (limited to 'net/mptcp/protocol.c')
| -rw-r--r-- | net/mptcp/protocol.c | 187 | 
1 files changed, 140 insertions, 47 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index da4257504fad..f599ad44ed24 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -150,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,  		 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,  		 to->len, MPTCP_SKB_CB(from)->end_seq);  	MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; -	kfree_skb_partial(from, fragstolen); + +	/* note the fwd memory can reach a negative value after accounting +	 * for the delta, but the later skb free will restore a non +	 * negative one +	 */  	atomic_add(delta, &sk->sk_rmem_alloc);  	mptcp_rmem_charge(sk, delta); +	kfree_skb_partial(from, fragstolen); +  	return true;  } @@ -656,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,  		skb = skb_peek(&ssk->sk_receive_queue);  		if (!skb) { -			/* if no data is found, a racing workqueue/recvmsg -			 * already processed the new data, stop here or we -			 * can enter an infinite loop +			/* With racing move_skbs_to_msk() and __mptcp_move_skbs(), +			 * a different CPU can have already processed the pending +			 * data, stop here or we can enter an infinite loop  			 */  			if (!moved)  				done = true; @@ -666,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,  		}  		if (__mptcp_check_fallback(msk)) { -			/* if we are running under the workqueue, TCP could have -			 * collapsed skbs between dummy map creation and now -			 * be sure to adjust the size +			/* Under fallback skbs have no MPTCP extension and TCP could +			 * collapse them between the dummy map creation and the +			 * current dequeue. Be sure to adjust the map size.  			 */  			map_remaining = skb->len;  			subflow->map_data_len = skb->len; @@ -1263,7 +1269,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,  		i = skb_shinfo(skb)->nr_frags;  		can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); -		if (!can_coalesce && i >= sysctl_max_skb_frags) { +		if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {  			tcp_mark_push(tcp_sk(ssk), skb);  			goto alloc_skb;  		} @@ -1538,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)  	struct mptcp_sendmsg_info info = {  				.flags = flags,  	}; +	bool do_check_data_fin = false;  	struct mptcp_data_frag *dfrag; -	int len, copied = 0; +	int len;  	while ((dfrag = mptcp_send_head(sk))) {  		info.sent = dfrag->already_sent; @@ -1574,8 +1581,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)  				goto out;  			} +			do_check_data_fin = true;  			info.sent += ret; -			copied += ret;  			len -= ret;  			mptcp_update_post_push(msk, dfrag, ret); @@ -1591,7 +1598,7 @@ out:  	/* ensure the rtx timer is running */  	if (!mptcp_timer_pending(sk))  		mptcp_reset_timer(sk); -	if (copied) +	if (do_check_data_fin)  		__mptcp_check_send_data_fin(sk);  } @@ -1670,6 +1677,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)  {  	struct mptcp_sock *msk = mptcp_sk(sk);  	struct page_frag *pfrag; +	struct socket *ssock;  	size_t copied = 0;  	int ret = 0;  	long timeo; @@ -1683,14 +1691,39 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)  	lock_sock(sk); +	ssock = __mptcp_nmpc_socket(msk); +	if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { +		struct sock *ssk = ssock->sk; +		int copied_syn = 0; + +		lock_sock(ssk); + +		ret = tcp_sendmsg_fastopen(ssk, msg, &copied_syn, len, NULL); +		copied += copied_syn; +		if (ret == -EINPROGRESS && copied_syn > 0) { +			/* reflect the new state on the MPTCP socket */ +			inet_sk_state_store(sk, inet_sk_state_load(ssk)); +			release_sock(ssk); +			goto out; +		} else if (ret) { +			release_sock(ssk); +			goto do_error; +		} +		release_sock(ssk); +	} +  	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);  	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {  		ret = sk_stream_wait_connect(sk, &timeo);  		if (ret) -			goto out; +			goto do_error;  	} +	ret = -EPIPE; +	if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) +		goto do_error; +  	pfrag = sk_page_frag(sk);  	while (msg_data_left(msg)) { @@ -1699,11 +1732,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)  		bool dfrag_collapsed;  		size_t psize, offset; -		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { -			ret = -EPIPE; -			goto out; -		} -  		/* reuse tail pfrag, if possible, or carve a new one from the  		 * page allocator  		 */ @@ -1735,7 +1763,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)  		if (copy_page_from_iter(dfrag->page, offset, psize,  					&msg->msg_iter) != psize) {  			ret = -EFAULT; -			goto out; +			goto do_error;  		}  		/* data successfully copied into the write queue */ @@ -1767,7 +1795,7 @@ wait_for_memory:  		__mptcp_push_pending(sk, msg->msg_flags);  		ret = sk_stream_wait_memory(sk, &timeo);  		if (ret) -			goto out; +			goto do_error;  	}  	if (copied) @@ -1775,7 +1803,14 @@ wait_for_memory:  out:  	release_sock(sk); -	return copied ? : ret; +	return copied; + +do_error: +	if (copied) +		goto out; + +	copied = sk_stream_error(sk, msg->msg_flags, ret); +	goto out;  }  static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, @@ -2278,8 +2313,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,  	lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); -	if (flags & MPTCP_CF_FASTCLOSE) +	if (flags & MPTCP_CF_FASTCLOSE) { +		/* be sure to force the tcp_disconnect() path, +		 * to generate the egress reset +		 */ +		ssk->sk_lingertime = 0; +		sock_set_flag(ssk, SOCK_LINGER);  		subflow->send_fastclose = 1; +	}  	need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);  	if (!dispose_it) { @@ -2357,7 +2398,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk)  	might_sleep(); -	list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { +	mptcp_for_each_subflow_safe(msk, subflow, tmp) {  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);  		if (inet_sk_state_load(ssk) != TCP_CLOSE) @@ -2400,7 +2441,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)  	mptcp_token_destroy(msk); -	list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { +	mptcp_for_each_subflow_safe(msk, subflow, tmp) {  		struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);  		bool slow; @@ -2412,12 +2453,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)  		unlock_sock_fast(tcp_sk, slow);  	} +	/* Mirror the tcp_reset() error propagation */ +	switch (sk->sk_state) { +	case TCP_SYN_SENT: +		sk->sk_err = ECONNREFUSED; +		break; +	case TCP_CLOSE_WAIT: +		sk->sk_err = EPIPE; +		break; +	case TCP_CLOSE: +		return; +	default: +		sk->sk_err = ECONNRESET; +	} +  	inet_sk_state_store(sk, TCP_CLOSE);  	sk->sk_shutdown = SHUTDOWN_MASK;  	smp_mb__before_atomic(); /* SHUTDOWN must be visible first */  	set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); -	mptcp_close_wake_up(sk); +	/* the calling mptcp_worker will properly destroy the socket */ +	if (sock_flag(sk, SOCK_DEAD)) +		return; + +	sk->sk_state_change(sk); +	sk_error_report(sk);  }  static void __mptcp_retrans(struct sock *sk) @@ -2523,6 +2583,16 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)  	mptcp_reset_timeout(msk, 0);  } +static void mptcp_do_fastclose(struct sock *sk) +{ +	struct mptcp_subflow_context *subflow, *tmp; +	struct mptcp_sock *msk = mptcp_sk(sk); + +	mptcp_for_each_subflow_safe(msk, subflow, tmp) +		__mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), +				  subflow, MPTCP_CF_FASTCLOSE); +} +  static void mptcp_worker(struct work_struct *work)  {  	struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2551,11 +2621,15 @@ static void mptcp_worker(struct work_struct *work)  	 * closed, but we need the msk around to reply to incoming DATA_FIN,  	 * even if it is orphaned and in FIN_WAIT2 state  	 */ -	if (sock_flag(sk, SOCK_DEAD) && -	    (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { -		inet_sk_state_store(sk, TCP_CLOSE); -		__mptcp_destroy_sock(sk); -		goto unlock; +	if (sock_flag(sk, SOCK_DEAD)) { +		if (mptcp_check_close_timeout(sk)) { +			inet_sk_state_store(sk, TCP_CLOSE); +			mptcp_do_fastclose(sk); +		} +		if (sk->sk_state == TCP_CLOSE) { +			__mptcp_destroy_sock(sk); +			goto unlock; +		}  	}  	if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2656,7 +2730,7 @@ static void __mptcp_clear_xmit(struct sock *sk)  		dfrag_clear(sk, dfrag);  } -static void mptcp_cancel_work(struct sock *sk) +void mptcp_cancel_work(struct sock *sk)  {  	struct mptcp_sock *msk = mptcp_sk(sk); @@ -2796,13 +2870,24 @@ static void __mptcp_destroy_sock(struct sock *sk)  	sock_put(sk);  } -static void mptcp_close(struct sock *sk, long timeout) +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ +	/* Concurrent splices from sk_receive_queue into receive_queue will +	 * always show at least one non-empty queue when checked in this order. +	 */ +	if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && +	    skb_queue_empty_lockless(&msk->receive_queue)) +		return 0; + +	return EPOLLIN | EPOLLRDNORM; +} + +bool __mptcp_close(struct sock *sk, long timeout)  {  	struct mptcp_subflow_context *subflow;  	struct mptcp_sock *msk = mptcp_sk(sk);  	bool do_cancel_work = false; -	lock_sock(sk);  	sk->sk_shutdown = SHUTDOWN_MASK;  	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { @@ -2810,8 +2895,13 @@ static void mptcp_close(struct sock *sk, long timeout)  		goto cleanup;  	} -	if (mptcp_close_state(sk)) +	if (mptcp_check_readable(msk)) { +		/* the msk has read data, do the MPTCP equivalent of TCP reset */ +		inet_sk_state_store(sk, TCP_CLOSE); +		mptcp_do_fastclose(sk); +	} else if (mptcp_close_state(sk)) {  		__mptcp_wr_shutdown(sk); +	}  	sk_stream_wait_close(sk, timeout); @@ -2844,6 +2934,17 @@ cleanup:  	} else {  		mptcp_reset_timeout(msk, 0);  	} + +	return do_cancel_work; +} + +static void mptcp_close(struct sock *sk, long timeout) +{ +	bool do_cancel_work; + +	lock_sock(sk); + +	do_cancel_work = __mptcp_close(sk, timeout);  	release_sock(sk);  	if (do_cancel_work)  		mptcp_cancel_work(sk); @@ -3047,7 +3148,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)  	__mptcp_clear_xmit(sk);  	/* join list will be eventually flushed (with rst) at sock lock release time */ -	list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) +	mptcp_for_each_subflow_safe(msk, subflow, tmp)  		__mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);  	/* move to sk_receive_queue, sk_stream_kill_queues will purge it */ @@ -3519,6 +3620,7 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,  do_connect:  	err = ssock->ops->connect(ssock, uaddr, addr_len, flags); +	inet_sk(sock->sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;  	sock->state = ssock->state;  	/* on successful connect, the msk state will be moved to established by @@ -3616,18 +3718,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,  	return err;  } -static __poll_t mptcp_check_readable(struct mptcp_sock *msk) -{ -	/* Concurrent splices from sk_receive_queue into receive_queue will -	 * always show at least one non-empty queue when checked in this order. -	 */ -	if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && -	    skb_queue_empty_lockless(&msk->receive_queue)) -		return 0; - -	return EPOLLIN | EPOLLRDNORM; -} -  static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)  {  	struct sock *sk = (struct sock *)msk; @@ -3669,13 +3759,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,  	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {  		mask |= mptcp_check_readable(msk);  		mask |= mptcp_check_writeable(msk); +	} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { +		/* cf tcp_poll() note about TFO */ +		mask |= EPOLLOUT | EPOLLWRNORM;  	}  	if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)  		mask |= EPOLLHUP;  	if (sk->sk_shutdown & RCV_SHUTDOWN)  		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; -	/* This barrier is coupled with smp_wmb() in tcp_reset() */ +	/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */  	smp_rmb();  	if (sk->sk_err)  		mask |= EPOLLERR;  |