diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 330 |
1 files changed, 214 insertions, 116 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f8897a70c11d..1dbc62537259 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -662,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { - /* if no data is found, a racing workqueue/recvmsg - * already processed the new data, stop here or we - * can enter an infinite loop + /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), + * a different CPU can have already processed the pending + * data, stop here or we can enter an infinite loop */ if (!moved) done = true; @@ -672,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } if (__mptcp_check_fallback(msk)) { - /* if we are running under the workqueue, TCP could have - * collapsed skbs between dummy map creation and now - * be sure to adjust the size + /* Under fallback skbs have no MPTCP extension and TCP could + * collapse them between the dummy map creation and the + * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; @@ -1544,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) struct mptcp_sendmsg_info info = { .flags = flags, }; + bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len, copied = 0; + int len; while ((dfrag = mptcp_send_head(sk))) { info.sent = dfrag->already_sent; @@ -1580,8 +1581,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) goto out; } + do_check_data_fin = true; info.sent += ret; - copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); @@ -1597,7 +1598,7 @@ out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - if (copied) + if (do_check_data_fin) __mptcp_check_send_data_fin(sk); } @@ -1672,10 +1673,42 @@ static void mptcp_set_nospace(struct sock *sk) set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } +static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, + size_t len, int *copied_syn) +{ + unsigned int saved_flags = msg->msg_flags; + struct mptcp_sock *msk = mptcp_sk(sk); + int ret; + + lock_sock(ssk); + msg->msg_flags |= MSG_DONTWAIT; + msk->connect_flags = O_NONBLOCK; + msk->is_sendmsg = 1; + ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); + msk->is_sendmsg = 0; + msg->msg_flags = saved_flags; + release_sock(ssk); + + /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ + if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { + ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, msg->msg_flags, 1); + + /* Keep the same behaviour of plain TCP: zero the copied bytes in + * case of any error, except timeout or signal + */ + if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) + *copied_syn = 0; + } + + return ret; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; + struct socket *ssock; size_t copied = 0; int ret = 0; long timeo; @@ -1689,14 +1722,30 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { + int copied_syn = 0; + + ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); + copied += copied_syn; + if (ret == -EINPROGRESS && copied_syn > 0) + goto out; + else if (ret) + goto do_error; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) - goto out; + goto do_error; } + ret = -EPIPE; + if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) + goto do_error; + pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { @@ -1705,11 +1754,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) bool dfrag_collapsed; size_t psize, offset; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ @@ -1741,7 +1785,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { ret = -EFAULT; - goto out; + goto do_error; } /* data successfully copied into the write queue */ @@ -1773,7 +1817,7 @@ wait_for_memory: __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) - goto out; + goto do_error; } if (copied) @@ -1781,7 +1825,14 @@ wait_for_memory: out: release_sock(sk); - return copied ? : ret; + return copied; + +do_error: + if (copied) + goto out; + + copied = sk_stream_error(sk, msg->msg_flags, ret); + goto out; } static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, @@ -2284,8 +2335,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if (flags & MPTCP_CF_FASTCLOSE) + if (flags & MPTCP_CF_FASTCLOSE) { + /* be sure to force the tcp_disconnect() path, + * to generate the egress reset + */ + ssk->sk_lingertime = 0; + sock_set_flag(ssk, SOCK_LINGER); subflow->send_fastclose = 1; + } need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { @@ -2297,12 +2354,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, goto out; } - /* if we are invoked by the msk cleanup code, the subflow is - * already orphaned - */ - if (ssk->sk_socket) - sock_orphan(ssk); - + sock_orphan(ssk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2363,7 +2415,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) might_sleep(); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (inet_sk_state_load(ssk) != TCP_CLOSE) @@ -2406,7 +2458,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) mptcp_token_destroy(msk); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; @@ -2418,12 +2470,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) unlock_sock_fast(tcp_sk, slow); } + /* Mirror the tcp_reset() error propagation */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); - mptcp_close_wake_up(sk); + /* the calling mptcp_worker will properly destroy the socket */ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) @@ -2529,6 +2600,16 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) mptcp_reset_timeout(msk, 0); } +static void mptcp_do_fastclose(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), + subflow, MPTCP_CF_FASTCLOSE); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2557,11 +2638,15 @@ static void mptcp_worker(struct work_struct *work) * closed, but we need the msk around to reply to incoming DATA_FIN, * even if it is orphaned and in FIN_WAIT2 state */ - if (sock_flag(sk, SOCK_DEAD) && - (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { - inet_sk_state_store(sk, TCP_CLOSE); - __mptcp_destroy_sock(sk); - goto unlock; + if (sock_flag(sk, SOCK_DEAD)) { + if (mptcp_check_close_timeout(sk)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + goto unlock; + } } if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2802,6 +2887,18 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; +} + bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; @@ -2815,8 +2912,13 @@ bool __mptcp_close(struct sock *sk, long timeout) goto cleanup; } - if (mptcp_close_state(sk)) + if (mptcp_check_readable(msk)) { + /* the msk has read data, do the MPTCP equivalent of TCP reset */ + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); + } sk_stream_wait_close(sk, timeout); @@ -2833,7 +2935,11 @@ cleanup: if (ssk == msk->first) subflow->fail_tout = 0; - sock_orphan(ssk); + /* detach from the parent socket, but allow data_ready to + * push incoming data into the mptcp stack, to properly ack it + */ + ssk->sk_socket = NULL; + ssk->sk_wq = NULL; unlock_sock_fast(ssk, slow); } sock_orphan(sk); @@ -2867,7 +2973,7 @@ static void mptcp_close(struct sock *sk, long timeout) sock_put(sk); } -static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); @@ -3063,7 +3169,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) __mptcp_clear_xmit(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) + mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ @@ -3422,10 +3528,73 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } +static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} + +static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + int err = -EINVAL; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + return -EINVAL; + + mptcp_token_destroy(msk); + inet_sk_state_store(sk, TCP_SYN_SENT); + subflow = mptcp_subflow_ctx(ssock->sk); +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_early_fallback(msk, subflow); +#endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { + MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); + mptcp_subflow_early_fallback(msk, subflow); + } + if (likely(!__mptcp_check_fallback(msk))) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); + + /* if reaching here via the fastopen/sendmsg path, the caller already + * acquired the subflow socket lock, too. + */ + if (msk->is_sendmsg) + err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); + else + err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); + inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; + + /* on successful connect, the msk state will be moved to established by + * subflow_finish_connect() + */ + if (unlikely(err && err != -EINPROGRESS)) { + inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + return err; + } + + mptcp_copy_inaddrs(sk, ssock->sk); + + /* unblocking connect, mptcp-level inet_stream_connect will error out + * without changing the socket state, update it here. + */ + if (err == -EINPROGRESS) + sk->sk_socket->state = ssock->state; + return err; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, + .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .accept = mptcp_accept, @@ -3477,77 +3646,16 @@ unlock: return err; } -static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) -{ - subflow->request_mptcp = 0; - __mptcp_do_fallback(msk); -} - static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { - struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct mptcp_subflow_context *subflow; - struct socket *ssock; - int err = -EINVAL; + int ret; lock_sock(sock->sk); - if (uaddr) { - if (addr_len < sizeof(uaddr->sa_family)) - goto unlock; - - if (uaddr->sa_family == AF_UNSPEC) { - err = mptcp_disconnect(sock->sk, flags); - sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; - goto unlock; - } - } - - if (sock->state != SS_UNCONNECTED && msk->subflow) { - /* pending connection or invalid state, let existing subflow - * cope with that - */ - ssock = msk->subflow; - goto do_connect; - } - - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) - goto unlock; - - mptcp_token_destroy(msk); - inet_sk_state_store(sock->sk, TCP_SYN_SENT); - subflow = mptcp_subflow_ctx(ssock->sk); -#ifdef CONFIG_TCP_MD5SIG - /* no MPTCP if MD5SIG is enabled on this socket or we may run out of - * TCP option space. - */ - if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_early_fallback(msk, subflow); -#endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { - MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); - mptcp_subflow_early_fallback(msk, subflow); - } - if (likely(!__mptcp_check_fallback(msk))) - MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); - -do_connect: - err = ssock->ops->connect(ssock, uaddr, addr_len, flags); - sock->state = ssock->state; - - /* on successful connect, the msk state will be moved to established by - * subflow_finish_connect() - */ - if (!err || err == -EINPROGRESS) - mptcp_copy_inaddrs(sock->sk, ssock->sk); - else - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - -unlock: + mptcp_sk(sock->sk)->connect_flags = flags; + ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); - return err; + return ret; } static int mptcp_listen(struct socket *sock, int backlog) @@ -3613,7 +3721,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (mptcp_is_fully_established(newsk)) mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); - mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); mptcp_propagate_sndbuf(newsk, msk->first); @@ -3632,18 +3739,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, return err; } -static __poll_t mptcp_check_readable(struct mptcp_sock *msk) -{ - /* Concurrent splices from sk_receive_queue into receive_queue will - * always show at least one non-empty queue when checked in this order. - */ - if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && - skb_queue_empty_lockless(&msk->receive_queue)) - return 0; - - return EPOLLIN | EPOLLRDNORM; -} - static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; @@ -3685,13 +3780,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); mask |= mptcp_check_writeable(msk); + } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + /* cf tcp_poll() note about TFO */ + mask |= EPOLLOUT | EPOLLWRNORM; } if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (sk->sk_err) mask |= EPOLLERR; |