diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 263 |
1 files changed, 191 insertions, 72 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9936e33ac351..14b253d10ccf 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -144,12 +144,29 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, unsigned int offset, size_t copy_len) { struct sock *sk = (struct sock *)msk; + struct sk_buff *tail; __skb_unlink(skb, &ssk->sk_receive_queue); - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_ext_reset(skb); + skb_orphan(skb); msk->ack_seq += copy_len; + + tail = skb_peek_tail(&sk->sk_receive_queue); + if (offset == 0 && tail) { + bool fragstolen; + int delta; + + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + kfree_skb_partial(skb, fragstolen); + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + return; + } + } + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); MPTCP_SKB_CB(skb)->offset = offset; } @@ -367,8 +384,10 @@ static void mptcp_stop_timer(struct sock *sk) static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; + if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(); + msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); return !!msk->cached_ext; } @@ -510,20 +529,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || - !mptcp_ext_cache_refill(msk)) { - ret = sk_stream_wait_memory(ssk, timeo); - if (ret) - return ret; - - /* if sk_stream_wait_memory() sleeps snd_una can change - * significantly, refresh the rtx queue - */ - mptcp_clean_una(sk); - - if (unlikely(__mptcp_needs_tcp_fallback(msk))) - return 0; - } if (!retransmission) { write_seq = &msk->write_seq; page = pfrag->page; @@ -590,7 +595,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * access the skb after the sendpages call */ ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST); + msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); if (ret <= 0) return ret; @@ -653,6 +658,15 @@ out: return ret; } +static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) +{ + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + + /* enables sk->write_space() callbacks */ + set_bit(SOCK_NOSPACE, &sock->flags); +} + static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -660,19 +674,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); + if (!mptcp_ext_cache_refill(msk)) + return NULL; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!sk_stream_memory_free(ssk)) { struct socket *sock = ssk->sk_socket; - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* enables sk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); return NULL; } @@ -698,22 +710,19 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) return; sock = READ_ONCE(ssk->sk_socket); - - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - /* set NOSPACE only after clearing SEND_SPACE flag */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); + struct page_frag *pfrag; struct socket *ssock; size_t copied = 0; struct sock *ssk; + bool tx_ok; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) @@ -738,11 +747,29 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } + pfrag = sk_page_frag(sk); +restart: mptcp_clean_una(sk); +wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); - while (!sk_stream_memory_free(sk) || !ssk) { + while (!sk_stream_memory_free(sk) || + !ssk || + !mptcp_page_frag_refill(ssk, pfrag)) { + if (ssk) { + /* make sure retransmit timer is + * running before we wait for memory. + * + * The retransmit timer might be needed + * to make the peer send an up-to-date + * MPTCP Ack. + */ + mptcp_set_timeout(sk, ssk); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + } + ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; @@ -759,11 +786,18 @@ fallback: pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); - while (msg_data_left(msg)) { + tx_ok = msg_data_left(msg); + while (tx_ok) { ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); - if (ret < 0) + if (ret < 0) { + if (ret == -EAGAIN && timeo > 0) { + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } break; + } if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { /* Can happen for passive sockets: * 3WHS negotiated MPTCP, but first packet after is @@ -777,6 +811,50 @@ fallback: } copied += ret; + + tx_ok = msg_data_left(msg); + if (!tx_ok) + break; + + if (!sk_stream_memory_free(ssk) || + !mptcp_page_frag_refill(ssk, pfrag) || + !mptcp_ext_cache_refill(msk)) { + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } + + /* memory is charged to mptcp level socket as well, i.e. + * if msg is very large, mptcp socket may run out of buffer + * space. mptcp_clean_una() will release data that has + * been acked at mptcp level in the mean time, so there is + * a good chance we can continue sending data right away. + * + * Normally, when the tcp subflow can accept more data, then + * so can the MPTCP socket. However, we need to cope with + * peers that might lag behind in their MPTCP-level + * acknowledgements, i.e. data might have been acked at + * tcp level only. So, we must also check the MPTCP socket + * limits before we send more data. + */ + if (unlikely(!sk_stream_memory_free(sk))) { + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_clean_una(sk); + if (!sk_stream_memory_free(sk)) { + /* can't send more for now, need to wait for + * MPTCP-level ACKs from peer. + * + * Wakeup will happen via mptcp_clean_una(). + */ + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto wait_for_sndbuf; + } + } } mptcp_set_timeout(sk, ssk); @@ -954,7 +1032,8 @@ fallback: pr_debug("block timeout %ld", timeo); mptcp_wait_data(sk, &timeo); - if (unlikely(__mptcp_tcp_fallback(msk))) + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) goto fallback; } @@ -1094,7 +1173,7 @@ static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0; + int orig_len, orig_offset, mss_now = 0, size_goal = 0; struct mptcp_data_frag *dfrag; u64 orig_write_seq; size_t copied = 0; @@ -1116,6 +1195,9 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; + if (!mptcp_ext_cache_refill(msk)) + goto reset_unlock; + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; @@ -1127,8 +1209,8 @@ static void mptcp_worker(struct work_struct *work) orig_offset = dfrag->offset; orig_write_seq = dfrag->data_seq; while (dfrag->data_len > 0) { - ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now, - &size_goal); + int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, + &mss_now, &size_goal); if (ret < 0) break; @@ -1136,6 +1218,9 @@ static void mptcp_worker(struct work_struct *work) copied += ret; dfrag->data_len -= ret; dfrag->offset += ret; + + if (!mptcp_ext_cache_refill(msk)) + break; } if (copied) tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, @@ -1262,11 +1347,14 @@ static void mptcp_close(struct sock *sk, long timeout) lock_sock(sk); - mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); - __mptcp_flush_join_list(msk); - + /* be sure to always acquire the join list lock, to sync vs + * mptcp_finish_join(). + */ + spin_lock_bh(&msk->join_list_lock); + list_splice_tail_init(&msk->join_list, &msk->conn_list); + spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); data_fin_tx_seq = msk->write_seq; @@ -1316,11 +1404,12 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) static int mptcp_disconnect(struct sock *sk, int flags) { - lock_sock(sk); - __mptcp_clear_xmit(sk); - release_sock(sk); - mptcp_cancel_work(sk); - return tcp_disconnect(sk, flags); + /* Should never be called. + * inet_stream_connect() calls ->disconnect, but that + * refers to the subflow socket, not the mptcp one. + */ + WARN_ON_ONCE(1); + return 0; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1332,7 +1421,9 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) } #endif -struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) +struct sock *mptcp_sk_clone(const struct sock *sk, + const struct mptcp_options_received *mp_opt, + struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); @@ -1355,26 +1446,30 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) msk->subflow = NULL; if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) { + nsk->sk_state = TCP_CLOSE; bh_unlock_sock(nsk); /* we can't call into mptcp_close() here - possible BH context - * free the sock directly + * free the sock directly. + * sk_clone_lock() sets nsk refcnt to two, hence call sk_free() + * too. */ - nsk->sk_prot->destroy(nsk); + sk_common_release(nsk); sk_free(nsk); return NULL; } msk->write_seq = subflow_req->idsn + 1; atomic64_set(&msk->snd_una, msk->write_seq); - if (subflow_req->remote_key_valid) { + if (mp_opt->mp_capable) { msk->can_ack = true; - msk->remote_key = subflow_req->remote_key; + msk->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; msk->ack_seq = ack_seq; } + sock_reset_flag(nsk, SOCK_RCU_FREE); /* will be fully established after successful MPC subflow creation */ inet_sk_state_store(nsk, TCP_SYN_RECV); bh_unlock_sock(nsk); @@ -1431,6 +1526,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); + inet_sk_state_store(newsk, TCP_ESTABLISHED); bh_unlock_sock(new_mptcp_sock); @@ -1448,6 +1544,7 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + mptcp_token_destroy(msk->token); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); @@ -1614,27 +1711,30 @@ bool mptcp_finish_join(struct sock *sk) if (!msk->pm.server_side) return true; - /* passive connection, attach to msk socket */ + if (!mptcp_pm_allow_new_subflow(msk)) + return false; + + /* active connections are already on conn_list, and we can't acquire + * msk lock here. + * use the join list lock as synchronization point and double-check + * msk status to avoid racing with mptcp_close() + */ + spin_lock_bh(&msk->join_list_lock); + ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; + if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); + if (!ret) + return false; + + /* attach to msk socket only after we are sure he will deal with us + * at close time + */ parent_sock = READ_ONCE(parent->sk_socket); if (parent_sock && !sk->sk_socket) mptcp_sock_graft(sk, parent_sock); - - ret = mptcp_pm_allow_new_subflow(msk); - if (ret) { - /* active connections are already on conn_list */ - spin_lock_bh(&msk->join_list_lock); - if (!WARN_ON_ONCE(!list_empty(&subflow->node))) - list_add_tail(&subflow->node, &msk->join_list); - spin_unlock_bh(&msk->join_list_lock); - } - return ret; -} - -bool mptcp_sk_is_subflow(const struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - - return subflow->mp_join == 1; + subflow->map_seq = msk->ack_seq; + return true; } static bool mptcp_memory_free(const struct sock *sk, int wake) @@ -1701,6 +1801,14 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; lock_sock(sock->sk); + if (sock->state != SS_UNCONNECTED && msk->subflow) { + /* pending connection or invalid state, let existing subflow + * cope with that + */ + ssock = msk->subflow; + goto do_connect; + } + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); if (IS_ERR(ssock)) { err = PTR_ERR(ssock); @@ -1715,9 +1823,17 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; #endif +do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - mptcp_copy_inaddrs(sock->sk, ssock->sk); + sock->state = ssock->state; + + /* on successful connect, the msk state will be moved to established by + * subflow_finish_connect() + */ + if (!err || err == EINPROGRESS) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + else + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); unlock: release_sock(sock->sk); @@ -1775,6 +1891,8 @@ static int mptcp_listen(struct socket *sock, int backlog) goto unlock; } + sock_set_flag(sock->sk, SOCK_RCU_FREE); + err = ssock->ops->listen(ssock, backlog); inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); if (!err) @@ -1996,6 +2114,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif |