diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 820 |
1 files changed, 608 insertions, 212 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4b7794835fea..2540d82742ac 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -60,7 +60,7 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { - return atomic64_read(&msk->wnd_end); + return READ_ONCE(msk->wnd_end); } static bool mptcp_is_tcpsk(struct sock *sk) @@ -348,17 +348,22 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } -static void mptcp_check_data_fin_ack(struct sock *sk) +static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (__mptcp_check_fallback(msk)) - return; + return !__mptcp_check_fallback(msk) && + ((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == READ_ONCE(msk->snd_una); +} + +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ - if (((1 << sk->sk_state) & - (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && - msk->write_seq == atomic64_read(&msk->snd_una)) { + if (mptcp_pending_data_fin_ack(sk)) { mptcp_stop_timer(sk); WRITE_ONCE(msk->snd_data_fin_enable, 0); @@ -419,31 +424,57 @@ static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } -static void mptcp_send_ack(struct mptcp_sock *msk, bool force) +static bool tcp_can_send_ack(const struct sock *ssk) +{ + return !((1 << inet_sk_state_load(ssk)) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE)); +} + +static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct sock *pick = NULL; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (force) { - lock_sock(ssk); + lock_sock(ssk); + if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); - release_sock(ssk); - continue; - } - - /* if the hintes ssk is still active, use it */ - pick = ssk; - if (ssk == msk->ack_hint) - break; + release_sock(ssk); } - if (!force && pick) { - lock_sock(pick); - tcp_cleanup_rbuf(pick, 1); - release_sock(pick); +} + +static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +{ + int ret; + + lock_sock(ssk); + ret = tcp_can_send_ack(ssk); + if (ret) + tcp_cleanup_rbuf(ssk, 1); + release_sock(ssk); + return ret; +} + +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +{ + struct sock *ack_hint = READ_ONCE(msk->ack_hint); + struct mptcp_subflow_context *subflow; + + /* if the hinted ssk is still active, try to use it */ + if (likely(ack_hint)) { + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) + return; + } } + + /* otherwise pick the first active subflow */ + mptcp_for_each_subflow(msk, subflow) + if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) + return; } static bool mptcp_check_data_fin(struct sock *sk) @@ -494,7 +525,7 @@ static bool mptcp_check_data_fin(struct sock *sk) ret = true; mptcp_set_timeout(sk, NULL); - mptcp_send_ack(msk, true); + mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; @@ -588,13 +619,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, break; } } while (more_data_avail); - msk->ack_hint = ssk; + WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; return done; } -static bool mptcp_ofo_queue(struct mptcp_sock *msk) +static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; @@ -640,34 +671,27 @@ static bool mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (READ_ONCE(sk->sk_lock.owned)) - return false; + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; - if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) - return false; + mptcp_data_lock(sk); - /* must re-check after taking the lock */ - if (!READ_ONCE(sk->sk_lock.owned)) { - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_ofo_queue(msk); + __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + __mptcp_ofo_queue(msk); - /* If the moves have caught up with the DATA_FIN sequence number - * it's time to ack the DATA_FIN and change socket state, but - * this is not a good place to change state. Let the workqueue - * do it. - */ - if (mptcp_pending_data_fin(sk, NULL)) - mptcp_schedule_work(sk); - } - - spin_unlock_bh(&sk->sk_lock.slock); - - return moved > 0; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL)) + mptcp_schedule_work(sk); + mptcp_data_unlock(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) @@ -677,6 +701,13 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) int sk_rbuf, ssk_rbuf; bool wake; + /* The peer can send data while we are shutting down this + * subflow at msk destruction time, but we must avoid enqueuing + * more data to the msk receive queue + */ + if (unlikely(subflow->disposable)) + return; + /* move_skbs_to_msk below can legitly clear the data_avail flag, * but we will need later to properly woke the reader, cache its * value @@ -745,16 +776,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -void mptcp_data_acked(struct sock *sk) -{ - mptcp_reset_timer(sk); - - if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) || - mptcp_send_head(sk) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED))) - mptcp_schedule_work(sk); -} - void mptcp_subflow_eof(struct sock *sk) { if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) @@ -799,16 +820,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) mptcp_close_wake_up(sk); } -static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) -{ - const struct sock *sk = (const struct sock *)msk; - - if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); - - return !!msk->cached_ext; -} - static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -847,6 +858,121 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } +static int mptcp_wmem_with_overhead(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int ret, skbs; + + ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); + skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; + if (skbs < msk->skb_tx_cache.qlen) + return ret; + + return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); +} + +static void __mptcp_wmem_reserve(struct sock *sk, int size) +{ + int amount = mptcp_wmem_with_overhead(sk, size); + struct mptcp_sock *msk = mptcp_sk(sk); + + WARN_ON_ONCE(msk->wmem_reserved); + if (amount <= sk->sk_forward_alloc) + goto reserve; + + /* under memory pressure try to reserve at most a single page + * otherwise try to reserve the full estimate and fallback + * to a single page before entering the error path + */ + if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || + !sk_wmem_schedule(sk, amount)) { + if (amount <= PAGE_SIZE) + goto nomem; + + amount = PAGE_SIZE; + if (!sk_wmem_schedule(sk, amount)) + goto nomem; + } + +reserve: + msk->wmem_reserved = amount; + sk->sk_forward_alloc -= amount; + return; + +nomem: + /* we will wait for memory on next allocation */ + msk->wmem_reserved = -1; +} + +static void __mptcp_update_wmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->wmem_reserved) + return; + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + if (msk->wmem_reserved > 0) { + sk->sk_forward_alloc += msk->wmem_reserved; + msk->wmem_reserved = 0; + } +} + +static bool mptcp_wmem_alloc(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* check for pre-existing error condition */ + if (msk->wmem_reserved < 0) + return false; + + if (msk->wmem_reserved >= size) + goto account; + + mptcp_data_lock(sk); + if (!sk_wmem_schedule(sk, size)) { + mptcp_data_unlock(sk); + return false; + } + + sk->sk_forward_alloc -= size; + msk->wmem_reserved += size; + mptcp_data_unlock(sk); + +account: + msk->wmem_reserved -= size; + return true; +} + +static void mptcp_wmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + msk->wmem_reserved += size; +} + +static void mptcp_mem_reclaim_partial(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* if we are experiencing a transint allocation error, + * the forward allocation memory has been already + * released + */ + if (msk->wmem_reserved < 0) + return; + + mptcp_data_lock(sk); + sk->sk_forward_alloc += msk->wmem_reserved; + sk_mem_reclaim_partial(sk); + msk->wmem_reserved = sk->sk_forward_alloc; + sk->sk_forward_alloc = 0; + mptcp_data_unlock(sk); +} + static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -862,7 +988,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static void mptcp_clean_una(struct sock *sk) +static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; @@ -873,10 +999,9 @@ static void mptcp_clean_una(struct sock *sk) * plain TCP */ if (__mptcp_check_fallback(msk)) - atomic64_set(&msk->snd_una, msk->snd_nxt); - - snd_una = atomic64_read(&msk->snd_una); + msk->snd_una = READ_ONCE(msk->snd_nxt); + snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; @@ -904,36 +1029,34 @@ static void mptcp_clean_una(struct sock *sk) } out: - if (cleaned) - sk_mem_reclaim_partial(sk); -} - -static void mptcp_clean_una_wakeup(struct sock *sk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); + if (cleaned) { + if (tcp_under_memory_pressure(sk)) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } - mptcp_clean_una(sk); + if (sk_stream_is_writeable(sk)) { + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags)) + sk_stream_write_space(sk); + } + } - /* Only wake up writers if a subflow is ready */ - if (sk_stream_is_writeable(sk)) { - clear_bit(MPTCP_NOSPACE, &msk->flags); - sk_stream_write_space(sk); + if (snd_una == READ_ONCE(msk->snd_nxt)) { + if (msk->timer_ival) + mptcp_stop_timer(sk); + } else { + mptcp_reset_timer(sk); } } -/* ensure we get enough memory for the frag hdr, beyond some minimal amount of - * data - */ -static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool first = true; - if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), - pfrag, sk->sk_allocation))) - return true; - sk_stream_moderate_sndbuf(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -943,6 +1066,18 @@ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) sk_stream_moderate_sndbuf(ssk); first = false; } +} + +/* ensure we get enough memory for the frag hdr, beyond some minimal amount of + * data + */ +static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), + pfrag, sk->sk_allocation))) + return true; + + mptcp_enter_memory_pressure(sk); return false; } @@ -989,6 +1124,128 @@ static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, return avail_size; } +static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) +{ + struct skb_ext *mpext = __skb_ext_alloc(gfp); + + if (!mpext) + return false; + __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); + return true; +} + +static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) +{ + struct sk_buff *skb; + + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); + if (likely(skb)) { + if (likely(__mptcp_add_ext(skb, gfp))) { + skb_reserve(skb, MAX_TCP_HEADER); + skb->reserved_tailroom = skb->end - skb->tail; + return skb; + } + __kfree_skb(skb); + } else { + mptcp_enter_memory_pressure(sk); + } + return NULL; +} + +static bool mptcp_tx_cache_refill(struct sock *sk, int size, + struct sk_buff_head *skbs, int *total_ts) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + int space_needed; + + if (unlikely(tcp_under_memory_pressure(sk))) { + mptcp_mem_reclaim_partial(sk); + + /* under pressure pre-allocate at most a single skb */ + if (msk->skb_tx_cache.qlen) + return true; + space_needed = msk->size_goal_cache; + } else { + space_needed = msk->tx_pending_data + size - + msk->skb_tx_cache.qlen * msk->size_goal_cache; + } + + while (space_needed > 0) { + skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); + if (unlikely(!skb)) { + /* under memory pressure, try to pass the caller a + * single skb to allow forward progress + */ + while (skbs->qlen > 1) { + skb = __skb_dequeue_tail(skbs); + __kfree_skb(skb); + } + return skbs->qlen > 0; + } + + *total_ts += skb->truesize; + __skb_queue_tail(skbs, skb); + space_needed -= msk->size_goal_cache; + } + return true; +} + +static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + + if (ssk->sk_tx_skb_cache) { + skb = ssk->sk_tx_skb_cache; + if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && + !__mptcp_add_ext(skb, gfp))) + return false; + return true; + } + + skb = skb_peek(&msk->skb_tx_cache); + if (skb) { + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + skb = __skb_dequeue(&msk->skb_tx_cache); + if (WARN_ON_ONCE(!skb)) + return false; + + mptcp_wmem_uncharge(sk, skb->truesize); + ssk->sk_tx_skb_cache = skb; + return true; + } + + /* over memory limit, no point to try to allocate a new skb */ + return false; + } + + skb = __mptcp_do_alloc_tx_skb(sk, gfp); + if (!skb) + return false; + + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + ssk->sk_tx_skb_cache = skb; + return true; + } + kfree_skb(skb); + return false; +} + +static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +{ + return !ssk->sk_tx_skb_cache && + !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && + tcp_under_memory_pressure(sk); +} + +static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +{ + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) + mptcp_mem_reclaim_partial(sk); + return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1000,7 +1257,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct sk_buff *skb, *tail; bool can_collapse = false; int avail_size; - size_t ret; + size_t ret = 0; pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); @@ -1008,6 +1265,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; + msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { /* Limit the write to the size available in the @@ -1028,10 +1286,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* Zero window and all data acked? Probe. */ avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); if (avail_size == 0) { - if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt) + u64 snd_una = READ_ONCE(msk->snd_una); + + if (skb || snd_una != msk->snd_nxt) return 0; zero_window_probe = true; - data_seq = atomic64_read(&msk->snd_una) - 1; + data_seq = snd_una - 1; avail_size = 1; } @@ -1056,8 +1316,11 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, goto out; } - mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext); - msk->cached_ext = NULL; + mpext = skb_ext_find(tail, SKB_EXT_MPTCP); + if (WARN_ON_ONCE(!mpext)) { + /* should never reach here, stream corrupted */ + return -EINVAL; + } memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; @@ -1081,31 +1344,6 @@ out: return ret; } -static void mptcp_nospace(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - set_bit(MPTCP_NOSPACE, &msk->flags); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool ssk_writeable = sk_stream_is_writeable(ssk); - struct socket *sock = READ_ONCE(ssk->sk_socket); - - if (ssk_writeable || !sock) - continue; - - /* enables ssk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); - } - - /* mptcp_data_acked() could run just before we set the NOSPACE bit, - * so explicitly check for snd_una value - */ - mptcp_clean_una((struct sock *)msk); -} - #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ @@ -1130,9 +1368,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, sock_owned_by_me((struct sock *)msk); *sndbuf = 0; - if (!mptcp_ext_cache_refill(msk)) - return NULL; - if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; @@ -1241,6 +1476,15 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags) if (ssk != prev_ssk || !prev_ssk) lock_sock(ssk); + /* keep it simple and always provide a new skb for the + * subflow, even if we will not use it when collapsing + * on the pending one + */ + if (!mptcp_alloc_tx_skb(sk, ssk)) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { mptcp_push_release(sk, ssk, &info); @@ -1251,6 +1495,7 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags) dfrag->already_sent += ret; msk->snd_nxt += ret; msk->snd_burst -= ret; + msk->tx_pending_data -= ret; copied += ret; len -= ret; } @@ -1270,6 +1515,63 @@ out: } } +static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + + info.flags = 0; + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } + if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) + goto out; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + goto out; + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + +out: + /* __mptcp_alloc_tx_skb could have released some wmem and we are + * not going to flush it via release_sock() + */ + __mptcp_update_wmem(sk); + if (copied) { + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + if (msk->snd_data_fin_enable && + msk->snd_nxt + 1 == msk->write_seq) + mptcp_schedule_work(sk); + } +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1281,7 +1583,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len)); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1292,11 +1594,11 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } pfrag = sk_page_frag(sk); - mptcp_clean_una(sk); while (msg_data_left(msg)) { + int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; - int frag_truesize = 0; + struct sk_buff_head skbs; bool dfrag_collapsed; size_t psize, offset; @@ -1311,11 +1613,9 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { - if (!sk_stream_memory_free(sk)) { - mptcp_push_pending(sk, msg->msg_flags); - if (!sk_stream_memory_free(sk)) - goto wait_for_memory; - } + if (!sk_stream_memory_free(sk)) + goto wait_for_memory; + if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; @@ -1330,11 +1630,20 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); - if (!sk_wmem_schedule(sk, psize + frag_truesize)) + total_ts = psize + frag_truesize; + __skb_queue_head_init(&skbs); + if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) goto wait_for_memory; + if (!mptcp_wmem_alloc(sk, total_ts)) { + __skb_queue_purge(&skbs); + goto wait_for_memory; + } + + skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { + mptcp_wmem_uncharge(sk, psize + frag_truesize); ret = -EFAULT; goto out; } @@ -1350,7 +1659,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) * Note: we charge such data both to sk and ssk */ sk_wmem_queued_add(sk, frag_truesize); - sk->sk_forward_alloc -= frag_truesize; if (!dfrag_collapsed) { get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); @@ -1361,21 +1669,20 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); - if (!mptcp_ext_cache_refill(msk)) - goto wait_for_memory; continue; wait_for_memory: - mptcp_nospace(msk); - if (mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; } - if (copied) + if (copied) { + msk->tx_pending_data += copied; mptcp_push_pending(sk, msg->msg_flags); + } out: release_sock(sk); @@ -1401,11 +1708,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len) { - struct sock *sk = (struct sock *)msk; struct sk_buff *skb; int copied = 0; - while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + while ((skb = skb_peek(&msk->receive_queue)) != NULL) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1425,7 +1731,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } - __skb_unlink(skb, &sk->sk_receive_queue); + /* we will bulk release the skb memory later */ + skb->destructor = NULL; + msk->rmem_released += skb->truesize; + __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); if (copied >= len) @@ -1533,25 +1842,47 @@ new_measure: msk->rcvq_space.time = mstamp; } +static void __mptcp_update_rmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->rmem_released) + return; + + atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, msk->rmem_released); + msk->rmem_released = 0; +} + +static void __mptcp_splice_receive_queue(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); +} + static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) { + struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - bool done; - - /* avoid looping forever below on racing close */ - if (((struct sock *)msk)->sk_state == TCP_CLOSE) - return false; + bool ret, done; __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; - if (!ssk) + /* we can have data pending in the subflows only if the msk + * receive buffer was full at subflow_data_ready() time, + * that is an unlikely slow path. + */ + if (likely(!ssk)) break; slowpath = lock_sock_fast(ssk); + mptcp_data_lock(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + mptcp_data_unlock(sk); if (moved && rcv) { WRITE_ONCE(msk->rmem_pending, min(rcv, moved)); tcp_cleanup_rbuf(ssk, 1); @@ -1560,11 +1891,19 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) unlock_sock_fast(ssk, slowpath); } while (!done); - if (mptcp_ofo_queue(msk) || moved > 0) { - mptcp_check_data_fin((struct sock *)msk); - return true; + /* acquire the data lock only if some input data is pending */ + ret = moved > 0; + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || + !skb_queue_empty_lockless(&sk->sk_receive_queue)) { + mptcp_data_lock(sk); + __mptcp_update_rmem(sk); + ret |= __mptcp_ofo_queue(msk); + __mptcp_splice_receive_queue(sk); + mptcp_data_unlock(sk); } - return false; + if (ret) + mptcp_check_data_fin((struct sock *)msk); + return !skb_queue_empty(&msk->receive_queue); } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1578,14 +1917,18 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); + if (unlikely(sk->sk_state == TCP_LISTEN)) { + copied = -ENOTCONN; + goto out_err; + } + timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - __mptcp_flush_join_list(msk); - for (;;) { + while (copied < len) { int bytes_read, old_space; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); @@ -1597,14 +1940,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&sk->sk_receive_queue) && + if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk, len - copied)) continue; /* be sure to advertise window change */ old_space = READ_ONCE(msk->old_wspace); if ((tcp_space(sk) - old_space) >= old_space) - mptcp_send_ack(msk, false); + mptcp_cleanup_rbuf(msk); /* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() @@ -1628,8 +1971,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* race breaker: the shutdown could be after the + * previous receive queue check + */ + if (__mptcp_move_skbs(msk, len - copied)) + continue; break; + } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; @@ -1651,7 +2000,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, mptcp_wait_data(sk, &timeo); } - if (skb_queue_empty(&sk->sk_receive_queue)) { + if (skb_queue_empty_lockless(&sk->sk_receive_queue) && + skb_queue_empty(&msk->receive_queue)) { /* entire backlog drained, clear DATA_READY. */ clear_bit(MPTCP_DATA_READY, &msk->flags); @@ -1667,7 +2017,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, out_err: pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty(&sk->sk_receive_queue), copied); + skb_queue_empty_lockless(&sk->sk_receive_queue), copied); mptcp_rcv_space_adjust(msk, copied); release_sock(sk); @@ -1678,12 +2028,8 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->snd_nxt)) { - mptcp_stop_timer(sk); - } else { - set_bit(MPTCP_WORK_RTX, &msk->flags); - mptcp_schedule_work(sk); - } + set_bit(MPTCP_WORK_RTX, &msk->flags); + mptcp_schedule_work(sk); } static void mptcp_retransmit_timer(struct timer_list *t) @@ -1710,6 +2056,7 @@ static void mptcp_timeout_timer(struct timer_list *t) struct sock *sk = from_timer(sk, t, sk_timer); mptcp_schedule_work(sk); + sock_put(sk); } /* Find an idle subflow. Return NULL if there is unacked data at tcp @@ -1779,6 +2126,8 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, sock_orphan(ssk); } + subflow->disposable = 1; + /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk; @@ -1786,8 +2135,7 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (!inet_csk(ssk)->icsk_ulp_ops) { kfree_rcu(subflow, rcu); } else { - /* otherwise ask tcp do dispose of ssk and subflow ctx */ - subflow->disposable = 1; + /* otherwise tcp will dispose of the ssk and subflow ctx */ __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -1883,21 +2231,18 @@ static void mptcp_worker(struct work_struct *work) if (unlikely(state == TCP_CLOSE)) goto unlock; - mptcp_clean_una_wakeup(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); - if (mptcp_send_head(sk)) - mptcp_push_pending(sk, 0); - if (msk->pm.status) pm_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); /* if the msk data is completely acked, or the socket timedout, @@ -1919,9 +2264,6 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; - if (!mptcp_ext_cache_refill(msk)) - goto reset_unlock; - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; @@ -1932,6 +2274,9 @@ static void mptcp_worker(struct work_struct *work) info.sent = 0; info.limit = dfrag->already_sent; while (info.sent < dfrag->already_sent) { + if (!mptcp_alloc_tx_skb(sk, ssk)) + break; + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; @@ -1939,9 +2284,6 @@ static void mptcp_worker(struct work_struct *work) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; info.sent += ret; - - if (!mptcp_ext_cache_refill(msk)) - break; } if (copied) tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, @@ -1969,8 +2311,14 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); + __skb_queue_head_init(&msk->receive_queue); + __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; + msk->wmem_reserved = 0; + msk->rmem_released = 0; + msk->tx_pending_data = 0; + msk->size_goal_cache = TCP_BASE_MSS; msk->ack_hint = NULL; msk->first = NULL; @@ -2014,12 +2362,15 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + struct sk_buff *skb; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); + while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { + sk->sk_forward_alloc += skb->truesize; + kfree_skb(skb); + } } static void mptcp_cancel_work(struct sock *sk) @@ -2154,7 +2505,7 @@ static void __mptcp_destroy_sock(struct sock *sk) spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - __mptcp_clear_xmit(sk); + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; @@ -2165,6 +2516,8 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); + WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); @@ -2294,8 +2647,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; - atomic64_set(&msk->snd_una, msk->write_seq); - atomic64_set(&msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd); + msk->snd_una = msk->write_seq; + msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; if (mp_opt->mp_capable) { msk->can_ack = true; @@ -2331,7 +2684,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; - atomic64_set(&msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, @@ -2382,6 +2735,13 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, void mptcp_destroy_common(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; + + __mptcp_clear_xmit(sk); + + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); + skb_rbtree_purge(&msk->out_of_order_queue); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); @@ -2391,9 +2751,6 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (msk->cached_ext) - __skb_ext_put(msk->cached_ext); - mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } @@ -2508,15 +2865,58 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } +void __mptcp_data_acked(struct sock *sk) +{ + if (!sock_owned_by_user(sk)) + __mptcp_clean_una(sk); + else + set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); + + if (mptcp_pending_data_fin_ack(sk)) + mptcp_schedule_work(sk); +} + +void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk) +{ + if (!mptcp_send_head(sk)) + return; + + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk); + else + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) -/* this is very alike tcp_release_cb() but we must handle differently a - * different set of events - */ +/* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) { unsigned long flags, nflags; + /* push_pending may touch wmem_reserved, do it before the later + * cleanup + */ + if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + __mptcp_clean_una(sk); + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) { + /* mptcp_push_pending() acquires the subflow socket lock + * + * 1) can't be invoked in atomic scope + * 2) must avoid ABBA deadlock with msk socket spinlock: the RX + * datapath acquires the msk socket spinlock while helding + * the subflow socket lock + */ + + spin_unlock_bh(&sk->sk_lock.slock); + mptcp_push_pending(sk, 0); + spin_lock_bh(&sk->sk_lock.slock); + } + + /* clear any wmem reservation and errors */ + __mptcp_update_wmem(sk); + __mptcp_update_rmem(sk); + do { flags = sk->sk_tsq_flags; if (!(flags & MPTCP_DEFERRED_ALL)) @@ -2587,7 +2987,7 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); - atomic64_set(&msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); @@ -2816,6 +3216,17 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, bool slowpath; slowpath = lock_sock_fast(newsk); + + /* PM/worker can now acquire the first subflow socket + * lock without racing with listener queue cleanup, + * we can notify it, if needed. + */ + subflow = mptcp_subflow_ctx(msk->first); + list_add(&subflow->node, &msk->conn_list); + sock_hold(msk->first); + if (mptcp_is_fully_established(newsk)) + mptcp_pm_fully_established(msk); + mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); @@ -2848,24 +3259,9 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } -static bool __mptcp_check_writeable(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - bool mptcp_writable; - - mptcp_clean_una(sk); - mptcp_writable = sk_stream_is_writeable(sk); - if (!mptcp_writable) - mptcp_nospace(msk); - - return mptcp_writable; -} - static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; - __poll_t ret = 0; - bool slow; if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) return 0; @@ -2873,12 +3269,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; - slow = lock_sock_fast(sk); - if (__mptcp_check_writeable(msk)) - ret = EPOLLOUT | EPOLLWRNORM; + set_bit(MPTCP_NOSPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; - unlock_sock_fast(sk, slow); - return ret; + return 0; } static __poll_t mptcp_poll(struct file *file, struct socket *sock, |