aboutsummaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c820
1 files changed, 608 insertions, 212 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4b7794835fea..2540d82742ac 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -60,7 +60,7 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
/* Returns end sequence number of the receiver's advertised window */
static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
{
- return atomic64_read(&msk->wnd_end);
+ return READ_ONCE(msk->wnd_end);
}
static bool mptcp_is_tcpsk(struct sock *sk)
@@ -348,17 +348,22 @@ static void mptcp_close_wake_up(struct sock *sk)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
-static void mptcp_check_data_fin_ack(struct sock *sk)
+static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (__mptcp_check_fallback(msk))
- return;
+ return !__mptcp_check_fallback(msk) &&
+ ((1 << sk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
+ msk->write_seq == READ_ONCE(msk->snd_una);
+}
+
+static void mptcp_check_data_fin_ack(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
/* Look for an acknowledged DATA_FIN */
- if (((1 << sk->sk_state) &
- (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
- msk->write_seq == atomic64_read(&msk->snd_una)) {
+ if (mptcp_pending_data_fin_ack(sk)) {
mptcp_stop_timer(sk);
WRITE_ONCE(msk->snd_data_fin_enable, 0);
@@ -419,31 +424,57 @@ static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
}
-static void mptcp_send_ack(struct mptcp_sock *msk, bool force)
+static bool tcp_can_send_ack(const struct sock *ssk)
+{
+ return !((1 << inet_sk_state_load(ssk)) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE));
+}
+
+static void mptcp_send_ack(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- struct sock *pick = NULL;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- if (force) {
- lock_sock(ssk);
+ lock_sock(ssk);
+ if (tcp_can_send_ack(ssk))
tcp_send_ack(ssk);
- release_sock(ssk);
- continue;
- }
-
- /* if the hintes ssk is still active, use it */
- pick = ssk;
- if (ssk == msk->ack_hint)
- break;
+ release_sock(ssk);
}
- if (!force && pick) {
- lock_sock(pick);
- tcp_cleanup_rbuf(pick, 1);
- release_sock(pick);
+}
+
+static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk)
+{
+ int ret;
+
+ lock_sock(ssk);
+ ret = tcp_can_send_ack(ssk);
+ if (ret)
+ tcp_cleanup_rbuf(ssk, 1);
+ release_sock(ssk);
+ return ret;
+}
+
+static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
+{
+ struct sock *ack_hint = READ_ONCE(msk->ack_hint);
+ struct mptcp_subflow_context *subflow;
+
+ /* if the hinted ssk is still active, try to use it */
+ if (likely(ack_hint)) {
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk))
+ return;
+ }
}
+
+ /* otherwise pick the first active subflow */
+ mptcp_for_each_subflow(msk, subflow)
+ if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow)))
+ return;
}
static bool mptcp_check_data_fin(struct sock *sk)
@@ -494,7 +525,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
ret = true;
mptcp_set_timeout(sk, NULL);
- mptcp_send_ack(msk, true);
+ mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
return ret;
@@ -588,13 +619,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
break;
}
} while (more_data_avail);
- msk->ack_hint = ssk;
+ WRITE_ONCE(msk->ack_hint, ssk);
*bytes += moved;
return done;
}
-static bool mptcp_ofo_queue(struct mptcp_sock *msk)
+static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
struct sk_buff *skb, *tail;
@@ -640,34 +671,27 @@ static bool mptcp_ofo_queue(struct mptcp_sock *msk)
/* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock.
*/
-static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
+static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{
struct sock *sk = (struct sock *)msk;
unsigned int moved = 0;
- if (READ_ONCE(sk->sk_lock.owned))
- return false;
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
- if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
- return false;
+ mptcp_data_lock(sk);
- /* must re-check after taking the lock */
- if (!READ_ONCE(sk->sk_lock.owned)) {
- __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
- mptcp_ofo_queue(msk);
+ __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ __mptcp_ofo_queue(msk);
- /* If the moves have caught up with the DATA_FIN sequence number
- * it's time to ack the DATA_FIN and change socket state, but
- * this is not a good place to change state. Let the workqueue
- * do it.
- */
- if (mptcp_pending_data_fin(sk, NULL))
- mptcp_schedule_work(sk);
- }
-
- spin_unlock_bh(&sk->sk_lock.slock);
-
- return moved > 0;
+ /* If the moves have caught up with the DATA_FIN sequence number
+ * it's time to ack the DATA_FIN and change socket state, but
+ * this is not a good place to change state. Let the workqueue
+ * do it.
+ */
+ if (mptcp_pending_data_fin(sk, NULL))
+ mptcp_schedule_work(sk);
+ mptcp_data_unlock(sk);
}
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
@@ -677,6 +701,13 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
int sk_rbuf, ssk_rbuf;
bool wake;
+ /* The peer can send data while we are shutting down this
+ * subflow at msk destruction time, but we must avoid enqueuing
+ * more data to the msk receive queue
+ */
+ if (unlikely(subflow->disposable))
+ return;
+
/* move_skbs_to_msk below can legitly clear the data_avail flag,
* but we will need later to properly woke the reader, cache its
* value
@@ -745,16 +776,6 @@ bool mptcp_schedule_work(struct sock *sk)
return false;
}
-void mptcp_data_acked(struct sock *sk)
-{
- mptcp_reset_timer(sk);
-
- if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
- mptcp_send_head(sk) ||
- (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
- mptcp_schedule_work(sk);
-}
-
void mptcp_subflow_eof(struct sock *sk)
{
if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
@@ -799,16 +820,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
mptcp_close_wake_up(sk);
}
-static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
-{
- const struct sock *sk = (const struct sock *)msk;
-
- if (!msk->cached_ext)
- msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
-
- return !!msk->cached_ext;
-}
-
static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@@ -847,6 +858,121 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
df->data_seq + df->data_len == msk->write_seq;
}
+static int mptcp_wmem_with_overhead(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int ret, skbs;
+
+ ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
+ skbs = (msk->tx_pending_data + size) / msk->size_goal_cache;
+ if (skbs < msk->skb_tx_cache.qlen)
+ return ret;
+
+ return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER);
+}
+
+static void __mptcp_wmem_reserve(struct sock *sk, int size)
+{
+ int amount = mptcp_wmem_with_overhead(sk, size);
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ WARN_ON_ONCE(msk->wmem_reserved);
+ if (amount <= sk->sk_forward_alloc)
+ goto reserve;
+
+ /* under memory pressure try to reserve at most a single page
+ * otherwise try to reserve the full estimate and fallback
+ * to a single page before entering the error path
+ */
+ if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
+ !sk_wmem_schedule(sk, amount)) {
+ if (amount <= PAGE_SIZE)
+ goto nomem;
+
+ amount = PAGE_SIZE;
+ if (!sk_wmem_schedule(sk, amount))
+ goto nomem;
+ }
+
+reserve:
+ msk->wmem_reserved = amount;
+ sk->sk_forward_alloc -= amount;
+ return;
+
+nomem:
+ /* we will wait for memory on next allocation */
+ msk->wmem_reserved = -1;
+}
+
+static void __mptcp_update_wmem(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!msk->wmem_reserved)
+ return;
+
+ if (msk->wmem_reserved < 0)
+ msk->wmem_reserved = 0;
+ if (msk->wmem_reserved > 0) {
+ sk->sk_forward_alloc += msk->wmem_reserved;
+ msk->wmem_reserved = 0;
+ }
+}
+
+static bool mptcp_wmem_alloc(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* check for pre-existing error condition */
+ if (msk->wmem_reserved < 0)
+ return false;
+
+ if (msk->wmem_reserved >= size)
+ goto account;
+
+ mptcp_data_lock(sk);
+ if (!sk_wmem_schedule(sk, size)) {
+ mptcp_data_unlock(sk);
+ return false;
+ }
+
+ sk->sk_forward_alloc -= size;
+ msk->wmem_reserved += size;
+ mptcp_data_unlock(sk);
+
+account:
+ msk->wmem_reserved -= size;
+ return true;
+}
+
+static void mptcp_wmem_uncharge(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (msk->wmem_reserved < 0)
+ msk->wmem_reserved = 0;
+ msk->wmem_reserved += size;
+}
+
+static void mptcp_mem_reclaim_partial(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* if we are experiencing a transint allocation error,
+ * the forward allocation memory has been already
+ * released
+ */
+ if (msk->wmem_reserved < 0)
+ return;
+
+ mptcp_data_lock(sk);
+ sk->sk_forward_alloc += msk->wmem_reserved;
+ sk_mem_reclaim_partial(sk);
+ msk->wmem_reserved = sk->sk_forward_alloc;
+ sk->sk_forward_alloc = 0;
+ mptcp_data_unlock(sk);
+}
+
static void dfrag_uncharge(struct sock *sk, int len)
{
sk_mem_uncharge(sk, len);
@@ -862,7 +988,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
-static void mptcp_clean_una(struct sock *sk)
+static void __mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
@@ -873,10 +999,9 @@ static void mptcp_clean_una(struct sock *sk)
* plain TCP
*/
if (__mptcp_check_fallback(msk))
- atomic64_set(&msk->snd_una, msk->snd_nxt);
-
- snd_una = atomic64_read(&msk->snd_una);
+ msk->snd_una = READ_ONCE(msk->snd_nxt);
+ snd_una = msk->snd_una;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
@@ -904,36 +1029,34 @@ static void mptcp_clean_una(struct sock *sk)
}
out:
- if (cleaned)
- sk_mem_reclaim_partial(sk);
-}
-
-static void mptcp_clean_una_wakeup(struct sock *sk)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
+ if (cleaned) {
+ if (tcp_under_memory_pressure(sk)) {
+ __mptcp_update_wmem(sk);
+ sk_mem_reclaim_partial(sk);
+ }
- mptcp_clean_una(sk);
+ if (sk_stream_is_writeable(sk)) {
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags))
+ sk_stream_write_space(sk);
+ }
+ }
- /* Only wake up writers if a subflow is ready */
- if (sk_stream_is_writeable(sk)) {
- clear_bit(MPTCP_NOSPACE, &msk->flags);
- sk_stream_write_space(sk);
+ if (snd_una == READ_ONCE(msk->snd_nxt)) {
+ if (msk->timer_ival)
+ mptcp_stop_timer(sk);
+ } else {
+ mptcp_reset_timer(sk);
}
}
-/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
- * data
- */
-static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+static void mptcp_enter_memory_pressure(struct sock *sk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool first = true;
- if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
- pfrag, sk->sk_allocation)))
- return true;
-
sk_stream_moderate_sndbuf(sk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -943,6 +1066,18 @@ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
sk_stream_moderate_sndbuf(ssk);
first = false;
}
+}
+
+/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
+ * data
+ */
+static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
+ pfrag, sk->sk_allocation)))
+ return true;
+
+ mptcp_enter_memory_pressure(sk);
return false;
}
@@ -989,6 +1124,128 @@ static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
return avail_size;
}
+static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp)
+{
+ struct skb_ext *mpext = __skb_ext_alloc(gfp);
+
+ if (!mpext)
+ return false;
+ __skb_ext_set(skb, SKB_EXT_MPTCP, mpext);
+ return true;
+}
+
+static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
+ if (likely(skb)) {
+ if (likely(__mptcp_add_ext(skb, gfp))) {
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->reserved_tailroom = skb->end - skb->tail;
+ return skb;
+ }
+ __kfree_skb(skb);
+ } else {
+ mptcp_enter_memory_pressure(sk);
+ }
+ return NULL;
+}
+
+static bool mptcp_tx_cache_refill(struct sock *sk, int size,
+ struct sk_buff_head *skbs, int *total_ts)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ int space_needed;
+
+ if (unlikely(tcp_under_memory_pressure(sk))) {
+ mptcp_mem_reclaim_partial(sk);
+
+ /* under pressure pre-allocate at most a single skb */
+ if (msk->skb_tx_cache.qlen)
+ return true;
+ space_needed = msk->size_goal_cache;
+ } else {
+ space_needed = msk->tx_pending_data + size -
+ msk->skb_tx_cache.qlen * msk->size_goal_cache;
+ }
+
+ while (space_needed > 0) {
+ skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ /* under memory pressure, try to pass the caller a
+ * single skb to allow forward progress
+ */
+ while (skbs->qlen > 1) {
+ skb = __skb_dequeue_tail(skbs);
+ __kfree_skb(skb);
+ }
+ return skbs->qlen > 0;
+ }
+
+ *total_ts += skb->truesize;
+ __skb_queue_tail(skbs, skb);
+ space_needed -= msk->size_goal_cache;
+ }
+ return true;
+}
+
+static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (ssk->sk_tx_skb_cache) {
+ skb = ssk->sk_tx_skb_cache;
+ if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
+ !__mptcp_add_ext(skb, gfp)))
+ return false;
+ return true;
+ }
+
+ skb = skb_peek(&msk->skb_tx_cache);
+ if (skb) {
+ if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
+ skb = __skb_dequeue(&msk->skb_tx_cache);
+ if (WARN_ON_ONCE(!skb))
+ return false;
+
+ mptcp_wmem_uncharge(sk, skb->truesize);
+ ssk->sk_tx_skb_cache = skb;
+ return true;
+ }
+
+ /* over memory limit, no point to try to allocate a new skb */
+ return false;
+ }
+
+ skb = __mptcp_do_alloc_tx_skb(sk, gfp);
+ if (!skb)
+ return false;
+
+ if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
+ ssk->sk_tx_skb_cache = skb;
+ return true;
+ }
+ kfree_skb(skb);
+ return false;
+}
+
+static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk)
+{
+ return !ssk->sk_tx_skb_cache &&
+ !skb_peek(&mptcp_sk(sk)->skb_tx_cache) &&
+ tcp_under_memory_pressure(sk);
+}
+
+static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk)
+{
+ if (unlikely(mptcp_must_reclaim_memory(sk, ssk)))
+ mptcp_mem_reclaim_partial(sk);
+ return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation);
+}
+
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_data_frag *dfrag,
struct mptcp_sendmsg_info *info)
@@ -1000,7 +1257,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct sk_buff *skb, *tail;
bool can_collapse = false;
int avail_size;
- size_t ret;
+ size_t ret = 0;
pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
@@ -1008,6 +1265,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
/* compute send limit */
info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
avail_size = info->size_goal;
+ msk->size_goal_cache = info->size_goal;
skb = tcp_write_queue_tail(ssk);
if (skb) {
/* Limit the write to the size available in the
@@ -1028,10 +1286,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
/* Zero window and all data acked? Probe. */
avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
if (avail_size == 0) {
- if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt)
+ u64 snd_una = READ_ONCE(msk->snd_una);
+
+ if (skb || snd_una != msk->snd_nxt)
return 0;
zero_window_probe = true;
- data_seq = atomic64_read(&msk->snd_una) - 1;
+ data_seq = snd_una - 1;
avail_size = 1;
}
@@ -1056,8 +1316,11 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
goto out;
}
- mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext);
- msk->cached_ext = NULL;
+ mpext = skb_ext_find(tail, SKB_EXT_MPTCP);
+ if (WARN_ON_ONCE(!mpext)) {
+ /* should never reach here, stream corrupted */
+ return -EINVAL;
+ }
memset(mpext, 0, sizeof(*mpext));
mpext->data_seq = data_seq;
@@ -1081,31 +1344,6 @@ out:
return ret;
}
-static void mptcp_nospace(struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow;
-
- set_bit(MPTCP_NOSPACE, &msk->flags);
- smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- bool ssk_writeable = sk_stream_is_writeable(ssk);
- struct socket *sock = READ_ONCE(ssk->sk_socket);
-
- if (ssk_writeable || !sock)
- continue;
-
- /* enables ssk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
- }
-
- /* mptcp_data_acked() could run just before we set the NOSPACE bit,
- * so explicitly check for snd_una value
- */
- mptcp_clean_una((struct sock *)msk);
-}
-
#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
sizeof(struct tcphdr) - \
MAX_TCP_OPTION_SPACE - \
@@ -1130,9 +1368,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
sock_owned_by_me((struct sock *)msk);
*sndbuf = 0;
- if (!mptcp_ext_cache_refill(msk))
- return NULL;
-
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
@@ -1241,6 +1476,15 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
if (ssk != prev_ssk || !prev_ssk)
lock_sock(ssk);
+ /* keep it simple and always provide a new skb for the
+ * subflow, even if we will not use it when collapsing
+ * on the pending one
+ */
+ if (!mptcp_alloc_tx_skb(sk, ssk)) {
+ mptcp_push_release(sk, ssk, &info);
+ goto out;
+ }
+
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0) {
mptcp_push_release(sk, ssk, &info);
@@ -1251,6 +1495,7 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
dfrag->already_sent += ret;
msk->snd_nxt += ret;
msk->snd_burst -= ret;
+ msk->tx_pending_data -= ret;
copied += ret;
len -= ret;
}
@@ -1270,6 +1515,63 @@ out:
}
}
+static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info;
+ struct mptcp_data_frag *dfrag;
+ int len, copied = 0;
+
+ info.flags = 0;
+ while ((dfrag = mptcp_send_head(sk))) {
+ info.sent = dfrag->already_sent;
+ info.limit = dfrag->data_len;
+ len = dfrag->data_len - dfrag->already_sent;
+ while (len > 0) {
+ int ret = 0;
+
+ /* do auto tuning */
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
+
+ if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
+ __mptcp_update_wmem(sk);
+ sk_mem_reclaim_partial(sk);
+ }
+ if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC))
+ goto out;
+
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
+ goto out;
+
+ info.sent += ret;
+ dfrag->already_sent += ret;
+ msk->snd_nxt += ret;
+ msk->snd_burst -= ret;
+ msk->tx_pending_data -= ret;
+ copied += ret;
+ len -= ret;
+ }
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ }
+
+out:
+ /* __mptcp_alloc_tx_skb could have released some wmem and we are
+ * not going to flush it via release_sock()
+ */
+ __mptcp_update_wmem(sk);
+ if (copied) {
+ mptcp_set_timeout(sk, ssk);
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+ if (msk->snd_data_fin_enable &&
+ msk->snd_nxt + 1 == msk->write_seq)
+ mptcp_schedule_work(sk);
+ }
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1281,7 +1583,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
return -EOPNOTSUPP;
- lock_sock(sk);
+ mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len));
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -1292,11 +1594,11 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
pfrag = sk_page_frag(sk);
- mptcp_clean_una(sk);
while (msg_data_left(msg)) {
+ int total_ts, frag_truesize = 0;
struct mptcp_data_frag *dfrag;
- int frag_truesize = 0;
+ struct sk_buff_head skbs;
bool dfrag_collapsed;
size_t psize, offset;
@@ -1311,11 +1613,9 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
dfrag = mptcp_pending_tail(sk);
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
if (!dfrag_collapsed) {
- if (!sk_stream_memory_free(sk)) {
- mptcp_push_pending(sk, msg->msg_flags);
- if (!sk_stream_memory_free(sk))
- goto wait_for_memory;
- }
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_memory;
+
if (!mptcp_page_frag_refill(sk, pfrag))
goto wait_for_memory;
@@ -1330,11 +1630,20 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
offset = dfrag->offset + dfrag->data_len;
psize = pfrag->size - offset;
psize = min_t(size_t, psize, msg_data_left(msg));
- if (!sk_wmem_schedule(sk, psize + frag_truesize))
+ total_ts = psize + frag_truesize;
+ __skb_queue_head_init(&skbs);
+ if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts))
goto wait_for_memory;
+ if (!mptcp_wmem_alloc(sk, total_ts)) {
+ __skb_queue_purge(&skbs);
+ goto wait_for_memory;
+ }
+
+ skb_queue_splice_tail(&skbs, &msk->skb_tx_cache);
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
+ mptcp_wmem_uncharge(sk, psize + frag_truesize);
ret = -EFAULT;
goto out;
}
@@ -1350,7 +1659,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
* Note: we charge such data both to sk and ssk
*/
sk_wmem_queued_add(sk, frag_truesize);
- sk->sk_forward_alloc -= frag_truesize;
if (!dfrag_collapsed) {
get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue);
@@ -1361,21 +1669,20 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
!dfrag_collapsed);
- if (!mptcp_ext_cache_refill(msk))
- goto wait_for_memory;
continue;
wait_for_memory:
- mptcp_nospace(msk);
- if (mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ set_bit(MPTCP_NOSPACE, &msk->flags);
+ mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
}
- if (copied)
+ if (copied) {
+ msk->tx_pending_data += copied;
mptcp_push_pending(sk, msg->msg_flags);
+ }
out:
release_sock(sk);
@@ -1401,11 +1708,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
struct msghdr *msg,
size_t len)
{
- struct sock *sk = (struct sock *)msk;
struct sk_buff *skb;
int copied = 0;
- while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ while ((skb = skb_peek(&msk->receive_queue)) != NULL) {
u32 offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
u32 count = min_t(size_t, len - copied, data_len);
@@ -1425,7 +1731,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
break;
}
- __skb_unlink(skb, &sk->sk_receive_queue);
+ /* we will bulk release the skb memory later */
+ skb->destructor = NULL;
+ msk->rmem_released += skb->truesize;
+ __skb_unlink(skb, &msk->receive_queue);
__kfree_skb(skb);
if (copied >= len)
@@ -1533,25 +1842,47 @@ new_measure:
msk->rcvq_space.time = mstamp;
}
+static void __mptcp_update_rmem(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!msk->rmem_released)
+ return;
+
+ atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, msk->rmem_released);
+ msk->rmem_released = 0;
+}
+
+static void __mptcp_splice_receive_queue(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
+}
+
static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
{
+ struct sock *sk = (struct sock *)msk;
unsigned int moved = 0;
- bool done;
-
- /* avoid looping forever below on racing close */
- if (((struct sock *)msk)->sk_state == TCP_CLOSE)
- return false;
+ bool ret, done;
__mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath;
- if (!ssk)
+ /* we can have data pending in the subflows only if the msk
+ * receive buffer was full at subflow_data_ready() time,
+ * that is an unlikely slow path.
+ */
+ if (likely(!ssk))
break;
slowpath = lock_sock_fast(ssk);
+ mptcp_data_lock(sk);
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ mptcp_data_unlock(sk);
if (moved && rcv) {
WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
tcp_cleanup_rbuf(ssk, 1);
@@ -1560,11 +1891,19 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
unlock_sock_fast(ssk, slowpath);
} while (!done);
- if (mptcp_ofo_queue(msk) || moved > 0) {
- mptcp_check_data_fin((struct sock *)msk);
- return true;
+ /* acquire the data lock only if some input data is pending */
+ ret = moved > 0;
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
+ !skb_queue_empty_lockless(&sk->sk_receive_queue)) {
+ mptcp_data_lock(sk);
+ __mptcp_update_rmem(sk);
+ ret |= __mptcp_ofo_queue(msk);
+ __mptcp_splice_receive_queue(sk);
+ mptcp_data_unlock(sk);
}
- return false;
+ if (ret)
+ mptcp_check_data_fin((struct sock *)msk);
+ return !skb_queue_empty(&msk->receive_queue);
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
@@ -1578,14 +1917,18 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
return -EOPNOTSUPP;
- lock_sock(sk);
+ mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
+ if (unlikely(sk->sk_state == TCP_LISTEN)) {
+ copied = -ENOTCONN;
+ goto out_err;
+ }
+
timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
- __mptcp_flush_join_list(msk);
- for (;;) {
+ while (copied < len) {
int bytes_read, old_space;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
@@ -1597,14 +1940,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
- if (skb_queue_empty(&sk->sk_receive_queue) &&
+ if (skb_queue_empty(&msk->receive_queue) &&
__mptcp_move_skbs(msk, len - copied))
continue;
/* be sure to advertise window change */
old_space = READ_ONCE(msk->old_wspace);
if ((tcp_space(sk) - old_space) >= old_space)
- mptcp_send_ack(msk, false);
+ mptcp_cleanup_rbuf(msk);
/* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
@@ -1628,8 +1971,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ /* race breaker: the shutdown could be after the
+ * previous receive queue check
+ */
+ if (__mptcp_move_skbs(msk, len - copied))
+ continue;
break;
+ }
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN;
@@ -1651,7 +2000,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
mptcp_wait_data(sk, &timeo);
}
- if (skb_queue_empty(&sk->sk_receive_queue)) {
+ if (skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+ skb_queue_empty(&msk->receive_queue)) {
/* entire backlog drained, clear DATA_READY. */
clear_bit(MPTCP_DATA_READY, &msk->flags);
@@ -1667,7 +2017,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
out_err:
pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
msk, test_bit(MPTCP_DATA_READY, &msk->flags),
- skb_queue_empty(&sk->sk_receive_queue), copied);
+ skb_queue_empty_lockless(&sk->sk_receive_queue), copied);
mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
@@ -1678,12 +2028,8 @@ static void mptcp_retransmit_handler(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->snd_nxt)) {
- mptcp_stop_timer(sk);
- } else {
- set_bit(MPTCP_WORK_RTX, &msk->flags);
- mptcp_schedule_work(sk);
- }
+ set_bit(MPTCP_WORK_RTX, &msk->flags);
+ mptcp_schedule_work(sk);
}
static void mptcp_retransmit_timer(struct timer_list *t)
@@ -1710,6 +2056,7 @@ static void mptcp_timeout_timer(struct timer_list *t)
struct sock *sk = from_timer(sk, t, sk_timer);
mptcp_schedule_work(sk);
+ sock_put(sk);
}
/* Find an idle subflow. Return NULL if there is unacked data at tcp
@@ -1779,6 +2126,8 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
sock_orphan(ssk);
}
+ subflow->disposable = 1;
+
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
* the ssk has been already destroyed, we just need to release the
* reference owned by msk;
@@ -1786,8 +2135,7 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (!inet_csk(ssk)->icsk_ulp_ops) {
kfree_rcu(subflow, rcu);
} else {
- /* otherwise ask tcp do dispose of ssk and subflow ctx */
- subflow->disposable = 1;
+ /* otherwise tcp will dispose of the ssk and subflow ctx */
__tcp_close(ssk, 0);
/* close acquired an extra ref */
@@ -1883,21 +2231,18 @@ static void mptcp_worker(struct work_struct *work)
if (unlikely(state == TCP_CLOSE))
goto unlock;
- mptcp_clean_una_wakeup(sk);
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
__mptcp_close_subflow(msk);
- if (mptcp_send_head(sk))
- mptcp_push_pending(sk, 0);
-
if (msk->pm.status)
pm_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
+ __mptcp_check_send_data_fin(sk);
mptcp_check_data_fin(sk);
/* if the msk data is completely acked, or the socket timedout,
@@ -1919,9 +2264,6 @@ static void mptcp_worker(struct work_struct *work)
if (!dfrag)
goto unlock;
- if (!mptcp_ext_cache_refill(msk))
- goto reset_unlock;
-
ssk = mptcp_subflow_get_retrans(msk);
if (!ssk)
goto reset_unlock;
@@ -1932,6 +2274,9 @@ static void mptcp_worker(struct work_struct *work)
info.sent = 0;
info.limit = dfrag->already_sent;
while (info.sent < dfrag->already_sent) {
+ if (!mptcp_alloc_tx_skb(sk, ssk))
+ break;
+
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0)
break;
@@ -1939,9 +2284,6 @@ static void mptcp_worker(struct work_struct *work)
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
copied += ret;
info.sent += ret;
-
- if (!mptcp_ext_cache_refill(msk))
- break;
}
if (copied)
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
@@ -1969,8 +2311,14 @@ static int __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
INIT_WORK(&msk->work, mptcp_worker);
+ __skb_queue_head_init(&msk->receive_queue);
+ __skb_queue_head_init(&msk->skb_tx_cache);
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
+ msk->wmem_reserved = 0;
+ msk->rmem_released = 0;
+ msk->tx_pending_data = 0;
+ msk->size_goal_cache = TCP_BASE_MSS;
msk->ack_hint = NULL;
msk->first = NULL;
@@ -2014,12 +2362,15 @@ static void __mptcp_clear_xmit(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
-
- sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ struct sk_buff *skb;
WRITE_ONCE(msk->first_pending, NULL);
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
dfrag_clear(sk, dfrag);
+ while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) {
+ sk->sk_forward_alloc += skb->truesize;
+ kfree_skb(skb);
+ }
}
static void mptcp_cancel_work(struct sock *sk)
@@ -2154,7 +2505,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
spin_unlock_bh(&msk->join_list_lock);
list_splice_init(&msk->conn_list, &conn_list);
- __mptcp_clear_xmit(sk);
+ sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0;
@@ -2165,6 +2516,8 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk->sk_prot->destroy(sk);
+ WARN_ON_ONCE(msk->wmem_reserved);
+ WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
@@ -2294,8 +2647,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->write_seq = subflow_req->idsn + 1;
msk->snd_nxt = msk->write_seq;
- atomic64_set(&msk->snd_una, msk->write_seq);
- atomic64_set(&msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd);
+ msk->snd_una = msk->write_seq;
+ msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
if (mp_opt->mp_capable) {
msk->can_ack = true;
@@ -2331,7 +2684,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
if (msk->rcvq_space.space == 0)
msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
- atomic64_set(&msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
+ WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
}
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
@@ -2382,6 +2735,13 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
void mptcp_destroy_common(struct mptcp_sock *msk)
{
+ struct sock *sk = (struct sock *)msk;
+
+ __mptcp_clear_xmit(sk);
+
+ /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
+ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
+
skb_rbtree_purge(&msk->out_of_order_queue);
mptcp_token_destroy(msk);
mptcp_pm_free_anno_list(msk);
@@ -2391,9 +2751,6 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (msk->cached_ext)
- __skb_ext_put(msk->cached_ext);
-
mptcp_destroy_common(msk);
sk_sockets_allocated_dec(sk);
}
@@ -2508,15 +2865,58 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
}
+void __mptcp_data_acked(struct sock *sk)
+{
+ if (!sock_owned_by_user(sk))
+ __mptcp_clean_una(sk);
+ else
+ set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags);
+
+ if (mptcp_pending_data_fin_ack(sk))
+ mptcp_schedule_work(sk);
+}
+
+void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk)
+{
+ if (!mptcp_send_head(sk))
+ return;
+
+ if (!sock_owned_by_user(sk))
+ __mptcp_subflow_push_pending(sk, ssk);
+ else
+ set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+}
+
#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
-/* this is very alike tcp_release_cb() but we must handle differently a
- * different set of events
- */
+/* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk)
{
unsigned long flags, nflags;
+ /* push_pending may touch wmem_reserved, do it before the later
+ * cleanup
+ */
+ if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags))
+ __mptcp_clean_una(sk);
+ if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) {
+ /* mptcp_push_pending() acquires the subflow socket lock
+ *
+ * 1) can't be invoked in atomic scope
+ * 2) must avoid ABBA deadlock with msk socket spinlock: the RX
+ * datapath acquires the msk socket spinlock while helding
+ * the subflow socket lock
+ */
+
+ spin_unlock_bh(&sk->sk_lock.slock);
+ mptcp_push_pending(sk, 0);
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
+
+ /* clear any wmem reservation and errors */
+ __mptcp_update_wmem(sk);
+ __mptcp_update_rmem(sk);
+
do {
flags = sk->sk_tsq_flags;
if (!(flags & MPTCP_DEFERRED_ALL))
@@ -2587,7 +2987,7 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
- atomic64_set(&msk->snd_una, msk->write_seq);
+ WRITE_ONCE(msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, 0);
@@ -2816,6 +3216,17 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
bool slowpath;
slowpath = lock_sock_fast(newsk);
+
+ /* PM/worker can now acquire the first subflow socket
+ * lock without racing with listener queue cleanup,
+ * we can notify it, if needed.
+ */
+ subflow = mptcp_subflow_ctx(msk->first);
+ list_add(&subflow->node, &msk->conn_list);
+ sock_hold(msk->first);
+ if (mptcp_is_fully_established(newsk))
+ mptcp_pm_fully_established(msk);
+
mptcp_copy_inaddrs(newsk, msk->first);
mptcp_rcv_space_init(msk, msk->first);
@@ -2848,24 +3259,9 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
0;
}
-static bool __mptcp_check_writeable(struct mptcp_sock *msk)
-{
- struct sock *sk = (struct sock *)msk;
- bool mptcp_writable;
-
- mptcp_clean_una(sk);
- mptcp_writable = sk_stream_is_writeable(sk);
- if (!mptcp_writable)
- mptcp_nospace(msk);
-
- return mptcp_writable;
-}
-
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
- __poll_t ret = 0;
- bool slow;
if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
return 0;
@@ -2873,12 +3269,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;
- slow = lock_sock_fast(sk);
- if (__mptcp_check_writeable(msk))
- ret = EPOLLOUT | EPOLLWRNORM;
+ set_bit(MPTCP_NOSPACE, &msk->flags);
+ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+ if (sk_stream_is_writeable(sk))
+ return EPOLLOUT | EPOLLWRNORM;
- unlock_sock_fast(sk, slow);
- return ret;
+ return 0;
}
static __poll_t mptcp_poll(struct file *file, struct socket *sock,