aboutsummaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c146
1 files changed, 85 insertions, 61 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e252539b1e19..a0b8356cd8c5 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -121,8 +121,6 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
ret = __mptcp_socket_create(msk);
if (ret)
return ERR_PTR(ret);
-
- mptcp_sockopt_sync(msk, msk->first);
}
return msk->first;
@@ -863,9 +861,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
/* Wake-up the reader only for in-sequence data */
mptcp_data_lock(sk);
- if (move_skbs_to_msk(msk, ssk))
+ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
sk->sk_data_ready(sk);
-
mptcp_data_unlock(sk);
}
@@ -893,6 +890,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
mptcp_sockopt_sync_locked(msk, ssk);
mptcp_subflow_joined(msk, ssk);
mptcp_stop_tout_timer(sk);
+ __mptcp_propagate_sndbuf(sk, ssk);
return true;
}
@@ -1079,15 +1077,16 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
bool first = true;
- sk_stream_moderate_sndbuf(sk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (first)
tcp_enter_memory_pressure(ssk);
sk_stream_moderate_sndbuf(ssk);
+
first = false;
}
+ __mptcp_sync_sndbuf(sk);
}
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
@@ -1268,7 +1267,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* queue management operation, to avoid breaking the ext <->
* SSN association set here
*/
- mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ mpext = mptcp_get_ext(skb);
if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
TCP_SKB_CB(skb)->eor = 1;
goto alloc_skb;
@@ -1290,7 +1289,7 @@ alloc_skb:
i = skb_shinfo(skb)->nr_frags;
reuse_skb = false;
- mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ mpext = mptcp_get_ext(skb);
}
/* Zero window and all data acked? Probe. */
@@ -1298,7 +1297,7 @@ alloc_skb:
if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
- if (snd_una != msk->snd_nxt) {
+ if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) {
tcp_remove_empty_skb(ssk);
return 0;
}
@@ -1306,11 +1305,6 @@ alloc_skb:
zero_window_probe = true;
data_seq = snd_una - 1;
copy = 1;
-
- /* all mptcp-level data is acked, no skbs should be present into the
- * ssk write queue
- */
- WARN_ON_ONCE(reuse_skb);
}
copy = min_t(size_t, copy, info->limit - info->sent);
@@ -1339,7 +1333,6 @@ alloc_skb:
if (reuse_skb) {
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += copy;
- WARN_ON_ONCE(zero_window_probe);
goto out;
}
@@ -1767,6 +1760,18 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return ret;
}
+static int do_copy_data_nocache(struct sock *sk, int copy,
+ struct iov_iter *from, char *to)
+{
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
+ if (!copy_from_iter_full_nocache(to, copy, from))
+ return -EFAULT;
+ } else if (!copy_from_iter_full(to, copy, from)) {
+ return -EFAULT;
+ }
+ return 0;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1840,11 +1845,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!sk_wmem_schedule(sk, total_ts))
goto wait_for_memory;
- if (copy_page_from_iter(dfrag->page, offset, psize,
- &msg->msg_iter) != psize) {
- ret = -EFAULT;
+ ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
+ page_address(dfrag->page) + offset);
+ if (ret)
goto do_error;
- }
/* data successfully copied into the write queue */
sk_forward_alloc_add(sk, -total_ts);
@@ -1928,6 +1932,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
if (!(flags & MSG_PEEK)) {
MPTCP_SKB_CB(skb)->offset += count;
MPTCP_SKB_CB(skb)->map_seq += count;
+ msk->bytes_consumed += count;
}
break;
}
@@ -1938,6 +1943,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
__skb_unlink(skb, &msk->receive_queue);
__kfree_skb(skb);
+ msk->bytes_consumed += count;
}
if (copied >= len)
@@ -2354,6 +2360,26 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
#define MPTCP_CF_PUSH BIT(1)
#define MPTCP_CF_FASTCLOSE BIT(2)
+/* be sure to send a reset only if the caller asked for it, also
+ * clean completely the subflow status when the subflow reaches
+ * TCP_CLOSE state
+ */
+static void __mptcp_subflow_disconnect(struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ unsigned int flags)
+{
+ if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ (flags & MPTCP_CF_FASTCLOSE)) {
+ /* The MPTCP code never wait on the subflow sockets, TCP-level
+ * disconnect should never fail
+ */
+ WARN_ON_ONCE(tcp_disconnect(ssk, 0));
+ mptcp_subflow_ctx_reset(subflow);
+ } else {
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ }
+}
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -2377,8 +2403,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (msk->in_accept_queue && msk->first == ssk &&
(sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
/* ensure later check in mptcp_worker() will dispose the msk */
- mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
sock_set_flag(sk, SOCK_DEAD);
+ mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
mptcp_subflow_drop_ctx(ssk);
goto out_release;
@@ -2391,7 +2417,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) {
- /* be sure to force the tcp_disconnect() path,
+ /* be sure to force the tcp_close path
* to generate the egress reset
*/
ssk->sk_lingertime = 0;
@@ -2401,11 +2427,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
- /* The MPTCP code never wait on the subflow sockets, TCP-level
- * disconnect should never fail
- */
- WARN_ON_ONCE(tcp_disconnect(ssk, 0));
- mptcp_subflow_ctx_reset(subflow);
+ __mptcp_subflow_disconnect(ssk, subflow, flags);
release_sock(ssk);
goto out;
@@ -2438,6 +2460,7 @@ out_release:
WRITE_ONCE(msk->first, NULL);
out:
+ __mptcp_sync_sndbuf(sk);
if (need_push)
__mptcp_push_pending(sk, 0);
@@ -2467,7 +2490,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
/* subflow aborted before reaching the fully_established status
* attempt the creation of the next subflow
*/
- mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow);
+ mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow);
__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
}
@@ -2506,7 +2529,7 @@ static bool mptcp_close_tout_expired(const struct sock *sk)
return false;
return time_after32(tcp_jiffies32,
- inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
+ inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk));
}
static void mptcp_check_fastclose(struct mptcp_sock *msk)
@@ -2649,7 +2672,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
return;
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
- TCP_TIMEWAIT_LEN;
+ mptcp_close_timeout(sk);
/* the close timeout takes precedence on the fail one, and here at least one of
* them is active
@@ -2745,6 +2768,7 @@ static void __mptcp_init_sock(struct sock *sk)
msk->rmem_fwd_alloc = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
+ msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
WRITE_ONCE(msk->first, NULL);
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -2954,16 +2978,9 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
__mptcp_destroy_sock(sk);
}
-static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+static __poll_t mptcp_check_readable(struct sock *sk)
{
- /* Concurrent splices from sk_receive_queue into receive_queue will
- * always show at least one non-empty queue when checked in this order.
- */
- if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
- skb_queue_empty_lockless(&msk->receive_queue))
- return 0;
-
- return EPOLLIN | EPOLLRDNORM;
+ return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
}
static void mptcp_check_listen_stop(struct sock *sk)
@@ -3001,7 +3018,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
- if (mptcp_check_readable(msk) || timeout < 0) {
+ if (mptcp_data_avail(msk) || timeout < 0) {
/* If the msk has read data, or the caller explicitly ask it,
* do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
*/
@@ -3098,12 +3115,6 @@ static int mptcp_disconnect(struct sock *sk, int flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- /* Deny disconnect if other threads are blocked in sk_wait_event()
- * or inet_wait_for_connect().
- */
- if (sk->sk_wait_pending)
- return -EBUSY;
-
/* We are on the fastopen error path. We can't call straight into the
* subflows cleanup code due to lock nesting (we are already under
* msk->firstsocket lock).
@@ -3134,6 +3145,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
msk->snd_data_fin_enable = false;
msk->rcv_fastclose = false;
msk->use_64bit_ack = false;
+ msk->bytes_consumed = 0;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
mptcp_pm_data_reset(msk);
mptcp_ca_reset(sk);
@@ -3173,7 +3185,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
#endif
- nsk->sk_wait_pending = 0;
__mptcp_init_sock(nsk);
msk = mptcp_sk(nsk);
@@ -3216,7 +3227,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
* uses the correct data
*/
mptcp_copy_inaddrs(nsk, ssk);
- mptcp_propagate_sndbuf(nsk, ssk);
+ __mptcp_propagate_sndbuf(nsk, ssk);
mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(nsk);
@@ -3394,6 +3405,8 @@ static void mptcp_release_cb(struct sock *sk)
__mptcp_set_connected(sk);
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk);
+ if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
+ __mptcp_sync_sndbuf(sk);
}
__mptcp_update_rmem(sk);
@@ -3425,24 +3438,29 @@ static void schedule_3rdack_retransmission(struct sock *ssk)
sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout);
}
-void mptcp_subflow_process_delegated(struct sock *ssk)
+void mptcp_subflow_process_delegated(struct sock *ssk, long status)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;
- if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
+ if (status & BIT(MPTCP_DELEGATE_SEND)) {
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk, true);
else
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
- mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND);
}
- if (test_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status)) {
- schedule_3rdack_retransmission(ssk);
- mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_ACK);
+ if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_sync_sndbuf(sk);
+ else
+ __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
+ mptcp_data_unlock(sk);
}
+ if (status & BIT(MPTCP_DELEGATE_ACK))
+ schedule_3rdack_retransmission(ssk);
}
static int mptcp_hash(struct sock *sk)
@@ -3525,6 +3543,7 @@ bool mptcp_finish_join(struct sock *ssk)
/* active subflow, already present inside the conn_list */
if (!list_empty(&subflow->node)) {
mptcp_subflow_joined(msk, ssk);
+ mptcp_propagate_sndbuf(parent, ssk);
return true;
}
@@ -3909,7 +3928,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
- mask |= mptcp_check_readable(msk);
+ mask |= mptcp_check_readable(sk);
if (shutdown & SEND_SHUTDOWN)
mask |= EPOLLOUT | EPOLLWRNORM;
else
@@ -3947,6 +3966,7 @@ static const struct proto_ops mptcp_stream_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
+ .set_rcvlowat = mptcp_set_rcvlowat,
};
static struct inet_protosw mptcp_protosw = {
@@ -3968,14 +3988,17 @@ static int mptcp_napi_poll(struct napi_struct *napi, int budget)
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bh_lock_sock_nested(ssk);
- if (!sock_owned_by_user(ssk) &&
- mptcp_subflow_has_delegated_action(subflow))
- mptcp_subflow_process_delegated(ssk);
- /* ... elsewhere tcp_release_cb_override already processed
- * the action or will do at next release_sock().
- * In both case must dequeue the subflow here - on the same
- * CPU that scheduled it.
- */
+ if (!sock_owned_by_user(ssk)) {
+ mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0));
+ } else {
+ /* tcp_release_cb_override already processed
+ * the action or will do at next release_sock().
+ * In both case must dequeue the subflow here - on the same
+ * CPU that scheduled it.
+ */
+ smp_wmb();
+ clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status);
+ }
bh_unlock_sock(ssk);
sock_put(ssk);
@@ -4045,6 +4068,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = inet6_compat_ioctl,
#endif
+ .set_rcvlowat = mptcp_set_rcvlowat,
};
static struct proto mptcp_v6_prot;