aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <[email protected]>2023-08-06 08:24:56 +0100
committerDavid S. Miller <[email protected]>2023-08-06 08:24:56 +0100
commit16fd753995f740fb968edaf5fd57ffb96020102e (patch)
treecaae5595891ec6a1e0d7d138186032e3d909eb96
parent81083076a007d3af3f2216ad9be1374de0687d49 (diff)
parent6e97ba552b8d3dd074a28b8600740b8bed42267b (diff)
Merge branch 'tcp-options-lockless'
Eric Dumazet says: ==================== tcp: set few options locklessly This series is avoiding the socket lock for six TCP options. They are not heavily used, but this exercise can give ideas for other parts of TCP/IP stack :) ==================== Signed-off-by: David S. Miller <[email protected]>
-rw-r--r--include/linux/tcp.h2
-rw-r--r--net/ipv4/tcp.c90
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_timer.c48
5 files changed, 68 insertions, 78 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d16abdb3541a..3c5efeeb024f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -564,6 +564,6 @@ void __tcp_sock_set_nodelay(struct sock *sk, bool on);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
-void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
+int tcp_sock_set_user_timeout(struct sock *sk, int val);
#endif /* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index aca5620cf3ba..4fbc7ff8c53c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2865,7 +2865,7 @@ adjudge_to_death:
if (sk->sk_state == TCP_FIN_WAIT2) {
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->linger2 < 0) {
+ if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk),
@@ -3291,18 +3291,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_SYNCNT)
return -EINVAL;
- lock_sock(sk);
WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
- release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_syncnt);
-void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
+int tcp_sock_set_user_timeout(struct sock *sk, int val)
{
- lock_sock(sk);
+ /* Cap the max time in ms TCP will retry or probe the window
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ if (val < 0)
+ return -EINVAL;
+
WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
- release_sock(sk);
+ return 0;
}
EXPORT_SYMBOL(tcp_sock_set_user_timeout);
@@ -3345,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPINTVL)
return -EINVAL;
- lock_sock(sk);
WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
- release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepintvl);
@@ -3357,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPCNT)
return -EINVAL;
- lock_sock(sk);
/* Paired with READ_ONCE() in keepalive_probes() */
WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
- release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);
@@ -3462,6 +3461,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case TCP_SYNCNT:
+ return tcp_sock_set_syncnt(sk, val);
+ case TCP_USER_TIMEOUT:
+ return tcp_sock_set_user_timeout(sk, val);
+ case TCP_KEEPINTVL:
+ return tcp_sock_set_keepintvl(sk, val);
+ case TCP_KEEPCNT:
+ return tcp_sock_set_keepcnt(sk, val);
+ case TCP_LINGER2:
+ if (val < 0)
+ WRITE_ONCE(tp->linger2, -1);
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+ WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
+ else
+ WRITE_ONCE(tp->linger2, val * HZ);
+ return 0;
+ case TCP_DEFER_ACCEPT:
+ /* Translate value in seconds to number of retransmits */
+ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ));
+ return 0;
+ }
+
sockopt_lock_sock(sk);
switch (optname) {
@@ -3557,25 +3582,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_KEEPIDLE:
err = tcp_sock_set_keepidle_locked(sk, val);
break;
- case TCP_KEEPINTVL:
- if (val < 1 || val > MAX_TCP_KEEPINTVL)
- err = -EINVAL;
- else
- WRITE_ONCE(tp->keepalive_intvl, val * HZ);
- break;
- case TCP_KEEPCNT:
- if (val < 1 || val > MAX_TCP_KEEPCNT)
- err = -EINVAL;
- else
- WRITE_ONCE(tp->keepalive_probes, val);
- break;
- case TCP_SYNCNT:
- if (val < 1 || val > MAX_TCP_SYNCNT)
- err = -EINVAL;
- else
- WRITE_ONCE(icsk->icsk_syn_retries, val);
- break;
-
case TCP_SAVE_SYN:
/* 0: disable, 1: enable, 2: start from ether_header */
if (val < 0 || val > 2)
@@ -3584,22 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
tp->save_syn = val;
break;
- case TCP_LINGER2:
- if (val < 0)
- WRITE_ONCE(tp->linger2, -1);
- else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
- WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
- else
- WRITE_ONCE(tp->linger2, val * HZ);
- break;
-
- case TCP_DEFER_ACCEPT:
- /* Translate value in seconds to number of retransmits */
- WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
- secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
- TCP_RTO_MAX / HZ));
- break;
-
case TCP_WINDOW_CLAMP:
err = tcp_set_window_clamp(sk, val);
break;
@@ -3614,16 +3604,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
- case TCP_USER_TIMEOUT:
- /* Cap the max time in ms TCP will retry or probe the window
- * before giving up and aborting (ETIMEDOUT) a connection.
- */
- if (val < 0)
- err = -EINVAL;
- else
- WRITE_ONCE(icsk->icsk_user_timeout, val);
- break;
-
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2995802126e4..8e96ebe373d7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6324,7 +6324,7 @@ consume:
if (fastopen_fail)
return -1;
if (sk->sk_write_pending ||
- icsk->icsk_accept_queue.rskq_defer_accept ||
+ READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
@@ -6624,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
}
- if (tp->linger2 < 0) {
+ if (READ_ONCE(tp->linger2) < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ddba3b67f7c8..13ee12983c42 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -792,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
- if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 470f581eedd4..d45c96c7f5a4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -26,14 +26,15 @@
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- u32 elapsed, start_ts;
+ u32 elapsed, start_ts, user_timeout;
s32 remaining;
start_ts = tcp_sk(sk)->retrans_stamp;
- if (!icsk->icsk_user_timeout)
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
- remaining = icsk->icsk_user_timeout - elapsed;
+ remaining = user_timeout - elapsed;
if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */
@@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- u32 remaining;
+ u32 remaining, user_timeout;
s32 elapsed;
- if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp)
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout || !icsk->icsk_probes_tstamp)
return when;
elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
if (unlikely(elapsed < 0))
elapsed = 0;
- remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed;
+ remaining = msecs_to_jiffies(user_timeout) - elapsed;
remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
return min_t(u32, remaining, when);
@@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
__dst_negative_advice(sk);
- retry_until = icsk->icsk_syn_retries ? :
+ /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
+ retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
max_retransmits = retry_until;
@@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk)
}
if (!expired)
expired = retransmits_timed_out(sk, retry_until,
- icsk->icsk_user_timeout);
+ READ_ONCE(icsk->icsk_user_timeout));
tcp_fastopen_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
@@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
- if (!icsk->icsk_probes_tstamp)
+ if (!icsk->icsk_probes_tstamp) {
icsk->icsk_probes_tstamp = tcp_jiffies32;
- else if (icsk->icsk_user_timeout &&
- (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
- msecs_to_jiffies(icsk->icsk_user_timeout))
- goto abort;
+ } else {
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (user_timeout &&
+ (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
+ msecs_to_jiffies(user_timeout))
+ goto abort;
+ }
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
@@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
req->rsk_ops->syn_ack_timeout(req);
- /* add one more retry for fastopen */
- max_retries = icsk->icsk_syn_retries ? :
+ /* Add one more retry for fastopen.
+ * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
+ */
+ max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
if (req->num_timeout >= max_retries) {
@@ -706,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
- if (tp->linger2 >= 0) {
+ if (READ_ONCE(tp->linger2) >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
@@ -731,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+
/* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead.
*/
- if ((icsk->icsk_user_timeout != 0 &&
- elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
+ if ((user_timeout != 0 &&
+ elapsed >= msecs_to_jiffies(user_timeout) &&
icsk->icsk_probes_out > 0) ||
- (icsk->icsk_user_timeout == 0 &&
+ (user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);