From 86de5921a3d5dd246df661e09bdd0a6131b39ae3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Nov 2018 05:53:59 -0800 Subject: tcp: defer SACK compression after DupThresh Jean-Louis reported a TCP regression and bisected to recent SACK compression. After a loss episode (receiver not able to keep up and dropping packets because its backlog is full), linux TCP stack is sending a single SACK (DUPACK). Sender waits a full RTO timer before recovering losses. While RFC 6675 says in section 5, "Algorithm Details", (2) If DupAcks < DupThresh but IsLost (HighACK + 1) returns true -- indicating at least three segments have arrived above the current cumulative acknowledgment point, which is taken to indicate loss -- go to step (4). ... (4) Invoke fast retransmit and enter loss recovery as follows: there are old TCP stacks not implementing this strategy, and still counting the dupacks before starting fast retransmit. While these stacks probably perform poorly when receivers implement LRO/GRO, we should be a little more gentle to them. This patch makes sure we do not enable SACK compression unless 3 dupacks have been sent since last rcv_nxt update. Ideally we should even rearm the timer to send one or two more DUPACK if no more packets are coming, but that will be work aiming for linux-4.21. Many thanks to Jean-Louis for bisecting the issue, providing packet captures and testing this patch. Fixes: 5d9f4262b7ea ("tcp: add SACK compression") Reported-by: Jean-Louis Dupond Tested-by: Jean-Louis Dupond Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 676020663ce8..5f8b6d3cd855 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -740,7 +740,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, -- cgit From 9efdda4e3abed13f0903b7b6e4d4c2102019440a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 24 Nov 2018 09:12:24 -0800 Subject: tcp: address problems caused by EDT misshaps When a qdisc setup including pacing FQ is dismantled and recreated, some TCP packets are sent earlier than instructed by TCP stack. TCP can be fooled when ACK comes back, because the following operation can return a negative value. tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; Some paths in TCP stack were not dealing properly with this, this patch addresses four of them. Fixes: ab408b6dc744 ("tcp: switch tcp and sch_fq to new earliest departure time model") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 ++++++++++------ net/ipv4/tcp_timer.c | 10 ++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1e37c1388189..a9d9555a973f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -579,10 +579,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; - if (!delta) - delta = 1; - delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - tcp_rcv_rtt_update(tp, delta_us, 0); + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + tcp_rcv_rtt_update(tp, delta_us, 0); + } } } @@ -2910,9 +2912,11 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - seq_rtt_us = ca_rtt_us = delta_us; + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + ca_rtt_us = seq_rtt_us; + } } rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5f8b6d3cd855..091c53925e4d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -40,15 +40,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts; + s32 remaining; start_ts = tcp_retransmit_stamp(sk); if (!icsk->icsk_user_timeout || !start_ts) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - if (elapsed >= icsk->icsk_user_timeout) + remaining = icsk->icsk_user_timeout - elapsed; + if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ - else - return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); + + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } /** @@ -209,7 +211,7 @@ static bool retransmits_timed_out(struct sock *sk, (boundary - linear_backoff_thresh) * TCP_RTO_MAX; timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; + return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ -- cgit From 3976535af0cb9fe34a55f2ffb8d7e6b39a2f8188 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 28 Nov 2018 16:06:43 -0800 Subject: tcp: fix off-by-one bug on aborting window-probing socket Previously there is an off-by-one bug on determining when to abort a stalled window-probing socket. This patch fixes that so it is consistent with tcp_write_timeout(). Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 091c53925e4d..25efdae4368a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -378,7 +378,7 @@ static void tcp_probe_timer(struct sock *sk) return; } - if (icsk->icsk_probes_out > max_probes) { + if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ -- cgit From e1561fe2dd69dc5dddd69bd73aa65355bdfb048b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 28 Nov 2018 16:06:45 -0800 Subject: tcp: fix SNMP TCP timeout under-estimation Previously the SNMP TCPTIMEOUTS counter has inconsistent accounting: 1. It counts all SYN and SYN-ACK timeouts 2. It counts timeouts in other states except recurring timeouts and timeouts after fast recovery or disorder state. Such selective accounting makes analysis difficult and complicated. For example the monitoring system needs to collect many other SNMP counters to infer the total amount of timeout events. This patch makes TCPTIMEOUTS counter simply counts all the retransmit timeout (SYN or data or FIN). Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 25efdae4368a..f87dbc78b6bc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -484,11 +484,12 @@ void tcp_retransmit_timer(struct sock *sk) goto out_reset_timer; } + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { - int mib_idx; + int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) @@ -503,10 +504,9 @@ void tcp_retransmit_timer(struct sock *sk) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; - } else { - mib_idx = LINUX_MIB_TCPTIMEOUTS; } - __NET_INC_STATS(sock_net(sk), mib_idx); + if (mib_idx) + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); -- cgit