From 57dde7f70de34d4251f291c9eac7ad920aaf56b2 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jan 2017 22:11:33 -0800 Subject: tcp: add reordering timer in RACK loss detection This patch makes RACK install a reordering timer when it suspects some packets might be lost, but wants to delay the decision a little bit to accomodate reordering. It does not create a new timer but instead repurposes the existing RTO timer, because both are meant to retransmit packets. Specifically it arms a timer ICSK_TIME_REO_TIMEOUT when the RACK timing check fails. The wait time is set to RACK.RTT + RACK.reo_wnd - (NOW - Packet.xmit_time) + fudge This translates to expecting a packet (Packet) should take (RACK.RTT + RACK.reo_wnd + fudge) to deliver after it was sent. When there are multiple packets that need a timer, we use one timer with the maximum timeout. Therefore the timer conservatively uses the maximum window to expire N packets by one timeout, instead of N timeouts to expire N packets sent at different times. The fudge factor is 2 jiffies to ensure when the timer fires, all the suspected packets would exceed the deadline and be marked lost by tcp_rack_detect_loss(). It has to be at least 1 jiffy because the clock may tick between calling icsk_reset_xmit_timer(timeout) and actually hang the timer. The next jiffy is to lower-bound the timeout to 2 jiffies when reo_wnd is < 1ms. When the reordering timer fires (tcp_rack_reo_timeout): If we aren't in Recovery we'll enter fast recovery and force fast retransmit. This is very similar to the early retransmit (RFC5827) except RACK is not constrained to only enter recovery for small outstanding flights. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d5331a1b1dc..0ba9026cb70d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2960,7 +2960,8 @@ begin_fwd: if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk)) + if (skb == tcp_write_queue_head(sk) && + icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); -- cgit From 840a3cbe89694fad75578856976f180e852e69aa Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jan 2017 22:11:38 -0800 Subject: tcp: remove forward retransmit feature Forward retransmit is an esoteric feature in RFC3517 (condition(3) in the NextSeg()). Basically if a packet is not considered lost by the current criteria (# of dupacks etc), but the congestion window has room for more packets, then retransmit this packet. However it actually conflicts with the rest of recovery design. For example, when reordering is detected we want to be conservative in retransmitting packets but forward-retransmit feature would break that to force more retransmission. Also the implementation is fairly complicated inside the retransmission logic inducing extra iterations in the write queue. With RACK losses are being detected timely and this heuristic is no longer necessary. There this patch removes the feature. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - net/ipv4/tcp_input.c | 5 ----- net/ipv4/tcp_output.c | 61 +++------------------------------------------------ 3 files changed, 3 insertions(+), 64 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 970d5f00589f..8e5f4c15d0e5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -307,7 +307,6 @@ struct tcp_sock { */ int lost_cnt_hint; - u32 retransmit_high; /* L-bits may be on up to this seqno */ u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9469ce384d3b..a041a92348ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -916,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = skb; - - if (!tp->lost_out || - after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } /* Sum the number of packets on the wire we have marked as lost. @@ -1983,7 +1979,6 @@ void tcp_enter_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0ba9026cb70d..6327e4d368a4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2831,36 +2831,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return err; } -/* Check if we forward retransmits are possible in the current - * window/congestion state. - */ -static bool tcp_can_forward_retransmit(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_sock *tp = tcp_sk(sk); - - /* Forward retransmissions are possible only during Recovery. */ - if (icsk->icsk_ca_state != TCP_CA_Recovery) - return false; - - /* No forward retransmissions in Reno are possible. */ - if (tcp_is_reno(tp)) - return false; - - /* Yeah, we have to make difficult choice between forward transmission - * and retransmission... Both ways have their merits... - * - * For now we do not retransmit anything, while we have some new - * segments to send. In the other cases, follow rule 3 for - * NextSeg() specified in RFC3517. - */ - - if (tcp_may_send_now(sk)) - return false; - - return true; -} - /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either @@ -2875,24 +2845,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct sk_buff *hole = NULL; - u32 max_segs, last_lost; + u32 max_segs; int mib_idx; - int fwd_rexmitting = 0; if (!tp->packets_out) return; - if (!tp->lost_out) - tp->retransmit_high = tp->snd_una; - if (tp->retransmit_skb_hint) { skb = tp->retransmit_skb_hint; - last_lost = TCP_SKB_CB(skb)->end_seq; - if (after(last_lost, tp->retransmit_high)) - last_lost = tp->retransmit_high; } else { skb = tcp_write_queue_head(sk); - last_lost = tp->snd_una; } max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); @@ -2915,31 +2877,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ segs = min_t(int, segs, max_segs); - if (fwd_rexmitting) { -begin_fwd: - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - break; - mib_idx = LINUX_MIB_TCPFORWARDRETRANS; - - } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { - tp->retransmit_high = last_lost; - if (!tcp_can_forward_retransmit(sk)) - break; - /* Backtrack if necessary to non-L'ed skb */ - if (hole) { - skb = hole; - hole = NULL; - } - fwd_rexmitting = 1; - goto begin_fwd; - + if (tp->retrans_out >= tp->lost_out) { + break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { - last_lost = TCP_SKB_CB(skb)->end_seq; if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else -- cgit From bec41a11dd3dc8c54f766b4f494140ca92ba7c10 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jan 2017 22:11:39 -0800 Subject: tcp: remove early retransmit This patch removes the support of RFC5827 early retransmit (i.e., fast recovery on small inflight with <3 dupacks) because it is subsumed by the new RACK loss detection. More specifically when RACK receives DUPACKs, it'll arm a reordering timer to start fast recovery after a quarter of (min)RTT, hence it covers the early retransmit except RACK does not limit itself to specific inflight or dupack numbers. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 19 +++-------- include/linux/tcp.h | 3 +- include/net/tcp.h | 19 ----------- net/ipv4/inet_diag.c | 1 - net/ipv4/tcp.c | 3 -- net/ipv4/tcp_input.c | 60 ++-------------------------------- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/tcp_metrics.c | 1 - net/ipv4/tcp_minisocks.c | 1 - net/ipv4/tcp_output.c | 11 +++---- net/ipv4/tcp_timer.c | 3 -- net/ipv6/tcp_ipv6.c | 1 - 12 files changed, 12 insertions(+), 111 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 7dd65c9cf707..7de2cf79e16f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN Allows TCP to send "duplicate" SACKs. tcp_early_retrans - INTEGER - Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold - for triggering fast retransmit when the amount of outstanding data is - small and when no previously unsent data can be transmitted (such - that limited transmit could be used). Also controls the use of - Tail loss probe (TLP) that converts RTOs occurring due to tail - losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01). + Tail loss probe (TLP) converts RTOs occurring due to tail + losses into fast recovery (draft-ietf-tcpm-rack). Note that + TLP requires RACK to function properly (see tcp_recovery below) Possible values: - 0 disables ER - 1 enables ER - 2 enables ER but delays fast recovery and fast retransmit - by a fourth of RTT. This mitigates connection falsely - recovers when network has a small degree of reordering - (less than 3 packets). - 3 enables delayed ER and TLP. - 4 enables TLP only. + 0 disables TLP + 3 or 4 enables TLP Default: 3 tcp_ecn - INTEGER diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8e5f4c15d0e5..4733368f953a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -224,8 +224,7 @@ struct tcp_sock { repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ - syn_data:1, /* SYN includes data */ + u8 syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 423438dd6fe9..c55d65f74f7f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -565,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); /* tcp_input.c */ -void tcp_resume_early_retransmit(struct sock *sk); void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); @@ -1037,24 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; } -/* TCP early-retransmit (ER) is similar to but more conservative than - * the thin-dupack feature. Enable ER only if thin-dupack is disabled. - */ -static inline void tcp_enable_early_retrans(struct tcp_sock *tp) -{ - struct net *net = sock_net((struct sock *)tp); - - tp->do_early_retrans = sysctl_tcp_early_retrans && - sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack && - !(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) && - net->ipv4.sysctl_tcp_reordering == 3; -} - -static inline void tcp_disable_early_retrans(struct tcp_sock *tp) -{ - tp->do_early_retrans = 0; -} - static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index d216e40623d3..3828b3a805cd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -215,7 +215,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c8d46c140b4a..d9023e8ed53e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; - tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -2477,8 +2476,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EINVAL; else { tp->thin_dupack = val; - if (tp->thin_dupack) - tcp_disable_early_retrans(tp); } break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a041a92348ee..79c819077a59 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -904,8 +904,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tcp_disable_fack(tp); } - if (metric > 0) - tcp_disable_early_retrans(tp); tp->rack.reord = 1; } @@ -2054,30 +2052,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } -static bool tcp_pause_early_retransmit(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned long delay; - - /* Delay early retransmit and entering fast recovery for - * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples - * available, or RTO is scheduled to fire first. - */ - if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt_us) - return false; - - delay = max(usecs_to_jiffies(tp->srtt_us >> 5), - msecs_to_jiffies(2)); - - if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) - return false; - - inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, - TCP_RTO_MAX); - return true; -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2221,16 +2195,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) tcp_is_sack(tp) && !tcp_send_head(sk)) return true; - /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious - * retransmissions due to small network reorderings, we implement - * Mitigation A.3 in the RFC and delay the retransmission for a short - * interval if appropriate. - */ - if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && - (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && - !tcp_may_send_now(sk)) - return !tcp_pause_early_retransmit(sk, flag); - return false; } @@ -3050,8 +3014,7 @@ void tcp_rearm_rto(struct sock *sk) } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || + if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = @@ -3068,24 +3031,6 @@ void tcp_rearm_rto(struct sock *sk) } } -/* This function is called when the delayed ER timer fires. TCP enters - * fast recovery and performs fast-retransmit. - */ -void tcp_resume_early_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_rearm_rto(sk); - - /* Stop if ER is disabled after the delayed ER timer is scheduled */ - if (!tp->do_early_retrans) - return; - - tcp_enter_recovery(sk, false); - tcp_update_scoreboard(sk, 1); - tcp_xmit_retransmit_queue(sk); -} - /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3651,8 +3596,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) + if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); if (after(ack, prior_snd_una)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ebf3e0c4967a..63214136cf1c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2229,7 +2229,6 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index ba8f02d0f283..b9ed0d50aead 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk) val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) { tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); tp->reordering = val; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 06fde26a82b7..bdb443471c39 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->sacked_out = 0; newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; newtp->lsndtime = treq->snt_synack.stamp_jiffies; newsk->sk_txhash = treq->txhash; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6327e4d368a4..9a1a1494b9dd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - } NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); @@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) u32 timeout, tlp_time_stamp, rto_time_stamp; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) - return false; /* No consecutive loss probes. */ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { tcp_rearm_rto(sk); @@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || - !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + !tp->packets_out || !tcp_is_sack(tp) || + icsk->icsk_ca_state != TCP_CA_Open) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 953c02a8566e..40d893556e67 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -566,9 +566,6 @@ void tcp_write_timer_handler(struct sock *sk) case ICSK_TIME_REO_TIMEOUT: tcp_rack_reo_timeout(sk); break; - case ICSK_TIME_EARLY_RETRANS: - tcp_resume_early_retransmit(sk); - break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); break; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f52c3742b404..fc14e04028bf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1745,7 +1745,6 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; -- cgit From 065263f40f0972d5f1cd294bb0242bd5aa5f06b2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 23 Jan 2017 10:59:20 -0800 Subject: net/tcp-fastopen: refactor cookie check logic Refactor the cookie check logic in tcp_send_syn_data() into a function. This function will be called else where in later changes. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_fastopen.c | 21 +++++++++++++++++++++ net/ipv4/tcp_output.c | 16 ++-------------- 3 files changed, 25 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index c55d65f74f7f..de67541d7adf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1493,6 +1493,8 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct tcp_fastopen_cookie *foc, struct dst_entry *dst); void tcp_fastopen_init_key_once(bool publish); +bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, + struct tcp_fastopen_cookie *cookie); #define TCP_FASTOPEN_KEY_LENGTH 16 /* Fastopen key context */ diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f51919535ca7..f90e09e1ff4c 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -325,3 +325,24 @@ fastopen: *foc = valid_foc; return NULL; } + +bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, + struct tcp_fastopen_cookie *cookie) +{ + unsigned long last_syn_loss = 0; + int syn_loss = 0; + + tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); + + /* Recurring FO SYN losses: no cookie or data in SYN */ + if (syn_loss > 1 && + time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { + cookie->len = -1; + return false; + } + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + cookie->len = -1; + return true; + } + return cookie->len > 0; +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9a1a1494b9dd..671c69535671 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3267,23 +3267,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, err = 0; - unsigned long last_syn_loss = 0; + int space, err = 0; struct sk_buff *syn_data; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ - tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, - &syn_loss, &last_syn_loss); - /* Recurring FO SYN losses: revert to regular handshake temporarily */ - if (syn_loss > 1 && - time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { - fo->cookie.len = -1; - goto fallback; - } - - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) - fo->cookie.len = -1; - else if (fo->cookie.len <= 0) + if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and -- cgit From 678550c651aee051d66112933c87894f129b9355 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 27 Jan 2017 16:24:39 -0800 Subject: tcp: include locally failed retries in retransmission stats Currently the retransmission stats are not incremented if the retransmit fails locally. But we always increment the other packet counters that track total packet/bytes sent. Awkwardly while we don't count these failed retransmits in RETRANSSEGS, we do count them in FAILEDRETRANS. If the qdisc is dropping many packets this could under-estimate TCP retransmission rate substantially from both SNMP or per-socket TCP_INFO stats. This patch changes this by always incrementing retransmission stats on retransmission attempts and failures. Another motivation is to properly track retransmists in SCM_TIMESTAMPING_OPT_STATS. Since SCM_TSTAMP_SCHED collection is triggered in tcp_transmit_skb(), If tp->total_retrans is incremented after the function, we'll always mis-count by the amount of the latest retransmission. Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 671c69535671..7b2d8762f15f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2771,6 +2771,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); + /* Update global and local TCP statistics. */ + segs = tcp_skb_pcount(skb); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans += segs; + /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. @@ -2788,14 +2795,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) } if (likely(!err)) { - segs = tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; - /* Update global TCP statistics. */ - TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - tp->total_retrans += segs; + } else if (err != -EBUSY) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } return err; } @@ -2818,8 +2820,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - } else if (err != -EBUSY) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } if (tp->undo_retrans < 0) -- cgit From 3541f9e8bdebce02458882b66b638d7302c1f616 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Feb 2017 08:04:56 -0800 Subject: tcp: add tcp_mss_clamp() helper Small cleanup factorizing code doing the TCP_MAXSEG clamping. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 9 +++++++++ net/ipv4/tcp_ipv4.c | 5 +---- net/ipv4/tcp_minisocks.c | 7 ++----- net/ipv4/tcp_output.c | 14 ++++---------- net/ipv6/tcp_ipv6.c | 5 +---- 5 files changed, 17 insertions(+), 23 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f88f4649ba6f..cfc2d9506ce8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -445,4 +445,13 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk); +static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) +{ + /* We use READ_ONCE() here because socket might not be locked. + * This happens for listeners. + */ + u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); + + return (user_mss && user_mss < mss) ? user_mss : mss; +} #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8c9e9aa17d66..8c124d4ef4b7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1324,10 +1324,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric_advmss(dst); - if (tcp_sk(sk)->rx_opt.user_mss && - tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) - newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bdb443471c39..dff7d2aaf861 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -360,15 +360,12 @@ void tcp_openreq_init_rwin(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); - u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); int full_space = tcp_full_space(sk_listener); - int mss = dst_metric_advmss(dst); u32 window_clamp; __u8 rcv_wscale; + int mss; - if (user_mss && user_mss < mss) - mss = user_mss; - + mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6d5bab8a3ea6..956bea9a5394 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3062,7 +3062,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; - u16 user_mss; int mss; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); @@ -3092,10 +3091,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, } skb_dst_set(skb, dst); - mss = dst_metric_advmss(dst); - user_mss = READ_ONCE(tp->rx_opt.user_mss); - if (user_mss && user_mss < mss) - mss = user_mss; + mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -3201,9 +3197,7 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); - tp->advmss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) - tp->advmss = tp->rx_opt.user_mss; + tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); @@ -3280,8 +3274,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) - tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 64834ec5ab73..6b9fc63fd4d2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1147,10 +1147,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric_advmss(dst); - if (tcp_sk(sk)->rx_opt.user_mss && - tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) - newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); -- cgit From 38ab52e8e1e2dd65f2d349f82553335813b638d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Feb 2017 20:40:08 -0800 Subject: tcp: clear pfmemalloc on outgoing skb Josef Bacik diagnosed following problem : I was seeing random disconnects while testing NBD over loopback. This turned out to be because NBD sets pfmemalloc on it's socket, however the receiving side is a user space application so does not have pfmemalloc set on its socket. This means that sk_filter_trim_cap will simply drop this packet, under the assumption that the other side will simply retransmit. Well we do retransmit, and then the packet is just dropped again for the same reason. It seems the better way to address this problem is to clear pfmemalloc in the TCP transmit path. pfmemalloc strict control really makes sense on the receive path. Signed-off-by: Eric Dumazet Acked-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 956bea9a5394..ccf1ef4dcba4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -964,6 +964,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); + /* If we had to use memory reserve to allocate this skb, + * this might cause drops if packet is looped back : + * Other socket might not have SOCK_MEMALLOC. + * Packets not looped back do not care about pfmemalloc. + */ + skb->pfmemalloc = 0; + skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); -- cgit From c3a2e8370534f810cac6050169db0ed3e0f94f0b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:14 +0200 Subject: tcp: replace dst_confirm with sk_dst_confirm When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. Use the new sk_dst_confirm() helper to propagate the indication from received packets to sock_confirm_neigh(). Reported-by: YueHaibing Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.") Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.") Tested-by: YueHaibing Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 +++--------- net/ipv4/tcp_metrics.c | 7 ++----- net/ipv4/tcp_output.c | 2 ++ 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 27c95acbb52f..2c0ff327b6df 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3644,11 +3644,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { - struct dst_entry *dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); - } + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) + sk_dst_confirm(sk); if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); @@ -5995,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; case TCP_FIN_WAIT1: { - struct dst_entry *dst; int tmo; /* If we enter the TCP_FIN_WAIT1 state and we are a @@ -6022,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); + sk_dst_confirm(sk); if (!sock_flag(sk, SOCK_DEAD)) { /* Wake up lingering close() */ diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b9ed0d50aead..0f46e5fe31ad 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk) u32 val; int m; + sk_dst_confirm(sk); if (sysctl_tcp_nometrics_save || !dst) return; - if (dst->flags & DST_HOST) - dst_confirm(dst); - rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? @@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk) struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ + sk_dst_confirm(sk); if (!dst) goto reset; - dst_confirm(dst); - rcu_read_lock(); tm = tcp_get_metrics(sk, dst, true); if (!tm) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ccf1ef4dcba4..61f272a99a49 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -980,6 +980,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); + skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); + /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; -- cgit From a4ecb15a2432880353d0de4a35204ab8b65b8751 Mon Sep 17 00:00:00 2001 From: "Cui, Cheng" Date: Fri, 17 Feb 2017 17:04:55 +0000 Subject: tcp: accommodate sequence number to a peer's shrunk receive window caused by precision loss in window scaling Prevent sending out a left-shifted sequence number from a Linux sender in response to a peer's shrunk receive-window caused by losing least significant bits in window-scaling. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Cheng Cui Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 61f272a99a49..22548b5f05cb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -83,7 +83,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tcp_skb_pcount(skb)); } -/* SND.NXT, if window was not shrunk. +/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one + * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already @@ -93,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - if (!before(tcp_wnd_end(tp), tp->snd_nxt)) + if (!before(tcp_wnd_end(tp), tp->snd_nxt) || + (tp->rx_opt.wscale_ok && + ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); -- cgit