From 1946e672c173559155a3e210fe95dbf8b7b8ddf7 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 28 Dec 2016 17:52:32 +0800
Subject: ipv4: Namespaceify tcp_tw_recycle and tcp_max_tw_buckets knob

Different namespace application might require fast recycling
TIME-WAIT sockets independently of the host.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6c790754ae3e..c61480249835 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6363,7 +6363,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		 * timewait bucket, so that all the necessary checks
 		 * are made in the function processing timewait state.
 		 */
-		if (tcp_death_row.sysctl_tw_recycle) {
+		if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
 			bool strict;
 
 			dst = af_ops->route_req(sk, &fl, req, &strict);
-- 
cgit 


From fee83d097b1620530f23bf6063f4ea251ba9c8c7 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Wed, 28 Dec 2016 17:52:33 +0800
Subject: ipv4: Namespaceify tcp_max_syn_backlog knob

Different namespace application might require different maximal
number of remembered connection requests.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |  1 +
 include/net/request_sock.h |  4 +---
 net/core/request_sock.c    |  2 --
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp.c             |  2 --
 net/ipv4/tcp_input.c       |  4 ++--
 net/ipv4/tcp_ipv4.c        |  7 +++++--
 7 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index fffd38453985..8e3f5b6f26d5 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -122,6 +122,7 @@ struct netns_ipv4 {
 	unsigned int sysctl_tcp_notsent_lowat;
 	int sysctl_tcp_tw_reuse;
 	struct inet_timewait_death_row tcp_death_row;
+	int sysctl_max_syn_backlog;
 
 	int sysctl_igmp_max_memberships;
 	int sysctl_igmp_max_msf;
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 6ebe13eb1c4c..a12a5d25b27e 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -1,7 +1,7 @@
 /*
  * NET		Generic infrastructure for Network protocols.
  *
- *		Definitions for request_sock 
+ *		Definitions for request_sock
  *
  * Authors:	Arnaldo Carvalho de Melo <acme@conectiva.com.br>
  *
@@ -123,8 +123,6 @@ static inline void reqsk_put(struct request_sock *req)
 		reqsk_free(req);
 }
 
-extern int sysctl_max_syn_backlog;
-
 /*
  * For a TCP Fast Open listener -
  *	lock - protects the access to all the reqsk, which is co-owned by
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 5d26056b6d8f..9b8727c67b58 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -34,8 +34,6 @@
  * and it will increase in proportion to the memory of machine.
  * Note : Dont forget somaxconn that may limit backlog too.
  */
-int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
 
 void reqsk_queue_alloc(struct request_sock_queue *queue)
 {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 66f8f1b1dc78..134d8e191366 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -323,13 +323,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_max_syn_backlog",
-		.data		= &sysctl_max_syn_backlog,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "inet_peer_threshold",
 		.data		= &inet_peer_threshold,
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_max_syn_backlog",
+		.data		= &init_net.ipv4.sysctl_max_syn_backlog,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7f0d81c090ce..2e3807d8eba8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3378,9 +3378,7 @@ void __init tcp_init(void)
 
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
-
 	sysctl_tcp_max_orphans = cnt / 2;
-	sysctl_max_syn_backlog = max(128, cnt / 256);
 
 	tcp_init_mem();
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c61480249835..ec6d84363024 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6377,8 +6377,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		}
 		/* Kill the following clause, if you dislike this way. */
 		else if (!net->ipv4.sysctl_tcp_syncookies &&
-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
-			  (sysctl_max_syn_backlog >> 2)) &&
+			 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+			  (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
 			 !tcp_peer_is_proven(req, dst, false,
 					     tmp_opt.saw_tstamp)) {
 			/* Without syncookies last quarter of
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 56b5f49e3f97..7e4be4f361f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2419,7 +2419,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
 
 static int __net_init tcp_sk_init(struct net *net)
 {
-	int res, cpu;
+	int res, cpu, cnt;
 
 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
 	if (!net->ipv4.tcp_sk)
@@ -2458,10 +2458,13 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 	net->ipv4.sysctl_tcp_tw_reuse = 0;
 
+	cnt = tcp_hashinfo.ehash_mask + 1;
 	net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
-	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (tcp_hashinfo.ehash_mask + 1) / 2;
+	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
 
+	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
-- 
cgit 


From e636f8b0104d6622aaaed6aa5ef17dfbf165bc51 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:31 -0800
Subject: tcp: new helper for RACK to detect loss

Create a new helper tcp_rack_detect_loss to prepare the upcoming
RACK reordering timer patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  3 +--
 net/ipv4/tcp_input.c    | 12 ++++++++----
 net/ipv4/tcp_recovery.c | 22 +++++++++++++---------
 3 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1da0aa724929..51183bba3835 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,8 +1863,7 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern int tcp_rack_mark_lost(struct sock *sk);
-
+extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp,
 			     const struct skb_mstamp *xmit_time, u8 sacked);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec6d84363024..bb24b93e64bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2865,10 +2865,14 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	}
 
 	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
-	    tcp_rack_mark_lost(sk)) {
-		flag |= FLAG_LOST_RETRANS;
-		*ack_flag |= FLAG_LOST_RETRANS;
+	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+		u32 prior_retrans = tp->retrans_out;
+
+		tcp_rack_mark_lost(sk);
+		if (prior_retrans > tp->retrans_out) {
+			flag |= FLAG_LOST_RETRANS;
+			*ack_flag |= FLAG_LOST_RETRANS;
+		}
 	}
 
 	/* E. Process state. */
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index f38dba5aed7a..7ea0377229c0 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,17 +32,11 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	u32 reo_wnd, prior_retrans = tp->retrans_out;
-
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
-		return 0;
-
-	/* Reset the advanced flag to avoid unnecessary queue scanning */
-	tp->rack.advanced = 0;
+	u32 reo_wnd;
 
 	/* To be more reordering resilient, allow min_rtt/4 settling delay
 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
@@ -82,7 +76,17 @@ int tcp_rack_mark_lost(struct sock *sk)
 			break;
 		}
 	}
-	return prior_retrans - tp->retrans_out;
+}
+
+void tcp_rack_mark_lost(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+		return;
+	/* Reset the advanced flag to avoid unnecessary queue scanning */
+	tp->rack.advanced = 0;
+	tcp_rack_detect_loss(sk);
 }
 
 /* Record the most recently (re)sent time among the (s)acked packets */
-- 
cgit 


From deed7be78f512d003c6290da0a781479b31b3d74 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:32 -0800
Subject: tcp: record most recent RTT in RACK loss detection

Record the most recent RTT in RACK. It is often identical to the
"ca_rtt_us" values in tcp_clean_rtx_queue. But when the packet has
been retransmitted, RACK choses to believe the ACK is for the
(latest) retransmitted packet if the RTT is over minimum RTT.

This requires passing the arrival time of the most recent ACK to
RACK routines. The timestamp is now recorded in the "ack_time"
in tcp_sacktag_state during the ACK processing.

This patch does not change the RACK algorithm itself. It only adds
the RTT variable to prepare the next main patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  1 +
 include/net/tcp.h       |  7 ++++---
 net/ipv4/tcp_input.c    | 36 ++++++++++++++++++++++--------------
 net/ipv4/tcp_recovery.c | 41 +++++++++++++++++++++++------------------
 4 files changed, 50 insertions(+), 35 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fc5848dad7a4..1255c592719c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -207,6 +207,7 @@ struct tcp_sock {
 	/* Information of the most recently (s)acked skb */
 	struct tcp_rack {
 		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
+		u32 rtt_us;  /* Associated RTT */
 		u8 advanced; /* mstamp advanced since last lost marking */
 		u8 reord;    /* reordering detected */
 	} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 51183bba3835..1439107658c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1863,9 +1863,10 @@ extern int sysctl_tcp_recovery;
 /* Use TCP RACK to detect (some) tail and retransmit losses */
 #define TCP_RACK_LOST_RETRANS  0x1
 
-extern void tcp_rack_mark_lost(struct sock *sk);
-extern void tcp_rack_advance(struct tcp_sock *tp,
-			     const struct skb_mstamp *xmit_time, u8 sacked);
+extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+			     const struct skb_mstamp *xmit_time,
+			     const struct skb_mstamp *ack_time);
 
 /*
  * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb24b93e64bc..8ccd171999bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1135,6 +1135,7 @@ struct tcp_sacktag_state {
 	 */
 	struct skb_mstamp first_sackt;
 	struct skb_mstamp last_sackt;
+	struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
 	struct rate_sample *rate;
 	int	flag;
 };
@@ -1217,7 +1218,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
-		tcp_rack_advance(tp, xmit_time, sacked);
+		tcp_rack_advance(tp, sacked, xmit_time, &state->ack_time);
 
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
@@ -2813,7 +2814,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
  * tcp_xmit_retransmit_queue().
  */
 static void tcp_fastretrans_alert(struct sock *sk, const int acked,
-				  bool is_dupack, int *ack_flag, int *rexmit)
+				  bool is_dupack, int *ack_flag, int *rexmit,
+				  const struct skb_mstamp *ack_time)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2868,7 +2870,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
 		u32 prior_retrans = tp->retrans_out;
 
-		tcp_rack_mark_lost(sk);
+		tcp_rack_mark_lost(sk, ack_time);
 		if (prior_retrans > tp->retrans_out) {
 			flag |= FLAG_LOST_RETRANS;
 			*ack_flag |= FLAG_LOST_RETRANS;
@@ -3105,11 +3107,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			       u32 prior_snd_una, int *acked,
-			       struct tcp_sacktag_state *sack,
-			       struct skb_mstamp *now)
+			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct skb_mstamp first_ackt, last_ackt;
+	struct skb_mstamp *now = &sack->ack_time;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
@@ -3169,7 +3171,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else if (tcp_is_sack(tp)) {
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
-				tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+				tcp_rack_advance(tp, sacked,
+						 &skb->skb_mstamp,
+						 &sack->ack_time);
 		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
@@ -3599,7 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
-	struct skb_mstamp now;
 
 	sack_state.first_sackt.v64 = 0;
 	sack_state.rate = &rs;
@@ -3625,7 +3628,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
-	skb_mstamp_get(&now);
+	skb_mstamp_get(&sack_state.ack_time);
 
 	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
@@ -3693,11 +3696,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-				    &sack_state, &now);
+				    &sack_state);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	}
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
@@ -3712,15 +3716,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tcp_schedule_loss_probe(sk);
 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
 	lost = tp->lost - lost;			/* freshly marked lost */
-	tcp_rate_gen(sk, delivered, lost, &now, &rs);
-	tcp_cong_control(sk, ack, delivered, flag, &rs);
+	tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
+		     sack_state.rate);
+	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
 	return 1;
 
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
 	if (flag & FLAG_DSACKING_ACK)
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3741,9 +3747,11 @@ old_ack:
 	 * If data was DSACKed, see if we can undo a cwnd reduction.
 	 */
 	if (TCP_SKB_CB(skb)->sacked) {
+		skb_mstamp_get(&sack_state.ack_time);
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 						&sack_state);
-		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+		tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+				      &sack_state.ack_time);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 7ea0377229c0..557363cde58a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,7 +32,7 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-static void tcp_rack_detect_loss(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -62,13 +62,14 @@ static void tcp_rack_detect_loss(struct sock *sk)
 			continue;
 
 		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
-
-			if (skb_mstamp_us_delta(&tp->rack.mstamp,
-						&skb->skb_mstamp) <= reo_wnd)
-				continue;
-
-			/* skb is lost if packet sent later is sacked */
-			tcp_rack_mark_skb_lost(sk, skb);
+			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
+			 * A packet is lost if its elapsed time is beyond
+			 * the recent RTT plus the reordering window.
+			 */
+			if (skb_mstamp_us_delta(now, &skb->skb_mstamp) >
+			    tp->rack.rtt_us + reo_wnd) {
+				tcp_rack_mark_skb_lost(sk, skb);
+			}
 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
 			/* Original data are sent sequentially so stop early
 			 * b/c the rest are all sent after rack_sent
@@ -78,7 +79,7 @@ static void tcp_rack_detect_loss(struct sock *sk)
 	}
 }
 
-void tcp_rack_mark_lost(struct sock *sk)
+void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -86,20 +87,25 @@ void tcp_rack_mark_lost(struct sock *sk)
 		return;
 	/* Reset the advanced flag to avoid unnecessary queue scanning */
 	tp->rack.advanced = 0;
-	tcp_rack_detect_loss(sk);
+	tcp_rack_detect_loss(sk, now);
 }
 
-/* Record the most recently (re)sent time among the (s)acked packets */
-void tcp_rack_advance(struct tcp_sock *tp,
-		      const struct skb_mstamp *xmit_time, u8 sacked)
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+		      const struct skb_mstamp *xmit_time,
+		      const struct skb_mstamp *ack_time)
 {
+	u32 rtt_us;
+
 	if (tp->rack.mstamp.v64 &&
 	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
 		return;
 
+	rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
 	if (sacked & TCPCB_RETRANS) {
-		struct skb_mstamp now;
-
 		/* If the sacked packet was retransmitted, it's ambiguous
 		 * whether the retransmission or the original (or the prior
 		 * retransmission) was sacked.
@@ -110,11 +116,10 @@ void tcp_rack_advance(struct tcp_sock *tp,
 		 * so it's at least one RTT (i.e., retransmission is at least
 		 * an RTT later).
 		 */
-		skb_mstamp_get(&now);
-		if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+		if (rtt_us < tcp_min_rtt(tp))
 			return;
 	}
-
+	tp->rack.rtt_us = rtt_us;
 	tp->rack.mstamp = *xmit_time;
 	tp->rack.advanced = 1;
 }
-- 
cgit 


From 57dde7f70de34d4251f291c9eac7ad920aaf56b2 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:33 -0800
Subject: tcp: add reordering timer in RACK loss detection

This patch makes RACK install a reordering timer when it suspects
some packets might be lost, but wants to delay the decision
a little bit to accomodate reordering.

It does not create a new timer but instead repurposes the existing
RTO timer, because both are meant to retransmit packets.
Specifically it arms a timer ICSK_TIME_REO_TIMEOUT when
the RACK timing check fails. The wait time is set to

  RACK.RTT + RACK.reo_wnd - (NOW - Packet.xmit_time) + fudge

This translates to expecting a packet (Packet) should take
(RACK.RTT + RACK.reo_wnd + fudge) to deliver after it was sent.

When there are multiple packets that need a timer, we use one timer
with the maximum timeout. Therefore the timer conservatively uses
the maximum window to expire N packets by one timeout, instead of
N timeouts to expire N packets sent at different times.

The fudge factor is 2 jiffies to ensure when the timer fires, all
the suspected packets would exceed the deadline and be marked lost
by tcp_rack_detect_loss(). It has to be at least 1 jiffy because the
clock may tick between calling icsk_reset_xmit_timer(timeout) and
actually hang the timer. The next jiffy is to lower-bound the timeout
to 2 jiffies when reo_wnd is < 1ms.

When the reordering timer fires (tcp_rack_reo_timeout): If we aren't
in Recovery we'll enter fast recovery and force fast retransmit.
This is very similar to the early retransmit (RFC5827) except RACK
is not constrained to only enter recovery for small outstanding
flights.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  4 ++-
 include/net/tcp.h                  |  4 +++
 net/ipv4/inet_diag.c               |  1 +
 net/ipv4/tcp_input.c               |  6 ++--
 net/ipv4/tcp_ipv4.c                |  1 +
 net/ipv4/tcp_output.c              |  3 +-
 net/ipv4/tcp_recovery.c            | 57 +++++++++++++++++++++++++++++++++-----
 net/ipv4/tcp_timer.c               |  3 ++
 net/ipv6/tcp_ipv6.c                |  1 +
 9 files changed, 68 insertions(+), 12 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 85ee3879499e..84b2edde09b1 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -144,6 +144,7 @@ struct inet_connection_sock {
 #define ICSK_TIME_PROBE0	3	/* Zero window probe timer */
 #define ICSK_TIME_EARLY_RETRANS 4	/* Early retransmit timer */
 #define ICSK_TIME_LOSS_PROBE	5	/* Tail loss probe timer */
+#define ICSK_TIME_REO_TIMEOUT	6	/* Reordering timer */
 
 static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
 {
@@ -234,7 +235,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	}
 
 	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
-	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
+	    what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
+	    what == ICSK_TIME_REO_TIMEOUT) {
 		icsk->icsk_pending = what;
 		icsk->icsk_timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1439107658c2..64fcdeb3358b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -143,6 +143,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
 					                 * for local resources.
 					                 */
+#define TCP_REO_TIMEOUT_MIN	(2000) /* Min RACK reordering timeout in usec */
 
 #define TCP_KEEPALIVE_TIME	(120*60*HZ)	/* two hours */
 #define TCP_KEEPALIVE_PROBES	9		/* Max of 9 keepalive probes	*/
@@ -397,6 +398,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 int tcp_child_process(struct sock *parent, struct sock *child,
 		      struct sk_buff *skb);
 void tcp_enter_loss(struct sock *sk);
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
 void tcp_clear_retrans(struct tcp_sock *tp);
 void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
@@ -541,6 +543,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
 void tcp_retransmit_timer(struct sock *sk);
 void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
+void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
 int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
 
@@ -1867,6 +1870,7 @@ extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
 			     const struct skb_mstamp *xmit_time,
 			     const struct skb_mstamp *ack_time);
+extern void tcp_rack_reo_timeout(struct sock *sk);
 
 /*
  * Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4dea33e5f295..d216e40623d3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -216,6 +216,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		r->idiag_timer = 1;
 		r->idiag_retrans = icsk->icsk_retransmits;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8ccd171999bf..be1191829963 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2522,8 +2522,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
 	tcp_ecn_queue_cwr(tp);
 }
 
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
-			       int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int sndcnt = 0;
@@ -2691,7 +2690,7 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
 
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int mib_idx;
@@ -3031,6 +3030,7 @@ void tcp_rearm_rto(struct sock *sk)
 		u32 rto = inet_csk(sk)->icsk_rto;
 		/* Offset the time elapsed after installing regular RTO */
 		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+		    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp =
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 56d756ecfb59..ebf3e0c4967a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2230,6 +2230,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1d5331a1b1dc..0ba9026cb70d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2960,7 +2960,8 @@ begin_fwd:
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += tcp_skb_pcount(skb);
 
-		if (skb == tcp_write_queue_head(sk))
+		if (skb == tcp_write_queue_head(sk) &&
+		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 						  inet_csk(sk)->icsk_rto,
 						  TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 557363cde58a..eb39b1b6d1dc 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,19 +32,18 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  * The current version is only used after recovery starts but can be
  * easily extended to detect the first loss.
  */
-static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+				 u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	u32 reo_wnd;
 
+	*reo_timeout = 0;
 	/* To be more reordering resilient, allow min_rtt/4 settling delay
 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
 	 * RTT because reordering is often a path property and less related
 	 * to queuing or delayed ACKs.
-	 *
-	 * TODO: measure and adapt to the observed reordering delay, and
-	 * use a timer to retransmit like the delayed early retransmit.
 	 */
 	reo_wnd = 1000;
 	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
@@ -66,10 +65,23 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
 			 * A packet is lost if its elapsed time is beyond
 			 * the recent RTT plus the reordering window.
 			 */
-			if (skb_mstamp_us_delta(now, &skb->skb_mstamp) >
-			    tp->rack.rtt_us + reo_wnd) {
+			u32 elapsed = skb_mstamp_us_delta(now,
+							  &skb->skb_mstamp);
+			s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
+
+			if (remaining < 0) {
 				tcp_rack_mark_skb_lost(sk, skb);
+				continue;
 			}
+
+			/* Skip ones marked lost but not yet retransmitted */
+			if ((scb->sacked & TCPCB_LOST) &&
+			    !(scb->sacked & TCPCB_SACKED_RETRANS))
+				continue;
+
+			/* Record maximum wait time (+1 to avoid 0) */
+			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
 			/* Original data are sent sequentially so stop early
 			 * b/c the rest are all sent after rack_sent
@@ -82,12 +94,19 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
 void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 timeout;
 
 	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
 		return;
+
 	/* Reset the advanced flag to avoid unnecessary queue scanning */
 	tp->rack.advanced = 0;
-	tcp_rack_detect_loss(sk, now);
+	tcp_rack_detect_loss(sk, now, &timeout);
+	if (timeout) {
+		timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+					  timeout, inet_csk(sk)->icsk_rto);
+	}
 }
 
 /* Record the most recently (re)sent time among the (s)acked packets
@@ -123,3 +142,27 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
 	tp->rack.mstamp = *xmit_time;
 	tp->rack.advanced = 1;
 }
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct skb_mstamp now;
+	u32 timeout, prior_inflight;
+
+	skb_mstamp_get(&now);
+	prior_inflight = tcp_packets_in_flight(tp);
+	tcp_rack_detect_loss(sk, &now, &timeout);
+	if (prior_inflight != tcp_packets_in_flight(tp)) {
+		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+			tcp_enter_recovery(sk, false);
+			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+				tcp_cwnd_reduction(sk, 1, 0);
+		}
+		tcp_xmit_retransmit_queue(sk);
+	}
+	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+		tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 29a9bd5f1225..953c02a8566e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -563,6 +563,9 @@ void tcp_write_timer_handler(struct sock *sk)
 	event = icsk->icsk_pending;
 
 	switch (event) {
+	case ICSK_TIME_REO_TIMEOUT:
+		tcp_rack_reo_timeout(sk);
+		break;
 	case ICSK_TIME_EARLY_RETRANS:
 		tcp_resume_early_retransmit(sk);
 		break;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 228965dca3c5..f52c3742b404 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1746,6 +1746,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
-- 
cgit 


From 1d0833df594390876647c54c2c88069d29059665 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:34 -0800
Subject: tcp: use sequence to break TS ties for RACK loss detection

The packets inside a jumbo skb (e.g., TSO) share the same skb
timestamp, even though they are sent sequentially on the wire. Since
RACK is based on time, it can not detect some packets inside the
same skb are lost.  However, we can leverage the packet sequence
numbers as extended timestamps to detect losses. Therefore, when
RACK timestamp is identical to skb's timestamp (i.e., one of the
packets of the skb is acked or sacked), we use the sequence numbers
of the acked and unacked packets to break ties.

We can use the same sequence logic to advance RACK xmit time as
well to detect more losses and avoid timeout.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  1 +
 include/net/tcp.h       |  2 +-
 net/ipv4/tcp_input.c    |  5 +++--
 net/ipv4/tcp_recovery.c | 17 ++++++++++++++---
 4 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1255c592719c..970d5f00589f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -208,6 +208,7 @@ struct tcp_sock {
 	struct tcp_rack {
 		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
 		u32 rtt_us;  /* Associated RTT */
+		u32 end_seq; /* Ending TCP sequence of the skb */
 		u8 advanced; /* mstamp advanced since last lost marking */
 		u8 reord;    /* reordering detected */
 	} rack;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 64fcdeb3358b..5fb1e75a32a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1867,7 +1867,7 @@ extern int sysctl_tcp_recovery;
 #define TCP_RACK_LOST_RETRANS  0x1
 
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
-extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     const struct skb_mstamp *xmit_time,
 			     const struct skb_mstamp *ack_time);
 extern void tcp_rack_reo_timeout(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be1191829963..e42ca11c0326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1218,7 +1218,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
-		tcp_rack_advance(tp, sacked, xmit_time, &state->ack_time);
+		tcp_rack_advance(tp, sacked, end_seq,
+				 xmit_time, &state->ack_time);
 
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
@@ -3171,7 +3172,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else if (tcp_is_sack(tp)) {
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
-				tcp_rack_advance(tp, sacked,
+				tcp_rack_advance(tp, sacked, scb->end_seq,
 						 &skb->skb_mstamp,
 						 &sack->ack_time);
 		}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index eb39b1b6d1dc..1e330a2f913d 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -16,6 +16,14 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+				const struct skb_mstamp *t2,
+				u32 seq1, u32 seq2)
+{
+	return skb_mstamp_after(t1, t2) ||
+	       (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
 /* Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
@@ -60,7 +68,8 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 		    scb->sacked & TCPCB_SACKED_ACKED)
 			continue;
 
-		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+		if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
+					tp->rack.end_seq, scb->end_seq)) {
 			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
 			 * A packet is lost if its elapsed time is beyond
 			 * the recent RTT plus the reordering window.
@@ -113,14 +122,15 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
  * draft-cheng-tcpm-rack-00.txt
  */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 		      const struct skb_mstamp *xmit_time,
 		      const struct skb_mstamp *ack_time)
 {
 	u32 rtt_us;
 
 	if (tp->rack.mstamp.v64 &&
-	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+	    !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
+				 end_seq, tp->rack.end_seq))
 		return;
 
 	rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
@@ -140,6 +150,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
 	}
 	tp->rack.rtt_us = rtt_us;
 	tp->rack.mstamp = *xmit_time;
+	tp->rack.end_seq = end_seq;
 	tp->rack.advanced = 1;
 }
 
-- 
cgit 


From 98e36d449cc681f1bb2ce2230243f7f977a7da1b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:35 -0800
Subject: tcp: check undo conditions before detecting losses

Currently RACK would mark loss before the undo operations in TCP
loss recovery. This could incorrectly identify real losses as
spurious. For example a sender first experiences a delay spike and
then eventually some packets were lost due to buffer overrun.
In this case, the sender should perform fast recovery b/c not all
the packets were lost.

But the sender may first trigger a (spurious) RTO and reset
cwnd to 1. The following ACKs may used to mark real losses by
tcp_rack_mark_lost. Then in tcp_process_loss this ACK could trigger
F-RTO undo condition and unmark real losses and revert the cwnd
reduction. If there are no more ACKs coming back, eventually the
sender would timeout again instead of performing fast recovery.

The patch fixes this incorrect process by always performing
the undo checks before detecting losses.

Fixes: 4f41b1c58a32 ("tcp: use RACK to detect losses")
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e42ca11c0326..9c98dc874825 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2801,6 +2801,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
 	return false;
 }
 
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+				   const struct skb_mstamp *ack_time)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Use RACK to detect loss */
+	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+		u32 prior_retrans = tp->retrans_out;
+
+		tcp_rack_mark_lost(sk, ack_time);
+		if (prior_retrans > tp->retrans_out)
+			*ack_flag |= FLAG_LOST_RETRANS;
+	}
+}
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2866,17 +2881,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		}
 	}
 
-	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
-		u32 prior_retrans = tp->retrans_out;
-
-		tcp_rack_mark_lost(sk, ack_time);
-		if (prior_retrans > tp->retrans_out) {
-			flag |= FLAG_LOST_RETRANS;
-			*ack_flag |= FLAG_LOST_RETRANS;
-		}
-	}
-
 	/* E. Process state. */
 	switch (icsk->icsk_ca_state) {
 	case TCP_CA_Recovery:
@@ -2894,11 +2898,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			tcp_try_keep_open(sk);
 			return;
 		}
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, is_dupack, rexmit);
-		if (icsk->icsk_ca_state != TCP_CA_Open &&
-		    !(flag & FLAG_LOST_RETRANS))
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
+		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+		      (*ack_flag & FLAG_LOST_RETRANS)))
 			return;
 		/* Change state if cwnd is undone or retransmits are lost */
 	default:
@@ -2912,6 +2918,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk);
 
+		tcp_rack_identify_loss(sk, ack_flag, ack_time);
 		if (!tcp_time_to_recover(sk, flag)) {
 			tcp_try_to_open(sk, flag);
 			return;
-- 
cgit 


From a0370b3f3f2cfb8b424b04c0545414abaa53f5ee Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:36 -0800
Subject: tcp: enable RACK loss detection to trigger recovery

This patch changes two things:

1. Start fast recovery with RACK in addition to other heuristics
   (e.g., DUPACK threshold, FACK). Prior to this change RACK
   is enabled to detect losses only after the recovery has
   started by other algorithms.

2. Disable TCP early retransmit. RACK subsumes the early retransmit
   with the new reordering timer feature. A latter patch in this
   series removes the early retransmit code.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       | 11 ++++-------
 net/ipv4/tcp_input.c    | 29 +++++++++++++++++++++--------
 net/ipv4/tcp_recovery.c | 16 ++++++++++------
 3 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5fb1e75a32a9..423438dd6fe9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_recovery;
+#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
+
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern int sysctl_tcp_min_tso_segs;
@@ -1043,6 +1046,7 @@ static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
 
 	tp->do_early_retrans = sysctl_tcp_early_retrans &&
 		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
+		!(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
 		net->ipv4.sysctl_tcp_reordering == 3;
 }
 
@@ -1859,13 +1863,6 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
-
-/* Flags to enable various loss recovery features. See below */
-extern int sysctl_tcp_recovery;
-
-/* Use TCP RACK to detect (some) tail and retransmit losses */
-#define TCP_RACK_LOST_RETRANS  0x1
-
 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     const struct skb_mstamp *xmit_time,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c98dc874825..4ad75b8c4fee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
  *	F.e. after RTO, when all the queue is considered as lost,
  *	lost_out = packets_out and in_flight = retrans_out.
  *
- *		Essentially, we have now two algorithms counting
+ *		Essentially, we have now a few algorithms detecting
  *		lost packets.
  *
- *		FACK: It is the simplest heuristics. As soon as we decided
+ *		If the receiver supports SACK:
+ *
+ *		RFC6675/3517: It is the conventional algorithm. A packet is
+ *		considered lost if the number of higher sequence packets
+ *		SACKed is greater than or equal the DUPACK thoreshold
+ *		(reordering). This is implemented in tcp_mark_head_lost and
+ *		tcp_update_scoreboard.
+ *
+ *		RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ *		(2017-) that checks timing instead of counting DUPACKs.
+ *		Essentially a packet is considered lost if it's not S/ACKed
+ *		after RTT + reordering_window, where both metrics are
+ *		dynamically measured and adjusted. This is implemented in
+ *		tcp_rack_mark_lost.
+ *
+ *		FACK: it is the simplest heuristics. As soon as we decided
  *		that something is lost, we decide that _all_ not SACKed
  *		packets until the most forward SACK are lost. I.e.
  *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
  *		takes place. We use FACK by default until reordering
  *		is suspected on the path to this destination.
  *
- *		NewReno: when Recovery is entered, we assume that one segment
+ *		If the receiver does not support SACK:
+ *
+ *		NewReno (RFC6582): in Recovery we assume that one segment
  *		is lost (classic Reno). While we are in Recovery and
  *		a partial ACK arrives, we assume that one more packet
  *		is lost (NewReno). This heuristics are the same in NewReno
  *		and SACK.
  *
- *  Imagine, that's all! Forget about all this shamanism about CWND inflation
- *  deflation etc. CWND is real congestion window, never inflated, changes
- *  only according to classic VJ rules.
- *
  * Really tricky (and requiring careful tuning) part of algorithm
  * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
  * The first determines the moment _when_ we should reduce CWND and,
@@ -2807,7 +2820,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* Use RACK to detect loss */
-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+	if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
 		u32 prior_retrans = tp->retrans_out;
 
 		tcp_rack_mark_lost(sk, ack_time);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 1e330a2f913d..4ecb38ae8504 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,7 +1,7 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
 
 static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 {
@@ -24,7 +24,9 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
 	       (t1->v64 == t2->v64 && after(seq1, seq2));
 }
 
-/* Marks a packet lost, if some packet sent later has been (s)acked.
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
  * The underlying idea is similar to the traditional dupthresh and FACK
  * but they look at different metrics:
  *
@@ -37,8 +39,10 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
  * is being more resilient to reordering by simply allowing some
  * "settling delay", instead of tweaking the dupthresh.
  *
- * The current version is only used after recovery starts but can be
- * easily extended to detect the first loss.
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
  */
 static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 				 u32 *reo_timeout)
@@ -54,7 +58,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 	 * to queuing or delayed ACKs.
 	 */
 	reo_wnd = 1000;
-	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
 
 	tcp_for_write_queue(skb, sk) {
@@ -105,7 +109,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 timeout;
 
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+	if (!tp->rack.advanced)
 		return;
 
 	/* Reset the advanced flag to avoid unnecessary queue scanning */
-- 
cgit 


From 89fe18e44f7ee5ab1c90d0dff5835acee7751427 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:37 -0800
Subject: tcp: extend F-RTO to catch more spurious timeouts

Current F-RTO reverts cwnd reset whenever a never-retransmitted
packet was (s)acked. The timeout can be declared spurious because
the packets acknoledged with this ACK was transmitted before the
timeout, so clearly not all the packets are lost to reset the cwnd.

This nice detection does not really depend F-RTO internals. This
patch applies the detection universally. On Google servers this
change detected 20% more spurious timeouts.

Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4ad75b8c4fee..9469ce384d3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1939,7 +1939,6 @@ void tcp_enter_loss(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct sk_buff *skb;
-	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
 	bool mark_lost;
 
@@ -2000,13 +1999,15 @@ void tcp_enter_loss(struct sock *sk)
 	tp->high_seq = tp->snd_nxt;
 	tcp_ecn_queue_cwr(tp);
 
-	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
-	 * loss recovery is underway except recurring timeout(s) on
-	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+	/* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+	 * if a previous recovery is underway, otherwise it may incorrectly
+	 * call a timeout spurious if some previously retransmitted packets
+	 * are s/acked (sec 3.2). We do not apply that retriction since
+	 * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+	 * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+	 * on PTMU discovery to avoid sending new data.
 	 */
-	tp->frto = sysctl_tcp_frto &&
-		   (new_recovery || icsk->icsk_retransmits) &&
-		   !inet_csk(sk)->icsk_mtup.probe_size;
+	tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2740,14 +2741,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
 	    tcp_try_undo_loss(sk, false))
 		return;
 
-	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
-		/* Step 3.b. A timeout is spurious if not all data are
-		 * lost, i.e., never-retransmitted data are (s)acked.
-		 */
-		if ((flag & FLAG_ORIG_SACK_ACKED) &&
-		    tcp_try_undo_loss(sk, true))
-			return;
+	/* The ACK (s)acks some never-retransmitted data meaning not all
+	 * the data packets before the timeout were lost. Therefore we
+	 * undo the congestion window and state. This is essentially
+	 * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+	 * a retransmitted skb is permantly marked, we can apply such an
+	 * operation even if F-RTO was not used.
+	 */
+	if ((flag & FLAG_ORIG_SACK_ACKED) &&
+	    tcp_try_undo_loss(sk, tp->undo_marker))
+		return;
 
+	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
 		if (after(tp->snd_nxt, tp->high_seq)) {
 			if (flag & FLAG_DATA_SACKED || is_dupack)
 				tp->frto = 0; /* Step 3.a. loss was real */
-- 
cgit 


From 840a3cbe89694fad75578856976f180e852e69aa Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:38 -0800
Subject: tcp: remove forward retransmit feature

Forward retransmit is an esoteric feature in RFC3517 (condition(3)
in the NextSeg()). Basically if a packet is not considered lost by
the current criteria (# of dupacks etc), but the congestion window
has room for more packets, then retransmit this packet.

However it actually conflicts with the rest of recovery design. For
example, when reordering is detected we want to be conservative
in retransmitting packets but forward-retransmit feature would
break that to force more retransmission. Also the implementation is
fairly complicated inside the retransmission logic inducing extra
iterations in the write queue. With RACK losses are being detected
timely and this heuristic is no longer necessary. There this patch
removes the feature.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 -
 net/ipv4/tcp_input.c  |  5 -----
 net/ipv4/tcp_output.c | 61 +++------------------------------------------------
 3 files changed, 3 insertions(+), 64 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 970d5f00589f..8e5f4c15d0e5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -307,7 +307,6 @@ struct tcp_sock {
 					 */
 
 	int     lost_cnt_hint;
-	u32     retransmit_high;	/* L-bits may be on up to this seqno */
 
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
 	u32	high_seq;	/* snd_nxt at onset of congestion	*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9469ce384d3b..a041a92348ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -916,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 	    before(TCP_SKB_CB(skb)->seq,
 		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
 		tp->retransmit_skb_hint = skb;
-
-	if (!tp->lost_out ||
-	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
-		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 /* Sum the number of packets on the wire we have marked as lost.
@@ -1983,7 +1979,6 @@ void tcp_enter_loss(struct sock *sk)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
-			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0ba9026cb70d..6327e4d368a4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2831,36 +2831,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	return err;
 }
 
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	/* Forward retransmissions are possible only during Recovery. */
-	if (icsk->icsk_ca_state != TCP_CA_Recovery)
-		return false;
-
-	/* No forward retransmissions in Reno are possible. */
-	if (tcp_is_reno(tp))
-		return false;
-
-	/* Yeah, we have to make difficult choice between forward transmission
-	 * and retransmission... Both ways have their merits...
-	 *
-	 * For now we do not retransmit anything, while we have some new
-	 * segments to send. In the other cases, follow rule 3 for
-	 * NextSeg() specified in RFC3517.
-	 */
-
-	if (tcp_may_send_now(sk))
-		return false;
-
-	return true;
-}
-
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2875,24 +2845,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	struct sk_buff *hole = NULL;
-	u32 max_segs, last_lost;
+	u32 max_segs;
 	int mib_idx;
-	int fwd_rexmitting = 0;
 
 	if (!tp->packets_out)
 		return;
 
-	if (!tp->lost_out)
-		tp->retransmit_high = tp->snd_una;
-
 	if (tp->retransmit_skb_hint) {
 		skb = tp->retransmit_skb_hint;
-		last_lost = TCP_SKB_CB(skb)->end_seq;
-		if (after(last_lost, tp->retransmit_high))
-			last_lost = tp->retransmit_high;
 	} else {
 		skb = tcp_write_queue_head(sk);
-		last_lost = tp->snd_una;
 	}
 
 	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2915,31 +2877,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		 */
 		segs = min_t(int, segs, max_segs);
 
-		if (fwd_rexmitting) {
-begin_fwd:
-			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-				break;
-			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
-		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
-			tp->retransmit_high = last_lost;
-			if (!tcp_can_forward_retransmit(sk))
-				break;
-			/* Backtrack if necessary to non-L'ed skb */
-			if (hole) {
-				skb = hole;
-				hole = NULL;
-			}
-			fwd_rexmitting = 1;
-			goto begin_fwd;
-
+		if (tp->retrans_out >= tp->lost_out) {
+			break;
 		} else if (!(sacked & TCPCB_LOST)) {
 			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
 				hole = skb;
 			continue;
 
 		} else {
-			last_lost = TCP_SKB_CB(skb)->end_seq;
 			if (icsk->icsk_ca_state != TCP_CA_Loss)
 				mib_idx = LINUX_MIB_TCPFASTRETRANS;
 			else
-- 
cgit 


From bec41a11dd3dc8c54f766b4f494140ca92ba7c10 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:39 -0800
Subject: tcp: remove early retransmit

This patch removes the support of RFC5827 early retransmit (i.e.,
fast recovery on small inflight with <3 dupacks) because it is
subsumed by the new RACK loss detection. More specifically when
RACK receives DUPACKs, it'll arm a reordering timer to start fast
recovery after a quarter of (min)RTT, hence it covers the early
retransmit except RACK does not limit itself to specific inflight
or dupack numbers.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 19 +++--------
 include/linux/tcp.h                    |  3 +-
 include/net/tcp.h                      | 19 -----------
 net/ipv4/inet_diag.c                   |  1 -
 net/ipv4/tcp.c                         |  3 --
 net/ipv4/tcp_input.c                   | 60 ++--------------------------------
 net/ipv4/tcp_ipv4.c                    |  1 -
 net/ipv4/tcp_metrics.c                 |  1 -
 net/ipv4/tcp_minisocks.c               |  1 -
 net/ipv4/tcp_output.c                  | 11 +++----
 net/ipv4/tcp_timer.c                   |  3 --
 net/ipv6/tcp_ipv6.c                    |  1 -
 12 files changed, 12 insertions(+), 111 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7dd65c9cf707..7de2cf79e16f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
 	Allows TCP to send "duplicate" SACKs.
 
 tcp_early_retrans - INTEGER
-	Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
-	for triggering fast retransmit when the amount of outstanding data is
-	small and when no previously unsent data can be transmitted (such
-	that limited transmit could be used). Also controls the use of
-	Tail loss probe (TLP) that converts RTOs occurring due to tail
-	losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
+	Tail loss probe (TLP) converts RTOs occurring due to tail
+	losses into fast recovery (draft-ietf-tcpm-rack). Note that
+	TLP requires RACK to function properly (see tcp_recovery below)
 	Possible values:
-		0 disables ER
-		1 enables ER
-		2 enables ER but delays fast recovery and fast retransmit
-		  by a fourth of RTT. This mitigates connection falsely
-		  recovers when network has a small degree of reordering
-		  (less than 3 packets).
-		3 enables delayed ER and TLP.
-		4 enables TLP only.
+		0 disables TLP
+		3 or 4 enables TLP
 	Default: 3
 
 tcp_ecn - INTEGER
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8e5f4c15d0e5..4733368f953a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -224,8 +224,7 @@ struct tcp_sock {
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
-	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-		syn_data:1,	/* SYN includes data */
+	u8	syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 423438dd6fe9..c55d65f74f7f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -565,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 			     const struct sk_buff *next_skb);
 
 /* tcp_input.c */
-void tcp_resume_early_retransmit(struct sock *sk);
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
@@ -1037,24 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
 	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
 }
 
-/* TCP early-retransmit (ER) is similar to but more conservative than
- * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
- */
-static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
-{
-	struct net *net = sock_net((struct sock *)tp);
-
-	tp->do_early_retrans = sysctl_tcp_early_retrans &&
-		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
-		!(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
-		net->ipv4.sysctl_tcp_reordering == 3;
-}
-
-static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
-{
-	tp->do_early_retrans = 0;
-}
-
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index d216e40623d3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -215,7 +215,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		r->idiag_timer = 1;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c8d46c140b4a..d9023e8ed53e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
-	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
@@ -2477,8 +2476,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		else {
 			tp->thin_dupack = val;
-			if (tp->thin_dupack)
-				tcp_disable_early_retrans(tp);
 		}
 		break;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a041a92348ee..79c819077a59 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -904,8 +904,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 		tcp_disable_fack(tp);
 	}
 
-	if (metric > 0)
-		tcp_disable_early_retrans(tp);
 	tp->rack.reord = 1;
 }
 
@@ -2054,30 +2052,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
 }
 
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned long delay;
-
-	/* Delay early retransmit and entering fast recovery for
-	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
-	 * available, or RTO is scheduled to fire first.
-	 */
-	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
-	    (flag & FLAG_ECE) || !tp->srtt_us)
-		return false;
-
-	delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
-		    msecs_to_jiffies(2));
-
-	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
-		return false;
-
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
-				  TCP_RTO_MAX);
-	return true;
-}
-
 /* Linux NewReno/SACK/FACK/ECN state machine.
  * --------------------------------------
  *
@@ -2221,16 +2195,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	    tcp_is_sack(tp) && !tcp_send_head(sk))
 		return true;
 
-	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
-	 * retransmissions due to small network reorderings, we implement
-	 * Mitigation A.3 in the RFC and delay the retransmission for a short
-	 * interval if appropriate.
-	 */
-	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
-	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
-	    !tcp_may_send_now(sk))
-		return !tcp_pause_early_retransmit(sk, flag);
-
 	return false;
 }
 
@@ -3050,8 +3014,7 @@ void tcp_rearm_rto(struct sock *sk)
 	} else {
 		u32 rto = inet_csk(sk)->icsk_rto;
 		/* Offset the time elapsed after installing regular RTO */
-		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-		    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp =
@@ -3068,24 +3031,6 @@ void tcp_rearm_rto(struct sock *sk)
 	}
 }
 
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tcp_rearm_rto(sk);
-
-	/* Stop if ER is disabled after the delayed ER timer is scheduled */
-	if (!tp->do_early_retrans)
-		return;
-
-	tcp_enter_recovery(sk, false);
-	tcp_update_scoreboard(sk, 1);
-	tcp_xmit_retransmit_queue(sk);
-}
-
 /* If we get here, the whole TSO packet has not been acked. */
 static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
@@ -3651,8 +3596,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	skb_mstamp_get(&sack_state.ack_time);
 
-	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+	if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
 
 	if (after(ack, prior_snd_una)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf3e0c4967a..63214136cf1c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2229,7 +2229,6 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 	int state;
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ba8f02d0f283..b9ed0d50aead 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk)
 	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
 	if (val && tp->reordering != val) {
 		tcp_disable_fack(tp);
-		tcp_disable_early_retrans(tp);
 		tp->reordering = val;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 06fde26a82b7..bdb443471c39 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->sacked_out = 0;
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = treq->snt_synack.stamp_jiffies;
 		newsk->sk_txhash = treq->txhash;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6327e4d368a4..9a1a1494b9dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
 	tp->packets_out += tcp_skb_pcount(skb);
-	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
-	}
 
 	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
 		      tcp_skb_pcount(skb));
@@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	u32 timeout, tlp_time_stamp, rto_time_stamp;
 	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
 
-	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
-		return false;
 	/* No consecutive loss probes. */
 	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
 		tcp_rearm_rto(sk);
@@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	/* Schedule a loss probe in 2*RTT for SACK capable connections
 	 * in Open state, that are either limited by cwnd or application.
 	 */
-	if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
-	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+	if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+	    !tp->packets_out || !tcp_is_sack(tp) ||
+	    icsk->icsk_ca_state != TCP_CA_Open)
 		return false;
 
 	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 953c02a8566e..40d893556e67 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -566,9 +566,6 @@ void tcp_write_timer_handler(struct sock *sk)
 	case ICSK_TIME_REO_TIMEOUT:
 		tcp_rack_reo_timeout(sk);
 		break;
-	case ICSK_TIME_EARLY_RETRANS:
-		tcp_resume_early_retransmit(sk);
-		break;
 	case ICSK_TIME_LOSS_PROBE:
 		tcp_send_loss_probe(sk);
 		break;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f52c3742b404..fc14e04028bf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1745,7 +1745,6 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
-	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
-- 
cgit 


From ac229dca7e4e582114e1ec9765fda0915aa58468 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:40 -0800
Subject: tcp: remove RFC4653 NCR

This patch removes the (partial) implementation of the aggressive
limited transmit in RFC4653 TCP Non-Congestion Robustness (NCR).

NCR is a mitigation to the problem created by the dynamic
DUPACK threshold.  With the current adaptive DUPACK threshold
(tp->reordering) could cause timeouts by preventing fast recovery.
For example, if the last packet of a cwnd burst was reordered, the
threshold will be set to the size of cwnd. But if next application
burst is smaller than threshold and has drops instead of reorderings,
the sender would not trigger fast recovery but instead resorts to a
timeout recovery.

NCR mitigates this issue by checking the number of DUPACKs against
the current flight size additionally. The techniqueue is similar to
the early retransmit RFC.

With RACK loss detection, this mitigation is not needed, because RACK
does not use DUPACK threshold to detect losses. RACK arms a reordering
timer to fire at most a quarter RTT later to start fast recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 79c819077a59..87315ab1ab1a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2161,8 +2161,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 static bool tcp_time_to_recover(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	__u32 packets_out;
-	int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 
 	/* Trick#1: The loss is proven. */
 	if (tp->lost_out)
@@ -2172,19 +2170,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	if (tcp_dupack_heuristics(tp) > tp->reordering)
 		return true;
 
-	/* Trick#4: It is still not OK... But will it be useful to delay
-	 * recovery more?
-	 */
-	packets_out = tp->packets_out;
-	if (packets_out <= tp->reordering &&
-	    tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
-	    !tcp_may_send_now(sk)) {
-		/* We have nothing to send. This connection is limited
-		 * either by receiver window or by application.
-		 */
-		return true;
-	}
-
 	/* If a thin stream is detected, retransmit after first
 	 * received dupack. Employ only if SACK is supported in order
 	 * to avoid possible corner-case series of spurious retransmissions
-- 
cgit 


From 4a7f6009441144783e5925551c72e3f2e1b0839b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:41 -0800
Subject: tcp: remove thin_dupack feature

Thin stream DUPACK is to start fast recovery on only one DUPACK
provided the connection is a thin stream (i.e., low inflight).  But
this older feature is now subsumed with RACK. If a connection
receives only a single DUPACK, RACK would arm a reordering timer
and soon starts fast recovery instead of timeout if no further
ACKs are received.

The socket option (THIN_DUPACK) is kept as a nop for compatibility.
Note that this patch does not change another thin-stream feature
which enables linear RTO. Although it might be good to generalize
that in the future (i.e., linear RTO for the first say 3 retries).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 12 ------------
 include/linux/tcp.h                    |  2 +-
 net/ipv4/sysctl_net_ipv4.c             |  7 -------
 net/ipv4/tcp.c                         |  6 ++----
 net/ipv4/tcp_input.c                   | 13 -------------
 5 files changed, 3 insertions(+), 37 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7de2cf79e16f..aa1bb49f1dc6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -703,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
-tcp_thin_dupack - BOOLEAN
-	Enable dynamic triggering of retransmissions after one dupACK
-	for thin streams. If set, a check is performed upon reception
-	of a dupACK to determine if the stream is thin (less than 4
-	packets in flight). As long as the stream is found to be thin,
-	data is retransmitted on the first received dupACK. This
-	improves retransmission latency for non-aggressive thin
-	streams, often found to be time-dependent.
-	For more information on thin streams, see
-	Documentation/networking/tcp-thin.txt
-	Default: 0
-
 tcp_limit_output_bytes - INTEGER
 	Controls TCP Small Queue limit per tcp socket.
 	TCP bulk sender tends to increase packets in flight until it
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4733368f953a..6c22332afb75 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -220,7 +220,7 @@ struct tcp_sock {
 		unused:5;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		thin_dupack : 1,/* Fast retransmit on first dupack      */
+		unused1	    : 1,
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0f2d37e8e983..c8d283615c6f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -536,13 +536,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec
 	},
-	{
-		.procname       = "tcp_thin_dupack",
-		.data           = &sysctl_tcp_thin_dupack,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec
-	},
 	{
 		.procname	= "tcp_early_retrans",
 		.data		= &sysctl_tcp_early_retrans,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d9023e8ed53e..aba6ea76338e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2474,9 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		if (val < 0 || val > 1)
 			err = -EINVAL;
-		else {
-			tp->thin_dupack = val;
-		}
 		break;
 
 	case TCP_REPAIR:
@@ -2966,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
+
 	case TCP_THIN_DUPACK:
-		val = tp->thin_dupack;
+		val = 0;
 		break;
 
 	case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87315ab1ab1a..39ebc20ca1b2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -2170,16 +2167,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	if (tcp_dupack_heuristics(tp) > tp->reordering)
 		return true;
 
-	/* If a thin stream is detected, retransmit after first
-	 * received dupack. Employ only if SACK is supported in order
-	 * to avoid possible corner-case series of spurious retransmissions
-	 * Use only if there are no unsent data.
-	 */
-	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
-	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
-	    tcp_is_sack(tp) && !tcp_send_head(sk))
-		return true;
-
 	return false;
 }
 
-- 
cgit 


From 94bdc9785a1136cef6a982b042719783978e8a26 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jan 2017 22:11:42 -0800
Subject: tcp: disable fack by default

This patch disables FACK by default as RACK is the successor of FACK
(inspired by the insights behind FACK).

FACK[1] in Linux works as follows: a packet P is deemed lost,
if packet Q of higher sequence is s/acked and P and Q are distant
by at least dupthresh number of packets in sequence space.

FACK is more aggressive than the IETF recommened recovery for SACK
(RFC3517 A Conservative Selective Acknowledgment (SACK)-based Loss
 Recovery Algorithm for TCP), because a single SACK may trigger
fast recovery. This obviously won't work well with reordering so
FACK is dynamically disabled upon detecting reordering.

RACK supersedes FACK by using time distance instead of sequence
distance. On reordering, RACK waits for a quarter of RTT receiving
a single SACK before starting recovery. (the timer can be made more
adaptive in the future by measuring reordering distance in time,
but currently RTT/4 seem to work well.) Once the recovery starts,
RACK behaves almost like FACK because it reduces the reodering
window to 1ms, so it fast retransmits quickly. In addition RACK
can detect loss retransmission as it does not care about the packet
sequences (being repeated or not), which is extremely useful when
the connection is going through a traffic policer.

Google server experiments indicate that disabling FACK after enabling
RACK has negligible impact on the overall loss recovery performance
with more reordering events detected.  But we still keep the FACK
implementation for backup if RACK has bugs that needs to be disabled.

[1] M. Mathis, J. Mahdavi, "Forward Acknowledgment: Refining
TCP Congestion Control," In Proceedings of SIGCOMM '96, August 1996.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 39ebc20ca1b2..1a34e9278c07 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,7 @@
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
@@ -2114,7 +2114,8 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
  *		dynamically measured and adjusted. This is implemented in
  *		tcp_rack_mark_lost.
  *
- *		FACK: it is the simplest heuristics. As soon as we decided
+ *		FACK (Disabled by default. Subsumbed by RACK):
+ *		It is the simplest heuristics. As soon as we decided
  *		that something is lost, we decide that _all_ not SACKed
  *		packets until the most forward SACK are lost. I.e.
  *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
-- 
cgit 


From 0e40f4c9593ba2c7c30150ed669da97bd581c0cd Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Tue, 17 Jan 2017 13:37:19 -0500
Subject: tcp: accept RST for rcv_nxt - 1 after receiving a FIN

Using a Mac OSX box as a client connecting to a Linux server, we have found
that when certain applications (such as 'ab'), are abruptly terminated
(via ^C), a FIN is sent followed by a RST packet on tcp connections. The
FIN is accepted by the Linux stack but the RST is sent with the same
sequence number as the FIN, and Linux responds with a challenge ACK per
RFC 5961. The OSX client then sometimes (they are rate-limited) does not
reply with any RST as would be expected on a closed socket.

This results in sockets accumulating on the Linux server left mostly in
the CLOSE_WAIT state, although LAST_ACK and CLOSING are also possible.
This sequence of events can tie up a lot of resources on the Linux server
since there may be a lot of data in write buffers at the time of the RST.
Accepting a RST equal to rcv_nxt - 1, after we have already successfully
processed a FIN, has made a significant difference for us in practice, by
freeing up unneeded resources in a more expedient fashion.

A packetdrill test demonstrating the behavior:

// testing mac osx rst behavior

// Establish a connection
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < S 0:0(0) win 32768 <mss 1460,nop,wscale 10>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 5>
0.200 < . 1:1(0) ack 1 win 32768
0.200 accept(3, ..., ...) = 4

// Client closes the connection
0.300 < F. 1:1(0) ack 1 win 32768

// now send rst with same sequence
0.300 < R. 1:1(0) ack 1 win 32768

// make sure we are in TCP_CLOSE
0.400 %{
assert tcpi_state == 7
}%

Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1a34e9278c07..bfa165cc455a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5199,6 +5199,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 	return err;
 }
 
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+					       TCPF_CLOSING));
+}
+
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5237,20 +5254,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 						  LINUX_MIB_TCPACKSKIPPEDSEQ,
 						  &tp->last_oow_ack_time))
 				tcp_send_dupack(sk, skb);
+		} else if (tcp_reset_check(sk, skb)) {
+			tcp_reset(sk);
 		}
 		goto discard;
 	}
 
 	/* Step 2: check RST bit */
 	if (th->rst) {
-		/* RFC 5961 3.2 (extend to match against SACK too if available):
-		 * If seq num matches RCV.NXT or the right-most SACK block,
+		/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+		 * FIN and SACK too if available):
+		 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+		 * the right-most SACK block,
 		 * then
 		 *     RESET the connection
 		 * else
 		 *     Send a challenge ACK
 		 */
-		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+		    tcp_reset_check(sk, skb)) {
 			rst_seq_match = true;
 		} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
 			struct tcp_sack_block *sp = &tp->selective_acks[0];
-- 
cgit 


From 60b1af3300724d211bb0b420c1fbe6bf5b87b013 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 24 Jan 2017 14:57:36 -0800
Subject: tcp: reduce skb overhead in selected places

tcp_add_backlog() can use skb_condense() helper to get better
gains and less SKB_TRUESIZE() magic. This only happens when socket
backlog has to be used.

Some attacks involve specially crafted out of order tiny TCP packets,
clogging the ofo queue of (many) sockets.
Then later, expensive collapse happens, trying to copy all these skbs
into single ones.
This unfortunately does not work if each skb has no neighbor in TCP
sequence order.

By using skb_condense() if the skb could not be coalesced to a prior
one, we defeat these kind of threats, potentially saving 4K per skb
(or more, since this is one page fragment).

A typical NAPI driver allocates gro packets with GRO_MAX_HEAD bytes
in skb->head, meaning the copy done by skb_condense() is limited to
about 200 bytes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 net/ipv4/tcp_ipv4.c  | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bfa165cc455a..3de6eba378ad 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4507,6 +4507,7 @@ add_sack:
 end:
 	if (skb) {
 		tcp_grow_window(sk, skb);
+		skb_condense(skb);
 		skb_set_owner_r(skb, sk);
 	}
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f7325b25b06e..a90b4540c11e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1556,8 +1556,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 	 * It has been noticed pure SACK packets were sometimes dropped
 	 * (if cooked by drivers without copybreak feature).
 	 */
-	if (!skb->data_len)
-		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+	skb_condense(skb);
 
 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
 		bh_unlock_sock(sk);
-- 
cgit 


From c3a2e8370534f810cac6050169db0ed3e0f94f0b Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 6 Feb 2017 23:14:14 +0200
Subject: tcp: replace dst_confirm with sk_dst_confirm

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
Use the new sk_dst_confirm() helper to propagate the
indication from received packets to sock_confirm_neigh().

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Tested-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c   | 12 +++---------
 net/ipv4/tcp_metrics.c |  7 ++-----
 net/ipv4/tcp_output.c  |  2 ++
 3 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 27c95acbb52f..2c0ff327b6df 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3644,11 +3644,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-		struct dst_entry *dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
-	}
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+		sk_dst_confirm(sk);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
@@ -5995,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		break;
 
 	case TCP_FIN_WAIT1: {
-		struct dst_entry *dst;
 		int tmo;
 
 		/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6022,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		tcp_set_state(sk, TCP_FIN_WAIT2);
 		sk->sk_shutdown |= SEND_SHUTDOWN;
 
-		dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
+		sk_dst_confirm(sk);
 
 		if (!sock_flag(sk, SOCK_DEAD)) {
 			/* Wake up lingering close() */
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b9ed0d50aead..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
 	u32 val;
 	int m;
 
+	sk_dst_confirm(sk);
 	if (sysctl_tcp_nometrics_save || !dst)
 		return;
 
-	if (dst->flags & DST_HOST)
-		dst_confirm(dst);
-
 	rcu_read_lock();
 	if (icsk->icsk_backoff || !tp->srtt_us) {
 		/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
 	struct tcp_metrics_block *tm;
 	u32 val, crtt = 0; /* cached RTT scaled by 8 */
 
+	sk_dst_confirm(sk);
 	if (!dst)
 		goto reset;
 
-	dst_confirm(dst);
-
 	rcu_read_lock();
 	tm = tcp_get_metrics(sk, dst, true);
 	if (!tm) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ccf1ef4dcba4..61f272a99a49 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -980,6 +980,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	skb_set_hash_from_sk(skb, sk);
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
+	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
 	/* Build TCP header and checksum it. */
 	th = (struct tcphdr *)skb->data;
 	th->source		= inet->inet_sport;
-- 
cgit 


From 449809a66c1d0b1563dee84493e14bf3104d2d7e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 1 Mar 2017 08:39:49 -0800
Subject: tcp/dccp: block BH for SYN processing

SYN processing really was meant to be handled from BH.

When I got rid of BH blocking while processing socket backlog
in commit 5413d1babe8f ("net: do not block BH while processing socket
backlog"), I forgot that a malicious user could transition to TCP_LISTEN
from a state that allowed (SYN) packets to be parked in the socket
backlog while socket is owned by the thread doing the listen() call.

Sure enough syzkaller found this and reported the bug ;)

=================================
[ INFO: inconsistent lock state ]
4.10.0+ #60 Not tainted
---------------------------------
inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
syz-executor0/5090 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at:
[<ffffffff83a6a370>] spin_lock include/linux/spinlock.h:299 [inline]
 (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at:
[<ffffffff83a6a370>] inet_ehash_insert+0x240/0xad0
net/ipv4/inet_hashtables.c:407
{IN-SOFTIRQ-W} state was registered at:
  mark_irqflags kernel/locking/lockdep.c:2923 [inline]
  __lock_acquire+0xbcf/0x3270 kernel/locking/lockdep.c:3295
  lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753
  __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
  _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151
  spin_lock include/linux/spinlock.h:299 [inline]
  inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407
  reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline]
  inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764
  tcp_conn_request+0x25cc/0x3310 net/ipv4/tcp_input.c:6399
  tcp_v4_conn_request+0x157/0x220 net/ipv4/tcp_ipv4.c:1262
  tcp_rcv_state_process+0x802/0x4130 net/ipv4/tcp_input.c:5889
  tcp_v4_do_rcv+0x56b/0x940 net/ipv4/tcp_ipv4.c:1433
  tcp_v4_rcv+0x2e12/0x3210 net/ipv4/tcp_ipv4.c:1711
  ip_local_deliver_finish+0x4ce/0xc40 net/ipv4/ip_input.c:216
  NF_HOOK include/linux/netfilter.h:257 [inline]
  ip_local_deliver+0x1ce/0x710 net/ipv4/ip_input.c:257
  dst_input include/net/dst.h:492 [inline]
  ip_rcv_finish+0xb1d/0x2110 net/ipv4/ip_input.c:396
  NF_HOOK include/linux/netfilter.h:257 [inline]
  ip_rcv+0xd90/0x19c0 net/ipv4/ip_input.c:487
  __netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4179
  __netif_receive_skb+0x2a/0x170 net/core/dev.c:4217
  netif_receive_skb_internal+0x1d6/0x430 net/core/dev.c:4245
  napi_skb_finish net/core/dev.c:4602 [inline]
  napi_gro_receive+0x4e6/0x680 net/core/dev.c:4636
  e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4033 [inline]
  e1000_clean_rx_irq+0x5e0/0x1490
drivers/net/ethernet/intel/e1000/e1000_main.c:4489
  e1000_clean+0xb9a/0x2910 drivers/net/ethernet/intel/e1000/e1000_main.c:3834
  napi_poll net/core/dev.c:5171 [inline]
  net_rx_action+0xe70/0x1900 net/core/dev.c:5236
  __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
  invoke_softirq kernel/softirq.c:364 [inline]
  irq_exit+0x19e/0x1d0 kernel/softirq.c:405
  exiting_irq arch/x86/include/asm/apic.h:658 [inline]
  do_IRQ+0x81/0x1a0 arch/x86/kernel/irq.c:250
  ret_from_intr+0x0/0x20
  native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:53
  arch_safe_halt arch/x86/include/asm/paravirt.h:98 [inline]
  default_idle+0x8f/0x410 arch/x86/kernel/process.c:271
  arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:262
  default_idle_call+0x36/0x60 kernel/sched/idle.c:96
  cpuidle_idle_call kernel/sched/idle.c:154 [inline]
  do_idle+0x348/0x440 kernel/sched/idle.c:243
  cpu_startup_entry+0x18/0x20 kernel/sched/idle.c:345
  start_secondary+0x344/0x440 arch/x86/kernel/smpboot.c:272
  verify_cpu+0x0/0xfc
irq event stamp: 1741
hardirqs last  enabled at (1741): [<ffffffff84d49d77>]
__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160
[inline]
hardirqs last  enabled at (1741): [<ffffffff84d49d77>]
_raw_spin_unlock_irqrestore+0xf7/0x1a0 kernel/locking/spinlock.c:191
hardirqs last disabled at (1740): [<ffffffff84d4a732>]
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline]
hardirqs last disabled at (1740): [<ffffffff84d4a732>]
_raw_spin_lock_irqsave+0xa2/0x110 kernel/locking/spinlock.c:159
softirqs last  enabled at (1738): [<ffffffff84d4deff>]
__do_softirq+0x7cf/0xb7d kernel/softirq.c:310
softirqs last disabled at (1571): [<ffffffff84d4b92c>]
do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&(&hashinfo->ehash_locks[i])->rlock);
  <Interrupt>
    lock(&(&hashinfo->ehash_locks[i])->rlock);

 *** DEADLOCK ***

1 lock held by syz-executor0/5090:
 #0:  (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff83406b43>] lock_sock
include/net/sock.h:1460 [inline]
 #0:  (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff83406b43>]
sock_setsockopt+0x233/0x1e40 net/core/sock.c:683

stack backtrace:
CPU: 1 PID: 5090 Comm: syz-executor0 Not tainted 4.10.0+ #60
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:15 [inline]
 dump_stack+0x292/0x398 lib/dump_stack.c:51
 print_usage_bug+0x3ef/0x450 kernel/locking/lockdep.c:2387
 valid_state kernel/locking/lockdep.c:2400 [inline]
 mark_lock_irq kernel/locking/lockdep.c:2602 [inline]
 mark_lock+0xf30/0x1410 kernel/locking/lockdep.c:3065
 mark_irqflags kernel/locking/lockdep.c:2941 [inline]
 __lock_acquire+0x6dc/0x3270 kernel/locking/lockdep.c:3295
 lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753
 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
 _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151
 spin_lock include/linux/spinlock.h:299 [inline]
 inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407
 reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline]
 inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764
 dccp_v6_conn_request+0xada/0x11b0 net/dccp/ipv6.c:380
 dccp_rcv_state_process+0x51e/0x1660 net/dccp/input.c:606
 dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632
 sk_backlog_rcv include/net/sock.h:896 [inline]
 __release_sock+0x127/0x3a0 net/core/sock.c:2052
 release_sock+0xa5/0x2b0 net/core/sock.c:2539
 sock_setsockopt+0x60f/0x1e40 net/core/sock.c:1016
 SYSC_setsockopt net/socket.c:1782 [inline]
 SyS_setsockopt+0x2fb/0x3a0 net/socket.c:1765
 entry_SYSCALL_64_fastpath+0x1f/0xc2
RIP: 0033:0x4458b9
RSP: 002b:00007fe8b26c2b58 EFLAGS: 00000292 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00000000004458b9
RDX: 000000000000001a RSI: 0000000000000001 RDI: 0000000000000006
RBP: 00000000006e2110 R08: 0000000000000010 R09: 0000000000000000
R10: 00000000208c3000 R11: 0000000000000292 R12: 0000000000708000
R13: 0000000020000000 R14: 0000000000001000 R15: 0000000000000000

Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c     | 10 ++++++++--
 net/ipv4/tcp_input.c | 10 ++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 8fedc2d49770..4a05d7876850 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
 	const int old_state = sk->sk_state;
+	bool acceptable;
 	int queued = 0;
 
 	/*
@@ -603,8 +604,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	 */
 	if (sk->sk_state == DCCP_LISTEN) {
 		if (dh->dccph_type == DCCP_PKT_REQUEST) {
-			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
-								    skb) < 0)
+			/* It is possible that we process SYN packets from backlog,
+			 * so we need to make sure to disable BH right there.
+			 */
+			local_bh_disable();
+			acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
+			local_bh_enable();
+			if (!acceptable)
 				return 1;
 			consume_skb(skb);
 			return 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2c0ff327b6df..39c393cc0fd3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5886,9 +5886,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		if (th->syn) {
 			if (th->fin)
 				goto discard;
-			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
-				return 1;
+			/* It is possible that we process SYN packets from backlog,
+			 * so we need to make sure to disable BH right there.
+			 */
+			local_bh_disable();
+			acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
+			local_bh_enable();
 
+			if (!acceptable)
+				return 1;
 			consume_skb(skb);
 			return 0;
 		}
-- 
cgit