diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 280 | 
1 files changed, 157 insertions, 123 deletions
| diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a8f515bb19c4..fdd88c3803a6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;  int sysctl_tcp_rfc1337 __read_mostly;  int sysctl_tcp_max_orphans __read_mostly = NR_FILE;  int sysctl_tcp_frto __read_mostly = 2; +int sysctl_tcp_min_rtt_wlen __read_mostly = 300;  int sysctl_tcp_thin_dupack __read_mostly; @@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,  	if (metric > 0)  		tcp_disable_early_retrans(tp); +	tp->rack.reord = 1;  }  /* This must be called before lost_out is incremented */ @@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)  	}  } -static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, -					    struct sk_buff *skb) +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)  {  	tcp_verify_retransmit_hint(tp, skb); @@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,  	return !before(start_seq, end_seq - tp->max_window);  } -/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "B". Later note: FACK people cheated me again 8), we have to account - * for reordering! Ugly, but should help. - * - * Search retransmitted skbs from write_queue that were sent when snd_nxt was - * less than what is now known to be received by the other end (derived from - * highest SACK block). Also calculate the lowest snd_nxt among the remaining - * retransmitted skbs to avoid some costly processing per ACKs. - */ -static void tcp_mark_lost_retrans(struct sock *sk, int *flag) -{ -	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct tcp_sock *tp = tcp_sk(sk); -	struct sk_buff *skb; -	int cnt = 0; -	u32 new_low_seq = tp->snd_nxt; -	u32 received_upto = tcp_highest_sack_seq(tp); - -	if (!tcp_is_fack(tp) || !tp->retrans_out || -	    !after(received_upto, tp->lost_retrans_low) || -	    icsk->icsk_ca_state != TCP_CA_Recovery) -		return; - -	tcp_for_write_queue(skb, sk) { -		u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; - -		if (skb == tcp_send_head(sk)) -			break; -		if (cnt == tp->retrans_out) -			break; -		if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) -			continue; - -		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) -			continue; - -		/* TODO: We would like to get rid of tcp_is_fack(tp) only -		 * constraint here (see above) but figuring out that at -		 * least tp->reordering SACK blocks reside between ack_seq -		 * and received_upto is not easy task to do cheaply with -		 * the available datastructures. -		 * -		 * Whether FACK should check here for tp->reordering segs -		 * in-between one could argue for either way (it would be -		 * rather simple to implement as we could count fack_count -		 * during the walk and do tp->fackets_out - fack_count). -		 */ -		if (after(received_upto, ack_seq)) { -			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; -			tp->retrans_out -= tcp_skb_pcount(skb); -			*flag |= FLAG_LOST_RETRANS; -			tcp_skb_mark_lost_uncond_verify(tp, skb); -			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); -		} else { -			if (before(ack_seq, new_low_seq)) -				new_low_seq = ack_seq; -			cnt += tcp_skb_pcount(skb); -		} -	} - -	if (tp->retrans_out) -		tp->lost_retrans_low = new_low_seq; -} -  static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,  			    struct tcp_sack_block_wire *sp, int num_sacks,  			    u32 prior_snd_una) @@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,  		return sacked;  	if (!(sacked & TCPCB_SACKED_ACKED)) { +		tcp_rack_advance(tp, xmit_time, sacked); +  		if (sacked & TCPCB_SACKED_RETRANS) {  			/* If the segment is not tagged as lost,  			 * we do not clear RETRANS, believing @@ -1837,7 +1776,6 @@ advance_sp:  	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))  		tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); -	tcp_mark_lost_retrans(sk, &state->flag);  	tcp_verify_left_out(tp);  out: @@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)  	tp->snd_cwnd_stamp = tcp_time_stamp;  } +static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) +{ +	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && +	       before(tp->rx_opt.rcv_tsecr, when); +} + +/* skb is spurious retransmitted if the returned timestamp echo + * reply is prior to the skb transmission time + */ +static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, +				     const struct sk_buff *skb) +{ +	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && +	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); +} +  /* Nothing was retransmitted or returned timestamp is less   * than timestamp of the first retransmission.   */  static inline bool tcp_packet_delayed(const struct tcp_sock *tp)  {  	return !tp->retrans_stamp || -		(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && -		 before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp)); +	       tcp_tsopt_ecr_before(tp, tp->retrans_stamp);  }  /* Undo procedures. */ @@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  		}  	} +	/* Use RACK to detect loss */ +	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && +	    tcp_rack_mark_lost(sk)) +		flag |= FLAG_LOST_RETRANS; +  	/* E. Process state. */  	switch (icsk->icsk_ca_state) {  	case TCP_CA_Recovery: @@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  	tcp_xmit_retransmit_queue(sk);  } +/* Kathleen Nichols' algorithm for tracking the minimum value of + * a data stream over some fixed time interval. (E.g., the minimum + * RTT over the past five minutes.) It uses constant space and constant + * time per update yet almost always delivers the same minimum as an + * implementation that has to keep all the data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of the + * n'th best >= n-1'th best. It also makes sure that the three values + * are widely separated in the time window since that bounds the worse + * case error when that data is monotonically increasing over the window. + * + * Upon getting a new min, we can forget everything earlier because it + * has no value - the new min is <= everything else in the window by + * definition and it's the most recent. So we restart fresh on every new min + * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd + * best. + */ +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +{ +	const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; +	struct rtt_meas *m = tcp_sk(sk)->rtt_min; +	struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; +	u32 elapsed; + +	/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ +	if (unlikely(rttm.rtt <= m[0].rtt)) +		m[0] = m[1] = m[2] = rttm; +	else if (rttm.rtt <= m[1].rtt) +		m[1] = m[2] = rttm; +	else if (rttm.rtt <= m[2].rtt) +		m[2] = rttm; + +	elapsed = now - m[0].ts; +	if (unlikely(elapsed > wlen)) { +		/* Passed entire window without a new min so make 2nd choice +		 * the new min & 3rd choice the new 2nd. So forth and so on. +		 */ +		m[0] = m[1]; +		m[1] = m[2]; +		m[2] = rttm; +		if (now - m[0].ts > wlen) { +			m[0] = m[1]; +			m[1] = rttm; +			if (now - m[0].ts > wlen) +				m[0] = rttm; +		} +	} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { +		/* Passed a quarter of the window without a new min so +		 * take 2nd choice from the 2nd quarter of the window. +		 */ +		m[2] = m[1] = rttm; +	} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { +		/* Passed half the window without a new min so take the 3rd +		 * choice from the last half of the window. +		 */ +		m[2] = rttm; +	} +} +  static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, -				      long seq_rtt_us, long sack_rtt_us) +				      long seq_rtt_us, long sack_rtt_us, +				      long ca_rtt_us)  {  	const struct tcp_sock *tp = tcp_sk(sk); @@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  	 * Karn's algorithm forbids taking RTT if some retransmitted data  	 * is acked (RFC6298).  	 */ -	if (flag & FLAG_RETRANS_DATA_ACKED) -		seq_rtt_us = -1L; -  	if (seq_rtt_us < 0)  		seq_rtt_us = sack_rtt_us; @@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  	 */  	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&  	    flag & FLAG_ACKED) -		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - +		seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - +							  tp->rx_opt.rcv_tsecr);  	if (seq_rtt_us < 0)  		return false; +	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is +	 * always taken together with ACK, SACK, or TS-opts. Any negative +	 * values will be skipped with the seq_rtt_us < 0 check above. +	 */ +	tcp_update_rtt_min(sk, ca_rtt_us);  	tcp_rtt_estimator(sk, seq_rtt_us);  	tcp_set_rto(sk); @@ -2953,21 +2974,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  }  /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) +void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)  { -	struct tcp_sock *tp = tcp_sk(sk); -	long seq_rtt_us = -1L; +	long rtt_us = -1L; -	if (synack_stamp && !tp->total_retrans) -		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); +	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { +		struct skb_mstamp now; -	/* If the ACK acks both the SYNACK and the (Fast Open'd) data packets -	 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() -	 */ -	if (!tp->srtt_us) -		tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); +		skb_mstamp_get(&now); +		rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); +	} + +	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);  } +  static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		if (sacked & TCPCB_SACKED_ACKED)  			tp->sacked_out -= acked_pcount; +		else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) +			tcp_rack_advance(tp, &skb->skb_mstamp, sacked);  		if (sacked & TCPCB_LOST)  			tp->lost_out -= acked_pcount; @@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		flag |= FLAG_SACK_RENEGING;  	skb_mstamp_get(&now); -	if (likely(first_ackt.v64)) { +	if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {  		seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);  		ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);  	} @@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);  	} -	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); +	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, +					ca_rtt_us);  	if (flag & FLAG_ACKED) {  		tcp_rearm_rto(sk); @@ -5472,7 +5496,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,  }  static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, -					 const struct tcphdr *th, unsigned int len) +					 const struct tcphdr *th)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); @@ -5698,15 +5722,14 @@ reset_and_undo:   *	address independent.   */ -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, -			  const struct tcphdr *th, unsigned int len) +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); +	const struct tcphdr *th = tcp_hdr(skb);  	struct request_sock *req;  	int queued = 0;  	bool acceptable; -	u32 synack_stamp;  	tp->rx_opt.saw_tstamp = 0; @@ -5750,7 +5773,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		goto discard;  	case TCP_SYN_SENT: -		queued = tcp_rcv_synsent_state_process(sk, skb, th, len); +		queued = tcp_rcv_synsent_state_process(sk, skb, th);  		if (queued >= 0)  			return queued; @@ -5785,15 +5808,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		if (!acceptable)  			return 1; +		if (!tp->srtt_us) +			tcp_synack_rtt_meas(sk, req); +  		/* Once we leave TCP_SYN_RECV, we no longer need req  		 * so release it.  		 */  		if (req) { -			synack_stamp = tcp_rsk(req)->snt_synack;  			tp->total_retrans = req->num_retrans;  			reqsk_fastopen_remove(sk, req, false);  		} else { -			synack_stamp = tp->lsndtime;  			/* Make sure socket is routed, for correct metrics. */  			icsk->icsk_af_ops->rebuild_header(sk);  			tcp_init_congestion_control(sk); @@ -5816,7 +5840,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;  		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;  		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); -		tcp_synack_rtt_meas(sk, synack_stamp);  		if (tp->rx_opt.tstamp_ok)  			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -6023,11 +6046,11 @@ static void tcp_openreq_init(struct request_sock *req,  {  	struct inet_request_sock *ireq = inet_rsk(req); -	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */ +	req->rsk_rcv_wnd = 0;		/* So that tcp_send_synack() knows! */  	req->cookie_ts = 0;  	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;  	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; -	tcp_rsk(req)->snt_synack = tcp_time_stamp; +	skb_mstamp_get(&tcp_rsk(req)->snt_synack);  	tcp_rsk(req)->last_oow_ack_time = 0;  	req->mss = rx_opt->mss_clamp;  	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -6043,9 +6066,11 @@ static void tcp_openreq_init(struct request_sock *req,  }  struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, -				      struct sock *sk_listener) +				      struct sock *sk_listener, +				      bool attach_listener)  { -	struct request_sock *req = reqsk_alloc(ops, sk_listener); +	struct request_sock *req = reqsk_alloc(ops, sk_listener, +					       attach_listener);  	if (req) {  		struct inet_request_sock *ireq = inet_rsk(req); @@ -6065,13 +6090,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc);  /*   * Return true if a syncookie should be sent   */ -static bool tcp_syn_flood_action(struct sock *sk, +static bool tcp_syn_flood_action(const struct sock *sk,  				 const struct sk_buff *skb,  				 const char *proto)  { +	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;  	const char *msg = "Dropping request";  	bool want_cookie = false; -	struct listen_sock *lopt;  #ifdef CONFIG_SYN_COOKIES  	if (sysctl_tcp_syncookies) { @@ -6082,12 +6107,12 @@ static bool tcp_syn_flood_action(struct sock *sk,  #endif  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); -	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; -	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { -		lopt->synflood_warned = 1; +	if (!queue->synflood_warned && +	    sysctl_tcp_syncookies != 2 && +	    xchg(&queue->synflood_warned, 1) == 0)  		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",  			proto, ntohs(tcp_hdr(skb)->dest), msg); -	} +  	return want_cookie;  } @@ -6112,16 +6137,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  		     const struct tcp_request_sock_ops *af_ops,  		     struct sock *sk, struct sk_buff *skb)  { +	struct tcp_fastopen_cookie foc = { .len = -1 }; +	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;  	struct tcp_options_received tmp_opt; -	struct request_sock *req;  	struct tcp_sock *tp = tcp_sk(sk); +	struct sock *fastopen_sk = NULL;  	struct dst_entry *dst = NULL; -	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; -	bool want_cookie = false, fastopen; +	struct request_sock *req; +	bool want_cookie = false;  	struct flowi fl; -	struct tcp_fastopen_cookie foc = { .len = -1 }; -	int err; -  	/* TW buckets are converted to open requests without  	 * limitations, they conserve resources and peer is @@ -6145,7 +6169,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  		goto drop;  	} -	req = inet_reqsk_alloc(rsk_ops, sk); +	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);  	if (!req)  		goto drop; @@ -6228,20 +6252,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	}  	tcp_rsk(req)->snt_isn = isn; +	tcp_rsk(req)->txhash = net_tx_rndhash();  	tcp_openreq_init_rwin(req, sk, dst); -	fastopen = !want_cookie && -		   tcp_try_fastopen(sk, skb, req, &foc, dst); -	err = af_ops->send_synack(sk, dst, &fl, req, -				  skb_get_queue_mapping(skb), &foc); -	if (!fastopen) { -		if (err || want_cookie) -			goto drop_and_free; - +	if (!want_cookie) { +		tcp_reqsk_record_syn(sk, req, skb); +		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); +	} +	if (fastopen_sk) { +		af_ops->send_synack(fastopen_sk, dst, &fl, req, +				    &foc, false); +		/* Add the child socket directly into the accept queue */ +		inet_csk_reqsk_queue_add(sk, req, fastopen_sk); +		sk->sk_data_ready(sk); +		bh_unlock_sock(fastopen_sk); +		sock_put(fastopen_sk); +	} else {  		tcp_rsk(req)->tfo_listener = false; -		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +		if (!want_cookie) +			inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +		af_ops->send_synack(sk, dst, &fl, req, +				    &foc, !want_cookie); +		if (want_cookie) +			goto drop_and_free;  	} -	tcp_reqsk_record_syn(sk, req, skb); - +	reqsk_put(req);  	return 0;  drop_and_release: |