diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 95 | 
1 files changed, 60 insertions, 35 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 684f095d196e..a8f515bb19c4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -109,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;  #define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/  #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/  #define FLAG_ECE		0x40 /* ECE in this ACK				*/ +#define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */  #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/  #define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/  #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -196,11 +197,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)   * and the session is not interactive.   */ -static inline bool tcp_in_quickack_mode(const struct sock *sk) +static bool tcp_in_quickack_mode(struct sock *sk)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); +	const struct dst_entry *dst = __sk_dst_get(sk); -	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; +	return (dst && dst_metric(dst, RTAX_QUICKACK)) || +		(icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);  }  static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -750,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)   * TCP pacing, to smooth the burst on large writes when packets   * in flight is significantly lower than cwnd (or rwin)   */ +int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; +int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; +  static void tcp_update_pacing_rate(struct sock *sk)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	u64 rate;  	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ -	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); +	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + +	/* current rate is (cwnd * mss) / srtt +	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. +	 * In Congestion Avoidance phase, set it to 120 % the current rate. +	 * +	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) +	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching +	 *	 end of slow start and should slow down. +	 */ +	if (tp->snd_cwnd < tp->snd_ssthresh / 2) +		rate *= sysctl_tcp_pacing_ss_ratio; +	else +		rate *= sysctl_tcp_pacing_ca_ratio;  	rate *= max(tp->snd_cwnd, tp->packets_out); @@ -1037,7 +1056,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,   * highest SACK block). Also calculate the lowest snd_nxt among the remaining   * retransmitted skbs to avoid some costly processing per ACKs.   */ -static void tcp_mark_lost_retrans(struct sock *sk) +static void tcp_mark_lost_retrans(struct sock *sk, int *flag)  {  	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); @@ -1078,7 +1097,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)  		if (after(received_upto, ack_seq)) {  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;  			tp->retrans_out -= tcp_skb_pcount(skb); - +			*flag |= FLAG_LOST_RETRANS;  			tcp_skb_mark_lost_uncond_verify(tp, skb);  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);  		} else { @@ -1818,7 +1837,7 @@ advance_sp:  	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))  		tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); -	tcp_mark_lost_retrans(sk); +	tcp_mark_lost_retrans(sk, &state->flag);  	tcp_verify_left_out(tp);  out: @@ -1917,14 +1936,13 @@ void tcp_enter_loss(struct sock *sk)  	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb; -	bool new_recovery = false; +	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;  	bool is_reneg;			/* is receiver reneging on SACKs? */  	/* Reduce ssthresh if it has not yet been made inside this window. */  	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||  	    !after(tp->high_seq, tp->snd_una) ||  	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { -		new_recovery = true;  		tp->prior_ssthresh = tcp_current_ssthresh(sk);  		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);  		tcp_ca_event(sk, CA_EVENT_LOSS); @@ -2475,15 +2493,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)  	return false;  } -/* The cwnd reduction in CWR and Recovery use the PRR algorithm - * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ +/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.   * It computes the number of packets to send (sndcnt) based on packets newly   * delivered:   *   1) If the packets in flight is larger than ssthresh, PRR spreads the   *	cwnd reductions across a full RTT. - *   2) If packets in flight is lower than ssthresh (such as due to excess - *	losses and/or application stalls), do not perform any further cwnd - *	reductions, but instead slow start up to ssthresh. + *   2) Otherwise PRR uses packet conservation to send as much as delivered. + *      But when the retransmits are acked without further losses, PRR + *      slow starts cwnd up to ssthresh to speed up the recovery.   */  static void tcp_init_cwnd_reduction(struct sock *sk)  { @@ -2500,7 +2517,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)  }  static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, -			       int fast_rexmit) +			       int fast_rexmit, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int sndcnt = 0; @@ -2509,16 +2526,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,  				 (tp->packets_out - tp->sacked_out);  	tp->prr_delivered += newly_acked_sacked; -	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { +	if (delta < 0) {  		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +  			       tp->prior_cwnd - 1;  		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; -	} else { +	} else if ((flag & FLAG_RETRANS_DATA_ACKED) && +		   !(flag & FLAG_LOST_RETRANS)) {  		sndcnt = min_t(int, delta,  			       max_t(int, tp->prr_delivered - tp->prr_out,  				     newly_acked_sacked) + 1); +	} else { +		sndcnt = min(delta, newly_acked_sacked);  	} -  	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));  	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;  } @@ -2579,7 +2598,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)  	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {  		tcp_try_keep_open(sk);  	} else { -		tcp_cwnd_reduction(sk, prior_unsacked, 0); +		tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);  	}  } @@ -2589,6 +2608,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)  	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;  	icsk->icsk_mtup.probe_size = 0; +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);  }  static void tcp_mtup_probe_success(struct sock *sk) @@ -2608,6 +2628,7 @@ static void tcp_mtup_probe_success(struct sock *sk)  	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;  	icsk->icsk_mtup.probe_size = 0;  	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);  }  /* Do a simple retransmit without using the backoff mechanisms in @@ -2676,7 +2697,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)  	tp->prior_ssthresh = 0;  	tcp_init_undo(tp); -	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { +	if (!tcp_in_cwnd_reduction(sk)) {  		if (!ece_ack)  			tp->prior_ssthresh = tcp_current_ssthresh(sk);  		tcp_init_cwnd_reduction(sk); @@ -2736,7 +2757,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)  /* Undo during fast recovery after partial ACK. */  static bool tcp_try_undo_partial(struct sock *sk, const int acked, -				 const int prior_unsacked) +				 const int prior_unsacked, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2752,7 +2773,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,  		 * mark more packets lost or retransmit more.  		 */  		if (tp->retrans_out) { -			tcp_cwnd_reduction(sk, prior_unsacked, 0); +			tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);  			return true;  		} @@ -2839,7 +2860,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  			if (tcp_is_reno(tp) && is_dupack)  				tcp_add_reno_sack(sk);  		} else { -			if (tcp_try_undo_partial(sk, acked, prior_unsacked)) +			if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))  				return;  			/* Partial ACK arrived. Force fast retransmit. */  			do_lost = tcp_is_reno(tp) || @@ -2852,9 +2873,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  		break;  	case TCP_CA_Loss:  		tcp_process_loss(sk, flag, is_dupack); -		if (icsk->icsk_ca_state != TCP_CA_Open) +		if (icsk->icsk_ca_state != TCP_CA_Open && +		    !(flag & FLAG_LOST_RETRANS))  			return; -		/* Fall through to processing in Open state. */ +		/* Change state if cwnd is undone or retransmits are lost */  	default:  		if (tcp_is_reno(tp)) {  			if (flag & FLAG_SND_UNA_ADVANCED) @@ -2889,7 +2911,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  	if (do_lost)  		tcp_update_scoreboard(sk, fast_rexmit); -	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); +	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);  	tcp_xmit_retransmit_queue(sk);  } @@ -3326,6 +3348,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32  			tp->pred_flags = 0;  			tcp_fast_path_check(sk); +			if (tcp_send_head(sk)) +				tcp_slow_start_after_idle_check(sk); +  			if (nwin > tp->max_window) {  				tp->max_window = nwin;  				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); @@ -3563,10 +3588,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  				    &sack_state);  	acked -= tp->packets_out; -	/* Advance cwnd if state allows */ -	if (tcp_may_raise_cwnd(sk, flag)) -		tcp_cong_avoid(sk, ack, acked); -  	if (tcp_ack_is_dubious(sk, flag)) {  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));  		tcp_fastretrans_alert(sk, acked, prior_unsacked, @@ -3575,6 +3596,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	if (tp->tlp_high_seq)  		tcp_process_tlp_ack(sk, ack, flag); +	/* Advance cwnd if state allows */ +	if (tcp_may_raise_cwnd(sk, flag)) +		tcp_cong_avoid(sk, ack, acked); +  	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {  		struct dst_entry *dst = __sk_dst_get(sk);  		if (dst) @@ -3948,7 +3973,6 @@ void tcp_reset(struct sock *sk)  static void tcp_fin(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); -	const struct dst_entry *dst;  	inet_csk_schedule_ack(sk); @@ -3960,9 +3984,7 @@ static void tcp_fin(struct sock *sk)  	case TCP_ESTABLISHED:  		/* Move to CLOSE_WAIT */  		tcp_set_state(sk, TCP_CLOSE_WAIT); -		dst = __sk_dst_get(sk); -		if (!dst || !dst_metric(dst, RTAX_QUICKACK)) -			inet_csk(sk)->icsk_ack.pingpong = 1; +		inet_csk(sk)->icsk_ack.pingpong = 1;  		break;  	case TCP_CLOSE_WAIT: @@ -5981,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req,  	const struct net *net = sock_net(listen_sk);  	bool th_ecn = th->ece && th->cwr;  	bool ect, ecn_ok; +	u32 ecn_ok_dst;  	if (!th_ecn)  		return;  	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); -	ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); +	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); +	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; -	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) +	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || +	    (ecn_ok_dst & DST_FEATURE_ECN_CA))  		inet_rsk(req)->ecn_ok = 1;  }  |