diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 138 | 
1 files changed, 81 insertions, 57 deletions
| diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf4ced9273e8..518f04355fbf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -261,7 +261,8 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)  		 * cwnd may be very low (even just 1 packet), so we should ACK  		 * immediately.  		 */ -		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; +		if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) +			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;  	}  } @@ -437,7 +438,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)  /* 3. Try to fixup all. It is made immediately after connection enters   *    established state.   */ -void tcp_init_buffer_space(struct sock *sk) +static void tcp_init_buffer_space(struct sock *sk)  {  	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;  	struct tcp_sock *tp = tcp_sk(sk); @@ -2183,8 +2184,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)  }  /* Detect loss in event "A" above by marking head of queue up as lost. - * For non-SACK(Reno) senders, the first "packets" number of segments - * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * For RFC3517 SACK, a segment is considered lost if it   * has at least tp->reordering SACKed seqments above it; "packets" refers to   * the maximum SACKed segments to pass before reaching this limit.   */ @@ -2192,10 +2192,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb; -	int cnt, oldcnt, lost; -	unsigned int mss; +	int cnt;  	/* Use SACK to deduce losses of new sequences sent during recovery */ -	const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq; +	const u32 loss_high = tp->snd_nxt;  	WARN_ON(packets > tp->packets_out);  	skb = tp->lost_skb_hint; @@ -2218,26 +2217,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  		if (after(TCP_SKB_CB(skb)->end_seq, loss_high))  			break; -		oldcnt = cnt; -		if (tcp_is_reno(tp) || -		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) +		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)  			cnt += tcp_skb_pcount(skb); -		if (cnt > packets) { -			if (tcp_is_sack(tp) || -			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || -			    (oldcnt >= packets)) -				break; - -			mss = tcp_skb_mss(skb); -			/* If needed, chop off the prefix to mark as lost. */ -			lost = (packets - oldcnt) * mss; -			if (lost < skb->len && -			    tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, -					 lost, mss, GFP_ATOMIC) < 0) -				break; -			cnt = packets; -		} +		if (cnt > packets) +			break;  		tcp_skb_mark_lost(tp, skb); @@ -2849,8 +2833,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  			if (tcp_try_undo_partial(sk, prior_snd_una))  				return;  			/* Partial ACK arrived. Force fast retransmit. */ -			do_lost = tcp_is_reno(tp) || -				  tcp_force_fast_retransmit(sk); +			do_lost = tcp_force_fast_retransmit(sk);  		}  		if (tcp_try_undo_dsack(sk)) {  			tcp_try_keep_open(sk); @@ -3014,7 +2997,7 @@ void tcp_rearm_rto(struct sock *sk)  			rto = usecs_to_jiffies(max_t(int, delta_us, 1));  		}  		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, -				     TCP_RTO_MAX, tcp_rtx_queue_head(sk)); +				     TCP_RTO_MAX);  	}  } @@ -3291,7 +3274,7 @@ static void tcp_ack_probe(struct sock *sk)  		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);  		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, -				     when, TCP_RTO_MAX, NULL); +				     when, TCP_RTO_MAX);  	}  } @@ -3505,10 +3488,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)  	}  } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack   */  static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)  { @@ -3517,7 +3498,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)  	if (before(ack, tp->tlp_high_seq))  		return; -	if (flag & FLAG_DSACKING_ACK) { +	if (!tp->tlp_retrans) { +		/* TLP of new data has been acknowledged */ +		tp->tlp_high_seq = 0; +	} else if (flag & FLAG_DSACKING_ACK) {  		/* This DSACK means original and TLP probe arrived; no loss */  		tp->tlp_high_seq = 0;  	} else if (after(ack, tp->tlp_high_seq)) { @@ -3683,6 +3667,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		tcp_in_ack_event(sk, ack_ev_flags);  	} +	/* This is a deviation from RFC3168 since it states that: +	 * "When the TCP data sender is ready to set the CWR bit after reducing +	 * the congestion window, it SHOULD set the CWR bit only on the first +	 * new data packet that it transmits." +	 * We accept CWR on pure ACKs to be more robust +	 * with widely-deployed TCP implementations that do this. +	 */ +	tcp_ecn_accept_cwr(sk, skb); +  	/* We passed data and got it acked, remove any soft error  	 * log. Something worked...  	 */ @@ -3926,10 +3919,6 @@ void tcp_parse_options(const struct net *net,  				 */  				break;  #endif -			case TCPOPT_MPTCP: -				mptcp_parse_option(skb, ptr, opsize, opt_rx); -				break; -  			case TCPOPT_FASTOPEN:  				tcp_parse_fastopen_option(  					opsize - TCPOLEN_FASTOPEN_BASE, @@ -4327,6 +4316,33 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)  	}  } +static void tcp_sack_compress_send_ack(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	if (!tp->compressed_ack) +		return; + +	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) +		__sock_put(sk); + +	/* Since we have to send one ack finally, +	 * substract one from tp->compressed_ack to keep +	 * LINUX_MIB_TCPACKCOMPRESSED accurate. +	 */ +	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, +		      tp->compressed_ack - 1); + +	tp->compressed_ack = 0; +	tcp_send_ack(sk); +} + +/* Reasonable amount of sack blocks included in TCP SACK option + * The max is 4, but this becomes 3 if TCP timestamps are there. + * Given that SACK packets might be lost, be conservative and use 2. + */ +#define TCP_SACK_BLOCKS_EXPECTED 2 +  static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -4339,6 +4355,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)  	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {  		if (tcp_sack_extend(sp, seq, end_seq)) { +			if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) +				tcp_sack_compress_send_ack(sk);  			/* Rotate this_sack to the first one. */  			for (; this_sack > 0; this_sack--, sp--)  				swap(*sp, *(sp - 1)); @@ -4348,6 +4366,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)  		}  	} +	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) +		tcp_sack_compress_send_ack(sk); +  	/* Could not find an adjacent existing SACK, build a new one,  	 * put it at the front, and shift everyone else down.  We  	 * always know there is at least one SACK present already here. @@ -4355,8 +4376,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)  	 * If the sack array is full, forget about the last one.  	 */  	if (this_sack >= TCP_NUM_SACKS) { -		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) -			tcp_send_ack(sk);  		this_sack--;  		tp->rx_opt.num_sacks--;  		sp--; @@ -4564,6 +4583,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)  	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); +		sk->sk_data_ready(sk);  		tcp_drop(sk, skb);  		return;  	} @@ -4597,7 +4617,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)  	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,  				 skb, &fragstolen)) {  coalesce_done: -		tcp_grow_window(sk, skb); +		/* For non sack flows, do not grow window to force DUPACK +		 * and trigger fast retransmit. +		 */ +		if (tcp_is_sack(tp)) +			tcp_grow_window(sk, skb);  		kfree_skb_partial(skb, fragstolen);  		skb = NULL;  		goto add_sack; @@ -4681,7 +4705,11 @@ add_sack:  		tcp_sack_new_ofo_skb(sk, seq, end_seq);  end:  	if (skb) { -		tcp_grow_window(sk, skb); +		/* For non sack flows, do not grow window to force DUPACK +		 * and trigger fast retransmit. +		 */ +		if (tcp_is_sack(tp)) +			tcp_grow_window(sk, skb);  		skb_condense(skb);  		skb_set_owner_r(skb, sk);  	} @@ -4761,7 +4789,8 @@ void tcp_data_ready(struct sock *sk)  	const struct tcp_sock *tp = tcp_sk(sk);  	int avail = tp->rcv_nxt - tp->copied_seq; -	if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) +	if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && +	    !sock_flag(sk, SOCK_DONE))  		return;  	sk->sk_data_ready(sk); @@ -4783,8 +4812,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)  	skb_dst_drop(skb);  	__skb_pull(skb, tcp_hdr(skb)->doff * 4); -	tcp_ecn_accept_cwr(sk, skb); -  	tp->rx_opt.dsack = 0;  	/*  Queue data for delivery to the user. @@ -4803,6 +4830,7 @@ queue_and_out:  			sk_forced_mem_schedule(sk, skb->truesize);  		else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {  			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); +			sk->sk_data_ready(sk);  			goto drop;  		} @@ -5275,15 +5303,13 @@ send_now:  	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {  		tp->compressed_ack_rcv_nxt = tp->rcv_nxt; -		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) -			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, -				      tp->compressed_ack - TCP_FASTRETRANS_THRESH); -		tp->compressed_ack = 0; +		tp->dup_ack_counter = 0;  	} - -	if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) +	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { +		tp->dup_ack_counter++;  		goto send_now; - +	} +	tp->compressed_ack++;  	if (hrtimer_is_queued(&tp->compressed_ack_timer))  		return; @@ -5296,8 +5322,9 @@ send_now:  	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,  		      rtt * (NSEC_PER_USEC >> 3)/20);  	sock_hold(sk); -	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), -		      HRTIMER_MODE_REL_PINNED_SOFT); +	hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), +			       sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns, +			       HRTIMER_MODE_REL_PINNED_SOFT);  }  static inline void tcp_ack_snd_check(struct sock *sk) @@ -5990,9 +6017,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);  		tcp_initialize_rcv_mss(sk); -		if (sk_is_mptcp(sk)) -			mptcp_rcv_synsent(sk); -  		/* Remember, tcp_poll() does not lock socket!  		 * Change state from SYN-SENT only after copied_seq  		 * is initialized. */ |