diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 145 | 
1 files changed, 93 insertions, 52 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3167ad96567..95618d0e78e4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,11 +39,13 @@  #include <net/tcp.h>  #include <net/mptcp.h> +#include <net/proto_memory.h>  #include <linux/compiler.h>  #include <linux/gfp.h>  #include <linux/module.h>  #include <linux/static_key.h> +#include <linux/skbuff_ref.h>  #include <trace/events/tcp.h> @@ -203,16 +205,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)   * This MUST be enforced by all callers.   */  void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, -			       __u32 *rcv_wnd, __u32 *window_clamp, +			       __u32 *rcv_wnd, __u32 *__window_clamp,  			       int wscale_ok, __u8 *rcv_wscale,  			       __u32 init_rcv_wnd)  {  	unsigned int space = (__space < 0 ? 0 : __space); +	u32 window_clamp = READ_ONCE(*__window_clamp);  	/* If no clamp set the clamp to the max possible scaled window */ -	if (*window_clamp == 0) -		(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); -	space = min(*window_clamp, space); +	if (window_clamp == 0) +		window_clamp = (U16_MAX << TCP_MAX_WSCALE); +	space = min(window_clamp, space);  	/* Quantize space offering to a multiple of mss if possible. */  	if (space > mss) @@ -229,7 +232,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,  	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))  		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);  	else -		(*rcv_wnd) = min_t(u32, space, U16_MAX); +		(*rcv_wnd) = space;  	if (init_rcv_wnd)  		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); @@ -239,12 +242,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,  		/* Set window scaling on max possible window */  		space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));  		space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); -		space = min_t(u32, space, *window_clamp); +		space = min_t(u32, space, window_clamp);  		*rcv_wscale = clamp_t(int, ilog2(space) - 15,  				      0, TCP_MAX_WSCALE);  	}  	/* Set the clamp no higher than max representable value */ -	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); +	WRITE_ONCE(*__window_clamp, +		   min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));  }  EXPORT_SYMBOL(tcp_select_initial_window); @@ -1499,18 +1503,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)  }  /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)  { +	int tso_segs; +  	if (skb->len <= mss_now) {  		/* Avoid the costly divide in the normal  		 * non-TSO case.  		 */ -		tcp_skb_pcount_set(skb, 1);  		TCP_SKB_CB(skb)->tcp_gso_size = 0; -	} else { -		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); -		TCP_SKB_CB(skb)->tcp_gso_size = mss_now; +		tcp_skb_pcount_set(skb, 1); +		return 1;  	} +	TCP_SKB_CB(skb)->tcp_gso_size = mss_now; +	tso_segs = DIV_ROUND_UP(skb->len, mss_now); +	tcp_skb_pcount_set(skb, tso_segs); +	return tso_segs;  }  /* Pcount in the middle of the write queue got changed, we need to do various @@ -2070,16 +2078,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,  /* Can at least one segment of SKB be sent right now, according to the   * congestion window rules?  If so, return how many segments are allowed.   */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, -					 const struct sk_buff *skb) +static u32 tcp_cwnd_test(const struct tcp_sock *tp)  {  	u32 in_flight, cwnd, halfcwnd; -	/* Don't be strict about the congestion window for the final FIN.  */ -	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && -	    tcp_skb_pcount(skb) == 1) -		return 1; -  	in_flight = tcp_packets_in_flight(tp);  	cwnd = tcp_snd_cwnd(tp);  	if (in_flight >= cwnd) @@ -2100,10 +2102,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)  {  	int tso_segs = tcp_skb_pcount(skb); -	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { -		tcp_set_skb_tso_segs(skb, mss_now); -		tso_segs = tcp_skb_pcount(skb); -	} +	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) +		return tcp_set_skb_tso_segs(skb, mss_now); +  	return tso_segs;  } @@ -2403,6 +2404,21 @@ commit:  	return 0;  } +/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if + * all its payload was moved to another one (dst). + * Make sure to transfer tcp_flags, eor, and tstamp. + */ +static void tcp_eat_one_skb(struct sock *sk, +			    struct sk_buff *dst, +			    struct sk_buff *src) +{ +	TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; +	TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; +	tcp_skb_collapse_tstamp(dst, src); +	tcp_unlink_write_queue(src, sk); +	tcp_wmem_free_skb(sk, src); +} +  /* Create a new MTU probe if we are ready.   * MTU probe is regularly attempting to increase the path MTU by   * deliberately sending larger packets.  This discovers routing @@ -2508,16 +2524,7 @@ static int tcp_mtu_probe(struct sock *sk)  		copy = min_t(int, skb->len, probe_size - len);  		if (skb->len <= copy) { -			/* We've eaten all the data from this skb. -			 * Throw it away. */ -			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; -			/* If this is the last SKB we copy and eor is set -			 * we need to propagate it to the new skb. -			 */ -			TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; -			tcp_skb_collapse_tstamp(nskb, skb); -			tcp_unlink_write_queue(skb, sk); -			tcp_wmem_free_skb(sk, skb); +			tcp_eat_one_skb(sk, nskb, skb);  		} else {  			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &  						   ~(TCPHDR_FIN|TCPHDR_PSH); @@ -2683,6 +2690,35 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)  		tcp_chrono_set(tp, TCP_CHRONO_BUSY);  } +/* First skb in the write queue is smaller than ideal packet size. + * Check if we can move payload from the second skb in the queue. + */ +static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) +{ +	struct sk_buff *next_skb = skb->next; +	unsigned int nlen; + +	if (tcp_skb_is_last(sk, skb)) +		return; + +	if (!tcp_skb_can_collapse(skb, next_skb)) +		return; + +	nlen = min_t(u32, amount, next_skb->len); +	if (!nlen || !skb_shift(skb, next_skb, nlen)) +		return; + +	TCP_SKB_CB(skb)->end_seq += nlen; +	TCP_SKB_CB(next_skb)->seq += nlen; + +	if (!next_skb->len) { +		/* In case FIN is set, we need to update end_seq */ +		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + +		tcp_eat_one_skb(sk, skb, next_skb); +	} +} +  /* This routine writes packets to the network.  It advances the   * send_head.  This happens as incoming acks open up the remote   * window for us. @@ -2703,10 +2739,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb;  	unsigned int tso_segs, sent_pkts; -	int cwnd_quota; +	u32 cwnd_quota, max_segs;  	int result;  	bool is_cwnd_limited = false, is_rwnd_limited = false; -	u32 max_segs;  	sent_pkts = 0; @@ -2724,6 +2759,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  	max_segs = tcp_tso_segs(sk, mss_now);  	while ((skb = tcp_send_head(sk))) {  		unsigned int limit; +		int missing_bytes;  		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {  			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */ @@ -2737,10 +2773,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		if (tcp_pacing_check(sk))  			break; -		tso_segs = tcp_init_tso_segs(skb, mss_now); -		BUG_ON(!tso_segs); - -		cwnd_quota = tcp_cwnd_test(tp, skb); +		cwnd_quota = tcp_cwnd_test(tp);  		if (!cwnd_quota) {  			if (push_one == 2)  				/* Force out a loss probe pkt. */ @@ -2748,6 +2781,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  			else  				break;  		} +		cwnd_quota = min(cwnd_quota, max_segs); +		missing_bytes = cwnd_quota * mss_now - skb->len; +		if (missing_bytes > 0) +			tcp_grow_skb(sk, skb, missing_bytes); + +		tso_segs = tcp_set_skb_tso_segs(skb, mss_now);  		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {  			is_rwnd_limited = true; @@ -2769,9 +2808,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		limit = mss_now;  		if (tso_segs > 1 && !tcp_urg_mode(tp))  			limit = tcp_mss_split_point(sk, skb, mss_now, -						    min_t(unsigned int, -							  cwnd_quota, -							  max_segs), +						    cwnd_quota,  						    nonagle);  		if (skb->len > limit && @@ -3387,11 +3424,6 @@ start:  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);  	} -	/* To avoid taking spuriously low RTT samples based on a timestamp -	 * for a transmit that never happened, always mark EVER_RETRANS -	 */ -	TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; -  	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))  		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,  				  TCP_SKB_CB(skb)->seq, segs, err); @@ -3401,6 +3433,12 @@ start:  	} else if (err != -EBUSY) {  		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);  	} + +	/* To avoid taking spuriously low RTT samples based on a timestamp +	 * for a transmit that never happened, always mark EVER_RETRANS +	 */ +	TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; +  	return err;  } @@ -3563,7 +3601,9 @@ void tcp_send_fin(struct sock *sk)  			return;  		}  	} else { -		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); +		skb = alloc_skb_fclone(MAX_TCP_HEADER, +				       sk_gfp_mask(sk, GFP_ATOMIC | +						       __GFP_NOWARN));  		if (unlikely(!skb))  			return; @@ -3583,7 +3623,8 @@ void tcp_send_fin(struct sock *sk)   * was unread data in the receive queue.  This behavior is recommended   * by RFC 2525, section 2.17.  -DaveM   */ -void tcp_send_active_reset(struct sock *sk, gfp_t priority) +void tcp_send_active_reset(struct sock *sk, gfp_t priority, +			   enum sk_rst_reason reason)  {  	struct sk_buff *skb; @@ -3608,7 +3649,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)  	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,  	 * skb here is different to the troublesome skb, so use NULL  	 */ -	trace_tcp_send_reset(sk, NULL); +	trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);  }  /* Send a crossed SYN-ACK during socket establishment. @@ -3855,7 +3896,7 @@ static void tcp_connect_init(struct sock *sk)  	tcp_ca_dst_init(sk, dst);  	if (!tp->window_clamp) -		tp->window_clamp = dst_metric(dst, RTAX_WINDOW); +		WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));  	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));  	tcp_initialize_rcv_mss(sk); @@ -3863,7 +3904,7 @@ static void tcp_connect_init(struct sock *sk)  	/* limit the window selection if the user enforce a smaller rx buffer */  	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&  	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) -		tp->window_clamp = tcp_full_space(sk); +		WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));  	rcv_wnd = tcp_rwnd_init_bpf(sk);  	if (rcv_wnd == 0)  |