diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 156 | 
1 files changed, 93 insertions, 63 deletions
| diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7820f3a7dd70..3be16727f058 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -363,15 +363,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,   */  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)  { +	struct skb_shared_info *shinfo = skb_shinfo(skb); +  	skb->ip_summed = CHECKSUM_PARTIAL;  	skb->csum = 0;  	TCP_SKB_CB(skb)->tcp_flags = flags;  	TCP_SKB_CB(skb)->sacked = 0; -	skb_shinfo(skb)->gso_segs = 1; -	skb_shinfo(skb)->gso_size = 0; -	skb_shinfo(skb)->gso_type = 0; +	shinfo->gso_segs = 1; +	shinfo->gso_size = 0; +	shinfo->gso_type = 0;  	TCP_SKB_CB(skb)->seq = seq;  	if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -406,7 +408,7 @@ struct tcp_out_options {   * Beware: Something in the Internet is very sensitive to the ordering of   * TCP options, we learned this through the hard way, so be careful here.   * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with   * the ordering which we have been using if we want to keep working with   * those broken things (not that it currently hurts anybody as there isn't   * particular reason why the ordering would need to be changed). @@ -679,7 +681,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb   *   * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb   * needs to be reallocated in a driver. - * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc   *   * Since transmit from skb destructor is forbidden, we use a tasklet   * to process all sockets that eventually need to send more skbs. @@ -696,12 +698,13 @@ static void tcp_tsq_handler(struct sock *sk)  	if ((1 << sk->sk_state) &  	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |  	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) -		tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); +		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, +			       0, GFP_ATOMIC);  }  /* - * One tasklest per cpu tries to send more skbs. + * One tasklet per cpu tries to send more skbs.   * We run in tasklet context but need to disable irqs when - * transfering tsq->head because tcp_wfree() might + * transferring tsq->head because tcp_wfree() might   * interrupt us (non NAPI drivers)   */  static void tcp_tasklet_func(unsigned long data) @@ -795,7 +798,7 @@ void __init tcp_tasklet_init(void)  /*   * Write buffer destructor automatically called from kfree_skb. - * We cant xmit new skbs from this context, as we might already + * We can't xmit new skbs from this context, as we might already   * hold qdisc lock.   */  void tcp_wfree(struct sk_buff *skb) @@ -986,6 +989,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)  static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,  				 unsigned int mss_now)  { +	struct skb_shared_info *shinfo = skb_shinfo(skb); +  	/* Make sure we own this skb before messing gso_size/gso_segs */  	WARN_ON_ONCE(skb_cloned(skb)); @@ -993,13 +998,13 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,  		/* Avoid the costly divide in the normal  		 * non-TSO case.  		 */ -		skb_shinfo(skb)->gso_segs = 1; -		skb_shinfo(skb)->gso_size = 0; -		skb_shinfo(skb)->gso_type = 0; +		shinfo->gso_segs = 1; +		shinfo->gso_size = 0; +		shinfo->gso_type = 0;  	} else { -		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); -		skb_shinfo(skb)->gso_size = mss_now; -		skb_shinfo(skb)->gso_type = sk->sk_gso_type; +		shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); +		shinfo->gso_size = mss_now; +		shinfo->gso_type = sk->sk_gso_type;  	}  } @@ -1146,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,   */  static void __pskb_trim_head(struct sk_buff *skb, int len)  { +	struct skb_shared_info *shinfo;  	int i, k, eat;  	eat = min_t(int, len, skb_headlen(skb)); @@ -1157,23 +1163,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)  	}  	eat = len;  	k = 0; -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); +	shinfo = skb_shinfo(skb); +	for (i = 0; i < shinfo->nr_frags; i++) { +		int size = skb_frag_size(&shinfo->frags[i]);  		if (size <= eat) {  			skb_frag_unref(skb, i);  			eat -= size;  		} else { -			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; +			shinfo->frags[k] = shinfo->frags[i];  			if (eat) { -				skb_shinfo(skb)->frags[k].page_offset += eat; -				skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); +				shinfo->frags[k].page_offset += eat; +				skb_frag_size_sub(&shinfo->frags[k], eat);  				eat = 0;  			}  			k++;  		}  	} -	skb_shinfo(skb)->nr_frags = k; +	shinfo->nr_frags = k;  	skb_reset_tail_pointer(skb);  	skb->data_len -= len; @@ -1378,23 +1385,51 @@ static void tcp_cwnd_validate(struct sock *sk)  	}  } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ +	return after(tp->snd_sml, tp->snd_una) && +		!after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + *  skb_pcount = skb->len / mss_now   */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, -					unsigned int mss_now, unsigned int max_segs) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, +				const struct sk_buff *skb) +{ +	if (skb->len < tcp_skb_pcount(skb) * mss_now) +		tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + *    With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, +			    unsigned int mss_now, int nonagle) +{ +	return partial && +		((nonagle & TCP_NAGLE_CORK) || +		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, +					const struct sk_buff *skb, +					unsigned int mss_now, +					unsigned int max_segs, +					int nonagle)  {  	const struct tcp_sock *tp = tcp_sk(sk); -	u32 needed, window, max_len; +	u32 partial, needed, window, max_len;  	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;  	max_len = mss_now * max_segs; @@ -1407,7 +1442,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b  	if (max_len <= needed)  		return max_len; -	return needed - needed % mss_now; +	partial = needed % mss_now; +	/* If last segment is not a full MSS, check if Nagle rules allow us +	 * to include this last segment in this skb. +	 * Otherwise, we'll split the skb at last MSS boundary +	 */ +	if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) +		return needed - partial; + +	return needed;  }  /* Can at least one segment of SKB be sent right now, according to the @@ -1447,28 +1490,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,  	return tso_segs;  } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ -	return after(tp->snd_sml, tp->snd_una) && -		!after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - *    With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, -				  const struct sk_buff *skb, -				  unsigned int mss_now, int nonagle) -{ -	return skb->len < mss_now && -		((nonagle & TCP_NAGLE_CORK) || -		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -}  /* Return true if the Nagle test allows this packet to be   * sent now. @@ -1489,7 +1510,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf  	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))  		return true; -	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) +	if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))  		return true;  	return false; @@ -1884,7 +1905,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		if (atomic_read(&sk->sk_wmem_alloc) > limit) {  			set_bit(TSQ_THROTTLED, &tp->tsq_flags); -			break; +			/* It is possible TX completion already happened +			 * before we set TSQ_THROTTLED, so we must +			 * test again the condition. +			 * We abuse smp_mb__after_clear_bit() because +			 * there is no smp_mb__after_set_bit() yet +			 */ +			smp_mb__after_clear_bit(); +			if (atomic_read(&sk->sk_wmem_alloc) > limit) +				break;  		}  		limit = mss_now; @@ -1892,7 +1921,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  			limit = tcp_mss_split_point(sk, skb, mss_now,  						    min_t(unsigned int,  							  cwnd_quota, -							  sk->sk_gso_max_segs)); +							  sk->sk_gso_max_segs), +						    nonagle);  		if (skb->len > limit &&  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -1956,7 +1986,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)  	/* Schedule a loss probe in 2*RTT for SACK capable connections  	 * in Open state, that are either limited by cwnd or application.  	 */ -	if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || +	if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out ||  	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)  		return false; @@ -2756,7 +2786,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  EXPORT_SYMBOL(tcp_make_synack);  /* Do all connect socket setups that can be done AF independent. */ -void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk)  {  	const struct dst_entry *dst = __sk_dst_get(sk);  	struct tcp_sock *tp = tcp_sk(sk); |