diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 388 |
1 files changed, 211 insertions, 177 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d07e34f8e309..730bc44dbad9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,6 +45,21 @@ #include <trace/events/tcp.h> +/* Refresh clocks of a TCP socket, + * ensuring monotically increasing values. + */ +void tcp_mstamp_refresh(struct tcp_sock *tp) +{ + u64 val = tcp_clock_ns(); + + if (val > tp->tcp_clock_cache) + tp->tcp_clock_cache = val; + + val = div_u64(val, NSEC_PER_USEC); + if (val > tp->tcp_mstamp) + tp->tcp_mstamp = val; +} + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -160,25 +175,23 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) { - tcp_dec_quickack_mode(sk, pkts); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); -} - + struct tcp_sock *tp = tcp_sk(sk); -u32 tcp_default_init_rwnd(u32 mss) -{ - /* Initial receive window should be twice of TCP_INIT_CWND to - * enable proper sending of new unsent data during fast recovery - * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a - * limit when mss is larger than 1460. - */ - u32 init_rwnd = TCP_INIT_CWND * 2; + if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = TCP_FASTRETRANS_THRESH; + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + } - if (mss > 1460) - init_rwnd = max((1460 * init_rwnd) / mss, 2U); - return init_rwnd; + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; /* Special ACK sent by DCTCP to reflect ECN */ + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. @@ -215,26 +228,20 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else - (*rcv_wnd) = space; + (*rcv_wnd) = min_t(u32, space, U16_MAX); - (*rcv_wscale) = 0; + if (init_rcv_wnd) + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); + + *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { - space >>= 1; - (*rcv_wscale)++; - } - } - - if (mss > (1 << *rcv_wscale)) { - if (!init_rcv_wnd) /* Use default unless specified otherwise */ - init_rcv_wnd = tcp_default_init_rwnd(mss); - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); + *rcv_wscale = clamp_t(int, ilog2(space) - 15, + 0, TCP_MAX_WSCALE); } - /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); } @@ -585,14 +592,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; + *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - *md5 = tp->af_specific->md5_lookup(sk, sk); - if (*md5) { - opts->options |= OPTION_MD5; - remaining -= TCPOLEN_MD5SIG_ALIGNED; + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + } } -#else - *md5 = NULL; #endif /* We always get an MSS option. The option bytes which will be seen in @@ -720,14 +729,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->options = 0; + *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - *md5 = tp->af_specific->md5_lookup(sk, sk); - if (unlikely(*md5)) { - opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } } -#else - *md5 = NULL; #endif if (likely(tp->rx_opt.tstamp_ok)) { @@ -772,7 +783,7 @@ struct tsq_tasklet { }; static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); -static void tcp_tsq_handler(struct sock *sk) +static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | @@ -789,6 +800,16 @@ static void tcp_tsq_handler(struct sock *sk) 0, GFP_ATOMIC); } } + +static void tcp_tsq_handler(struct sock *sk) +{ + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + tcp_tsq_write(sk); + else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + bh_unlock_sock(sk); +} /* * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when @@ -816,16 +837,7 @@ static void tcp_tasklet_func(unsigned long data) smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); - if (!sk->sk_lock.owned && - test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) { - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) { - clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); - tcp_tsq_handler(sk); - } - bh_unlock_sock(sk); - } - + tcp_tsq_handler(sk); sk_free(sk); } } @@ -853,9 +865,10 @@ void tcp_release_cb(struct sock *sk) nflags = flags & ~TCP_DEFERRED_ALL; } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); - if (flags & TCPF_TSQ_DEFERRED) - tcp_tsq_handler(sk); - + if (flags & TCPF_TSQ_DEFERRED) { + tcp_tsq_write(sk); + __sock_put(sk); + } /* Here begins the tricky part : * We are called from release_sock() with : * 1) BH disabled @@ -929,7 +942,7 @@ void tcp_wfree(struct sk_buff *skb) if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; - nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); if (nval != oval) continue; @@ -948,75 +961,42 @@ out: sk_free(sk); } -/* Note: Called under hard irq. - * We can not call TCP stack right away. +/* Note: Called under soft irq. + * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; - unsigned long nval, oval; - for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { - struct tsq_tasklet *tsq; - bool empty; + tcp_tsq_handler(sk); + sock_put(sk); - if (oval & TSQF_QUEUED) - break; - - nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; - nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); - if (nval != oval) - continue; - - if (!refcount_inc_not_zero(&sk->sk_wmem_alloc)) - break; - /* queue this socket to tasklet queue */ - tsq = this_cpu_ptr(&tsq_tasklet); - empty = list_empty(&tsq->head); - list_add(&tp->tsq_node, &tsq->head); - if (empty) - tasklet_schedule(&tsq->tasklet); - break; - } return HRTIMER_NORESTART; } -/* BBR congestion control needs pacing. - * Same remark for SO_MAX_PACING_RATE. - * sch_fq packet scheduler is efficiently handling pacing, - * but is not always installed/used. - * Return true if TCP stack should pace packets itself. - */ -static bool tcp_needs_internal_pacing(const struct sock *sk) -{ - return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; -} - -static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) +static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, + u64 prior_wstamp) { - u64 len_ns; - u32 rate; + struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_needs_internal_pacing(sk)) - return; - rate = sk->sk_pacing_rate; - if (!rate || rate == ~0U) - return; + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + if (sk->sk_pacing_status != SK_PACING_NONE) { + unsigned long rate = sk->sk_pacing_rate; - /* Should account for header sizes as sch_fq does, - * but lets make things simple. - */ - len_ns = (u64)skb->len * NSEC_PER_SEC; - do_div(len_ns, rate); - hrtimer_start(&tcp_sk(sk)->pacing_timer, - ktime_add_ns(ktime_get(), len_ns), - HRTIMER_MODE_ABS_PINNED); -} + /* Original sch_fq does not pace first 10 MSS + * Note that tp->data_segs_out overflows after 2^32 packets, + * this is a minor annoyance. + */ + if (rate != ~0UL && rate && tp->data_segs_out >= 10) { + u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); + u64 credit = tp->tcp_wstamp_ns - prior_wstamp; -static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) -{ - skb->skb_mstamp = tp->tcp_mstamp; + /* take into account OS jitter */ + len_ns -= min_t(u64, len_ns / 2, credit); + tp->tcp_wstamp_ns += len_ns; + } + } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } @@ -1031,8 +1011,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - gfp_t gfp_mask) +static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, + int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -1043,6 +1023,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct sk_buff *oskb = NULL; struct tcp_md5sig_key *md5; struct tcphdr *th; + u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -1063,7 +1044,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (unlikely(!skb)) return -ENOBUFS; } - skb->skb_mstamp = tp->tcp_mstamp; + + prior_wstamp = tp->tcp_wstamp_ns; + tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); + + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); @@ -1078,7 +1063,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() - * which holds one reference to sk_wmem_alloc. + * which holds one reference to sk. * * TODO: Ideally, in-flight pure ACK packets should not matter here. * One way to get this would be to set skb->truesize = 2 on them. @@ -1108,7 +1093,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); + th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); @@ -1149,12 +1134,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); - tcp_internal_pacing(sk, skb); + tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -1166,8 +1151,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); - /* Our usage of tstamp should remain private */ - skb->tstamp = 0; + /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), @@ -1180,12 +1164,19 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - tcp_update_skb_after_send(tp, oskb); + tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; } +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +} + /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -1708,8 +1699,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, { u32 bytes, segs; - bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, - sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + bytes = min_t(unsigned long, + sk->sk_pacing_rate >> sk->sk_pacing_shift, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, * not one big TSO packet every 100 ms. @@ -1912,24 +1904,27 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited, u32 max_segs) + bool *is_cwnd_limited, + bool *is_rwnd_limited, + u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 age, send_win, cong_win, limit, in_flight; + u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; - - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - goto send_now; + s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer - * only if the last write was recent. + * only if the last write was recent (1 ms). + * Note that tp->tcp_wstamp_ns can be in the future if we have + * packets waiting in a qdisc or device for EDT delivery. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) + delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; + if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1976,15 +1971,33 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); + delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ - if (age < (tp->srtt_us >> 4)) + if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; - /* Ok, it looks like it is advisable to defer. */ + /* Ok, it looks like it is advisable to defer. + * Three cases are tracked : + * 1) We are cwnd-limited + * 2) We are rwnd-limited + * 3) We are application limited. + */ + if (cong_win < send_win) { + if (cong_win <= skb->len) { + *is_cwnd_limited = true; + return true; + } + } else { + if (send_win <= skb->len) { + *is_rwnd_limited = true; + return true; + } + } - if (cong_win < send_win && cong_win <= skb->len) - *is_cwnd_limited = true; + /* If this packet won't get more data, do not wait. */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || + TCP_SKB_CB(skb)->eor) + goto send_now; return true; @@ -2182,10 +2195,23 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } -static bool tcp_pacing_check(const struct sock *sk) +static bool tcp_pacing_check(struct sock *sk) { - return tcp_needs_internal_pacing(sk) && - hrtimer_active(&tcp_sk(sk)->pacing_timer); + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_needs_internal_pacing(sk)) + return false; + + if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) + return false; + + if (!hrtimer_is_queued(&tp->pacing_timer)) { + hrtimer_start(&tp->pacing_timer, + ns_to_ktime(tp->tcp_wstamp_ns), + HRTIMER_MODE_ABS_PINNED_SOFT); + sock_hold(sk); + } + return true; } /* TCP Small Queues : @@ -2202,11 +2228,14 @@ static bool tcp_pacing_check(const struct sock *sk) static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { - unsigned int limit; + unsigned long limit; - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); - limit = min_t(u32, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + limit = max_t(unsigned long, + 2 * skb->truesize, + sk->sk_pacing_rate >> sk->sk_pacing_shift); + if (sk->sk_pacing_status == SK_PACING_NONE) + limit = min_t(unsigned long, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { @@ -2314,18 +2343,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { + /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ + skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + goto repair; /* Skip network transmission */ + } + if (tcp_pacing_check(sk)) break; tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); - if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { - /* "skb_mstamp" is used as a start point for the retransmit timer */ - tcp_update_skb_after_send(tp, skb); - goto repair; /* Skip network transmission */ - } - cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { if (push_one == 2) @@ -2348,7 +2378,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, - max_segs)) + &is_rwnd_limited, max_segs)) break; } @@ -2365,8 +2395,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb, limit, mss_now, gfp))) break; - if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) - clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -2449,8 +2477,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, + TCP_RTO_MAX, NULL); return true; } @@ -2488,15 +2516,18 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); + if (unlikely(!skb)) { + WARN_ONCE(tp->packets_out, + "invalid inflight: %u state %u cwnd %u mss %d\n", + tp->packets_out, sk->sk_state, tp->snd_cwnd, mss); + inet_csk(sk)->icsk_pending = 0; + return; + } /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) goto rearm_timer; - /* Retransmit last segment. */ - if (WARN_ON(!skb)) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2710,9 +2741,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); - int skb_size, next_skb_size; + int next_skb_size; - skb_size = skb->len; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); @@ -2883,6 +2913,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; + tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -2899,7 +2930,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) } tcp_skb_tsorted_restore(skb); if (!err) { - tcp_update_skb_after_send(tp, skb); + tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { @@ -2914,7 +2945,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } return err; } @@ -3014,9 +3045,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX, + skb); } } @@ -3217,10 +3249,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif - skb->skb_mstamp = tcp_clock_us(); + skb->skb_mstamp_ns = tcp_clock_ns(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3436,7 +3468,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - syn->skb_mstamp = syn_data->skb_mstamp; + syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) @@ -3533,8 +3565,6 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; - tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); - if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3583,7 +3613,7 @@ void tcp_send_delayed_ack(struct sock *sk) } /* This routine sends an ack and also updates the window. */ -void tcp_send_ack(struct sock *sk) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; @@ -3591,8 +3621,6 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; - tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); - /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. @@ -3618,9 +3646,14 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); + __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); +} +EXPORT_SYMBOL_GPL(__tcp_send_ack); + +void tcp_send_ack(struct sock *sk) +{ + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } -EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. @@ -3745,9 +3778,10 @@ void tcp_send_probe0(struct sock *sk) icsk->icsk_probes_out = 1; probe_max = TCP_RESOURCE_PROBE_INTERVAL; } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_when(sk, probe_max), - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + tcp_probe0_when(sk, probe_max), + TCP_RTO_MAX, + NULL); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) |