aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 10:15:09 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 10:15:09 -0800
commit3051bf36c25d5153051704291782f8d44e744d36 (patch)
tree72dfc8a1d12675c6f2981d13102df954b678f11b /net/ipv4/tcp_output.c
parent1e74a2eb1f5cc7f2f2b5aa9c9eeecbcf352220a3 (diff)
parent005c3490e9db23738d91e02788606c0fe4734723 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Support TX_RING in AF_PACKET TPACKET_V3 mode, from Sowmini Varadhan. 2) Simplify classifier state on sk_buff in order to shrink it a bit. From Willem de Bruijn. 3) Introduce SIPHASH and it's usage for secure sequence numbers and syncookies. From Jason A. Donenfeld. 4) Reduce CPU usage for ICMP replies we are going to limit or suppress, from Jesper Dangaard Brouer. 5) Introduce Shared Memory Communications socket layer, from Ursula Braun. 6) Add RACK loss detection and allow it to actually trigger fast recovery instead of just assisting after other algorithms have triggered it. From Yuchung Cheng. 7) Add xmit_more and BQL support to mvneta driver, from Simon Guinot. 8) skb_cow_data avoidance in esp4 and esp6, from Steffen Klassert. 9) Export MPLS packet stats via netlink, from Robert Shearman. 10) Significantly improve inet port bind conflict handling, especially when an application is restarted and changes it's setting of reuseport. From Josef Bacik. 11) Implement TX batching in vhost_net, from Jason Wang. 12) Extend the dummy device so that VF (virtual function) features, such as configuration, can be more easily tested. From Phil Sutter. 13) Avoid two atomic ops per page on x86 in bnx2x driver, from Eric Dumazet. 14) Add new bpf MAP, implementing a longest prefix match trie. From Daniel Mack. 15) Packet sample offloading support in mlxsw driver, from Yotam Gigi. 16) Add new aquantia driver, from David VomLehn. 17) Add bpf tracepoints, from Daniel Borkmann. 18) Add support for port mirroring to b53 and bcm_sf2 drivers, from Florian Fainelli. 19) Remove custom busy polling in many drivers, it is done in the core networking since 4.5 times. From Eric Dumazet. 20) Support XDP adjust_head in virtio_net, from John Fastabend. 21) Fix several major holes in neighbour entry confirmation, from Julian Anastasov. 22) Add XDP support to bnxt_en driver, from Michael Chan. 23) VXLAN offloads for enic driver, from Govindarajulu Varadarajan. 24) Add IPVTAP driver (IP-VLAN based tap driver) from Sainath Grandhi. 25) Support GRO in IPSEC protocols, from Steffen Klassert" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1764 commits) Revert "ath10k: Search SMBIOS for OEM board file extension" net: socket: fix recvmmsg not returning error from sock_error bnxt_en: use eth_hw_addr_random() bpf: fix unlocking of jited image when module ronx not set arch: add ARCH_HAS_SET_MEMORY config net: napi_watchdog() can use napi_schedule_irqoff() tcp: Revert "tcp: tcp_probe: use spin_lock_bh()" net/hsr: use eth_hw_addr_random() net: mvpp2: enable building on 64-bit platforms net: mvpp2: switch to build_skb() in the RX path net: mvpp2: simplify MVPP2_PRS_RI_* definitions net: mvpp2: fix indentation of MVPP2_EXT_GLOBAL_CTRL_DEFAULT net: mvpp2: remove unused register definitions net: mvpp2: simplify mvpp2_bm_bufs_add() net: mvpp2: drop useless fields in mvpp2_bm_pool and related code net: mvpp2: remove unused 'tx_skb' field of 'struct mvpp2_tx_queue' net: mvpp2: release reference to txq_cpu[] entry after unmapping net: mvpp2: handle too large value in mvpp2_rx_time_coal_set() net: mvpp2: handle too large value handling in mvpp2_rx_pkts_coal_set() net: mvpp2: remove useless arguments in mvpp2_rx_{pkts, time}_coal_set ...
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c139
1 files changed, 38 insertions, 101 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8ce50dc3ab8c..22548b5f05cb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
- }
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
}
-/* SND.NXT, if window was not shrunk.
+/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
+ * window scaling factor due to loss of precision.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+ if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
+ (tp->rx_opt.wscale_ok &&
+ ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
return tp->snd_nxt;
else
return tcp_wnd_end(tp);
@@ -966,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+ /* If we had to use memory reserve to allocate this skb,
+ * this might cause drops if packet is looped back :
+ * Other socket might not have SOCK_MEMALLOC.
+ * Packets not looped back do not care about pfmemalloc.
+ */
+ skb->pfmemalloc = 0;
+
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -975,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
@@ -2289,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
- if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
- return false;
/* No consecutive loss probes. */
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
tcp_rearm_rto(sk);
@@ -2309,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
- !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+ !tp->packets_out || !tcp_is_sack(tp) ||
+ icsk->icsk_ca_state != TCP_CA_Open)
return false;
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2776,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
+ /* Update global and local TCP statistics. */
+ segs = tcp_skb_pcount(skb);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans += segs;
+
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
@@ -2793,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
}
if (likely(!err)) {
- segs = tcp_skb_pcount(skb);
-
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
- /* Update global TCP statistics. */
- TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans += segs;
+ } else if (err != -EBUSY) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
return err;
}
@@ -2823,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- } else if (err != -EBUSY) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
@@ -2833,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
return err;
}
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_sock *tp = tcp_sk(sk);
-
- /* Forward retransmissions are possible only during Recovery. */
- if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return false;
-
- /* No forward retransmissions in Reno are possible. */
- if (tcp_is_reno(tp))
- return false;
-
- /* Yeah, we have to make difficult choice between forward transmission
- * and retransmission... Both ways have their merits...
- *
- * For now we do not retransmit anything, while we have some new
- * segments to send. In the other cases, follow rule 3 for
- * NextSeg() specified in RFC3517.
- */
-
- if (tcp_may_send_now(sk))
- return false;
-
- return true;
-}
-
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
@@ -2877,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 max_segs, last_lost;
+ u32 max_segs;
int mib_idx;
- int fwd_rexmitting = 0;
if (!tp->packets_out)
return;
- if (!tp->lost_out)
- tp->retransmit_high = tp->snd_una;
-
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
- last_lost = TCP_SKB_CB(skb)->end_seq;
- if (after(last_lost, tp->retransmit_high))
- last_lost = tp->retransmit_high;
} else {
skb = tcp_write_queue_head(sk);
- last_lost = tp->snd_una;
}
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2917,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
*/
segs = min_t(int, segs, max_segs);
- if (fwd_rexmitting) {
-begin_fwd:
- if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
- break;
- mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
- } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
- tp->retransmit_high = last_lost;
- if (!tcp_can_forward_retransmit(sk))
- break;
- /* Backtrack if necessary to non-L'ed skb */
- if (hole) {
- skb = hole;
- hole = NULL;
- }
- fwd_rexmitting = 1;
- goto begin_fwd;
-
+ if (tp->retrans_out >= tp->lost_out) {
+ break;
} else if (!(sacked & TCPCB_LOST)) {
if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
} else {
- last_lost = TCP_SKB_CB(skb)->end_seq;
if (icsk->icsk_ca_state != TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPFASTRETRANS;
else
@@ -2962,7 +2916,8 @@ begin_fwd:
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
- if (skb == tcp_write_queue_head(sk))
+ if (skb == tcp_write_queue_head(sk) &&
+ icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
@@ -3119,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct sk_buff *skb;
int tcp_header_size;
struct tcphdr *th;
- u16 user_mss;
int mss;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3149,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
}
skb_dst_set(skb, dst);
- mss = dst_metric_advmss(dst);
- user_mss = READ_ONCE(tp->rx_opt.user_mss);
- if (user_mss && user_mss < mss)
- mss = user_mss;
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
@@ -3258,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk)
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
- tp->advmss = dst_metric_advmss(dst);
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
- tp->advmss = tp->rx_opt.user_mss;
+ tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
@@ -3326,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0;
- unsigned long last_syn_loss = 0;
+ int space, err = 0;
struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
- tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
- &syn_loss, &last_syn_loss);
- /* Recurring FO SYN losses: revert to regular handshake temporarily */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- fo->cookie.len = -1;
- goto fallback;
- }
-
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
- fo->cookie.len = -1;
- else if (fo->cookie.len <= 0)
+ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
* user-MSS. Reserve maximum option space for middleboxes that add
* private TCP options. The cost is reduced data space in SYN :(
*/
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+
space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;