From a70437cc09a11771870e9f6bfc0ba1237161daa8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2020 10:35:43 -0700 Subject: tcp: add hrtimer slack to sack compression Add a sysctl to control hrtimer slack, default of 100 usec. This gives the opportunity to reduce system overhead, and help very short RTT flows. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 83a5d24e13b8..6c05f1ceb538 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2780,6 +2780,7 @@ static int __net_init tcp_sk_init(struct net *net) sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit From 45af29ca761c275e350cca659856bc56f1035ef9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 May 2020 11:00:02 -0700 Subject: tcp: allow traceroute -Mtcp for unpriv users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unpriv users can use traceroute over plain UDP sockets, but not TCP ones. $ traceroute -Mtcp 8.8.8.8 You do not have enough privileges to use this traceroute method. $ traceroute -n -Mudp 8.8.8.8 traceroute to 8.8.8.8 (8.8.8.8), 30 hops max, 60 byte packets 1 192.168.86.1 3.631 ms 3.512 ms 3.405 ms 2 10.1.10.1 4.183 ms 4.125 ms 4.072 ms 3 96.120.88.125 20.621 ms 19.462 ms 20.553 ms 4 96.110.177.65 24.271 ms 25.351 ms 25.250 ms 5 69.139.199.197 44.492 ms 43.075 ms 44.346 ms 6 68.86.143.93 27.969 ms 25.184 ms 25.092 ms 7 96.112.146.18 25.323 ms 96.112.146.22 25.583 ms 96.112.146.26 24.502 ms 8 72.14.239.204 24.405 ms 74.125.37.224 16.326 ms 17.194 ms 9 209.85.251.9 18.154 ms 209.85.247.55 14.449 ms 209.85.251.9 26.296 ms^C We can easily support traceroute over TCP, by queueing an error message into socket error queue. Note that applications need to set IP_RECVERR/IPV6_RECVERR option to enable this feature, and that the error message is only queued while in SYN_SNT state. socket(AF_INET6, SOCK_STREAM, IPPROTO_IP) = 3 setsockopt(3, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMP_OLD, [1], 4) = 0 setsockopt(3, SOL_IPV6, IPV6_UNICAST_HOPS, [5], 4) = 0 connect(3, {sa_family=AF_INET6, sin6_port=htons(8787), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "2002:a05:6608:297::", &sin6_addr), sin6_scope_id=0}, 28) = -1 EHOSTUNREACH (No route to host) recvmsg(3, {msg_name={sa_family=AF_INET6, sin6_port=htons(8787), sin6_flowinfo=htonl(0), inet_pton(AF_INET6, "2002:a05:6608:297::", &sin6_addr), sin6_scope_id=0}, msg_namelen=1024->28, msg_iov=[{iov_base="`\r\337\320\0004\6\1&\7\370\260\200\231\16\27\0\0\0\0\0\0\0\0 \2\n\5f\10\2\227"..., iov_len=1024}], msg_iovlen=1, msg_control=[{cmsg_len=32, cmsg_level=SOL_SOCKET, cmsg_type=SO_TIMESTAMP_OLD, cmsg_data={tv_sec=1590340680, tv_usec=272424}}, {cmsg_len=60, cmsg_level=SOL_IPV6, cmsg_type=IPV6_RECVERR}], msg_controllen=96, msg_flags=MSG_ERRQUEUE}, MSG_ERRQUEUE) = 144 Suggested-by: Maciej Żenczykowski Cc: Willem de Bruijn Reviewed-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ net/ipv6/tcp_ipv6.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6c05f1ceb538..900c6d154cbc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -573,6 +573,8 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; + ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); + if (!sock_owned_by_user(sk)) { sk->sk_err = err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 413b3425ac66..01a6f5111a77 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -463,6 +463,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (fastopen && !fastopen->sk) break; + ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); + if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ -- cgit From 239174945dac8cb9613db7755103d5fb6c32241d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2020 20:15:24 -0700 Subject: tcp: tcp_v4_err() icmp skb is named icmp_skb I missed the fact that tcp_v4_err() differs from tcp_v6_err(). After commit 4d1a2d9ec1c1 ("Rename skb to icmp_skb in tcp_v4_err()") the skb argument has been renamed to icmp_skb only in one function. I will in a future patch reconciliate these functions to avoid this kind of confusion. Fixes: 45af29ca761c ("tcp: allow traceroute -Mtcp for unpriv users") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 900c6d154cbc..6789671f0f5a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -573,7 +573,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; - ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); + ip_icmp_error(sk, icmp_skb, err, th->dest, info, (u8 *)th); if (!sock_owned_by_user(sk)) { sk->sk_err = err; -- cgit From f745664257b62f6ba29f45fd21fbe7193b4da57b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2020 19:48:49 -0700 Subject: tcp: add tcp_ld_RTO_revert() helper RFC 6069 logic has been implemented for IPv4 only so far, right in the middle of tcp_v4_err() and was error prone. Move this code to one helper, to make tcp_v4_err() more readable and to eventually expand RFC 6069 to IPv6 in the future. Also perform sock_owned_by_user() check a bit sooner. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 85 ++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6789671f0f5a..f32dcadc91b7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -403,6 +403,45 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } EXPORT_SYMBOL(tcp_req_err); +/* TCP-LD (RFC 6069) logic */ +static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + s32 remaining; + u32 delta_us; + + if (sock_owned_by_user(sk)) + return; + + if (seq != tp->snd_una || !icsk->icsk_retransmits || + !icsk->icsk_backoff) + return; + + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) + return; + + icsk->icsk_backoff--; + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); + remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); + + if (remaining > 0) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now. + */ + tcp_retransmit_timer(sk); + } +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -423,17 +462,13 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); - struct inet_connection_sock *icsk; struct tcp_sock *tp; struct inet_sock *inet; const int type = icmp_hdr(icmp_skb)->type; const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; - struct sk_buff *skb; struct request_sock *fastopen; u32 seq, snd_una; - s32 remaining; - u32 delta_us; int err; struct net *net = dev_net(icmp_skb->dev); @@ -476,7 +511,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } - icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); @@ -521,41 +555,12 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } err = icmp_err_convert[code].errno; - /* check if icmp_skb allows revert of backoff - * (see draft-zimmermann-tcp-lcd) */ - if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) - break; - if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff || fastopen) - break; - - if (sock_owned_by_user(sk)) - break; - - skb = tcp_rtx_queue_head(sk); - if (WARN_ON_ONCE(!skb)) - break; - - icsk->icsk_backoff--; - icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - - - tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); - remaining = icsk->icsk_rto - - usecs_to_jiffies(delta_us); - - if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); - } else { - /* RTO revert clocked out retransmission. - * Will retransmit now */ - tcp_retransmit_timer(sk); - } - + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && + (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) + tcp_ld_RTO_revert(sk, seq); break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; -- cgit From a12daf13a4492bb7fb26231e52afb381927f938e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 May 2020 19:48:50 -0700 Subject: tcp: rename tcp_v4_err() skb parameter This essentially reverts 4d1a2d9ec1c1 ("Revert Backoff [v3]: Rename skb to icmp_skb in tcp_v4_err()") Now we have tcp_ld_RTO_revert() helper, we can use the usual name for sk_buff parameter, so that tcp_v4_err() and tcp_v6_err() use similar names. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f32dcadc91b7..4eef5b84fff1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -458,23 +458,23 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) * */ -int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; - struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; struct inet_sock *inet; - const int type = icmp_hdr(icmp_skb)->type; - const int code = icmp_hdr(icmp_skb)->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; struct request_sock *fastopen; u32 seq, snd_una; int err; - struct net *net = dev_net(icmp_skb->dev); + struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, ntohs(th->source), - inet_iif(icmp_skb), 0); + inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; @@ -524,7 +524,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) switch (type) { case ICMP_REDIRECT: if (!sock_owned_by_user(sk)) - do_redirect(icmp_skb, sk); + do_redirect(skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ @@ -578,7 +578,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; - ip_icmp_error(sk, icmp_skb, err, th->dest, info, (u8 *)th); + ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); if (!sock_owned_by_user(sk)) { sk->sk_err = err; -- cgit From d29245692a44d71d5e2e0770463184a693696232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2020 17:34:58 -0700 Subject: tcp: ipv6: support RFC 6069 (TCP-LD) Make tcp_ld_RTO_revert() helper available to IPv6, and implement RFC 6069 : Quoting this RFC : 3. Connectivity Disruption Indication For Internet Protocol version 6 (IPv6) [RFC2460], the counterpart of the ICMP destination unreachable message of code 0 (net unreachable) and of code 1 (host unreachable) is the ICMPv6 destination unreachable message of code 0 (no route to destination) [RFC4443]. As with IPv4, a router should generate an ICMPv6 destination unreachable message of code 0 in response to a packet that cannot be delivered to its destination address because it lacks a matching entry in its routing table. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index b681338a8320..66e4b8331850 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -437,6 +437,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); +void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4eef5b84fff1..ad6435ba6d72 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -404,7 +404,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) EXPORT_SYMBOL(tcp_req_err); /* TCP-LD (RFC 6069) logic */ -static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +void tcp_ld_RTO_revert(struct sock *sk, u32 seq) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -441,6 +441,7 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } +EXPORT_SYMBOL(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 01a6f5111a77..b7415ca75c2d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -473,6 +473,15 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else sk->sk_err_soft = err; goto out; + case TCP_LISTEN: + break; + default: + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && type == ICMPV6_DEST_UNREACH && + code == ICMPV6_NOROUTE) + tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && np->recverr) { -- cgit