diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Makefile | 2 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 5 | ||||
-rw-r--r-- | net/ipv4/bpf_tcp_ca.c | 69 | ||||
-rw-r--r-- | net/ipv4/bpfilter/Makefile | 2 | ||||
-rw-r--r-- | net/ipv4/bpfilter/sockopt.c | 71 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 6 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 1 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 121 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 86 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 125 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 21 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 51 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 15 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 215 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 104 | ||||
-rw-r--r-- | net/ipv4/tcp_ao.c | 16 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 29 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 |
19 files changed, 553 insertions, 408 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index e144a02a6a61..ec36d2ec059e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -16,8 +16,6 @@ obj-y := route.o inetpeer.o protocol.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ metrics.o netlink.o nexthop.o udp_tunnel_stub.o -obj-$(CONFIG_BPFILTER) += bpfilter/ - obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fb81de10d332..835f4f9d98d2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1633,6 +1633,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) #endif return -EINVAL; } +EXPORT_SYMBOL(inet_recv_error); int inet_gro_complete(struct sk_buff *skb, int nhoff) { @@ -1847,9 +1848,7 @@ static __net_init int inet_init_net(struct net *net) /* * Set defaults for local port range */ - seqlock_init(&net->ipv4.ip_local_ports.lock); - net->ipv4.ip_local_ports.range[0] = 32768; - net->ipv4.ip_local_ports.range[1] = 60999; + net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u; seqlock_init(&net->ipv4.ping_group_range.lock); /* diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 39dcccf0f174..ae8b15e6896f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -271,6 +271,74 @@ static int bpf_tcp_ca_validate(void *kdata) return tcp_validate_congestion_control(kdata); } +static u32 bpf_tcp_ca_ssthresh(struct sock *sk) +{ + return 0; +} + +static void bpf_tcp_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ +} + +static void bpf_tcp_ca_set_state(struct sock *sk, u8 new_state) +{ +} + +static void bpf_tcp_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ +} + +static void bpf_tcp_ca_in_ack_event(struct sock *sk, u32 flags) +{ +} + +static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample) +{ +} + +static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) +{ + return 0; +} + +static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +{ +} + +static u32 bpf_tcp_ca_undo_cwnd(struct sock *sk) +{ + return 0; +} + +static u32 bpf_tcp_ca_sndbuf_expand(struct sock *sk) +{ + return 0; +} + +static void __bpf_tcp_ca_init(struct sock *sk) +{ +} + +static void __bpf_tcp_ca_release(struct sock *sk) +{ +} + +static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .ssthresh = bpf_tcp_ca_ssthresh, + .cong_avoid = bpf_tcp_ca_cong_avoid, + .set_state = bpf_tcp_ca_set_state, + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, + .min_tso_segs = bpf_tcp_ca_min_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, + + .init = __bpf_tcp_ca_init, + .release = __bpf_tcp_ca_release, +}; + struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, @@ -281,6 +349,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .init = bpf_tcp_ca_init, .validate = bpf_tcp_ca_validate, .name = "tcp_congestion_ops", + .cfi_stubs = &__bpf_ops_tcp_congestion_ops, }; static int __init bpf_tcp_ca_kfunc_init(void) diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile deleted file mode 100644 index 00af5305e05a..000000000000 --- a/net/ipv4/bpfilter/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_BPFILTER) += sockopt.o diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c deleted file mode 100644 index 193bcc2acccc..000000000000 --- a/net/ipv4/bpfilter/sockopt.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/init.h> -#include <linux/module.h> -#include <linux/uaccess.h> -#include <linux/bpfilter.h> -#include <uapi/linux/bpf.h> -#include <linux/wait.h> -#include <linux/kmod.h> -#include <linux/fs.h> -#include <linux/file.h> - -struct bpfilter_umh_ops bpfilter_ops; -EXPORT_SYMBOL_GPL(bpfilter_ops); - -static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set) -{ - int err; - mutex_lock(&bpfilter_ops.lock); - if (!bpfilter_ops.sockopt) { - mutex_unlock(&bpfilter_ops.lock); - request_module("bpfilter"); - mutex_lock(&bpfilter_ops.lock); - - if (!bpfilter_ops.sockopt) { - err = -ENOPROTOOPT; - goto out; - } - } - if (bpfilter_ops.info.tgid && - thread_group_exited(bpfilter_ops.info.tgid)) - umd_cleanup_helper(&bpfilter_ops.info); - - if (!bpfilter_ops.info.tgid) { - err = bpfilter_ops.start(); - if (err) - goto out; - } - err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); -out: - mutex_unlock(&bpfilter_ops.lock); - return err; -} - -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen) -{ - return bpfilter_mbox_request(sk, optname, optval, optlen, true); -} - -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) -{ - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len, - false); -} - -static int __init bpfilter_sockopt_init(void) -{ - mutex_init(&bpfilter_ops.lock); - bpfilter_ops.info.tgid = NULL; - bpfilter_ops.info.driver_name = "bpfilter_umh"; - - return 0; -} -device_initcall(bpfilter_sockopt_init); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 513f475c6a53..5bdd1c016009 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -395,13 +395,13 @@ static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0); + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL); if (err < 0) return err; - err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); + err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN); if (err < 0) return err; - err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0); + err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT); if (err < 0) return err; return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9bdfdab906fe..3ff35f811765 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -52,6 +52,7 @@ #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 394a498c2823..8e2eb1793685 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -117,37 +117,39 @@ bool inet_rcv_saddr_any(const struct sock *sk) return !sk->sk_rcv_saddr; } -void inet_get_local_port_range(const struct net *net, int *low, int *high) -{ - unsigned int seq; - - do { - seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - - *low = net->ipv4.ip_local_ports.range[0]; - *high = net->ipv4.ip_local_ports.range[1]; - } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); -} -EXPORT_SYMBOL(inet_get_local_port_range); - -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) +/** + * inet_sk_get_local_port_range - fetch ephemeral ports range + * @sk: socket + * @low: pointer to low port + * @high: pointer to high port + * + * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range) + * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option. + * Returns true if IP_LOCAL_PORT_RANGE was set on this socket. + */ +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { - const struct inet_sock *inet = inet_sk(sk); - const struct net *net = sock_net(sk); int lo, hi, sk_lo, sk_hi; + bool local_range = false; + u32 sk_range; - inet_get_local_port_range(net, &lo, &hi); + inet_get_local_port_range(sock_net(sk), &lo, &hi); - sk_lo = inet->local_port_range.lo; - sk_hi = inet->local_port_range.hi; + sk_range = READ_ONCE(inet_sk(sk)->local_port_range); + if (unlikely(sk_range)) { + sk_lo = sk_range & 0xffff; + sk_hi = sk_range >> 16; - if (unlikely(lo <= sk_lo && sk_lo <= hi)) - lo = sk_lo; - if (unlikely(lo <= sk_hi && sk_hi <= hi)) - hi = sk_hi; + if (lo <= sk_lo && sk_lo <= hi) + lo = sk_lo; + if (lo <= sk_hi && sk_hi <= hi) + hi = sk_hi; + local_range = true; + } *low = lo; *high = hi; + return local_range; } EXPORT_SYMBOL(inet_sk_get_local_port_range); @@ -157,8 +159,11 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) if (sk->sk_family == AF_INET6) { int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - return addr_type != IPV6_ADDR_ANY && - addr_type != IPV6_ADDR_MAPPED; + if (addr_type == IPV6_ADDR_ANY) + return false; + + if (addr_type != IPV6_ADDR_MAPPED) + return true; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); @@ -211,18 +216,9 @@ static bool inet_bhash2_conflict(const struct sock *sk, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { - struct inet_timewait_sock *tw2; struct sock *sk2; - sk_for_each_bound_bhash2(sk2, &tb2->owners) { - if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) - return true; - } - - twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { - sk2 = (struct sock *)tw2; - + sk_for_each_bound(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; @@ -231,15 +227,20 @@ static bool inet_bhash2_conflict(const struct sock *sk, return false; } +#define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ + hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ + sk_for_each_bound(sk2, &(__tb2)->owners) + /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { - bool reuseport_cb_ok; - struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + struct sock_reuseport *reuseport_cb; + bool reuseport_cb_ok; + struct sock *sk2; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); @@ -247,32 +248,29 @@ static int inet_csk_bind_conflict(const struct sock *sk, reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); - /* - * Unlike other sk lookup places we do not check + /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if + * ipv4) should have been checked already. We need to do these two + * checks separately because their spinlocks have to be acquired/released + * independently of each other, to prevent possible deadlocks + */ + if (inet_use_bhash2_on_bind(sk)) + return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, + reuseport_cb_ok, reuseport_ok); + + /* Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ + sk_for_each_bound_bhash(sk2, tb2, tb) { + if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; - if (!inet_use_bhash2_on_bind(sk)) { - struct sock *sk2; - - sk_for_each_bound(sk2, &tb->owners) - if (inet_bind_conflict(sk, sk2, uid, relax, - reuseport_cb_ok, reuseport_ok) && - inet_rcv_saddr_equal(sk, sk2, true)) - return true; - - return false; + if (inet_rcv_saddr_equal(sk, sk2, true)) + return true; } - /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if - * ipv4) should have been checked already. We need to do these two - * checks separately because their spinlocks have to be acquired/released - * independently of each other, to prevent possible deadlocks - */ - return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok); + return false; } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or @@ -455,7 +453,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; @@ -547,7 +545,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } if (!found_port) { - if (!hlist_empty(&tb->owners)) { + if (!hlist_empty(&tb->bhash2)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) @@ -567,7 +565,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, - net, head2, port, l3mdev, sk); + net, head2, tb, sk); if (!tb2) goto fail_unlock; bhash2_created = true; @@ -589,11 +587,10 @@ success: fail_unlock: if (ret) { + if (bhash2_created) + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - if (bhash2_created) - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - tb2); } if (head2_lock_acquired) spin_unlock(&head2->lock); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7d0e7aaa71e0..8e6b6aa0579e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1077,10 +1077,94 @@ skip_listen_ht: s_i = num = s_num = 0; } +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets + * with bh disabled. + */ +#define SKARR_SZ 16 + + /* Dump bound but inactive (not listening, connecting, etc.) sockets */ + if (cb->args[0] == 1) { + if (!(idiag_states & TCPF_BOUND_INACTIVE)) + goto skip_bind_ht; + + for (i = s_i; i < hashinfo->bhash_size; i++) { + struct inet_bind_hashbucket *ibb; + struct inet_bind2_bucket *tb2; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + +resume_bind_walk: + num = 0; + accum = 0; + ibb = &hashinfo->bhash2[i]; + + spin_lock_bh(&ibb->lock); + inet_bind_bucket_for_each(tb2, &ibb->chain) { + if (!net_eq(ib2_net(tb2), net)) + continue; + + sk_for_each_bound(sk, &tb2->owners) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_bind; + + if (sk->sk_state != TCP_CLOSE || + !inet->inet_num) + goto next_bind; + + if (r->sdiag_family != AF_UNSPEC && + r->sdiag_family != sk->sk_family) + goto next_bind; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_bind; + + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + goto pause_bind_walk; +next_bind: + num++; + } + } +pause_bind_walk: + spin_unlock_bh(&ibb->lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = inet_sk_diag_fill(sk_arr[idx], + NULL, skb, cb, + r, NLM_F_MULTI, + net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_put(sk_arr[idx]); + } + if (res < 0) + goto done; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto resume_bind_walk; + } + + s_num = 0; + } +skip_bind_ht: + cb->args[0] = 2; + s_i = num = s_num = 0; + } + if (!(idiag_states & ~TCPF_LISTEN)) goto out; -#define SKARR_SZ 16 for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index a532f749e477..93e9193df544 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; - INIT_HLIST_HEAD(&tb->owners); + INIT_HLIST_HEAD(&tb->bhash2); hlist_add_head(&tb->node, &head->chain); } return tb; @@ -87,7 +87,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->bhash2)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } @@ -100,47 +100,52 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net tb->l3mdev == l3mdev; } -static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, +static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk) { - write_pnet(&tb->ib_net, net); - tb->l3mdev = l3mdev; - tb->port = port; + write_pnet(&tb2->ib_net, net); + tb2->l3mdev = tb->l3mdev; + tb2->port = tb->port; #if IS_ENABLED(CONFIG_IPV6) - tb->family = sk->sk_family; - if (sk->sk_family == AF_INET6) - tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; - else + BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); + if (sk->sk_family == AF_INET6) { + tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; + } else { + tb2->addr_type = IPV6_ADDR_MAPPED; + ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); + } +#else + tb2->rcv_saddr = sk->sk_rcv_saddr; #endif - tb->rcv_saddr = sk->sk_rcv_saddr; - INIT_HLIST_HEAD(&tb->owners); - INIT_HLIST_HEAD(&tb->deathrow); - hlist_add_head(&tb->node, &head->chain); + INIT_HLIST_HEAD(&tb2->owners); + hlist_add_head(&tb2->node, &head->chain); + hlist_add_head(&tb2->bhash_node, &tb->bhash2); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, - int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk) { - struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); + struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); - if (tb) - inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); + if (tb2) + inet_bind2_bucket_init(tb2, net, head, tb, sk); - return tb; + return tb2; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { - if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); + __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); } } @@ -149,18 +154,11 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb2->family) { - if (sk->sk_family == AF_INET) - return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && - tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; - - return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && - sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr; - } - if (sk->sk_family == AF_INET6) - return ipv6_addr_equal(&tb2->v6_rcv_saddr, - &sk->sk_v6_rcv_saddr); + return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); + + if (tb2->addr_type != IPV6_ADDR_MAPPED) + return false; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } @@ -169,10 +167,9 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { inet_sk(sk)->inet_num = port; - sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; - sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; + sk_add_bind_node(sk, &tb2->owners); } /* @@ -192,21 +189,20 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; - __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; - __sk_del_bind2_node(sk); + __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind2_hash = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); } spin_unlock(&head2->lock); + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } @@ -275,8 +271,7 @@ bhash2_find: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); if (!tb2) { tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, - net, head2, port, - l3mdev, child); + net, head2, tb, child); if (!tb2) goto error; } @@ -836,16 +831,15 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const return false; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb->family) { - if (sk->sk_family == AF_INET) - return ipv6_addr_any(&tb->v6_rcv_saddr) || - ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); + if (tb->addr_type == IPV6_ADDR_ANY) + return true; + if (tb->addr_type != IPV6_ADDR_MAPPED) return false; - } - if (sk->sk_family == AF_INET6) - return ipv6_addr_any(&tb->v6_rcv_saddr); + if (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return false; #endif return tb->rcv_saddr == 0; } @@ -942,7 +936,7 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, spin_lock_bh(&head->lock); spin_lock(&head2->lock); - __sk_del_bind2_node(sk); + __sk_del_bind_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); @@ -957,10 +951,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; - inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); + inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); } - sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; + sk_add_bind_node(sk, &tb2->owners); spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); @@ -1012,7 +1006,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, bool tb_created = false; u32 remaining, offset; int ret, i, low, high; - int l3mdev; + bool local_ports; + int step, l3mdev; u32 index; if (port) { @@ -1024,10 +1019,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, l3mdev = inet_sk_bound_l3mdev(sk); - inet_sk_get_local_port_range(sk, &low, &high); + local_ports = inet_sk_get_local_port_range(sk, &low, &high); + step = local_ports ? 1 : 2; + high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; - if (likely(remaining > 1)) + if (!local_ports && remaining > 1) remaining &= ~1U; get_random_sleepable_once(table_perturb, @@ -1040,10 +1037,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ - offset &= ~1U; + if (!local_ports) + offset &= ~1U; other_parity_scan: port = low + offset; - for (i = 0; i < remaining; i += 2, port += 2) { + for (i = 0; i < remaining; i += step, port += step) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) @@ -1060,7 +1058,7 @@ other_parity_scan: if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; - WARN_ON(hlist_empty(&tb->owners)); + WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, port, &tw)) goto ok; @@ -1083,10 +1081,11 @@ next_port: cond_resched(); } - offset++; - if ((offset & 1) && remaining > 1) - goto other_parity_scan; - + if (!local_ports) { + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; + } return -EADDRNOTAVAIL; ok: @@ -1099,7 +1098,7 @@ ok: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, - head2, port, l3mdev, sk); + head2, tb, sk); if (!tb2) goto error; } @@ -1109,8 +1108,8 @@ ok: * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, get_random_u32_below(8) * 2); - WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); + i = max_t(int, i, get_random_u32_below(8) * step); + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index dd37a5bf6881..5befa4de5b24 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -35,13 +35,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, if (!tb) return; - __hlist_del(&tw->tw_bind_node); + __sk_del_bind_node((struct sock *)tw); tw->tw_tb = NULL; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - - __hlist_del(&tw->tw_bind2_node); tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); __sock_put((struct sock *)tw); } @@ -94,18 +92,6 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, hlist_nulls_add_head_rcu(&tw->tw_node, list); } -static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind_node, list); -} - -static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind2_node, list); -} - /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -133,11 +119,10 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); - inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); tw->tw_tb2 = icsk->icsk_bind2_hash; WARN_ON(!icsk->icsk_bind2_hash); - inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow); + sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners); spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2efc53526a38..7aa9dc0e6760 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -47,8 +47,6 @@ #include <linux/errqueue.h> #include <linux/uaccess.h> -#include <linux/bpfilter.h> - /* * SOL_IP control messages. */ @@ -775,7 +773,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max)) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -811,7 +809,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max) - 4) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); @@ -1055,6 +1053,19 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_TOS: /* This sets both TOS and Precedence */ ip_sock_set_tos(sk, val); return 0; + case IP_LOCAL_PORT_RANGE: + { + u16 lo = val; + u16 hi = val >> 16; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (lo != 0 && hi != 0 && lo > hi) + return -EINVAL; + + WRITE_ONCE(inet->local_port_range, val); + return 0; + } } err = 0; @@ -1241,7 +1252,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; - if (optlen > READ_ONCE(sysctl_optmem_max)) { + if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) { err = -ENOBUFS; break; } @@ -1332,20 +1343,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, err = xfrm_user_policy(sk, optname, optval, optlen); break; - case IP_LOCAL_PORT_RANGE: - { - const __u16 lo = val; - const __u16 hi = val >> 16; - - if (optlen != sizeof(__u32)) - goto e_inval; - if (lo != 0 && hi != 0 && lo > hi) - goto e_inval; - - inet->local_port_range.lo = lo; - inet->local_port_range.hi = hi; - break; - } default: err = -ENOPROTOOPT; break; @@ -1412,11 +1409,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_SET_REPLACE && - optname < BPFILTER_IPT_SET_MAX) - err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && @@ -1692,6 +1684,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } + case IP_LOCAL_PORT_RANGE: + val = READ_ONCE(inet->local_port_range); + goto copyval; } if (needs_rtnl) @@ -1721,9 +1716,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - case IP_LOCAL_PORT_RANGE: - val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; - break; case IP_PROTOCOL: val = inet_sk(sk)->inet_num; break; @@ -1764,11 +1756,6 @@ int ip_getsockopt(struct sock *sk, int level, err = do_ip_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_GET_INFO && - optname < BPFILTER_IPT_GET_MAX) - err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9e222a57bc2b..9d6f59531b3a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -253,7 +253,7 @@ static int __net_init ipmr_rules_init(struct net *net) goto err1; } - err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0); + err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT); if (err < 0) goto err2; @@ -1025,6 +1025,10 @@ static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *skb; int ret; + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (!mroute_sk) + return -EINVAL; + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else @@ -1069,7 +1073,8 @@ static int ipmr_cache_report(const struct mr_table *mrt, msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; - skb_dst_set(skb, dst_clone(skb_dst(pkt))); + ipv4_pktinfo_prepare(mroute_sk, pkt); + memcpy(skb->cb, pkt->cb, sizeof(skb->cb)); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); igmp->type = assert; @@ -1079,12 +1084,6 @@ static int ipmr_cache_report(const struct mr_table *mrt, skb->transport_header = skb->network_header; } - mroute_sk = rcu_dereference(mrt->mroute_sk); - if (!mroute_sk) { - kfree_skb(skb); - return -EINVAL; - } - igmpmsg_netlink_event(mrt, skb); /* Deliver to mrouted */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d37282c06e3d..61f1c96cfe63 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -189,12 +189,14 @@ __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ -int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, - u32 cookie) +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th) { + __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; - __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, - th->source, th->dest, seq); + __u32 mssind; + + mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, + th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } @@ -202,7 +204,7 @@ EXPORT_SYMBOL_GPL(__cookie_v4_check); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst, u32 tsoff) + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; @@ -212,7 +214,6 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, NULL, &own_req); if (child) { refcount_set(&req->rsk_refcnt, 1); - tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); if (rsk_drop_req(req)) { @@ -269,26 +270,46 @@ bool cookie_timestamp_decode(const struct net *net, } EXPORT_SYMBOL(cookie_timestamp_decode); -bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, - const struct net *net, const struct dst_entry *dst) +static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) { - bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; + struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + const struct tcphdr *th = tcp_hdr(skb); - if (!ecn_ok) - return false; + req->num_retrans = 0; - if (READ_ONCE(net->ipv4.sysctl_tcp_ecn)) - return true; + ireq->ir_num = ntohs(th->dest); + ireq->ir_rmt_port = th->source; + ireq->ir_iif = inet_request_bound_dev_if(sk, skb); + ireq->ir_mark = inet_request_mark(sk, skb); + + if (IS_ENABLED(CONFIG_SMC)) + ireq->smc_ok = 0; - return dst_feature(dst, RTAX_FEATURE_ECN); + treq->snt_synack = 0; + treq->tfo_listener = false; + treq->txhash = net_tx_rndhash(); + treq->rcv_isn = ntohl(th->seq) - 1; + treq->snt_isn = ntohl(th->ack_seq) - 1; + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; + treq->req_usec_ts = false; + +#if IS_ENABLED(CONFIG_MPTCP) + treq->is_mptcp = sk_is_mptcp(sk); + if (treq->is_mptcp) + return mptcp_subflow_init_cookie_req(req, sk, skb); +#endif + + return 0; } -EXPORT_SYMBOL(cookie_ecn_ok); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - const struct tcp_request_sock_ops *af_ops, - struct sock *sk, - struct sk_buff *skb) + struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *tcp_opt, + int mss, u32 tsoff) { + struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; @@ -300,126 +321,109 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, if (!req) return NULL; - treq = tcp_rsk(req); + if (cookie_tcp_reqsk_init(sk, skb, req)) { + reqsk_free(req); + return NULL; + } - /* treq->af_specific might be used to perform TCP_MD5 lookup */ - treq->af_specific = af_ops; + ireq = inet_rsk(req); + treq = tcp_rsk(req); - treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; - treq->req_usec_ts = false; + req->mss = mss; + req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0; -#if IS_ENABLED(CONFIG_MPTCP) - treq->is_mptcp = sk_is_mptcp(sk); - if (treq->is_mptcp) { - int err = mptcp_subflow_init_cookie_req(req, sk, skb); + ireq->snd_wscale = tcp_opt->snd_wscale; + ireq->tstamp_ok = tcp_opt->saw_tstamp; + ireq->sack_ok = tcp_opt->sack_ok; + ireq->wscale_ok = tcp_opt->wscale_ok; + ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN); - if (err) { - reqsk_free(req); - return NULL; - } - } -#endif + treq->ts_off = tsoff; return req; } EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); -/* On input, sk is a listener. - * Output is listener if incoming packet would not create a child - * NULL if memory could not be allocated. - */ -struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) +static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, + struct sk_buff *skb) { - struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct tcp_options_received tcp_opt; - struct inet_request_sock *ireq; - struct tcp_request_sock *treq; - struct tcp_sock *tp = tcp_sk(sk); - const struct tcphdr *th = tcp_hdr(skb); - __u32 cookie = ntohl(th->ack_seq) - 1; - struct sock *ret = sk; - struct request_sock *req; - int full_space, mss; - struct rtable *rt; - __u8 rcv_wscale; - struct flowi4 fl4; u32 tsoff = 0; - int l3index; - - if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || - !th->ack || th->rst) - goto out; + int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; - mss = __cookie_v4_check(ip_hdr(skb), th, cookie); - if (mss == 0) { - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb)); + if (!mss) { + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); + tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcp_ts_off(sock_net(sk), + tsoff = secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); tcp_opt.rcv_tsecr -= tsoff; } - if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt)) + if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; - ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, - &tcp_request_sock_ipv4_ops, sk, skb); - if (!req) + return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb, + &tcp_opt, mss, tsoff); +out: + return ERR_PTR(-EINVAL); +} + +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) +{ + struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + struct inet_request_sock *ireq; + struct net *net = sock_net(sk); + struct request_sock *req; + struct sock *ret = sk; + struct flowi4 fl4; + struct rtable *rt; + __u8 rcv_wscale; + int full_space; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + if (!req) + goto out_drop; + ireq = inet_rsk(req); - treq = tcp_rsk(req); - treq->rcv_isn = ntohl(th->seq) - 1; - treq->snt_isn = cookie; - treq->ts_off = 0; - treq->txhash = net_tx_rndhash(); - req->mss = mss; - ireq->ir_num = ntohs(th->dest); - ireq->ir_rmt_port = th->source; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->ir_mark = inet_request_mark(sk, skb); - ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->sack_ok = tcp_opt.sack_ok; - ireq->wscale_ok = tcp_opt.wscale_ok; - ireq->tstamp_ok = tcp_opt.saw_tstamp; - req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = 0; - treq->tfo_listener = false; - - if (IS_ENABLED(CONFIG_SMC)) - ireq->smc_ok = 0; - - ireq->ir_iif = inet_request_bound_dev_if(sk, skb); - - l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); - tcp_ao_syncookie(sk, skb, treq, AF_INET, l3index); /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb)); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); - if (security_inet_conn_request(sk, skb, req)) { - reqsk_free(req); - goto out; - } + if (security_inet_conn_request(sk, skb, req)) + goto out_free; - req->num_retrans = 0; + tcp_ao_syncookie(sk, skb, req, AF_INET); /* * We need to lookup the route here to get at the correct @@ -433,11 +437,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); - if (IS_ERR(rt)) { - reqsk_free(req); - goto out; - } + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + goto out_free; /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); @@ -453,13 +455,18 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); + ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); - ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ if (ret) inet_sk(ret)->cork.fl.u.ip4 = fl4; -out: return ret; +out: + return ret; +out_free: + reqsk_free(req); +out_drop: + return NULL; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f63a545a7374..7e4f16a7dcc1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -50,26 +50,22 @@ static int tcp_plb_max_cong_thresh = 256; static int sysctl_tcp_low_latency __read_mostly; /* Update system visible IP port range */ -static void set_local_port_range(struct net *net, int range[2]) +static void set_local_port_range(struct net *net, unsigned int low, unsigned int high) { - bool same_parity = !((range[0] ^ range[1]) & 1); + bool same_parity = !((low ^ high) & 1); - write_seqlock_bh(&net->ipv4.ip_local_ports.lock); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } - net->ipv4.ip_local_ports.range[0] = range[0]; - net->ipv4.ip_local_ports.range[1] = range[1]; - write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); + WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low); } /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = - container_of(table->data, struct net, ipv4.ip_local_ports.range); + struct net *net = table->data; int ret; int range[2]; struct ctl_table tmp = { @@ -93,7 +89,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock))) ret = -EINVAL; else - set_local_port_range(net, range); + set_local_port_range(net, range[0], range[1]); } return ret; @@ -733,8 +729,8 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ip_local_port_range", - .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), - .data = &init_net.ipv4.ip_local_ports.range, + .maxlen = 0, + .data = &init_net, .mode = 0644, .proc_handler = ipv4_local_port_range, }, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ff6838ca2e58..1baa484d2190 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1849,7 +1849,6 @@ static int receive_fallback_to_copy(struct sock *sk, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; zc->length = 0; @@ -1858,8 +1857,8 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - inq, &iov, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, + &msg.msg_iter); if (err) return err; @@ -1886,14 +1885,13 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - copylen, &iov, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, + &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); @@ -2605,6 +2603,7 @@ void tcp_set_state(struct sock *sk, int state) BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values @@ -4585,6 +4584,97 @@ static void __init tcp_init_mem(void) sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } +static void __init tcp_struct_check(void) +{ + /* TX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); + + /* TXRX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 31); + + /* RX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); + + /* TX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113); + + /* TXRX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76); + + /* RX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); +} + void __init tcp_init(void) { int max_rshare, max_wshare, cnt; @@ -4595,6 +4685,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); + tcp_struct_check(); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index f8308d3f565e..87db432c6bb4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -844,18 +844,30 @@ static struct tcp_ao_key *tcp_ao_inbound_lookup(unsigned short int family, } void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index) + struct request_sock *req, unsigned short int family) { + struct tcp_request_sock *treq = tcp_rsk(req); const struct tcphdr *th = tcp_hdr(skb); const struct tcp_ao_hdr *aoh; struct tcp_ao_key *key; + int l3index; + + /* treq->af_specific is used to perform TCP_AO lookup + * in tcp_create_openreq_child(). + */ +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6) + treq->af_specific = &tcp_request_sock_ipv6_ops; + else +#endif + treq->af_specific = &tcp_request_sock_ipv4_ops; treq->used_tcp_ao = false; if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh) return; + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), inet_rsk(req)->ir_iif); key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index); if (!key) /* Key not found, continue without TCP-AO */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 701cb87043f2..df7b13f0e5e0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -202,23 +202,17 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, } #endif -static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, - unsigned int len) +static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb, + unsigned int len) { - static bool __once __read_mostly; + struct net_device *dev; - if (!__once) { - struct net_device *dev; - - __once = true; - - rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); - if (!dev || len >= dev->mtu) - pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", - dev ? dev->name : "Unknown driver"); - rcu_read_unlock(); - } + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + if (!dev || len >= READ_ONCE(dev->mtu)) + pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", + dev ? dev->name : "Unknown driver"); + rcu_read_unlock(); } /* Adapt the MSS value used to make delayed ack decision to the @@ -250,9 +244,8 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ - if (unlikely(len > icsk->icsk_ack.rcv_mss + - MAX_TCP_OPTION_SPACE)) - tcp_gro_dev_warn(sk, skb, len); + DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE, + tcp_gro_dev_warn, sk, skb, len); /* If the skb has a len of exactly 1*MSS and has the PSH bit * set then it is likely the end of an application write. So * more data may not be arriving soon, and yet the data sender diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1f9f6c1c196b..d1ad20ce1c8c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -626,7 +626,6 @@ void tcp_retransmit_timer(struct sock *sk) * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - icsk->icsk_backoff++; out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is @@ -647,11 +646,12 @@ out_reset_timer: tcp_rto_min(sk), TCP_RTO_MAX); } else if (sk->sk_state != TCP_SYN_SENT || - icsk->icsk_backoff > + tp->total_rto > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { /* Use normal (exponential) backoff unless linear timeouts are * activated. */ + icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |