From fff1f3001cc58b5064a0f1154a7ac09b76f29c44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:23 -0700 Subject: tcp: add a spinlock to protect struct request_sock_queue struct request_sock_queue fields are currently protected by the listener 'lock' (not a real spinlock) We need to add a private spinlock instead, so that softirq handlers creating children do not have to worry with backlog notion that the listener 'lock' carries. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/request_sock.c | 1 + net/ipv4/inet_connection_sock.c | 21 +++++++-------------- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/request_sock.c b/net/core/request_sock.c index e22cfa4ed25f..8d9fd31d3d06 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -58,6 +58,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, return -ENOMEM; get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); + spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->syn_wait_lock); spin_lock_init(&queue->fastopenq.lock); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e1527882a578..0085612b9e49 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,10 +330,9 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - req = reqsk_queue_remove(queue); + req = reqsk_queue_remove(queue, sk); newsk = req->sk; - sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); @@ -832,11 +831,7 @@ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct request_sock *acc_req; - struct request_sock *req; - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(queue); + struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -848,11 +843,9 @@ void inet_csk_listen_stop(struct sock *sk) */ reqsk_queue_destroy(queue); - while ((req = acc_req) != NULL) { + while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; - acc_req = req->dl_next; - local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); @@ -882,18 +875,18 @@ void inet_csk_listen_stop(struct sock *sk) local_bh_enable(); sock_put(child); - sk_acceptq_removed(sk); reqsk_put(req); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq.lock); - acc_req = queue->fastopenq.rskq_rst_head; + req = queue->fastopenq.rskq_rst_head; queue->fastopenq.rskq_rst_head = NULL; spin_unlock_bh(&queue->fastopenq.lock); - while ((req = acc_req) != NULL) { - acc_req = req->dl_next; + while (req != NULL) { + next = req->dl_next; reqsk_put(req); + req = next; } } WARN_ON(sk->sk_ack_backlog); -- cgit From aac065c50aba0c534a929aeb687eb68c58e523b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:24 -0700 Subject: tcp: move qlen/young out of struct listen_sock qlen_inc & young_inc were protected by listener lock, while qlen_dec & young_dec were atomic fields. Everything needs to be atomic for upcoming lockless listener. Also move qlen/young in request_sock_queue as we'll get rid of struct listen_sock eventually. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 40 ++++++++++------------------------------ net/core/request_sock.c | 8 ++++---- net/ipv4/inet_connection_sock.c | 6 +++--- net/ipv4/inet_diag.c | 2 +- 4 files changed, 18 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 202e36163ae3..d128e7f89042 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -122,14 +122,7 @@ extern int sysctl_max_syn_backlog; * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs */ struct listen_sock { - int qlen_inc; /* protected by listener lock */ - int young_inc;/* protected by listener lock */ - - /* following fields can be updated by timer */ - atomic_t qlen_dec; /* qlen = qlen_inc - qlen_dec */ - atomic_t young_dec; - - u32 max_qlen_log ____cacheline_aligned_in_smp; + u32 max_qlen_log; u32 synflood_warned; u32 hash_rnd; u32 nr_table_entries; @@ -179,6 +172,9 @@ struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + atomic_t qlen; + atomic_t young; + struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; struct listen_sock *listen_opt; @@ -242,41 +238,25 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue static inline void reqsk_queue_removed(struct request_sock_queue *queue, const struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - if (req->num_timeout == 0) - atomic_inc(&lopt->young_dec); - atomic_inc(&lopt->qlen_dec); + atomic_dec(&queue->young); + atomic_dec(&queue->qlen); } static inline void reqsk_queue_added(struct request_sock_queue *queue) { - struct listen_sock *lopt = queue->listen_opt; - - lopt->young_inc++; - lopt->qlen_inc++; -} - -static inline int listen_sock_qlen(const struct listen_sock *lopt) -{ - return lopt->qlen_inc - atomic_read(&lopt->qlen_dec); -} - -static inline int listen_sock_young(const struct listen_sock *lopt) -{ - return lopt->young_inc - atomic_read(&lopt->young_dec); + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); } static inline int reqsk_queue_len(const struct request_sock_queue *queue) { - const struct listen_sock *lopt = queue->listen_opt; - - return lopt ? listen_sock_qlen(lopt) : 0; + return atomic_read(&queue->qlen); } static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { - return listen_sock_young(queue->listen_opt); + return atomic_read(&queue->young); } static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 8d9fd31d3d06..5ca624cea04c 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -102,7 +102,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - if (listen_sock_qlen(lopt) != 0) { + if (reqsk_queue_len(queue) != 0) { unsigned int i; for (i = 0; i < lopt->nr_table_entries; i++) { @@ -116,7 +116,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) * or risk a dead lock. */ spin_unlock_bh(&queue->syn_wait_lock); - atomic_inc(&lopt->qlen_dec); + atomic_dec(&queue->qlen); if (del_timer_sync(&req->rsk_timer)) reqsk_put(req); reqsk_put(req); @@ -126,8 +126,8 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) } } - if (WARN_ON(listen_sock_qlen(lopt) != 0)) - pr_err("qlen %u\n", listen_sock_qlen(lopt)); + if (WARN_ON(reqsk_queue_len(queue) != 0)) + pr_err("qlen %u\n", reqsk_queue_len(queue)); kvfree(lopt); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0085612b9e49..093ef04e6ebf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -640,9 +640,9 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - qlen = listen_sock_qlen(lopt); + qlen = reqsk_queue_len(queue); if (qlen >> (lopt->max_qlen_log - 1)) { - int young = listen_sock_young(lopt) << 1; + int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { if (qlen < young) @@ -664,7 +664,7 @@ static void reqsk_timer_handler(unsigned long data) unsigned long timeo; if (req->num_timeout++ == 0) - atomic_inc(&lopt->young_dec); + atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c3b1f3a0f4cf..0ac1d68dc8a6 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -753,7 +753,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !listen_sock_qlen(lopt)) + if (!lopt || !reqsk_queue_len(&icsk->icsk_accept_queue)) goto out; if (bc) { -- cgit From 8d2675f1e464aa5cedda63849adecffd8d33fead Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:25 -0700 Subject: tcp: move synflood_warned into struct request_sock_queue long term plan is to remove struct listen_sock when its hash table is no longer there. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 +- net/ipv4/tcp_input.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index d128e7f89042..273fb7235ce3 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -123,7 +123,6 @@ extern int sysctl_max_syn_backlog; */ struct listen_sock { u32 max_qlen_log; - u32 synflood_warned; u32 hash_rnd; u32 nr_table_entries; struct request_sock *syn_table[0]; @@ -171,6 +170,7 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u32 synflood_warned; atomic_t qlen; atomic_t young; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e58cbcd2f07e..8b0ce73c2049 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6068,9 +6068,9 @@ static bool tcp_syn_flood_action(const struct sock *sk, const struct sk_buff *skb, const char *proto) { + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; - struct listen_sock *lopt; #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { @@ -6081,10 +6081,9 @@ static bool tcp_syn_flood_action(const struct sock *sk, #endif NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && + if (!queue->synflood_warned && sysctl_tcp_syncookies != 2 && - xchg(&lopt->synflood_warned, 1) == 0) + xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); -- cgit From 38cb52455c2c3e8b5751350a3fb32e43e82e129a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:26 -0700 Subject: tcp: call sk_mark_napi_id() on the child, not the listener This fixes a typo : We want to store the NAPI id on child socket. Presumably nobody really uses busy polling, on short lived flows. Fixes: 3d97379a67486 ("tcp: move sk_mark_napi_id() at the right place") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 64ece718d66c..2fb0945b9d83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1411,7 +1411,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2ae95e1d03e1..e463583c39ee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1265,7 +1265,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) -- cgit From ba8e275a457397ab06f3567cf7bef0d78a43ae7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:28 -0700 Subject: tcp: cleanup tcp_v[46]_inbound_md5_hash() We'll soon have to call tcp_v[46]_inbound_md5_hash() twice. Also add const attribute to the socket, as it might be the unlocked listener for SYN packets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 16 ++++++---------- net/ipv6/tcp_ipv6.c | 10 ++++++---- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2fb0945b9d83..56f8c6395966 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1112,10 +1112,13 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +#endif + /* Called with rcu_read_lock() */ -static bool tcp_v4_inbound_md5_hash(struct sock *sk, +static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG /* * This gets called for each TCP segment that arrives * so we want to be efficient. @@ -1165,8 +1168,9 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, return true; } return false; -} #endif + return false; +} static void tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener, @@ -1607,16 +1611,8 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif nf_reset(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e463583c39ee..65e797dba504 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -622,8 +622,12 @@ clear_hash_noput: return 1; } -static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +#endif + +static bool tcp_v6_inbound_md5_hash(const struct sock *sk, + const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -660,9 +664,9 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) &ip6h->daddr, ntohs(th->dest)); return true; } +#endif return false; } -#endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, @@ -1408,10 +1412,8 @@ process: tcp_v6_fill_cb(skb, hdr, th); -#ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif if (sk_filter(sk, skb)) goto discard_and_relse; -- cgit From 9cfd08601f49a4536e4407286b5f07b24293e474 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:29 -0700 Subject: tcp: remove BUG_ON() in tcp_check_req() Once listener is lockless, its sk_state can change anytime. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 897e34273ba3..9adf1e2c3170 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -578,8 +578,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); - tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(skb, &tmp_opt, 0, NULL); -- cgit From aa3a0c8ce651b5e16124866b0a10d1b90b9ef022 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:30 -0700 Subject: tcp: get_openreq[46]() changes When request sockets are no longer in a per listener hash table but on regular TCP ehash, we need to access listener uid through req->rsk_listener get_openreq6() also gets a const for its request socket argument. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_ipv4.c | 8 ++++---- net/ipv6/tcp_ipv6.c | 7 ++++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2c7dfe52f473..a26341d2ad67 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1637,7 +1637,6 @@ struct tcp_iter_state { enum tcp_seq_states state; struct sock *syn_wait_sk; int bucket, offset, sbucket, num; - kuid_t uid; loff_t last_pos; }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 56f8c6395966..a33101616215 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1871,7 +1871,6 @@ get_sk: spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: - st->uid = sock_i_uid(sk); st->syn_wait_sk = sk; st->state = TCP_SEQ_STATE_OPENREQ; st->sbucket = 0; @@ -2151,7 +2150,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid) + struct seq_file *f, int i) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->rsk_timer.expires - jiffies; @@ -2168,7 +2167,8 @@ static void get_openreq4(const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_delta_to_clock_t(delta), req->num_timeout, - from_kuid_munged(seq_user_ns(f), uid), + from_kuid_munged(seq_user_ns(f), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2278,7 +2278,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) get_tcp4_sock(v, seq, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(v, seq, st->num, st->uid); + get_openreq4(v, seq, st->num); break; } out: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 65e797dba504..cadb44a2d34e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1635,7 +1635,7 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - struct request_sock *req, int i, kuid_t uid) + const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; @@ -1659,7 +1659,8 @@ static void get_openreq6(struct seq_file *seq, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, - from_kuid_munged(seq_user_ns(seq), uid), + from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1773,7 +1774,7 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) get_tcp6_sock(seq, v, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq6(seq, v, st->num, st->uid); + get_openreq6(seq, v, st->num); break; } out: -- cgit From 2feda34192a379f8b35a7c6c5826b2f23e884f32 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:31 -0700 Subject: tcp/dccp: remove inet_csk_reqsk_queue_added() timeout argument This is no longer used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 +-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ee54f21a8113..b2e2e30befa9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,8 +282,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk, void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); -static inline void inet_csk_reqsk_queue_added(struct sock *sk, - const unsigned long timeout) +static inline void inet_csk_reqsk_queue_added(struct sock *sk) { reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 093ef04e6ebf..e62f04775c93 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -531,7 +531,7 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, lopt->hash_rnd, lopt->nr_table_entries); reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); + inet_csk_reqsk_queue_added(sk); } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 163bfef3e5db..ea915aa5e4e2 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -157,7 +157,7 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, lopt->hash_rnd, lopt->nr_table_entries); reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); + inet_csk_reqsk_queue_added(sk); } EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); -- cgit From 079096f103faca2dd87342cca6f23d4b34da8871 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:32 -0700 Subject: tcp/dccp: install syn_recv requests into ehash table In this patch, we insert request sockets into TCP/DCCP regular ehash table (where ESTABLISHED and TIMEWAIT sockets are) instead of using the per listener hash table. ACK packets find SYN_RECV pseudo sockets without having to find and lock the listener. In nominal conditions, this halves pressure on listener lock. Note that this will allow for SO_REUSEPORT refinements, so that we can select a listener using cpu/numa affinities instead of the prior 'consistent hash', since only SYN packets will apply this selection logic. We will shrink listen_sock in the following patch to ease code review. Signed-off-by: Eric Dumazet Cc: Ying Cai Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 4 -- include/net/inet_hashtables.h | 1 + include/net/request_sock.h | 4 -- include/net/tcp.h | 3 - net/core/request_sock.c | 28 +-------- net/dccp/ipv4.c | 64 +++++++------------- net/dccp/ipv6.c | 72 +++++++---------------- net/ipv4/inet_connection_sock.c | 103 +++++++------------------------- net/ipv4/inet_diag.c | 96 +++--------------------------- net/ipv4/inet_hashtables.c | 14 ++++- net/ipv4/syncookies.c | 4 ++ net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 117 +++++++++++-------------------------- net/ipv6/inet6_connection_sock.c | 67 --------------------- net/ipv6/tcp_ipv6.c | 82 ++++++++++++-------------- 15 files changed, 160 insertions(+), 501 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b2e2e30befa9..730aa034cd3d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr); int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax); int inet_csk_get_port(struct sock *sk, unsigned short snum); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3fb778d7c875..6683ada25fef 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); +int inet_ehash_insert(struct sock *sk, struct sock *osk); void __inet_hash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk); void inet_hash(struct sock *sk); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 97c1ba61ed2d..e1850923c4f5 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -266,8 +266,4 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log; } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout); - #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index a26341d2ad67..225e9561af35 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1618,7 +1618,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, - TCP_SEQ_STATE_OPENREQ, TCP_SEQ_STATE_ESTABLISHED, }; @@ -1717,8 +1716,6 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc); - void (*queue_hash_add)(struct sock *sk, struct request_sock *req, - const unsigned long timeout); }; #ifdef CONFIG_SYN_COOKIES diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 5ca624cea04c..a4b305d8ca2b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -99,35 +99,9 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk( void reqsk_queue_destroy(struct request_sock_queue *queue) { - /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - if (reqsk_queue_len(queue) != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - spin_lock_bh(&queue->syn_wait_lock); - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - /* Because of following del_timer_sync(), - * we must release the spinlock here - * or risk a dead lock. - */ - spin_unlock_bh(&queue->syn_wait_lock); - atomic_dec(&queue->qlen); - if (del_timer_sync(&req->rsk_timer)) - reqsk_put(req); - reqsk_put(req); - spin_lock_bh(&queue->syn_wait_lock); - } - spin_unlock_bh(&queue->syn_wait_lock); - } - } - - if (WARN_ON(reqsk_queue_len(queue) != 0)) - pr_err("qlen %u\n", reqsk_queue_len(queue)); + /* cleaning is done by req timers */ kvfree(lopt); } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5b7818c63cec..8910c9567719 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -444,36 +444,6 @@ put_and_exit: } EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); -static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct sock *nsk; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport, - iph->saddr, iph->daddr); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, - iph->saddr, dh->dccph_sport, - iph->daddr, dh->dccph_dport, - inet_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -705,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v4_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dh, skb->len)) goto reset; @@ -724,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: dccp_v4_ctl_send_reset(sk, skb); -discard: kfree_skb(skb); return 0; } @@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v4_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e8753aa3b7a4..1361a3f45df7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = { .syn_ack_timeout = dccp_syn_ack_timeout, }; -static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr, - &iph->daddr, inet6_iif(skb)); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, - &iph->saddr, dh->dccph_sport, - &iph->daddr, ntohs(dh->dccph_dport), - inet6_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct request_sock *req; @@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: @@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v6_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - if (opt_skb != NULL) - __kfree_skb(opt_skb); - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) goto reset; @@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v6_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e62f04775c93..80904df02187 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -476,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -571,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - struct request_sock **prev; - bool found = false; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; + spinlock_t *lock; + bool found; - spin_lock(&queue->syn_wait_lock); + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } - } + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -616,10 +557,8 @@ static void reqsk_timer_handler(unsigned long data) int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_listener->sk_state != TCP_LISTEN || !lopt) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -669,36 +608,36 @@ static void reqsk_timer_handler(unsigned long data) mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } +drop: inet_csk_reqsk_queue_drop(sk_listener, req); reqsk_put(req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); - req->rsk_hash = hash; + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2); +} - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 0ac1d68dc8a6..ab9f8a66615d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -730,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !reqsk_queue_len(&icsk->icsk_accept_queue)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -879,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -906,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 56742e995dd3..bed8886a4b6c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -398,14 +398,18 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +int inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; + int ret = 0; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -419,6 +423,12 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk) sk_nulls_del_node_init_rcu(osk); } spin_unlock(lock); + return ret; +} + +void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +{ + inet_ehash_insert(sk, osk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6b97b5f6457c..729ceb5f63c6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -284,6 +284,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b0ce73c2049..a56912772354 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6241,7 +6241,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; tcp_rsk(req)->tfo_listener = false; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } tcp_reqsk_record_syn(sk, req, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a33101616215..bfe9d39ee87d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1224,7 +1224,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, - .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1343,34 +1342,11 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - - nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb); #endif @@ -1409,10 +1385,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_cookie_check(sk, skb); + if (!nsk) goto discard; - if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); sk_mark_napi_id(nsk, skb); @@ -1603,6 +1579,29 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1830,35 +1829,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_OPENREQ) { - struct request_sock *req = cur; - - icsk = inet_csk(st->syn_wait_sk); - req = req->dl_next; - while (1) { - while (req) { - if (req->rsk_ops->family == st->family) { - cur = req; - goto out; - } - req = req->dl_next; - } - if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) - break; -get_req: - req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; - } - sk = sk_nulls_next(st->syn_wait_sk); - st->state = TCP_SEQ_STATE_LISTENING; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } else { - icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) - goto start_req; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_nulls_next(sk); - } + sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) @@ -1868,15 +1839,6 @@ get_sk: goto out; } icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) { -start_req: - st->syn_wait_sk = sk; - st->state = TCP_SEQ_STATE_OPENREQ; - st->sbucket = 0; - goto get_req; - } - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2008,7 +1970,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) void *rc = NULL; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; @@ -2067,7 +2028,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { @@ -2092,11 +2052,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) struct tcp_iter_state *st = seq->private; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: - if (v) { - struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); @@ -2269,18 +2224,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq4(v, seq, st->num); - break; - } + else + get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index ea915aa5e4e2..5d1c7cee2cb2 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -94,73 +94,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, } EXPORT_SYMBOL(inet6_csk_route_req); -/* - * request_sock (formerly open request) hash tables. - */ -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - u32 c; - - c = jhash_3words((__force u32)raddr->s6_addr32[0], - (__force u32)raddr->s6_addr32[1], - (__force u32)raddr->s6_addr32[2], - rnd); - - c = jhash_2words((__force u32)raddr->s6_addr32[3], - (__force u32)rport, - c); - - return c & (synq_hsize - 1); -} - -struct request_sock *inet6_csk_search_req(struct sock *sk, - const __be16 rport, - const struct in6_addr *raddr, - const struct in6_addr *laddr, - const int iif) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && - ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && - (!ireq->ir_iif || ireq->ir_iif == iif)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk != NULL); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet6_csk_search_req); - -void inet6_csk_reqsk_queue_hash_add(struct sock *sk, - struct request_sock *req, - const unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk); -} -EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); - void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cadb44a2d34e..a215614cfb2b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -727,7 +727,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, .send_synack = tcp_v6_send_synack, - .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, @@ -938,37 +937,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, } -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - /* Find possible connection requests. */ - req = inet6_csk_search_req(sk, th->source, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - tcp_v6_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v6_check(sk, skb); #endif @@ -1258,15 +1231,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v6_hnd_req(sk, skb); + struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); sk_mark_napi_id(nsk, skb); @@ -1402,6 +1371,33 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + tcp_v6_fill_cb(skb, hdr, th); + if (tcp_v6_inbound_md5_hash(sk, skb)) { + reqsk_put(req); + goto discard_it; + } + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + tcp_v6_restore_cb(skb); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v6_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1765,18 +1761,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait6_sock(seq, v, st->num); - else - get_tcp6_sock(seq, v, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); - break; - } + else + get_tcp6_sock(seq, v, st->num); out: return 0; } -- cgit From 81b496b31a4331415b6a644b485a329ec0b45155 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:33 -0700 Subject: tcp/dccp: shrink struct listen_sock We no longer use hash_rnd, nr_table_entries and syn_table[] For a listener with a backlog of 10 millions sockets, this saves 80 MBytes of vmalloced memory. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 3 --- net/core/request_sock.c | 14 +++----------- 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index e1850923c4f5..353cb61bb399 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -125,9 +125,6 @@ extern int sysctl_max_syn_backlog; */ struct listen_sock { u32 max_qlen_log; - u32 hash_rnd; - u32 nr_table_entries; - struct request_sock *syn_table[0]; }; /* diff --git a/net/core/request_sock.c b/net/core/request_sock.c index a4b305d8ca2b..124f61c5bfef 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -46,18 +46,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); - if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - lopt = kzalloc(lopt_size, GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); - if (!lopt) - lopt = vzalloc(lopt_size); + lopt = kzalloc(lopt_size, GFP_KERNEL); if (!lopt) return -ENOMEM; - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->syn_wait_lock); @@ -68,7 +61,6 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; - lopt->nr_table_entries = nr_table_entries; lopt->max_qlen_log = ilog2(nr_table_entries); spin_lock_bh(&queue->syn_wait_lock); @@ -81,7 +73,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, void __reqsk_queue_destroy(struct request_sock_queue *queue) { /* This is an error recovery path only, no locking needed */ - kvfree(queue->listen_opt); + kfree(queue->listen_opt); } static inline struct listen_sock *reqsk_queue_yank_listen_sk( @@ -102,7 +94,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); /* cleaning is done by req timers */ - kvfree(lopt); + kfree(lopt); } /* -- cgit From ca6fb06518836ef9b65dc0aac02ff97704d52a05 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:35 -0700 Subject: tcp: attach SYNACK messages to request sockets instead of listener If a listen backlog is very big (to avoid syncookies), then the listener sk->sk_wmem_alloc is the main source of false sharing, as we need to touch it twice per SYNACK re-transmit and TX completion. (One SYN packet takes listener lock once, but up to 6 SYNACK are generated) By attaching the skb to the request socket, we remove this source of contention. Tested: listen(fd, 10485760); // single listener (no SO_REUSEPORT) 16 RX/TX queue NIC Sustain a SYNFLOOD attack of ~320,000 SYN per second, Sending ~1,400,000 SYNACK per second. Perf profiles now show listener spinlock being next bottleneck. 20.29% [kernel] [k] queued_spin_lock_slowpath 10.06% [kernel] [k] __inet_lookup_established 5.12% [kernel] [k] reqsk_timer_handler 3.22% [kernel] [k] get_next_timer_interrupt 3.00% [kernel] [k] tcp_make_synack 2.77% [kernel] [k] ipt_do_table 2.70% [kernel] [k] run_timer_softirq 2.50% [kernel] [k] ip_finish_output 2.04% [kernel] [k] cascade Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ++++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_fastopen.c | 4 ++-- net/ipv4/tcp_input.c | 23 ++++++++++++----------- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_output.c | 22 +++++++++++++++------- net/ipv6/tcp_ipv6.c | 5 +++-- net/sched/sch_fq.c | 12 +++++++----- 8 files changed, 47 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 225e9561af35..a6be56d5f0e3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc); + struct tcp_fastopen_cookie *foc, + bool attach_req); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -1715,7 +1716,8 @@ struct tcp_request_sock_ops { __u32 (*init_seq)(const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc); + u16 queue_mapping, struct tcp_fastopen_cookie *foc, + bool attach_req); }; #ifdef CONFIG_SYN_COOKIES diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 80904df02187..099e0ea9242a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -628,7 +628,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); + atomic_set(&req->rsk_refcnt, 2 + 1); } void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f69f436fcbcc..410ac481fda0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); + atomic_set(&req->rsk_refcnt, 2); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a56912772354..27108757c310 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6120,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct request_sock *req; bool want_cookie = false; struct flowi fl; - int err; - /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is @@ -6230,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); - if (!want_cookie) + if (!want_cookie) { fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc); + tcp_reqsk_record_syn(sk, req, skb); + } if (fastopen_sk) { + af_ops->send_synack(fastopen_sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, false); sock_put(fastopen_sk); } else { - if (err || want_cookie) - goto drop_and_free; - tcp_rsk(req)->tfo_listener = false; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (!want_cookie) + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, !want_cookie); + if (want_cookie) + goto drop_and_free; } - tcp_reqsk_record_syn(sk, req, skb); - + reqsk_put(req); return 0; drop_and_release: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bfe9d39ee87d..ac2ea73e9aaf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 09bb082ca1a7..55ed3266b05f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk) */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, u16 user_mss; int mss; - /* sk is a const pointer, because we want to express multiple cpus - * might call us concurrently. - * sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb->destructor = sock_edemux; + sock_hold(req_to_sk(req)); + skb->sk = req_to_sk(req); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); @@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a215614cfb2b..3d18571811c5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f377702d4b91..3386cce4751e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) return &q->internal; - /* SYNACK messages are attached to a listener socket. - * 1) They are not part of a 'flow' yet - * 2) We do not want to rate limit them (eg SYNFLOOD attack), + /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket + * 1) request sockets are not full blown, + * they do not contain sk_pacing_rate + * 2) They are not part of a 'flow' yet + * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE - * 3) We pretend they are orphaned + * 4) We pretend they are orphaned */ - if (!sk || sk->sk_state == TCP_LISTEN) { + if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not -- cgit From 10cbc8f179177c1a6d5f56a46ebddc8f602ce5ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:36 -0700 Subject: tcp/dccp: remove struct listen_sock It is enough to check listener sk_state, no need for an extra condition. max_qlen_log can be moved into struct request_sock_queue We can remove syn_wait_lock and the alignment it enforced. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 26 ++++------------------- net/core/request_sock.c | 47 +++-------------------------------------- net/ipv4/inet_connection_sock.c | 14 ++++-------- 3 files changed, 11 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 353cb61bb399..a66ab1345373 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -119,14 +119,6 @@ static inline void reqsk_put(struct request_sock *req) extern int sysctl_max_syn_backlog; -/** struct listen_sock - listen state - * - * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs - */ -struct listen_sock { - u32 max_qlen_log; -}; - /* * For a TCP Fast Open listener - * lock - protects the access to all the reqsk, which is co-owned by @@ -160,36 +152,26 @@ struct fastopen_queue { * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children * @rskq_defer_accept - User waits for some data after accept() - * @syn_wait_lock - serializer - * - * %syn_wait_lock is necessary only to avoid proc interface having to grab the main - * lock sock while browsing the listening hash (otherwise it's deadlock prone). * */ struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u8 max_qlen_log; u32 synflood_warned; - atomic_t qlen; atomic_t young; struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; - struct listen_sock *listen_opt; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. */ - - /* temporary alignment, our goal is to get rid of this lock */ - spinlock_t syn_wait_lock ____cacheline_aligned_in_smp; }; -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries); +void reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries); -void __reqsk_queue_destroy(struct request_sock_queue *queue); -void reqsk_queue_destroy(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); @@ -260,7 +242,7 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) { - return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log; + return reqsk_queue_len(queue) >> queue->max_qlen_log; } #endif /* _REQUEST_SOCK_H */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 124f61c5bfef..ecf74189bd3f 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -37,22 +37,14 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries) { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt = NULL; - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt = kzalloc(lopt_size, GFP_KERNEL); - if (!lopt) - return -ENOMEM; - spin_lock_init(&queue->rskq_lock); - spin_lock_init(&queue->syn_wait_lock); spin_lock_init(&queue->fastopenq.lock); queue->fastopenq.rskq_rst_head = NULL; @@ -61,40 +53,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; - lopt->max_qlen_log = ilog2(nr_table_entries); - - spin_lock_bh(&queue->syn_wait_lock); - queue->listen_opt = lopt; - spin_unlock_bh(&queue->syn_wait_lock); - - return 0; -} - -void __reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* This is an error recovery path only, no locking needed */ - kfree(queue->listen_opt); -} - -static inline struct listen_sock *reqsk_queue_yank_listen_sk( - struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - - spin_lock_bh(&queue->syn_wait_lock); - lopt = queue->listen_opt; - queue->listen_opt = NULL; - spin_unlock_bh(&queue->syn_wait_lock); - - return lopt; -} - -void reqsk_queue_destroy(struct request_sock_queue *queue) -{ - struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - - /* cleaning is done by req timers */ - kfree(lopt); + queue->max_qlen_log = ilog2(nr_table_entries); } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 099e0ea9242a..775483283fa7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -552,12 +552,11 @@ static void reqsk_timer_handler(unsigned long data) struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct listen_sock *lopt = queue->listen_opt; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) + if (sk_listener->sk_state != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; @@ -580,7 +579,7 @@ static void reqsk_timer_handler(unsigned long data) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if (qlen >> (lopt->max_qlen_log - 1)) { + if (qlen >> (queue->max_qlen_log - 1)) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { @@ -730,12 +729,10 @@ EXPORT_SYMBOL(inet_csk_prepare_forced_close); int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + struct inet_sock *inet = inet_sk(sk); - if (rc != 0) - return rc; + reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; @@ -757,7 +754,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); @@ -780,8 +776,6 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(queue); - while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; -- cgit From ef547f2ac16bd9d77a780a0e7c70857e69e8f23f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:37 -0700 Subject: tcp: remove max_qlen_log This control variable was set at first listen(fd, backlog) call, but not updated if application tried to increase or decrease backlog. It made sense at the time listener had a non resizeable hash table. Also rounding to powers of two was not very friendly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- include/net/request_sock.h | 10 ++-------- net/core/request_sock.c | 8 +------- net/ipv4/inet_connection_sock.c | 4 ++-- 4 files changed, 6 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 730aa034cd3d..3208a65d1c28 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -295,7 +295,7 @@ static inline int inet_csk_reqsk_queue_young(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); + return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; } void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index a66ab1345373..bae6936d75c4 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -157,7 +157,7 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; - u8 max_qlen_log; + u32 synflood_warned; atomic_t qlen; atomic_t young; @@ -169,8 +169,7 @@ struct request_sock_queue { */ }; -void reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries); +void reqsk_queue_alloc(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); @@ -240,9 +239,4 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } -static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) -{ - return reqsk_queue_len(queue) >> queue->max_qlen_log; -} - #endif /* _REQUEST_SOCK_H */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index ecf74189bd3f..15c853806518 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -37,13 +37,8 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); -void reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue) { - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); - nr_table_entries = max_t(u32, nr_table_entries, 8); - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->fastopenq.lock); @@ -53,7 +48,6 @@ void reqsk_queue_alloc(struct request_sock_queue *queue, queue->fastopenq.max_qlen = 0; queue->rskq_accept_head = NULL; - queue->max_qlen_log = ilog2(nr_table_entries); } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 775483283fa7..5f6e31a4aeae 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -579,7 +579,7 @@ static void reqsk_timer_handler(unsigned long data) * ones are about to clog our table. */ qlen = reqsk_queue_len(queue); - if (qlen >> (queue->max_qlen_log - 1)) { + if ((qlen << 1) > sk_listener->sk_max_ack_backlog) { int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { @@ -732,7 +732,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; -- cgit From 92d6f176fdcce1a9c22a59d754c924168fdf2ce4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:38 -0700 Subject: tcp/dccp: add a reschedule point in inet_csk_listen_stop() If a listener with thousands of children in accept queue is dismantled, it can take a while to close all of them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5f6e31a4aeae..89eedfbd4ad5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -809,6 +809,7 @@ void inet_csk_listen_stop(struct sock *sk) sock_put(child); reqsk_put(req); + cond_resched(); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ -- cgit From e994b2f0fb9229aeff5eea9541320bd7b2ca8714 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:39 -0700 Subject: tcp: do not lock listener to process SYN packets Everything should now be ready to finally allow SYN packets processing without holding listener lock. Tested: 3.5 Mpps SYNFLOOD. Plenty of cpu cycles available. Next bottleneck is the refcount taken on listener, that could be avoided if we remove SLAB_DESTROY_BY_RCU strict semantic for listeners, and use regular RCU. 13.18% [kernel] [k] __inet_lookup_listener 9.61% [kernel] [k] tcp_conn_request 8.16% [kernel] [k] sha_transform 5.30% [kernel] [k] inet_reqsk_alloc 4.22% [kernel] [k] sock_put 3.74% [kernel] [k] tcp_make_synack 2.88% [kernel] [k] ipt_do_table 2.56% [kernel] [k] memcpy_erms 2.53% [kernel] [k] sock_wfree 2.40% [kernel] [k] tcp_v4_rcv 2.08% [kernel] [k] fib_table_lookup 1.84% [kernel] [k] tcp_openreq_init_rwin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 +++++++++-- net/ipv6/tcp_ipv6.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac2ea73e9aaf..34310748a365 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1355,7 +1355,7 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1619,9 +1619,15 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v4_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1636,6 +1642,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3d18571811c5..33334f0c217d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1161,7 +1161,7 @@ out: } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1415,9 +1415,15 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v6_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1432,6 +1438,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret ? -1 : 0; -- cgit