diff options
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 166 | 
1 files changed, 111 insertions, 55 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0c83780dc9bf..6376ad915765 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -199,16 +199,18 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,  /* This will initiate an outgoing connection. */  int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  { +	struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;  	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; +	struct inet_timewait_death_row *tcp_death_row; +	__be32 daddr, nexthop, prev_sk_rcv_saddr;  	struct inet_sock *inet = inet_sk(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	struct ip_options_rcu *inet_opt; +	struct net *net = sock_net(sk);  	__be16 orig_sport, orig_dport; -	__be32 daddr, nexthop;  	struct flowi4 *fl4;  	struct rtable *rt;  	int err; -	struct ip_options_rcu *inet_opt; -	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;  	if (addr_len < sizeof(struct sockaddr_in))  		return -EINVAL; @@ -234,7 +236,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	if (IS_ERR(rt)) {  		err = PTR_ERR(rt);  		if (err == -ENETUNREACH) -			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); +			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);  		return err;  	} @@ -246,10 +248,29 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	if (!inet_opt || !inet_opt->opt.srr)  		daddr = fl4->daddr; -	if (!inet->inet_saddr) +	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + +	if (!inet->inet_saddr) { +		if (inet_csk(sk)->icsk_bind2_hash) { +			prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, +								     sk, net, inet->inet_num); +			prev_sk_rcv_saddr = sk->sk_rcv_saddr; +		}  		inet->inet_saddr = fl4->saddr; +	} +  	sk_rcv_saddr_set(sk, inet->inet_saddr); +	if (prev_addr_hashbucket) { +		err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); +		if (err) { +			inet->inet_saddr = 0; +			sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); +			ip_rt_put(rt); +			return err; +		} +	} +  	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {  		/* Reset inherited state */  		tp->rx_opt.ts_recent	   = 0; @@ -298,8 +319,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  						  inet->inet_daddr,  						  inet->inet_sport,  						  usin->sin_port)); -		tp->tsoffset = secure_tcp_ts_off(sock_net(sk), -						 inet->inet_saddr, +		tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,  						 inet->inet_daddr);  	} @@ -475,9 +495,9 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)  	int err;  	struct net *net = dev_net(skb->dev); -	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, -				       th->dest, iph->saddr, ntohs(th->source), -				       inet_iif(skb), 0); +	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, +				       iph->daddr, th->dest, iph->saddr, +				       ntohs(th->source), inet_iif(skb), 0);  	if (!sk) {  		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);  		return -ENOENT; @@ -740,8 +760,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)  		 * Incoming packet is checked with md5 hash with finding key,  		 * no RST generated if md5 hash doesn't match.  		 */ -		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, -					     ip_hdr(skb)->saddr, +		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, +					     NULL, 0, ip_hdr(skb)->saddr,  					     th->source, ip_hdr(skb)->daddr,  					     ntohs(th->source), dif, sdif);  		/* don't send rst if it can't find key */ @@ -1709,6 +1729,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);  int tcp_v4_early_demux(struct sk_buff *skb)  { +	struct net *net = dev_net(skb->dev);  	const struct iphdr *iph;  	const struct tcphdr *th;  	struct sock *sk; @@ -1725,7 +1746,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)  	if (th->doff < sizeof(struct tcphdr) / 4)  		return 0; -	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, +	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,  				       iph->saddr, th->source,  				       iph->daddr, ntohs(th->dest),  				       skb->skb_iif, inet_sdif(skb)); @@ -1951,7 +1972,8 @@ int tcp_v4_rcv(struct sk_buff *skb)  	th = (const struct tcphdr *)skb->data;  	iph = ip_hdr(skb);  lookup: -	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, +	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, +			       skb, __tcp_hdrlen(th), th->source,  			       th->dest, sdif, &refcounted);  	if (!sk)  		goto no_tcp_socket; @@ -2133,9 +2155,9 @@ do_time_wait:  	}  	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {  	case TCP_TW_SYN: { -		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), -							&tcp_hashinfo, skb, -							__tcp_hdrlen(th), +		struct sock *sk2 = inet_lookup_listener(net, +							net->ipv4.tcp_death_row.hashinfo, +							skb, __tcp_hdrlen(th),  							iph->saddr, th->source,  							iph->daddr, th->dest,  							inet_iif(skb), @@ -2285,15 +2307,16 @@ static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)   */  static void *listening_get_first(struct seq_file *seq)  { +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct tcp_iter_state *st = seq->private;  	st->offset = 0; -	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { +	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {  		struct inet_listen_hashbucket *ilb2;  		struct hlist_nulls_node *node;  		struct sock *sk; -		ilb2 = &tcp_hashinfo.lhash2[st->bucket]; +		ilb2 = &hinfo->lhash2[st->bucket];  		if (hlist_nulls_empty(&ilb2->nulls_head))  			continue; @@ -2318,6 +2341,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)  	struct tcp_iter_state *st = seq->private;  	struct inet_listen_hashbucket *ilb2;  	struct hlist_nulls_node *node; +	struct inet_hashinfo *hinfo;  	struct sock *sk = cur;  	++st->num; @@ -2329,7 +2353,8 @@ static void *listening_get_next(struct seq_file *seq, void *cur)  			return sk;  	} -	ilb2 = &tcp_hashinfo.lhash2[st->bucket]; +	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; +	ilb2 = &hinfo->lhash2[st->bucket];  	spin_unlock(&ilb2->lock);  	++st->bucket;  	return listening_get_first(seq); @@ -2351,9 +2376,10 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)  	return rc;  } -static inline bool empty_bucket(const struct tcp_iter_state *st) +static inline bool empty_bucket(struct inet_hashinfo *hinfo, +				const struct tcp_iter_state *st)  { -	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); +	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);  }  /* @@ -2362,20 +2388,21 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)   */  static void *established_get_first(struct seq_file *seq)  { +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct tcp_iter_state *st = seq->private;  	st->offset = 0; -	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { +	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {  		struct sock *sk;  		struct hlist_nulls_node *node; -		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); +		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);  		/* Lockless fast path for the common case of empty buckets */ -		if (empty_bucket(st)) +		if (empty_bucket(hinfo, st))  			continue;  		spin_lock_bh(lock); -		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { +		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {  			if (seq_sk_match(seq, sk))  				return sk;  		} @@ -2387,9 +2414,10 @@ static void *established_get_first(struct seq_file *seq)  static void *established_get_next(struct seq_file *seq, void *cur)  { -	struct sock *sk = cur; -	struct hlist_nulls_node *node; +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct tcp_iter_state *st = seq->private; +	struct hlist_nulls_node *node; +	struct sock *sk = cur;  	++st->num;  	++st->offset; @@ -2401,7 +2429,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)  			return sk;  	} -	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); +	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));  	++st->bucket;  	return established_get_first(seq);  } @@ -2439,6 +2467,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)  static void *tcp_seek_last_pos(struct seq_file *seq)  { +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct tcp_iter_state *st = seq->private;  	int bucket = st->bucket;  	int offset = st->offset; @@ -2447,7 +2476,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)  	switch (st->state) {  	case TCP_SEQ_STATE_LISTENING: -		if (st->bucket > tcp_hashinfo.lhash2_mask) +		if (st->bucket > hinfo->lhash2_mask)  			break;  		st->state = TCP_SEQ_STATE_LISTENING;  		rc = listening_get_first(seq); @@ -2459,7 +2488,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)  		st->state = TCP_SEQ_STATE_ESTABLISHED;  		fallthrough;  	case TCP_SEQ_STATE_ESTABLISHED: -		if (st->bucket > tcp_hashinfo.ehash_mask) +		if (st->bucket > hinfo->ehash_mask)  			break;  		rc = established_get_first(seq);  		while (offset-- && rc && bucket == st->bucket) @@ -2527,16 +2556,17 @@ EXPORT_SYMBOL(tcp_seq_next);  void tcp_seq_stop(struct seq_file *seq, void *v)  { +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct tcp_iter_state *st = seq->private;  	switch (st->state) {  	case TCP_SEQ_STATE_LISTENING:  		if (v != SEQ_START_TOKEN) -			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); +			spin_unlock(&hinfo->lhash2[st->bucket].lock);  		break;  	case TCP_SEQ_STATE_ESTABLISHED:  		if (v) -			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); +			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));  		break;  	}  } @@ -2731,6 +2761,7 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,  static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,  						 struct sock *start_sk)  { +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct bpf_tcp_iter_state *iter = seq->private;  	struct tcp_iter_state *st = &iter->state;  	struct hlist_nulls_node *node; @@ -2750,7 +2781,7 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,  			expected++;  		}  	} -	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); +	spin_unlock(&hinfo->lhash2[st->bucket].lock);  	return expected;  } @@ -2758,6 +2789,7 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,  static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,  						   struct sock *start_sk)  { +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct bpf_tcp_iter_state *iter = seq->private;  	struct tcp_iter_state *st = &iter->state;  	struct hlist_nulls_node *node; @@ -2777,13 +2809,14 @@ static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,  			expected++;  		}  	} -	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); +	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));  	return expected;  }  static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)  { +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;  	struct bpf_tcp_iter_state *iter = seq->private;  	struct tcp_iter_state *st = &iter->state;  	unsigned int expected; @@ -2799,7 +2832,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)  		st->offset = 0;  		st->bucket++;  		if (st->state == TCP_SEQ_STATE_LISTENING && -		    st->bucket > tcp_hashinfo.lhash2_mask) { +		    st->bucket > hinfo->lhash2_mask) {  			st->state = TCP_SEQ_STATE_ESTABLISHED;  			st->bucket = 0;  		} @@ -3064,7 +3097,7 @@ struct proto tcp_prot = {  	.slab_flags		= SLAB_TYPESAFE_BY_RCU,  	.twsk_prot		= &tcp_timewait_sock_ops,  	.rsk_prot		= &tcp_request_sock_ops, -	.h.hashinfo		= &tcp_hashinfo, +	.h.hashinfo		= NULL,  	.no_autobind		= true,  	.diag_destroy		= tcp_abort,  }; @@ -3072,19 +3105,43 @@ EXPORT_SYMBOL(tcp_prot);  static void __net_exit tcp_sk_exit(struct net *net)  { -	struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; -  	if (net->ipv4.tcp_congestion_control)  		bpf_module_put(net->ipv4.tcp_congestion_control,  			       net->ipv4.tcp_congestion_control->owner); -	if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) -		kfree(tcp_death_row);  } -static int __net_init tcp_sk_init(struct net *net) +static void __net_init tcp_set_hashinfo(struct net *net)  { -	int cnt; +	struct inet_hashinfo *hinfo; +	unsigned int ehash_entries; +	struct net *old_net; + +	if (net_eq(net, &init_net)) +		goto fallback; + +	old_net = current->nsproxy->net_ns; +	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); +	if (!ehash_entries) +		goto fallback; + +	ehash_entries = roundup_pow_of_two(ehash_entries); +	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); +	if (!hinfo) { +		pr_warn("Failed to allocate TCP ehash (entries: %u) " +			"for a netns, fallback to the global one\n", +			ehash_entries); +fallback: +		hinfo = &tcp_hashinfo; +		ehash_entries = tcp_hashinfo.ehash_mask + 1; +	} + +	net->ipv4.tcp_death_row.hashinfo = hinfo; +	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; +	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); +} +static int __net_init tcp_sk_init(struct net *net) +{  	net->ipv4.sysctl_tcp_ecn = 2;  	net->ipv4.sysctl_tcp_ecn_fallback = 1; @@ -3110,15 +3167,9 @@ static int __net_init tcp_sk_init(struct net *net)  	net->ipv4.sysctl_tcp_tw_reuse = 2;  	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; -	net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); -	if (!net->ipv4.tcp_death_row) -		return -ENOMEM; -	refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); -	cnt = tcp_hashinfo.ehash_mask + 1; -	net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; -	net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; +	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); +	tcp_set_hashinfo(net); -	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);  	net->ipv4.sysctl_tcp_sack = 1;  	net->ipv4.sysctl_tcp_window_scaling = 1;  	net->ipv4.sysctl_tcp_timestamps = 1; @@ -3139,8 +3190,10 @@ static int __net_init tcp_sk_init(struct net *net)  	net->ipv4.sysctl_tcp_tso_win_divisor = 3;  	/* Default TSQ limit of 16 TSO segments */  	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; -	/* rfc5961 challenge ack rate limiting */ -	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + +	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ +	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; +  	net->ipv4.sysctl_tcp_min_tso_segs = 2;  	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */  	net->ipv4.sysctl_tcp_min_rtt_wlen = 300; @@ -3178,10 +3231,13 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)  {  	struct net *net; -	inet_twsk_purge(&tcp_hashinfo, AF_INET); +	tcp_twsk_purge(net_exit_list, AF_INET); -	list_for_each_entry(net, net_exit_list, exit_list) +	list_for_each_entry(net, net_exit_list, exit_list) { +		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); +		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));  		tcp_fastopen_ctx_destroy(net); +	}  }  static struct pernet_operations __net_initdata tcp_sk_ops = {  |