diff options
Diffstat (limited to 'net/ipv4/tcp.c')
| -rw-r--r-- | net/ipv4/tcp.c | 101 | 
1 files changed, 68 insertions, 33 deletions
| diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b7796b4cf0a0..3b75836db19b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -292,7 +292,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);  long sysctl_tcp_mem[3] __read_mostly;  EXPORT_SYMBOL(sysctl_tcp_mem); -atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */ +atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;	/* Current allocated memory. */  EXPORT_SYMBOL(tcp_memory_allocated);  #if IS_ENABLED(CONFIG_SMC) @@ -303,7 +303,7 @@ EXPORT_SYMBOL(tcp_have_smc);  /*   * Current number of TCP sockets.   */ -struct percpu_counter tcp_sockets_allocated; +struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;  EXPORT_SYMBOL(tcp_sockets_allocated);  /* @@ -456,7 +456,6 @@ void tcp_init_sock(struct sock *sk)  	WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);  	sk_sockets_allocated_inc(sk); -	sk->sk_route_forced_caps = NETIF_F_GSO;  }  EXPORT_SYMBOL(tcp_init_sock); @@ -546,10 +545,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  	if (state != TCP_SYN_SENT &&  	    (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {  		int target = sock_rcvlowat(sk, 0, INT_MAX); +		u16 urg_data = READ_ONCE(tp->urg_data); -		if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && -		    !sock_flag(sk, SOCK_URGINLINE) && -		    tp->urg_data) +		if (unlikely(urg_data) && +		    READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && +		    !sock_flag(sk, SOCK_URGINLINE))  			target++;  		if (tcp_stream_is_readable(sk, target)) @@ -574,7 +574,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  		} else  			mask |= EPOLLOUT | EPOLLWRNORM; -		if (tp->urg_data & TCP_URG_VALID) +		if (urg_data & TCP_URG_VALID)  			mask |= EPOLLPRI;  	} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {  		/* Active TCP fastopen socket with defer_connect @@ -608,7 +608,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)  		unlock_sock_fast(sk, slow);  		break;  	case SIOCATMARK: -		answ = tp->urg_data && +		answ = READ_ONCE(tp->urg_data) &&  		       READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);  		break;  	case SIOCOUTQ: @@ -1466,7 +1466,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)  		char c = tp->urg_data;  		if (!(flags & MSG_PEEK)) -			tp->urg_data = TCP_URG_READ; +			WRITE_ONCE(tp->urg_data, TCP_URG_READ);  		/* Read urgent data. */  		msg->msg_flags |= MSG_OOB; @@ -1580,6 +1580,36 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)  		tcp_send_ack(sk);  } +void __sk_defer_free_flush(struct sock *sk) +{ +	struct llist_node *head; +	struct sk_buff *skb, *n; + +	head = llist_del_all(&sk->defer_list); +	llist_for_each_entry_safe(skb, n, head, ll_node) { +		prefetch(n); +		skb_mark_not_on_list(skb); +		__kfree_skb(skb); +	} +} +EXPORT_SYMBOL(__sk_defer_free_flush); + +static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) +{ +	__skb_unlink(skb, &sk->sk_receive_queue); +	if (likely(skb->destructor == sock_rfree)) { +		sock_rfree(skb); +		skb->destructor = NULL; +		skb->sk = NULL; +		if (!skb_queue_empty(&sk->sk_receive_queue) || +		    !llist_empty(&sk->defer_list)) { +			llist_add(&skb->ll_node, &sk->defer_list); +			return; +		} +	} +	__kfree_skb(skb); +} +  static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)  {  	struct sk_buff *skb; @@ -1599,7 +1629,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)  		 * splitted a fat GRO packet, while we released socket lock  		 * in skb_splice_bits()  		 */ -		sk_eat_skb(sk, skb); +		tcp_eat_recv_skb(sk, skb);  	}  	return NULL;  } @@ -1633,7 +1663,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  			len = skb->len - offset;  			/* Stop reading if we hit a patch of urgent data */ -			if (tp->urg_data) { +			if (unlikely(tp->urg_data)) {  				u32 urg_offset = tp->urg_seq - seq;  				if (urg_offset < len)  					len = urg_offset; @@ -1665,11 +1695,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  				continue;  		}  		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { -			sk_eat_skb(sk, skb); +			tcp_eat_recv_skb(sk, skb);  			++seq;  			break;  		} -		sk_eat_skb(sk, skb); +		tcp_eat_recv_skb(sk, skb);  		if (!desc->count)  			break;  		WRITE_ONCE(tp->copied_seq, seq); @@ -1758,6 +1788,9 @@ static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,  {  	skb_frag_t *frag; +	if (unlikely(offset_skb >= skb->len)) +		return NULL; +  	offset_skb -= skb_headlen(skb);  	if ((int)offset_skb < 0 || skb_has_frag_list(skb))  		return NULL; @@ -2326,7 +2359,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,  		u32 offset;  		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ -		if (tp->urg_data && tp->urg_seq == *seq) { +		if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {  			if (copied)  				break;  			if (signal_pending(current)) { @@ -2369,10 +2402,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,  			break;  		if (copied) { -			if (sk->sk_err || +			if (!timeo || +			    sk->sk_err ||  			    sk->sk_state == TCP_CLOSE ||  			    (sk->sk_shutdown & RCV_SHUTDOWN) || -			    !timeo ||  			    signal_pending(current))  				break;  		} else { @@ -2406,13 +2439,12 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,  			}  		} -		tcp_cleanup_rbuf(sk, copied); -  		if (copied >= target) {  			/* Do not sleep, just process backlog. */ -			release_sock(sk); -			lock_sock(sk); +			__sk_flush_backlog(sk);  		} else { +			tcp_cleanup_rbuf(sk, copied); +			sk_defer_free_flush(sk);  			sk_wait_data(sk, &timeo, last);  		} @@ -2432,7 +2464,7 @@ found_ok_skb:  			used = len;  		/* Do we have urgent data here? */ -		if (tp->urg_data) { +		if (unlikely(tp->urg_data)) {  			u32 urg_offset = tp->urg_seq - *seq;  			if (urg_offset < used) {  				if (!urg_offset) { @@ -2466,8 +2498,8 @@ found_ok_skb:  		tcp_rcv_space_adjust(sk);  skip_copy: -		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { -			tp->urg_data = 0; +		if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { +			WRITE_ONCE(tp->urg_data, 0);  			tcp_fast_path_check(sk);  		} @@ -2482,14 +2514,14 @@ skip_copy:  		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)  			goto found_fin_ok;  		if (!(flags & MSG_PEEK)) -			sk_eat_skb(sk, skb); +			tcp_eat_recv_skb(sk, skb);  		continue;  found_fin_ok:  		/* Process the FIN. */  		WRITE_ONCE(*seq, *seq + 1);  		if (!(flags & MSG_PEEK)) -			sk_eat_skb(sk, skb); +			tcp_eat_recv_skb(sk, skb);  		break;  	} while (len > 0); @@ -2531,6 +2563,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,  	ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,  				 &cmsg_flags);  	release_sock(sk); +	sk_defer_free_flush(sk);  	if (cmsg_flags && ret >= 0) {  		if (cmsg_flags & TCP_CMSG_TS) @@ -2961,7 +2994,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	tcp_clear_xmit_timers(sk);  	__skb_queue_purge(&sk->sk_receive_queue);  	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); -	tp->urg_data = 0; +	WRITE_ONCE(tp->urg_data, 0);  	tcp_write_queue_purge(sk);  	tcp_fastopen_active_disable_ofo_check(sk);  	skb_rbtree_purge(&tp->out_of_order_queue); @@ -3009,8 +3042,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;  	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));  	__sk_dst_reset(sk); -	dst_release(sk->sk_rx_dst); -	sk->sk_rx_dst = NULL; +	dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));  	tcp_saved_syn_free(tp);  	tp->compressed_ack = 0;  	tp->segs_in = 0; @@ -3056,7 +3088,7 @@ int tcp_disconnect(struct sock *sk, int flags)  		sk->sk_frag.page = NULL;  		sk->sk_frag.offset = 0;  	} - +	sk_defer_free_flush(sk);  	sk_error_report(sk);  	return 0;  } @@ -3174,7 +3206,7 @@ static void tcp_enable_tx_delay(void)   * TCP_CORK can be set together with TCP_NODELAY and it is stronger than   * TCP_NODELAY.   */ -static void __tcp_sock_set_cork(struct sock *sk, bool on) +void __tcp_sock_set_cork(struct sock *sk, bool on)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -3202,7 +3234,7 @@ EXPORT_SYMBOL(tcp_sock_set_cork);   * However, when TCP_NODELAY is set we make an explicit push, which overrides   * even TCP_CORK for currently queued segments.   */ -static void __tcp_sock_set_nodelay(struct sock *sk, bool on) +void __tcp_sock_set_nodelay(struct sock *sk, bool on)  {  	if (on) {  		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; @@ -3771,10 +3803,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  	tcp_get_info_chrono_stats(tp, info);  	info->tcpi_segs_out = tp->segs_out; -	info->tcpi_segs_in = tp->segs_in; + +	/* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ +	info->tcpi_segs_in = READ_ONCE(tp->segs_in); +	info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);  	info->tcpi_min_rtt = tcp_min_rtt(tp); -	info->tcpi_data_segs_in = tp->data_segs_in;  	info->tcpi_data_segs_out = tp->data_segs_out;  	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; @@ -4183,6 +4217,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,  							  &zc, &len, err);  		release_sock(sk); +		sk_defer_free_flush(sk);  		if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))  			goto zerocopy_rcv_cmsg;  		switch (len) { |