diff options
Diffstat (limited to 'net/core/sock.c')
| -rw-r--r-- | net/core/sock.c | 122 | 
1 files changed, 88 insertions, 34 deletions
| diff --git a/net/core/sock.c b/net/core/sock.c index b5baeb9cb0fb..ac2a404c73eb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -102,6 +102,7 @@  #include <linux/proc_fs.h>  #include <linux/seq_file.h>  #include <linux/sched.h> +#include <linux/sched/mm.h>  #include <linux/timer.h>  #include <linux/string.h>  #include <linux/sockios.h> @@ -138,10 +139,7 @@  #include <trace/events/sock.h> -#ifdef CONFIG_INET  #include <net/tcp.h> -#endif -  #include <net/busy_poll.h>  static DEFINE_MUTEX(proto_list_mutex); @@ -372,14 +370,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc);  int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)  {  	int ret; -	unsigned long pflags = current->flags; +	unsigned int noreclaim_flag;  	/* these should have been dropped before queueing */  	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); -	current->flags |= PF_MEMALLOC; +	noreclaim_flag = memalloc_noreclaim_save();  	ret = sk->sk_backlog_rcv(sk, skb); -	current_restore_flags(pflags, PF_MEMALLOC); +	memalloc_noreclaim_restore(noreclaim_flag);  	return ret;  } @@ -1040,6 +1038,10 @@ set_rcvbuf:  #endif  	case SO_MAX_PACING_RATE: +		if (val != ~0U) +			cmpxchg(&sk->sk_pacing_status, +				SK_PACING_NONE, +				SK_PACING_NEEDED);  		sk->sk_max_pacing_rate = val;  		sk->sk_pacing_rate = min(sk->sk_pacing_rate,  					 sk->sk_max_pacing_rate); @@ -1076,6 +1078,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,  	}  } +static int groups_to_user(gid_t __user *dst, const struct group_info *src) +{ +	struct user_namespace *user_ns = current_user_ns(); +	int i; + +	for (i = 0; i < src->ngroups; i++) +		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) +			return -EFAULT; + +	return 0; +} +  int sock_getsockopt(struct socket *sock, int level, int optname,  		    char __user *optval, int __user *optlen)  { @@ -1229,6 +1243,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		goto lenout;  	} +	case SO_PEERGROUPS: +	{ +		int ret, n; + +		if (!sk->sk_peer_cred) +			return -ENODATA; + +		n = sk->sk_peer_cred->group_info->ngroups; +		if (len < n * sizeof(gid_t)) { +			len = n * sizeof(gid_t); +			return put_user(len, optlen) ? -EFAULT : -ERANGE; +		} +		len = n * sizeof(gid_t); + +		ret = groups_to_user((gid_t __user *)optval, +				     sk->sk_peer_cred->group_info); +		if (ret) +			return ret; +		goto lenout; +	} +  	case SO_PEERNAME:  	{  		char address[128]; @@ -1493,7 +1528,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,  		if (likely(sk->sk_net_refcnt))  			get_net(net);  		sock_net_set(sk, net); -		atomic_set(&sk->sk_wmem_alloc, 1); +		refcount_set(&sk->sk_wmem_alloc, 1);  		mem_cgroup_sk_alloc(sk);  		cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -1517,7 +1552,7 @@ static void __sk_destruct(struct rcu_head *head)  		sk->sk_destruct(sk);  	filter = rcu_dereference_check(sk->sk_filter, -				       atomic_read(&sk->sk_wmem_alloc) == 0); +				       refcount_read(&sk->sk_wmem_alloc) == 0);  	if (filter) {  		sk_filter_uncharge(sk, filter);  		RCU_INIT_POINTER(sk->sk_filter, NULL); @@ -1567,7 +1602,7 @@ void sk_free(struct sock *sk)  	 * some packets are still in some tx queue.  	 * If not null, sock_wfree() will call __sk_free(sk) later  	 */ -	if (atomic_dec_and_test(&sk->sk_wmem_alloc)) +	if (refcount_dec_and_test(&sk->sk_wmem_alloc))  		__sk_free(sk);  }  EXPORT_SYMBOL(sk_free); @@ -1624,7 +1659,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  		/*  		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())  		 */ -		atomic_set(&newsk->sk_wmem_alloc, 1); +		refcount_set(&newsk->sk_wmem_alloc, 1);  		atomic_set(&newsk->sk_omem_alloc, 0);  		sk_init_common(newsk); @@ -1673,7 +1708,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  		 * (Documentation/RCU/rculist_nulls.txt for details)  		 */  		smp_wmb(); -		atomic_set(&newsk->sk_refcnt, 2); +		refcount_set(&newsk->sk_refcnt, 2);  		/*  		 * Increment the counter in the same struct proto as the master @@ -1752,7 +1787,7 @@ void sock_wfree(struct sk_buff *skb)  		 * Keep a reference on sk_wmem_alloc, this will be released  		 * after sk_write_space() call  		 */ -		atomic_sub(len - 1, &sk->sk_wmem_alloc); +		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));  		sk->sk_write_space(sk);  		len = 1;  	} @@ -1760,7 +1795,7 @@ void sock_wfree(struct sk_buff *skb)  	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()  	 * could not do because of in-flight packets  	 */ -	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) +	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))  		__sk_free(sk);  }  EXPORT_SYMBOL(sock_wfree); @@ -1772,7 +1807,7 @@ void __sock_wfree(struct sk_buff *skb)  {  	struct sock *sk = skb->sk; -	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) +	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))  		__sk_free(sk);  } @@ -1794,7 +1829,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)  	 * is enough to guarantee sk_free() wont free this sock until  	 * all in-flight packets are completed  	 */ -	atomic_add(skb->truesize, &sk->sk_wmem_alloc); +	refcount_add(skb->truesize, &sk->sk_wmem_alloc);  }  EXPORT_SYMBOL(skb_set_owner_w); @@ -1802,28 +1837,24 @@ EXPORT_SYMBOL(skb_set_owner_w);   * delay queue. We want to allow the owner socket to send more   * packets, as if they were already TX completed by a typical driver.   * But we also want to keep skb->sk set because some packet schedulers - * rely on it (sch_fq for example). So we set skb->truesize to a small - * amount (1) and decrease sk_wmem_alloc accordingly. + * rely on it (sch_fq for example).   */  void skb_orphan_partial(struct sk_buff *skb)  { -	/* If this skb is a TCP pure ACK or already went here, -	 * we have nothing to do. 2 is already a very small truesize. -	 */ -	if (skb->truesize <= 2) +	if (skb_is_tcp_pure_ack(skb))  		return; -	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, -	 * so we do not completely orphan skb, but transfert all -	 * accounted bytes but one, to avoid unexpected reorders. -	 */  	if (skb->destructor == sock_wfree  #ifdef CONFIG_INET  	    || skb->destructor == tcp_wfree  #endif  		) { -		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); -		skb->truesize = 1; +		struct sock *sk = skb->sk; + +		if (refcount_inc_not_zero(&sk->sk_refcnt)) { +			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); +			skb->destructor = sock_efree; +		}  	} else {  		skb_orphan(skb);  	} @@ -1881,7 +1912,7 @@ EXPORT_SYMBOL(sock_i_ino);  struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,  			     gfp_t priority)  { -	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { +	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {  		struct sk_buff *skb = alloc_skb(size, priority);  		if (skb) {  			skb_set_owner_w(skb, sk); @@ -1956,7 +1987,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)  			break;  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); -		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) +		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)  			break;  		if (sk->sk_shutdown & SEND_SHUTDOWN)  			break; @@ -2078,6 +2109,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,  }  EXPORT_SYMBOL(sock_cmsg_send); +static void sk_enter_memory_pressure(struct sock *sk) +{ +	if (!sk->sk_prot->enter_memory_pressure) +		return; + +	sk->sk_prot->enter_memory_pressure(sk); +} + +static void sk_leave_memory_pressure(struct sock *sk) +{ +	if (sk->sk_prot->leave_memory_pressure) { +		sk->sk_prot->leave_memory_pressure(sk); +	} else { +		unsigned long *memory_pressure = sk->sk_prot->memory_pressure; + +		if (memory_pressure && *memory_pressure) +			*memory_pressure = 0; +	} +} +  /* On 32bit arches, an skb frag is limited to 2^15 */  #define SKB_FRAG_PAGE_ORDER	get_order(32768) @@ -2259,7 +2310,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)  		if (sk->sk_type == SOCK_STREAM) {  			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])  				return 1; -		} else if (atomic_read(&sk->sk_wmem_alloc) < +		} else if (refcount_read(&sk->sk_wmem_alloc) <  			   prot->sysctl_wmem[0])  				return 1;  	} @@ -2526,7 +2577,7 @@ static void sock_def_write_space(struct sock *sk)  	/* Do not wake up a writer until he can make "significant"  	 * progress.  --DaveM  	 */ -	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { +	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {  		wq = rcu_dereference(sk->sk_wq);  		if (skwq_has_sleeper(wq))  			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | @@ -2636,7 +2687,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)  	 * (Documentation/RCU/rculist_nulls.txt for details)  	 */  	smp_wmb(); -	atomic_set(&sk->sk_refcnt, 1); +	refcount_set(&sk->sk_refcnt, 1);  	atomic_set(&sk->sk_drops, 0);  }  EXPORT_SYMBOL(sock_init_data); @@ -2681,9 +2732,12 @@ EXPORT_SYMBOL(release_sock);   * @sk: socket   *   * This version should be used for very small section, where process wont block - * return false if fast path is taken + * return false if fast path is taken: + *   *   sk_lock.slock locked, owned = 0, BH disabled - * return true if slow path is taken + * + * return true if slow path is taken: + *   *   sk_lock.slock unlocked, owned = 1, BH enabled   */  bool lock_sock_fast(struct sock *sk) |