diff options
Diffstat (limited to 'net/unix')
| -rw-r--r-- | net/unix/af_unix.c | 213 | ||||
| -rw-r--r-- | net/unix/diag.c | 47 | ||||
| -rw-r--r-- | net/unix/garbage.c | 8 | ||||
| -rw-r--r-- | net/unix/unix_bpf.c | 3 | 
4 files changed, 162 insertions, 109 deletions
| diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 142f56770b77..0be0dcb07f7b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -126,6 +126,81 @@ static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];   *    hash table is protected with spinlock.   *    each socket state is protected by separate spinlock.   */ +#ifdef CONFIG_PROVE_LOCKING +#define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r))) + +static int unix_table_lock_cmp_fn(const struct lockdep_map *a, +				  const struct lockdep_map *b) +{ +	return cmp_ptr(a, b); +} + +static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, +				  const struct lockdep_map *_b) +{ +	const struct unix_sock *a, *b; + +	a = container_of(_a, struct unix_sock, lock.dep_map); +	b = container_of(_b, struct unix_sock, lock.dep_map); + +	if (a->sk.sk_state == TCP_LISTEN) { +		/* unix_stream_connect(): Before the 2nd unix_state_lock(), +		 * +		 *   1. a is TCP_LISTEN. +		 *   2. b is not a. +		 *   3. concurrent connect(b -> a) must fail. +		 * +		 * Except for 2. & 3., the b's state can be any possible +		 * value due to concurrent connect() or listen(). +		 * +		 * 2. is detected in debug_spin_lock_before(), and 3. cannot +		 * be expressed as lock_cmp_fn. +		 */ +		switch (b->sk.sk_state) { +		case TCP_CLOSE: +		case TCP_ESTABLISHED: +		case TCP_LISTEN: +			return -1; +		default: +			/* Invalid case. */ +			return 0; +		} +	} + +	/* Should never happen.  Just to be symmetric. */ +	if (b->sk.sk_state == TCP_LISTEN) { +		switch (b->sk.sk_state) { +		case TCP_CLOSE: +		case TCP_ESTABLISHED: +			return 1; +		default: +			return 0; +		} +	} + +	/* unix_state_double_lock(): ascending address order. */ +	return cmp_ptr(a, b); +} + +static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, +				  const struct lockdep_map *_b) +{ +	const struct sock *a, *b; + +	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); +	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); + +	/* unix_collect_skb(): listener -> embryo order. */ +	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) +		return -1; + +	/* Should never happen.  Just to be symmetric. */ +	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) +		return 1; + +	return 0; +} +#endif  static unsigned int unix_unbound_hash(struct sock *sk)  { @@ -168,7 +243,7 @@ static void unix_table_double_lock(struct net *net,  		swap(hash1, hash2);  	spin_lock(&net->unx.table.locks[hash1]); -	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); +	spin_lock(&net->unx.table.locks[hash2]);  }  static void unix_table_double_unlock(struct net *net, @@ -647,8 +722,8 @@ static void unix_release_sock(struct sock *sk, int embrion)  	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {  		if (state == TCP_LISTEN)  			unix_release_sock(skb->sk, 1); +  		/* passed fds are erased in the kfree_skb hook	      */ -		UNIXCB(skb).consumed = skb->len;  		kfree_skb(skb);  	} @@ -676,14 +751,19 @@ static void unix_release_sock(struct sock *sk, int embrion)  static void init_peercred(struct sock *sk)  { +	sk->sk_peer_pid = get_pid(task_tgid(current)); +	sk->sk_peer_cred = get_current_cred(); +} + +static void update_peercred(struct sock *sk) +{  	const struct cred *old_cred;  	struct pid *old_pid;  	spin_lock(&sk->sk_peer_lock);  	old_pid = sk->sk_peer_pid;  	old_cred = sk->sk_peer_cred; -	sk->sk_peer_pid  = get_pid(task_tgid(current)); -	sk->sk_peer_cred = get_current_cred(); +	init_peercred(sk);  	spin_unlock(&sk->sk_peer_lock);  	put_pid(old_pid); @@ -692,26 +772,12 @@ static void init_peercred(struct sock *sk)  static void copy_peercred(struct sock *sk, struct sock *peersk)  { -	const struct cred *old_cred; -	struct pid *old_pid; +	lockdep_assert_held(&unix_sk(peersk)->lock); -	if (sk < peersk) { -		spin_lock(&sk->sk_peer_lock); -		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); -	} else { -		spin_lock(&peersk->sk_peer_lock); -		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); -	} -	old_pid = sk->sk_peer_pid; -	old_cred = sk->sk_peer_cred; -	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid); +	spin_lock(&sk->sk_peer_lock); +	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);  	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); -  	spin_unlock(&sk->sk_peer_lock); -	spin_unlock(&peersk->sk_peer_lock); - -	put_pid(old_pid); -	put_cred(old_cred);  }  static int unix_listen(struct socket *sock, int backlog) @@ -735,7 +801,7 @@ static int unix_listen(struct socket *sock, int backlog)  	WRITE_ONCE(sk->sk_state, TCP_LISTEN);  	/* set credentials so connect can copy them */ -	init_peercred(sk); +	update_peercred(sk);  	err = 0;  out_unlock: @@ -972,12 +1038,15 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,  	sk->sk_write_space	= unix_write_space;  	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);  	sk->sk_destruct		= unix_sock_destructor; +	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); +  	u = unix_sk(sk);  	u->listener = NULL;  	u->vertex = NULL;  	u->path.dentry = NULL;  	u->path.mnt = NULL;  	spin_lock_init(&u->lock); +	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);  	mutex_init(&u->iolock); /* single task reading lock */  	mutex_init(&u->bindlock); /* single task binding lock */  	init_waitqueue_head(&u->peer_wait); @@ -1326,11 +1395,12 @@ static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)  		unix_state_lock(sk1);  		return;  	} +  	if (sk1 > sk2)  		swap(sk1, sk2);  	unix_state_lock(sk1); -	unix_state_lock_nested(sk2, U_LOCK_SECOND); +	unix_state_lock(sk2);  }  static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) @@ -1473,6 +1543,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	struct unix_sock *u = unix_sk(sk), *newu, *otheru;  	struct net *net = sock_net(sk);  	struct sk_buff *skb = NULL; +	unsigned char state;  	long timeo;  	int err; @@ -1523,7 +1594,6 @@ restart:  		goto out;  	} -	/* Latch state of peer */  	unix_state_lock(other);  	/* Apparently VFS overslept socket death. Retry. */ @@ -1553,37 +1623,21 @@ restart:  		goto restart;  	} -	/* Latch our state. - -	   It is tricky place. We need to grab our state lock and cannot -	   drop lock on peer. It is dangerous because deadlock is -	   possible. Connect to self case and simultaneous -	   attempt to connect are eliminated by checking socket -	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we -	   check this before attempt to grab lock. - -	   Well, and we have to recheck the state after socket locked. +	/* self connect and simultaneous connect are eliminated +	 * by rejecting TCP_LISTEN socket to avoid deadlock.  	 */ -	switch (READ_ONCE(sk->sk_state)) { -	case TCP_CLOSE: -		/* This is ok... continue with connect */ -		break; -	case TCP_ESTABLISHED: -		/* Socket is already connected */ -		err = -EISCONN; -		goto out_unlock; -	default: -		err = -EINVAL; +	state = READ_ONCE(sk->sk_state); +	if (unlikely(state != TCP_CLOSE)) { +		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;  		goto out_unlock;  	} -	unix_state_lock_nested(sk, U_LOCK_SECOND); +	unix_state_lock(sk); -	if (sk->sk_state != TCP_CLOSE) { +	if (unlikely(sk->sk_state != TCP_CLOSE)) { +		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;  		unix_state_unlock(sk); -		unix_state_unlock(other); -		sock_put(other); -		goto restart; +		goto out_unlock;  	}  	err = security_unix_stream_connect(sk, other, newsk); @@ -2667,10 +2721,49 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)  { +	struct unix_sock *u = unix_sk(sk); +	struct sk_buff *skb; +	int err; +  	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))  		return -ENOTCONN; -	return unix_read_skb(sk, recv_actor); +	mutex_lock(&u->iolock); +	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); +	mutex_unlock(&u->iolock); +	if (!skb) +		return err; + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +	if (unlikely(skb == READ_ONCE(u->oob_skb))) { +		bool drop = false; + +		unix_state_lock(sk); + +		if (sock_flag(sk, SOCK_DEAD)) { +			unix_state_unlock(sk); +			kfree_skb(skb); +			return -ECONNRESET; +		} + +		spin_lock(&sk->sk_receive_queue.lock); +		if (likely(skb == u->oob_skb)) { +			WRITE_ONCE(u->oob_skb, NULL); +			drop = true; +		} +		spin_unlock(&sk->sk_receive_queue.lock); + +		unix_state_unlock(sk); + +		if (drop) { +			WARN_ON_ONCE(skb_unref(skb)); +			kfree_skb(skb); +			return -EAGAIN; +		} +	} +#endif + +	return recv_actor(sk, skb);  }  static int unix_stream_read_generic(struct unix_stream_read_state *state, @@ -2717,9 +2810,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,  	skip = max(sk_peek_offset(sk, flags), 0);  	do { -		int chunk; -		bool drop_skb;  		struct sk_buff *skb, *last; +		int chunk;  redo:  		unix_state_lock(sk); @@ -2815,11 +2907,7 @@ unlock:  		}  		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); -		skb_get(skb);  		chunk = state->recv_actor(skb, skip, chunk, state); -		drop_skb = !unix_skb_len(skb); -		/* skb is only safe to use if !drop_skb */ -		consume_skb(skb);  		if (chunk < 0) {  			if (copied == 0)  				copied = -EFAULT; @@ -2828,18 +2916,6 @@ unlock:  		copied += chunk;  		size -= chunk; -		if (drop_skb) { -			/* the skb was touched by a concurrent reader; -			 * we should not expect anything from this skb -			 * anymore and assume it invalid - we can be -			 * sure it was dropped from the socket queue -			 * -			 * let's report a short read -			 */ -			err = 0; -			break; -		} -  		/* Mark read part of skb as used */  		if (!(flags & MSG_PEEK)) {  			UNIXCB(skb).consumed += chunk; @@ -3620,6 +3696,7 @@ static int __net_init unix_net_init(struct net *net)  	for (i = 0; i < UNIX_HASH_SIZE; i++) {  		spin_lock_init(&net->unx.table.locks[i]); +		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);  		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);  	} diff --git a/net/unix/diag.c b/net/unix/diag.c index 937edf4afed4..9138af8b465e 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -47,9 +47,7 @@ static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb)  	peer = unix_peer_get(sk);  	if (peer) { -		unix_state_lock(peer);  		ino = sock_i_ino(peer); -		unix_state_unlock(peer);  		sock_put(peer);  		return nla_put_u32(nlskb, UNIX_DIAG_PEER, ino); @@ -75,20 +73,9 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)  		buf = nla_data(attr);  		i = 0; -		skb_queue_walk(&sk->sk_receive_queue, skb) { -			struct sock *req, *peer; - -			req = skb->sk; -			/* -			 * The state lock is outer for the same sk's -			 * queue lock. With the other's queue locked it's -			 * OK to lock the state. -			 */ -			unix_state_lock_nested(req, U_LOCK_DIAG); -			peer = unix_sk(req)->peer; -			buf[i++] = (peer ? sock_i_ino(peer) : 0); -			unix_state_unlock(req); -		} +		skb_queue_walk(&sk->sk_receive_queue, skb) +			buf[i++] = sock_i_ino(unix_peer(skb->sk)); +  		spin_unlock(&sk->sk_receive_queue.lock);  	} @@ -180,22 +167,6 @@ out_nlmsg_trim:  	return -EMSGSIZE;  } -static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, -			struct user_namespace *user_ns, -			u32 portid, u32 seq, u32 flags) -{ -	int sk_ino; - -	unix_state_lock(sk); -	sk_ino = sock_i_ino(sk); -	unix_state_unlock(sk); - -	if (!sk_ino) -		return 0; - -	return sk_diag_fill(sk, skb, req, user_ns, portid, seq, flags, sk_ino); -} -  static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)  {  	struct net *net = sock_net(skb->sk); @@ -213,14 +184,22 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)  		num = 0;  		spin_lock(&net->unx.table.locks[slot]);  		sk_for_each(sk, &net->unx.table.buckets[slot]) { +			int sk_ino; +  			if (num < s_num)  				goto next; +  			if (!(req->udiag_states & (1 << READ_ONCE(sk->sk_state))))  				goto next; -			if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), + +			sk_ino = sock_i_ino(sk); +			if (!sk_ino) +				goto next; + +			if (sk_diag_fill(sk, skb, req, sk_user_ns(skb->sk),  					 NETLINK_CB(cb->skb).portid,  					 cb->nlh->nlmsg_seq, -					 NLM_F_MULTI) < 0) { +					 NLM_F_MULTI, sk_ino) < 0) {  				spin_unlock(&net->unx.table.locks[slot]);  				goto done;  			} diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 23efb78fe9ef..06d94ad999e9 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -337,11 +337,6 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)  	return true;  } -enum unix_recv_queue_lock_class { -	U_RECVQ_LOCK_NORMAL, -	U_RECVQ_LOCK_EMBRYO, -}; -  static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist)  {  	skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); @@ -375,8 +370,7 @@ static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist  			skb_queue_walk(queue, skb) {  				struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; -				/* listener -> embryo order, the inversion never happens. */ -				spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); +				spin_lock(&embryo_queue->lock);  				unix_collect_queue(unix_sk(skb->sk), hitlist);  				spin_unlock(&embryo_queue->lock);  			} diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index bd84785bf8d6..bca2d86ba97d 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -54,6 +54,9 @@ static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg,  	struct sk_psock *psock;  	int copied; +	if (flags & MSG_OOB) +		return -EOPNOTSUPP; +  	if (!len)  		return 0; |