From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 26 Nov 2015 13:55:39 +0800
Subject: net: Generalise wq_has_sleeper helper

The memory barrier in the helper wq_has_sleeper is needed by just
about every user of waitqueue_active.  This patch generalises it
by making it take a wait_queue_head_t directly.  The existing
helper is renamed to skwq_has_sleeper.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 955ec152cb71..efb706e1d1c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -339,7 +339,7 @@ static void unix_write_space(struct sock *sk)
 	rcu_read_lock();
 	if (unix_writable(sk)) {
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
 				POLLOUT | POLLWRNORM | POLLWRBAND);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-- 
cgit 


From 77b75f4d8cf105b599beef38724f8171e557919d Mon Sep 17 00:00:00 2001
From: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Date: Thu, 26 Nov 2015 19:23:15 +0000
Subject: unix: use wq_has_sleeper in unix_dgram_recvmsg

The current unix_dgram_recvmsg does a wake up for every received
datagram. This seems wasteful as only SOCK_DGRAM client sockets in an
n:1 association with a server socket will ever wait because of the
associated condition. The patch below changes the function such that the
wake up only happens if wq_has_sleeper indicates that someone actually
wants to be notified. Testing with SOCK_SEQPACKET and SOCK_DGRAM socket
seems to confirm that this is an improvment.

Signed-Off-By: Rainer Weikusat <rweikusat@mobileactivedefense.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index efb706e1d1c0..ac011b97097d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1914,8 +1914,10 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		goto out_unlock;
 	}
 
-	wake_up_interruptible_sync_poll(&u->peer_wait,
-					POLLOUT | POLLWRNORM | POLLWRBAND);
+	if (wq_has_sleeper(&u->peer_wait))
+		wake_up_interruptible_sync_poll(&u->peer_wait,
+						POLLOUT | POLLWRNORM |
+						POLLWRBAND);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
-- 
cgit 


From 64874280889e7c0b2c9266705363627d4c92cf01 Mon Sep 17 00:00:00 2001
From: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Date: Sun, 6 Dec 2015 21:11:38 +0000
Subject: af_unix: fix unix_dgram_recvmsg entry locking

The current unix_dgram_recvsmg code acquires the u->readlock mutex in
order to protect access to the peek offset prior to calling
__skb_recv_datagram for actually receiving data. This implies that a
blocking reader will go to sleep with this mutex held if there's
presently no data to return to userspace. Two non-desirable side effects
of this are that a later non-blocking read call on the same socket will
block on the ->readlock mutex until the earlier blocking call releases it
(or the readers is interrupted) and that later blocking read calls
will wait longer than the effective socket read timeout says they
should: The timeout will only start 'ticking' once such a reader hits
the schedule_timeout in wait_for_more_packets (core.c) while the time it
already had to wait until it could acquire the mutex is unaccounted for.

The patch avoids both by using the __skb_try_recv_datagram and
__skb_wait_for_more packets functions created by the first patch to
implement a unix_dgram_recvmsg read loop which releases the readlock
mutex prior to going to sleep and reacquires it as needed
afterwards. Non-blocking readers will thus immediately return with
-EAGAIN if there's no data available regardless of any concurrent
blocking readers and all blocking readers will end up sleeping via
schedule_timeout, thus honouring the configured socket receive timeout.

Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 502e572af3fd..1c3c1f3a3ec4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2078,8 +2078,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 	struct scm_cookie scm;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
-	int noblock = flags & MSG_DONTWAIT;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *last;
+	long timeo;
 	int err;
 	int peeked, skip;
 
@@ -2087,26 +2087,32 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 	if (flags&MSG_OOB)
 		goto out;
 
-	err = mutex_lock_interruptible(&u->readlock);
-	if (unlikely(err)) {
-		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
-		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
-		 */
-		err = noblock ? -EAGAIN : -ERESTARTSYS;
-		goto out;
-	}
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
-	skip = sk_peek_offset(sk, flags);
+	do {
+		mutex_lock(&u->readlock);
 
-	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
-	if (!skb) {
+		skip = sk_peek_offset(sk, flags);
+		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
+					      &last);
+		if (skb)
+			break;
+
+		mutex_unlock(&u->readlock);
+
+		if (err != -EAGAIN)
+			break;
+	} while (timeo &&
+		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
+
+	if (!skb) { /* implies readlock unlocked */
 		unix_state_lock(sk);
 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
 		    (sk->sk_shutdown & RCV_SHUTDOWN))
 			err = 0;
 		unix_state_unlock(sk);
-		goto out_unlock;
+		goto out;
 	}
 
 	if (wq_has_sleeper(&u->peer_wait))
@@ -2164,7 +2170,6 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 out_free:
 	skb_free_datagram(sk, skb);
-out_unlock:
 	mutex_unlock(&u->readlock);
 out:
 	return err;
-- 
cgit 


From 712f4aad406bb1ed67f3f98d04c044191f0ff593 Mon Sep 17 00:00:00 2001
From: willy tarreau <w@1wt.eu>
Date: Sun, 10 Jan 2016 07:54:56 +0100
Subject: unix: properly account for FDs passed over unix sockets

It is possible for a process to allocate and accumulate far more FDs than
the process' limit by sending them over a unix socket then closing them
to keep the process' fd count low.

This change addresses this problem by keeping track of the number of FDs
in flight per user and preventing non-privileged processes from having
more FDs in flight than their configured FD limit.

Reported-by: socketpair@gmail.com
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Mitigates: CVE-2013-4312 (Linux 2.0+)
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sched.h |  1 +
 net/unix/af_unix.c    | 24 ++++++++++++++++++++----
 net/unix/garbage.c    | 13 ++++++++-----
 3 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'net/unix')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a43edea..fbf25f19b3b5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -830,6 +830,7 @@ struct user_struct {
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
 #endif
 	unsigned long locked_shm; /* How many pages of mlocked shm ? */
+	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 
 #ifdef CONFIG_KEYS
 	struct key *uid_keyring;	/* UID specific keyring */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ef05cd9403d4..e3f85bc8b135 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1513,6 +1513,21 @@ static void unix_destruct_scm(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 
+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+	struct user_struct *user = current_user();
+
+	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+	return false;
+}
+
 #define MAX_RECURSION_LEVEL 4
 
 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
@@ -1521,6 +1536,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	unsigned char max_level = 0;
 	int unix_sock_count = 0;
 
+	if (too_many_unix_fds(current))
+		return -ETOOMANYREFS;
+
 	for (i = scm->fp->count - 1; i >= 0; i--) {
 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
 
@@ -1542,10 +1560,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	if (!UNIXCB(skb).fp)
 		return -ENOMEM;
 
-	if (unix_sock_count) {
-		for (i = scm->fp->count - 1; i >= 0; i--)
-			unix_inflight(scm->fp->fp[i]);
-	}
+	for (i = scm->fp->count - 1; i >= 0; i--)
+		unix_inflight(scm->fp->fp[i]);
 	return max_level;
 }
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a73a226f2d33..8fcdc2283af5 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -120,11 +120,11 @@ void unix_inflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
 
+	spin_lock(&unix_gc_lock);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		spin_lock(&unix_gc_lock);
-
 		if (atomic_long_inc_return(&u->inflight) == 1) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
@@ -132,25 +132,28 @@ void unix_inflight(struct file *fp)
 			BUG_ON(list_empty(&u->link));
 		}
 		unix_tot_inflight++;
-		spin_unlock(&unix_gc_lock);
 	}
+	fp->f_cred->user->unix_inflight++;
+	spin_unlock(&unix_gc_lock);
 }
 
 void unix_notinflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
 
+	spin_lock(&unix_gc_lock);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		spin_lock(&unix_gc_lock);
 		BUG_ON(list_empty(&u->link));
 
 		if (atomic_long_dec_and_test(&u->inflight))
 			list_del_init(&u->link);
 		unix_tot_inflight--;
-		spin_unlock(&unix_gc_lock);
 	}
+	fp->f_cred->user->unix_inflight--;
+	spin_unlock(&unix_gc_lock);
 }
 
 static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
-- 
cgit