From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 26 Nov 2015 13:55:39 +0800 Subject: net: Generalise wq_has_sleeper helper The memory barrier in the helper wq_has_sleeper is needed by just about every user of waitqueue_active. This patch generalises it by making it take a wait_queue_head_t directly. The existing helper is renamed to skwq_has_sleeper. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 955ec152cb71..efb706e1d1c0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -339,7 +339,7 @@ static void unix_write_space(struct sock *sk) rcu_read_lock(); if (unix_writable(sk)) { wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); -- cgit v1.2.3 From 77b75f4d8cf105b599beef38724f8171e557919d Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 26 Nov 2015 19:23:15 +0000 Subject: unix: use wq_has_sleeper in unix_dgram_recvmsg The current unix_dgram_recvmsg does a wake up for every received datagram. This seems wasteful as only SOCK_DGRAM client sockets in an n:1 association with a server socket will ever wait because of the associated condition. The patch below changes the function such that the wake up only happens if wq_has_sleeper indicates that someone actually wants to be notified. Testing with SOCK_SEQPACKET and SOCK_DGRAM socket seems to confirm that this is an improvment. Signed-Off-By: Rainer Weikusat Signed-off-by: David S. Miller --- net/unix/af_unix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index efb706e1d1c0..ac011b97097d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1914,8 +1914,10 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, goto out_unlock; } - wake_up_interruptible_sync_poll(&u->peer_wait, - POLLOUT | POLLWRNORM | POLLWRBAND); + if (wq_has_sleeper(&u->peer_wait)) + wake_up_interruptible_sync_poll(&u->peer_wait, + POLLOUT | POLLWRNORM | + POLLWRBAND); if (msg->msg_name) unix_copy_addr(msg, skb->sk); -- cgit v1.2.3 From 64874280889e7c0b2c9266705363627d4c92cf01 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Sun, 6 Dec 2015 21:11:38 +0000 Subject: af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- net/unix/af_unix.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 502e572af3fd..1c3c1f3a3ec4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2078,8 +2078,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, struct scm_cookie scm; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; + struct sk_buff *skb, *last; + long timeo; int err; int peeked, skip; @@ -2087,26 +2087,32 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (flags&MSG_OOB) goto out; - err = mutex_lock_interruptible(&u->readlock); - if (unlikely(err)) { - /* recvmsg() in non blocking mode is supposed to return -EAGAIN - * sk_rcvtimeo is not honored by mutex_lock_interruptible() - */ - err = noblock ? -EAGAIN : -ERESTARTSYS; - goto out; - } + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - skip = sk_peek_offset(sk, flags); + do { + mutex_lock(&u->readlock); - skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err); - if (!skb) { + skip = sk_peek_offset(sk, flags); + skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err, + &last); + if (skb) + break; + + mutex_unlock(&u->readlock); + + if (err != -EAGAIN) + break; + } while (timeo && + !__skb_wait_for_more_packets(sk, &err, &timeo, last)); + + if (!skb) { /* implies readlock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) err = 0; unix_state_unlock(sk); - goto out_unlock; + goto out; } if (wq_has_sleeper(&u->peer_wait)) @@ -2164,7 +2170,6 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, out_free: skb_free_datagram(sk, skb); -out_unlock: mutex_unlock(&u->readlock); out: return err; -- cgit v1.2.3 From 712f4aad406bb1ed67f3f98d04c044191f0ff593 Mon Sep 17 00:00:00 2001 From: willy tarreau Date: Sun, 10 Jan 2016 07:54:56 +0100 Subject: unix: properly account for FDs passed over unix sockets It is possible for a process to allocate and accumulate far more FDs than the process' limit by sending them over a unix socket then closing them to keep the process' fd count low. This change addresses this problem by keeping track of the number of FDs in flight per user and preventing non-privileged processes from having more FDs in flight than their configured FD limit. Reported-by: socketpair@gmail.com Reported-by: Tetsuo Handa Mitigates: CVE-2013-4312 (Linux 2.0+) Suggested-by: Linus Torvalds Acked-by: Hannes Frederic Sowa Signed-off-by: Willy Tarreau Signed-off-by: David S. Miller --- net/unix/af_unix.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ef05cd9403d4..e3f85bc8b135 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1513,6 +1513,21 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +/* + * The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + #define MAX_RECURSION_LEVEL 4 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) @@ -1521,6 +1536,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) unsigned char max_level = 0; int unix_sock_count = 0; + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + for (i = scm->fp->count - 1; i >= 0; i--) { struct sock *sk = unix_get_socket(scm->fp->fp[i]); @@ -1542,10 +1560,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - if (unix_sock_count) { - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - } + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); return max_level; } -- cgit v1.2.3 From fa0dc04df259ba2df3ce1920e9690c7842f8fa4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Jan 2016 13:53:50 -0800 Subject: af_unix: fix struct pid memory leak Dmitry reported a struct pid leak detected by a syzkaller program. Bug happens in unix_stream_recvmsg() when we break the loop when a signal is pending, without properly releasing scm. Fixes: b3ca9b02b007 ("net: fix multithreaded signal handling in unix recv routines") Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Cc: Rainer Weikusat Signed-off-by: David S. Miller --- net/unix/af_unix.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c5bf5ef2bf89..49d5093eb055 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2339,6 +2339,7 @@ again: if (signal_pending(current)) { err = sock_intr_errno(timeo); + scm_destroy(&scm); goto out; } -- cgit v1.2.3 From 415e3d3e90ce9e18727e8843ae343eda5a58fad6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 3 Feb 2016 02:11:03 +0100 Subject: unix: correctly track in-flight fds in sending process user_struct The commit referenced in the Fixes tag incorrectly accounted the number of in-flight fds over a unix domain socket to the original opener of the file-descriptor. This allows another process to arbitrary deplete the original file-openers resource limit for the maximum of open files. Instead the sending processes and its struct cred should be credited. To do so, we add a reference counted struct user_struct pointer to the scm_fp_list and use it to account for the number of inflight unix fds. Fixes: 712f4aad406bb1 ("unix: properly account for FDs passed over unix sockets") Reported-by: David Herrmann Cc: David Herrmann Cc: Willy Tarreau Cc: Linus Torvalds Suggested-by: Linus Torvalds Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 49d5093eb055..29be035f9c65 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->fp[i]); + unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_destruct_scm(struct sk_buff *skb) @@ -1561,7 +1561,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) return -ENOMEM; for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); + unix_inflight(scm->fp->user, scm->fp->fp[i]); return max_level; } -- cgit v1.2.3 From 1b92ee3d03af6643df395300ba7748f19ecdb0c5 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Mon, 8 Feb 2016 18:47:19 +0000 Subject: af_unix: Don't set err in unix_stream_read_generic unless there was an error The present unix_stream_read_generic contains various code sequences of the form err = -EDISASTER; if () goto out; This has the unfortunate side effect of possibly causing the error code to bleed through to the final out: return copied ? : err; and then to be wrongly returned if no data was copied because the caller didn't supply a data buffer, as demonstrated by the program available at http://pad.lv/1540731 Change it such that err is only set if an error condition was detected. Fixes: 3822b5c2fc62 ("af_unix: Revert 'lock_interruptible' in stream receive code") Reported-by: Joseph Salisbury Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- net/unix/af_unix.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 29be035f9c65..df923caa8389 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2277,13 +2277,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) size_t size = state->size; unsigned int last_len; - err = -EINVAL; - if (sk->sk_state != TCP_ESTABLISHED) + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + err = -EINVAL; goto out; + } - err = -EOPNOTSUPP; - if (flags & MSG_OOB) + if (unlikely(flags & MSG_OOB)) { + err = -EOPNOTSUPP; goto out; + } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); @@ -2329,9 +2331,11 @@ again: goto unlock; unix_state_unlock(sk); - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; break; + } + mutex_unlock(&u->readlock); timeo = unix_stream_data_wait(sk, timeo, last, -- cgit v1.2.3 From a5527dda344fff0514b7989ef7a755729769daa1 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 11 Feb 2016 19:37:27 +0000 Subject: af_unix: Guard against other == sk in unix_dgram_sendmsg The unix_dgram_sendmsg routine use the following test if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { to determine if sk and other are in an n:1 association (either established via connect or by using sendto to send messages to an unrelated socket identified by address). This isn't correct as the specified address could have been bound to the sending socket itself or because this socket could have been connected to itself by the time of the unix_peer_get but disconnected before the unix_state_lock(other). In both cases, the if-block would be entered despite other == sk which might either block the sender unintentionally or lead to trying to unlock the same spin lock twice for a non-blocking send. Add a other != sk check to guard against this. Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue") Reported-By: Philipp Hahn Signed-off-by: Rainer Weikusat Tested-by: Philipp Hahn Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index df923caa8389..c51e2831f498 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1781,7 +1781,12 @@ restart_locked: goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); -- cgit v1.2.3 From 18eceb818dc37bbc783ec7ef7703f270cc6cd281 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 18 Feb 2016 12:39:46 +0000 Subject: af_unix: Don't use continue to re-execute unix_stream_read_generic loop The unix_stream_read_generic function tries to use a continue statement to restart the receive loop after waiting for a message. This may not work as intended as the caller might use a recvmsg call to peek at control messages without specifying a message buffer. If this was the case, the continue will cause the function to return without an error and without the credential information if the function had to wait for a message while it had returned with the credentials otherwise. Change to using goto to restart the loop without checking the condition first in this case so that credentials are returned either way. Signed-off-by: Rainer Weikusat Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/unix/af_unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c51e2831f498..f75f847e688d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2312,6 +2312,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) bool drop_skb; struct sk_buff *skb, *last; +redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -2353,7 +2354,7 @@ again: } mutex_lock(&u->readlock); - continue; + goto redo; unlock: unix_state_unlock(sk); break; -- cgit v1.2.3