From f55bb7f9cb82dec2f2e803d7bd0fc5929248e4d8 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Tue, 21 Feb 2012 07:31:51 +0000
Subject: unix: Support peeking offset for datagram and seqpacket sockets

The sk_peek_off manipulations are protected with the unix_sk->readlock mutex.
This mutex is enough since all we need is to syncronize setting the offset
vs reading the queue head. The latter is fully covered with the mentioned lock.

The recently added __skb_recv_datagram's offset is used to pick the skb to
read the data from.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 85d3bb7490aa..3d9481de031f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -530,6 +530,16 @@ static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 				  struct msghdr *, size_t, int);
 
+static void unix_set_peek_off(struct sock *sk, int val)
+{
+	struct unix_sock *u = unix_sk(sk);
+
+	mutex_lock(&u->readlock);
+	sk->sk_peek_off = val;
+	mutex_unlock(&u->readlock);
+}
+
+
 static const struct proto_ops unix_stream_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
@@ -570,6 +580,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.recvmsg =	unix_dgram_recvmsg,
 	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
+	.set_peek_off =	unix_set_peek_off,
 };
 
 static const struct proto_ops unix_seqpacket_ops = {
@@ -591,6 +602,7 @@ static const struct proto_ops unix_seqpacket_ops = {
 	.recvmsg =	unix_seqpacket_recvmsg,
 	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
+	.set_peek_off =	unix_set_peek_off,
 };
 
 static struct proto unix_proto = {
@@ -1756,6 +1768,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	int noblock = flags & MSG_DONTWAIT;
 	struct sk_buff *skb;
 	int err;
+	int peeked, skip;
 
 	err = -EOPNOTSUPP;
 	if (flags&MSG_OOB)
@@ -1769,7 +1782,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		goto out;
 	}
 
-	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	skip = sk_peek_offset(sk, flags);
+
+	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
 	if (!skb) {
 		unix_state_lock(sk);
 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
@@ -1786,12 +1801,12 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
 
-	if (size > skb->len)
-		size = skb->len;
-	else if (size < skb->len)
+	if (size > skb->len - skip)
+		size = skb->len - skip;
+	else if (size < skb->len - skip)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
+	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
 	if (err)
 		goto out_free;
 
@@ -1808,6 +1823,8 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (!(flags & MSG_PEEK)) {
 		if (UNIXCB(skb).fp)
 			unix_detach_fds(siocb->scm, skb);
+
+		sk_peek_offset_bwd(sk, skb->len);
 	} else {
 		/* It is questionable: on PEEK we could:
 		   - do not return fds - good, but too simple 8)
@@ -1821,6 +1838,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		   clearly however!
 
 		*/
+
+		sk_peek_offset_fwd(sk, size);
+
 		if (UNIXCB(skb).fp)
 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
 	}
-- 
cgit v1.2.3


From fc0d753641f7b919c7273d9bd21ae6ab45e757f3 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Tue, 21 Feb 2012 07:32:06 +0000
Subject: unix: Support peeking offset for stream sockets

The same here -- we can protect the sk_peek_off manipulations with
the unix_sk->readlock mutex.

The peeking of data from a stream socket is done in the datagram style,
i.e. even if there's enough room for more data in the user buffer, only
the head skb's data is copied in there. This feature is preserved when
peeking data from a given offset -- the data is read till the nearest
skb's boundary.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3d9481de031f..0be4d24f6ae8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -559,6 +559,7 @@ static const struct proto_ops unix_stream_ops = {
 	.recvmsg =	unix_stream_recvmsg,
 	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
+	.set_peek_off =	unix_set_peek_off,
 };
 
 static const struct proto_ops unix_dgram_ops = {
@@ -1904,6 +1905,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 	int target;
 	int err = 0;
 	long timeo;
+	int skip;
 
 	err = -EINVAL;
 	if (sk->sk_state != TCP_ESTABLISHED)
@@ -1933,12 +1935,15 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		goto out;
 	}
 
+	skip = sk_peek_offset(sk, flags);
+
 	do {
 		int chunk;
 		struct sk_buff *skb;
 
 		unix_state_lock(sk);
 		skb = skb_peek(&sk->sk_receive_queue);
+again:
 		if (skb == NULL) {
 			unix_sk(sk)->recursion_level = 0;
 			if (copied >= target)
@@ -1973,6 +1978,13 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			unix_state_unlock(sk);
 			break;
 		}
+
+		if (skip >= skb->len) {
+			skip -= skb->len;
+			skb = skb_peek_next(skb, &sk->sk_receive_queue);
+			goto again;
+		}
+
 		unix_state_unlock(sk);
 
 		if (check_creds) {
@@ -1992,8 +2004,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			sunaddr = NULL;
 		}
 
-		chunk = min_t(unsigned int, skb->len, size);
-		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+		chunk = min_t(unsigned int, skb->len - skip, size);
+		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
@@ -2005,6 +2017,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (!(flags & MSG_PEEK)) {
 			skb_pull(skb, chunk);
 
+			sk_peek_offset_bwd(sk, chunk);
+
 			if (UNIXCB(skb).fp)
 				unix_detach_fds(siocb->scm, skb);
 
@@ -2022,6 +2036,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			if (UNIXCB(skb).fp)
 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
 
+			sk_peek_offset_fwd(sk, chunk);
+
 			break;
 		}
 	} while (size);
-- 
cgit v1.2.3


From 9f6f9af7694ede6314bed281eec74d588ba9474f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 21 Feb 2012 23:24:55 +0000
Subject: af_unix: MSG_TRUNC support for dgram sockets

Piergiorgio Beruto expressed the need to fetch size of first datagram in
queue for AF_UNIX sockets and suggested a patch against SIOCINQ ioctl.

I suggested instead to implement MSG_TRUNC support as a recv() input
flag, as already done for RAW, UDP & NETLINK sockets.

len = recv(fd, &byte, 1, MSG_PEEK | MSG_TRUNC);

MSG_TRUNC asks recv() to return the real length of the packet, even when
is was longer than the passed buffer.

There is risk that a userland application used MSG_TRUNC by accident
(since it had no effect on af_unix sockets) and this might break after
this patch.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Piergiorgio Beruto <piergiorgio.beruto@gmail.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0be4d24f6ae8..8ee85aa79fa7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1845,7 +1845,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (UNIXCB(skb).fp)
 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
 	}
-	err = size;
+	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
 
 	scm_recv(sock, msg, siocb->scm, flags);
 
-- 
cgit v1.2.3


From 40ffe67d2e89c7a475421d007becc11a2f88ea3d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 14 Mar 2012 21:54:32 -0400
Subject: switch unix_sock to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/unix/af_unix.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 85d3bb7490aa..ef4b780ef63d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -293,7 +293,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
 	spin_lock(&unix_table_lock);
 	sk_for_each(s, node,
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
-		struct dentry *dentry = unix_sk(s)->dentry;
+		struct dentry *dentry = unix_sk(s)->path.dentry;
 
 		if (dentry && dentry->d_inode == i) {
 			sock_hold(s);
@@ -377,8 +377,7 @@ static void unix_sock_destructor(struct sock *sk)
 static int unix_release_sock(struct sock *sk, int embrion)
 {
 	struct unix_sock *u = unix_sk(sk);
-	struct dentry *dentry;
-	struct vfsmount *mnt;
+	struct path path;
 	struct sock *skpair;
 	struct sk_buff *skb;
 	int state;
@@ -389,10 +388,9 @@ static int unix_release_sock(struct sock *sk, int embrion)
 	unix_state_lock(sk);
 	sock_orphan(sk);
 	sk->sk_shutdown = SHUTDOWN_MASK;
-	dentry	     = u->dentry;
-	u->dentry    = NULL;
-	mnt	     = u->mnt;
-	u->mnt	     = NULL;
+	path	     = u->path;
+	u->path.dentry = NULL;
+	u->path.mnt = NULL;
 	state = sk->sk_state;
 	sk->sk_state = TCP_CLOSE;
 	unix_state_unlock(sk);
@@ -425,10 +423,8 @@ static int unix_release_sock(struct sock *sk, int embrion)
 		kfree_skb(skb);
 	}
 
-	if (dentry) {
-		dput(dentry);
-		mntput(mnt);
-	}
+	if (path.dentry)
+		path_put(&path);
 
 	sock_put(sk);
 
@@ -628,8 +624,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
 	sk->sk_destruct		= unix_sock_destructor;
 	u	  = unix_sk(sk);
-	u->dentry = NULL;
-	u->mnt	  = NULL;
+	u->path.dentry = NULL;
+	u->path.mnt = NULL;
 	spin_lock_init(&u->lock);
 	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
@@ -789,9 +785,9 @@ static struct sock *unix_find_other(struct net *net,
 		u = unix_find_socket_byname(net, sunname, len, type, hash);
 		if (u) {
 			struct dentry *dentry;
-			dentry = unix_sk(u)->dentry;
+			dentry = unix_sk(u)->path.dentry;
 			if (dentry)
-				touch_atime(unix_sk(u)->mnt, dentry);
+				touch_atime(unix_sk(u)->path.mnt, dentry);
 		} else
 			goto fail;
 	}
@@ -897,8 +893,7 @@ out_mknod_drop_write:
 		list = &unix_socket_table[addr->hash];
 	} else {
 		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
-		u->dentry = path.dentry;
-		u->mnt    = path.mnt;
+		u->path = path;
 	}
 
 	err = 0;
@@ -1180,9 +1175,9 @@ restart:
 		atomic_inc(&otheru->addr->refcnt);
 		newu->addr = otheru->addr;
 	}
-	if (otheru->dentry) {
-		newu->dentry	= dget(otheru->dentry);
-		newu->mnt	= mntget(otheru->mnt);
+	if (otheru->path.dentry) {
+		path_get(&otheru->path);
+		newu->path = otheru->path;
 	}
 
 	/* Set credentials */
-- 
cgit v1.2.3


From 68ac1234fb949b66941d94dce4157742799fc581 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 Mar 2012 08:21:57 -0400
Subject: switch touch_atime to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/unix/af_unix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ef4b780ef63d..081679444a6e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -771,7 +771,7 @@ static struct sock *unix_find_other(struct net *net,
 			goto put_fail;
 
 		if (u->sk_type == type)
-			touch_atime(path.mnt, path.dentry);
+			touch_atime(&path);
 
 		path_put(&path);
 
@@ -787,7 +787,7 @@ static struct sock *unix_find_other(struct net *net,
 			struct dentry *dentry;
 			dentry = unix_sk(u)->path.dentry;
 			if (dentry)
-				touch_atime(unix_sk(u)->path.mnt, dentry);
+				touch_atime(&unix_sk(u)->path);
 		} else
 			goto fail;
 	}
-- 
cgit v1.2.3


From 626cf236608505d376e4799adb4f7eb00a8594af Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 23 Mar 2012 15:02:27 -0700
Subject: poll: add poll_requested_events() and poll_does_not_wait() functions

In some cases the poll() implementation in a driver has to do different
things depending on the events the caller wants to poll for.  An example
is when a driver needs to start a DMA engine if the caller polls for
POLLIN, but doesn't want to do that if POLLIN is not requested but instead
only POLLOUT or POLLPRI is requested.  This is something that can happen
in the video4linux subsystem among others.

Unfortunately, the current epoll/poll/select implementation doesn't
provide that information reliably.  The poll_table_struct does have it: it
has a key field with the event mask.  But once a poll() call matches one
or more bits of that mask any following poll() calls are passed a NULL
poll_table pointer.

Also, the eventpoll implementation always left the key field at ~0 instead
of using the requested events mask.

This was changed in eventpoll.c so the key field now contains the actual
events that should be polled for as set by the caller.

The solution to the NULL poll_table pointer is to set the qproc field to
NULL in poll_table once poll() matches the events, not the poll_table
pointer itself.  That way drivers can obtain the mask through a new
poll_requested_events inline.

The poll_table_struct can still be NULL since some kernel code calls it
internally (netfs_state_poll() in ./drivers/staging/pohmelfs/netfs.h).  In
that case poll_requested_events() returns ~0 (i.e.  all events).

Very rarely drivers might want to know whether poll_wait will actually
wait.  If another earlier file descriptor in the set already matched the
events the caller wanted to wait for, then the kernel will return from the
select() call without waiting.  This might be useful information in order
to avoid doing expensive work.

A new helper function poll_does_not_wait() is added that drivers can use
to detect this situation.  This is now used in sock_poll_wait() in
include/net/sock.h.  This was the only place in the kernel that needed
this information.

Drivers should no longer access any of the poll_table internals, but use
the poll_requested_events() and poll_does_not_wait() access functions
instead.  In order to enforce that the poll_table fields are now prepended
with an underscore and a comment was added warning against using them
directly.

This required a change in unix_dgram_poll() in unix/af_unix.c which used
the key field to get the requested events.  It's been replaced by a call
to poll_requested_events().

For qproc it was especially important to change its name since the
behavior of that field changes with this patch since this function pointer
can now be NULL when that wasn't possible in the past.

Any driver accessing the qproc or key fields directly will now fail to compile.

Some notes regarding the correctness of this patch: the driver's poll()
function is called with a 'struct poll_table_struct *wait' argument.  This
pointer may or may not be NULL, drivers can never rely on it being one or
the other as that depends on whether or not an earlier file descriptor in
the select()'s fdset matched the requested events.

There are only three things a driver can do with the wait argument:

1) obtain the key field:

	events = wait ? wait->key : ~0;

   This will still work although it should be replaced with the new
   poll_requested_events() function (which does exactly the same).
   This will now even work better, since wait is no longer set to NULL
   unnecessarily.

2) use the qproc callback. This could be deadly since qproc can now be
   NULL. Renaming qproc should prevent this from happening. There are no
   kernel drivers that actually access this callback directly, BTW.

3) test whether wait == NULL to determine whether poll would return without
   waiting. This is no longer sufficient as the correct test is now
   wait == NULL || wait->_qproc == NULL.

   However, the worst that can happen here is a slight performance hit in
   the case where wait != NULL and wait->_qproc == NULL. In that case the
   driver will assume that poll_wait() will actually add the fd to the set
   of waiting file descriptors. Of course, poll_wait() will not do that
   since it tests for wait->_qproc. This will not break anything, though.

   There is only one place in the whole kernel where this happens
   (sock_poll_wait() in include/net/sock.h) and that code will be replaced
   by a call to poll_does_not_wait() in the next patch.

   Note that even if wait->_qproc != NULL drivers cannot rely on poll_wait()
   actually waiting. The next file descriptor from the set might match the
   event mask and thus any possible waits will never happen.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index eb4277c33188..d510353ef431 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2206,7 +2206,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
+	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
 		return mask;
 
 	writable = unix_writable(sk);
-- 
cgit v1.2.3