From 5ff9d8a65ce80efb509ce4e8051394e9ed2cd942 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Mar 2013 21:04:39 -0700
Subject: vfs: Lock in place mounts from more privileged users

When creating a less privileged mount namespace or propogating mounts
from a more privileged to a less privileged mount namespace lock the
submounts so they may not be unmounted individually in the child mount
namespace revealing what is under them.

This enforces the reasonable expectation that it is not possible to
see under a mount point.  Most of the time mounts are on empty
directories and revealing that does not matter, however I have seen an
occassionaly sloppy configuration where there were interesting things
concealed under a mount point that probably should not be revealed.

Expirable submounts are not locked because they will eventually
unmount automatically so whatever is under them already needs
to be safe for unprivileged users to access.

From a practical standpoint these restrictions do not appear to be
significant for unprivileged users of the mount namespace.  Recursive
bind mounts and pivot_root continues to work, and mounts that are
created in a mount namespace may be unmounted there.  All of which
means that the common idiom of keeping a directory of interesting
files and using pivot_root to throw everything else away continues to
work just fine.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7b1ca9ba0b0a..7e16a730559c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -831,6 +831,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
 		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
 
+	/* Don't allow unprivileged users to reveal what is under a mount */
+	if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
+		mnt->mnt.mnt_flags |= MNT_LOCKED;
+
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
@@ -1327,6 +1331,8 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 		goto dput_and_out;
 	if (!check_mnt(mnt))
 		goto dput_and_out;
+	if (mnt->mnt.mnt_flags & MNT_LOCKED)
+		goto dput_and_out;
 
 	retval = do_umount(mnt, flags);
 dput_and_out:
@@ -1381,6 +1387,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 	if (IS_ERR(q))
 		return q;
 
+	q->mnt.mnt_flags &= ~MNT_LOCKED;
 	q->mnt_mountpoint = mnt->mnt_mountpoint;
 
 	p = mnt;
@@ -1696,6 +1703,19 @@ static int do_change_type(struct path *path, int flag)
 	return err;
 }
 
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+	struct mount *child;
+	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+		if (!is_subdir(child->mnt_mountpoint, dentry))
+			continue;
+
+		if (child->mnt.mnt_flags & MNT_LOCKED)
+			return true;
+	}
+	return false;
+}
+
 /*
  * do loopback mount.
  */
@@ -1731,6 +1751,9 @@ static int do_loopback(struct path *path, const char *old_name,
 	if (!check_mnt(parent) || !check_mnt(old))
 		goto out2;
 
+	if (!recurse && has_locked_children(old, old_path.dentry))
+		goto out2;
+
 	if (recurse)
 		mnt = copy_tree(old, old_path.dentry, 0);
 	else
@@ -1741,6 +1764,8 @@ static int do_loopback(struct path *path, const char *old_name,
 		goto out2;
 	}
 
+	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		br_write_lock(&vfsmount_lock);
@@ -1853,6 +1878,9 @@ static int do_move_mount(struct path *path, const char *old_name)
 	if (!check_mnt(p) || !check_mnt(old))
 		goto out1;
 
+	if (old->mnt.mnt_flags & MNT_LOCKED)
+		goto out1;
+
 	err = -EINVAL;
 	if (old_path.dentry != old_path.mnt->mnt_root)
 		goto out1;
@@ -2630,6 +2658,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out4;
 	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
 		goto out4;
+	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
+		goto out4;
 	error = -ENOENT;
 	if (d_unlinked(new.dentry))
 		goto out4;
@@ -2653,6 +2683,10 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	br_write_lock(&vfsmount_lock);
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
+	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
+		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
+		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+	}
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp);
 	/* mount new_root on / */
-- 
cgit v1.2.3


From 4ce5d2b1a8fde84c0eebe70652cf28b9beda6b4e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 30 Mar 2013 01:35:18 -0700
Subject: vfs: Don't copy mount bind mounts of /proc/<pid>/ns/mnt between
 namespaces

Don't copy bind mounts of /proc/<pid>/ns/mnt between namespaces.
These files hold references to a mount namespace and copying them
between namespaces could result in a reference counting loop.

The current mnt_ns_loop test prevents loops on the assumption that
mounts don't cross between namespaces.  Unfortunately unsharing a
mount namespace and shared substrees can both cause mounts to
propogate between mount namespaces.

Add two flags CL_COPY_UNBINDABLE and CL_COPY_MNT_NS_FILE are added to
control this behavior, and CL_COPY_ALL is redefined as both of them.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 46 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7e16a730559c..64627f883bf2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1355,14 +1355,11 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
 
 #endif
 
-static bool mnt_ns_loop(struct path *path)
+static bool is_mnt_ns_file(struct dentry *dentry)
 {
-	/* Could bind mounting the mount namespace inode cause a
-	 * mount namespace loop?
-	 */
-	struct inode *inode = path->dentry->d_inode;
+	/* Is this a proxy for a mount namespace? */
+	struct inode *inode = dentry->d_inode;
 	struct proc_ns *ei;
-	struct mnt_namespace *mnt_ns;
 
 	if (!proc_ns_inode(inode))
 		return false;
@@ -1371,7 +1368,19 @@ static bool mnt_ns_loop(struct path *path)
 	if (ei->ns_ops != &mntns_operations)
 		return false;
 
-	mnt_ns = ei->ns;
+	return true;
+}
+
+static bool mnt_ns_loop(struct dentry *dentry)
+{
+	/* Could bind mounting the mount namespace inode cause a
+	 * mount namespace loop?
+	 */
+	struct mnt_namespace *mnt_ns;
+	if (!is_mnt_ns_file(dentry))
+		return false;
+
+	mnt_ns = get_proc_ns(dentry->d_inode)->ns;
 	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
 }
 
@@ -1380,7 +1389,10 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 {
 	struct mount *res, *p, *q, *r, *parent;
 
-	if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
+	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
+		return ERR_PTR(-EINVAL);
+
+	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
 		return ERR_PTR(-EINVAL);
 
 	res = q = clone_mnt(mnt, dentry, flag);
@@ -1397,7 +1409,13 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
-			if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) {
+			if (!(flag & CL_COPY_UNBINDABLE) &&
+			    IS_MNT_UNBINDABLE(s)) {
+				s = skip_mnt_tree(s);
+				continue;
+			}
+			if (!(flag & CL_COPY_MNT_NS_FILE) &&
+			    is_mnt_ns_file(s->mnt.mnt_root)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
@@ -1733,7 +1751,7 @@ static int do_loopback(struct path *path, const char *old_name,
 		return err;
 
 	err = -EINVAL;
-	if (mnt_ns_loop(&old_path))
+	if (mnt_ns_loop(old_path.dentry))
 		goto out; 
 
 	mp = lock_mount(path);
@@ -1755,7 +1773,7 @@ static int do_loopback(struct path *path, const char *old_name,
 		goto out2;
 
 	if (recurse)
-		mnt = copy_tree(old, old_path.dentry, 0);
+		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
 	else
 		mnt = clone_mnt(old, old_path.dentry, 0);
 
@@ -2417,7 +2435,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 
 	namespace_lock();
 	/* First pass: copy the tree topology */
-	copy_flags = CL_COPY_ALL | CL_EXPIRE;
+	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != mnt_ns->user_ns)
 		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
@@ -2452,6 +2470,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		}
 		p = next_mnt(p, old);
 		q = next_mnt(q, new);
+		if (!q)
+			break;
+		while (p->mnt.mnt_root != q->mnt.mnt_root)
+			p = next_mnt(p, old);
 	}
 	namespace_unlock();
 
-- 
cgit v1.2.3


From e51db73532955dc5eaba4235e62b74b460709d5b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 30 Mar 2013 19:57:41 -0700
Subject: userns: Better restrictions on when proc and sysfs can be mounted

Rely on the fact that another flavor of the filesystem is already
mounted and do not rely on state in the user namespace.

Verify that the mounted filesystem is not covered in any significant
way.  I would love to verify that the previously mounted filesystem
has no mounts on top but there are at least the directories
/proc/sys/fs/binfmt_misc and /sys/fs/cgroup/ that exist explicitly
for other filesystems to mount on top of.

Refactor the test into a function named fs_fully_visible and call that
function from the mount routines of proc and sysfs.  This makes this
test local to the filesystems involved and the results current of when
the mounts take place, removing a weird threading of the user
namespace, the mount namespace and the filesystems themselves.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 64627f883bf2..877e4277f496 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2867,25 +2867,38 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
-void update_mnt_policy(struct user_namespace *userns)
+bool fs_fully_visible(struct file_system_type *type)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
+	bool visible = false;
 
-	down_read(&namespace_sem);
+	if (unlikely(!ns))
+		return false;
+
+	namespace_lock();
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
-		switch (mnt->mnt.mnt_sb->s_magic) {
-		case SYSFS_MAGIC:
-			userns->may_mount_sysfs = true;
-			break;
-		case PROC_SUPER_MAGIC:
-			userns->may_mount_proc = true;
-			break;
+		struct mount *child;
+		if (mnt->mnt.mnt_sb->s_type != type)
+			continue;
+
+		/* This mount is not fully visible if there are any child mounts
+		 * that cover anything except for empty directories.
+		 */
+		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+			struct inode *inode = child->mnt_mountpoint->d_inode;
+			if (!S_ISDIR(inode->i_mode))
+				goto next;
+			if (inode->i_nlink != 2)
+				goto next;
 		}
-		if (userns->may_mount_sysfs && userns->may_mount_proc)
-			break;
+		visible = true;
+		goto found;
+	next:	;
 	}
-	up_read(&namespace_sem);
+found:
+	namespace_unlock();
+	return visible;
 }
 
 static void *mntns_get(struct task_struct *task)
-- 
cgit v1.2.3


From c7b96acf1456ef127fef461fcfedb54b81fecfbb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 20 Mar 2013 12:49:49 -0700
Subject: userns:  Kill nsown_capable it makes the wrong thing easy

nsown_capable is a special case of ns_capable essentially for just CAP_SETUID and
CAP_SETGID.  For the existing users it doesn't noticably simplify things and
from the suggested patches I have seen it encourages people to do the wrong
thing.  So remove nsown_capable.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 877e4277f496..dc519a1437ee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2929,8 +2929,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns)
 	struct path root;
 
 	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
-	    !nsown_capable(CAP_SYS_CHROOT) ||
-	    !nsown_capable(CAP_SYS_ADMIN))
+	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (fs->users != 1)
-- 
cgit v1.2.3


From 8033426e6bdb2690d302872ac1e1fadaec1a5581 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 26 Jul 2013 06:23:25 -0400
Subject: vfs: allow umount to handle mountpoints without revalidating them

Christopher reported a regression where he was unable to unmount a NFS
filesystem where the root had gone stale. The problem is that
d_revalidate handles the root of the filesystem differently from other
dentries, but d_weak_revalidate does not. We could simply fix this by
making d_weak_revalidate return success on IS_ROOT dentries, but there
are cases where we do want to revalidate the root of the fs.

A umount is really a special case. We generally aren't interested in
anything but the dentry and vfsmount that's attached at that point. If
the inode turns out to be stale we just don't care since the intent is
to stop using it anyway.

Try to handle this situation better by treating umount as a special
case in the lookup code. Have it resolve the parent using normal
means, and then do a lookup of the final dentry without revalidating
it. In most cases, the final lookup will come out of the dcache, but
the case where there's a trailing symlink or !LAST_NORM entry on the
end complicates things a bit.

Cc: Neil Brown <neilb@suse.de>
Reported-by: Christopher T Vogan <cvogan@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index a45ba4f267fe..ad8ea9bc2518 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1318,7 +1318,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 
-	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
+	retval = user_path_umountat(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	mnt = real_mount(path.mnt);
-- 
cgit v1.2.3


From eed810076685c77dc9a8c5c3593e641c93caed1c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 5 Sep 2013 14:39:11 +0200
Subject: vfs: check unlinked ancestors before mount

We check submounts before doing d_drop() on a non-empty directory dentry in
NFS (have_submounts()), but we do not exclude a racing mount.  Nor do we
prevent mounts to be added to the disconnected subtree using relative paths
after the d_drop().

This patch fixes these issues by checking for unlinked (unhashed, non-root)
ancestors before proceeding with the mount.  This is done with rename
seqlock taken for write and with ->d_lock grabbed on each ancestor in turn,
including our dentry itself.  This ensures that the only one of
check_submounts_and_drop() or has_unlinked_ancestor() can succeed.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index ad8ea9bc2518..5997887cc64a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -611,6 +611,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
 {
 	struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
 	struct mountpoint *mp;
+	int ret;
 
 	list_for_each_entry(mp, chain, m_hash) {
 		if (mp->m_dentry == dentry) {
@@ -626,14 +627,12 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
 	if (!mp)
 		return ERR_PTR(-ENOMEM);
 
-	spin_lock(&dentry->d_lock);
-	if (d_unlinked(dentry)) {
-		spin_unlock(&dentry->d_lock);
+	ret = d_set_mounted(dentry);
+	if (ret) {
 		kfree(mp);
-		return ERR_PTR(-ENOENT);
+		return ERR_PTR(ret);
 	}
-	dentry->d_flags |= DCACHE_MOUNTED;
-	spin_unlock(&dentry->d_lock);
+
 	mp->m_dentry = dentry;
 	mp->m_count = 1;
 	list_add(&mp->m_hash, chain);
-- 
cgit v1.2.3


From 197df04c749a07616621b762e699b1fff4102fac Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Sep 2013 14:03:27 -0400
Subject: rename user_path_umountat() to user_path_mountpoint_at()

... and move the extern from linux/namei.h to fs/internal.h,
along with that of vfs_path_lookup().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index fc2b5226278d..25845d1b300b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1321,7 +1321,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 
-	retval = user_path_umountat(AT_FDCWD, name, lookup_flags, &path);
+	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	mnt = real_mount(path.mnt);
-- 
cgit v1.2.3


From 57f150a58c40cda598c31af8bceb8598f43c3e5f Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Wed, 11 Sep 2013 14:26:10 -0700
Subject: initmpfs: move rootfs code from fs/ramfs/ to init/

When the rootfs code was a wrapper around ramfs, having them in the same
file made sense.  Now that it can wrap another filesystem type, move it in
with the init code instead.

This also allows a subsequent patch to access rootfstype= command line
arg.

Signed-off-by: Rob Landley <rob@landley.net>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Stephen Warren <swarren@nvidia.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jim Cromie <jim.cromie@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/namespace.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 25845d1b300b..da5c49483430 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,7 +17,7 @@
 #include <linux/security.h>
 #include <linux/idr.h>
 #include <linux/acct.h>		/* acct_auto_close_mnt */
-#include <linux/ramfs.h>	/* init_rootfs */
+#include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
-- 
cgit v1.2.3