From ef0855d334e1e4af7c3e0c42146a8479ea14a5ab Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:14 -0700
Subject: mm: mempolicy: turn vma_set_policy() into vma_dup_policy()

Simple cleanup.  Every user of vma_set_policy() does the same work, this
looks a bit annoying imho.  And the new trivial helper which does
mpol_dup() + vma_set_policy() to simplify the callers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index f9c97d10b873..14f6bb4830f7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2380,7 +2380,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	      unsigned long addr, int new_below)
 {
-	struct mempolicy *pol;
 	struct vm_area_struct *new;
 	int err = -ENOMEM;
 
@@ -2404,12 +2403,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 	}
 
-	pol = mpol_dup(vma_policy(vma));
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
+	err = vma_dup_policy(vma, new);
+	if (err)
 		goto out_free_vma;
-	}
-	vma_set_policy(new, pol);
 
 	if (anon_vma_clone(new, vma))
 		goto out_free_mpol;
@@ -2437,7 +2433,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 		fput(new->vm_file);
 	unlink_anon_vmas(new);
  out_free_mpol:
-	mpol_put(pol);
+	mpol_put(vma_policy(new));
  out_free_vma:
 	kmem_cache_free(vm_area_cachep, new);
  out_err:
@@ -2780,7 +2776,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma, *prev;
 	struct rb_node **rb_link, *rb_parent;
-	struct mempolicy *pol;
 	bool faulted_in_anon_vma = true;
 
 	/*
@@ -2825,10 +2820,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
-			pol = mpol_dup(vma_policy(vma));
-			if (IS_ERR(pol))
+			if (vma_dup_policy(vma, new_vma))
 				goto out_free_vma;
-			vma_set_policy(new_vma, pol);
 			INIT_LIST_HEAD(&new_vma->anon_vma_chain);
 			if (anon_vma_clone(new_vma, vma))
 				goto out_free_mempol;
@@ -2843,7 +2836,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	return new_vma;
 
  out_free_mempol:
-	mpol_put(pol);
+	mpol_put(vma_policy(new_vma));
  out_free_vma:
 	kmem_cache_free(vm_area_cachep, new_vma);
 	return NULL;
-- 
cgit v1.2.3


From b2c56e4f7d93be3f33a82ec66f0d0f46713ff5f1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:18 -0700
Subject: mm: shift VM_GROWS* check from mmap_region() to do_mmap_pgoff()

mmap() doesn't allow the non-anonymous mappings with VM_GROWS* bit set.
In particular this means that mmap_region()->vma_merge(file, vm_flags)
must always fail if "vm_flags & VM_GROWS" is set incorrectly.

So it does not make sense to check VM_GROWS* after we already allocated
the new vma, the only caller, do_mmap_pgoff(), which can pass this flag
can do the check itself.

And this looks a bit more correct, mmap_region() already unmapped the
old mapping at this stage. But if mmap() is going to fail, it should
avoid do_munmap() if possible.

Note: we check VM_GROWS at the end to ensure that do_mmap_pgoff() won't
return EINVAL in the case when it currently returns another error code.

Many thanks to Hugh who nacked the buggy v1.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 14f6bb4830f7..6cff7ba24a34 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1302,6 +1302,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 
 			if (!file->f_op || !file->f_op->mmap)
 				return -ENODEV;
+			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+				return -EINVAL;
 			break;
 
 		default:
@@ -1310,6 +1312,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	} else {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
+			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+				return -EINVAL;
 			/*
 			 * Ignore pgoff.
 			 */
@@ -1544,11 +1548,7 @@ munmap_back:
 	vma->vm_pgoff = pgoff;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
-	error = -EINVAL;	/* when rejecting VM_GROWSDOWN|VM_GROWSUP */
-
 	if (file) {
-		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
-			goto free_vma;
 		if (vm_flags & VM_DENYWRITE) {
 			error = deny_write_access(file);
 			if (error)
@@ -1573,8 +1573,6 @@ munmap_back:
 		pgoff = vma->vm_pgoff;
 		vm_flags = vma->vm_flags;
 	} else if (vm_flags & VM_SHARED) {
-		if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
-			goto free_vma;
 		error = shmem_zero_setup(vma);
 		if (error)
 			goto free_vma;
-- 
cgit v1.2.3


From 077bf22b5cf233863826afbfa4af9b18650a832d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:19 -0700
Subject: mm: do_mmap_pgoff: cleanup the usage of file_inode()

Simple cleanup.  Move "struct inode *inode" variable into "if (file)"
block to simplify the code and avoid the unnecessary check.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Colin Cross <ccross@android.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 6cff7ba24a34..1e7a3ea23f1a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			unsigned long *populate)
 {
 	struct mm_struct * mm = current->mm;
-	struct inode *inode;
 	vm_flags_t vm_flags;
 
 	*populate = 0;
@@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			return -EAGAIN;
 	}
 
-	inode = file ? file_inode(file) : NULL;
-
 	if (file) {
+		struct inode *inode = file_inode(file);
+
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
-- 
cgit v1.2.3


From e86867720e617774b560dfbc169b7f3d0d490950 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:20 -0700
Subject: mm: mmap_region: kill correct_wcount/inode, use allow_write_access()

correct_wcount and inode in mmap_region() just complicate the code.  This
boolean was needed previously, when deny_write_access() was called before
vma_merge(), now we can simply check VM_DENYWRITE and do
allow_write_access() if it is set.

allow_write_access() checks file != NULL, so this is safe even if it was
possible to use VM_DENYWRITE && !file.  Just we need to ensure we use the
same file which was deny_write_access()'ed, so the patch also moves "file
= vma->vm_file" down after allow_write_access().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Colin Cross <ccross@android.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 1e7a3ea23f1a..13926a5a6901 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1479,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
-	int correct_wcount = 0;
 	int error;
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
-	struct inode *inode =  file ? file_inode(file) : NULL;
 
 	/* Check against address space limit. */
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
@@ -1552,7 +1550,6 @@ munmap_back:
 			error = deny_write_access(file);
 			if (error)
 				goto free_vma;
-			correct_wcount = 1;
 		}
 		vma->vm_file = get_file(file);
 		error = file->f_op->mmap(file, vma);
@@ -1593,11 +1590,10 @@ munmap_back:
 	}
 
 	vma_link(mm, vma, prev, rb_link, rb_parent);
-	file = vma->vm_file;
-
 	/* Once vma denies write, undo our temporary denial count */
-	if (correct_wcount)
-		atomic_inc(&inode->i_writecount);
+	if (vm_flags & VM_DENYWRITE)
+		allow_write_access(file);
+	file = vma->vm_file;
 out:
 	perf_event_mmap(vma);
 
@@ -1616,8 +1612,8 @@ out:
 	return addr;
 
 unmap_and_free_vma:
-	if (correct_wcount)
-		atomic_inc(&inode->i_writecount);
+	if (vm_flags & VM_DENYWRITE)
+		allow_write_access(file);
 	vma->vm_file = NULL;
 	fput(file);
 
-- 
cgit v1.2.3


From d9104d1ca9662498339c0de975b4666c30485f4e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 11 Sep 2013 14:22:24 -0700
Subject: mm: track vma changes with VM_SOFTDIRTY bit

Pavel reported that in case if vma area get unmapped and then mapped (or
expanded) in-place, the soft dirty tracker won't be able to recognize this
situation since it works on pte level and ptes are get zapped on unmap,
loosing soft dirty bit of course.

So to resolve this situation we need to track actions on vma level, there
VM_SOFTDIRTY flag comes in.  When new vma area created (or old expanded)
we set this bit, and keep it here until application calls for clearing
soft dirty bit.

Thus when user space application track memory changes now it can detect if
vma area is renewed.

Reported-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rob Landley <rob@landley.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 13926a5a6901..51958d192a48 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1609,6 +1609,15 @@ out:
 	if (file)
 		uprobe_mmap(vma);
 
+	/*
+	 * New (or expanded) vma always get soft dirty status.
+	 * Otherwise user-space soft-dirty page tracker won't
+	 * be able to distinguish situation when vma area unmapped,
+	 * then new mapped in-place (which must be aimed as
+	 * a completely new data area).
+	 */
+	vma->vm_flags |= VM_SOFTDIRTY;
+
 	return addr;
 
 unmap_and_free_vma:
@@ -2652,6 +2661,7 @@ out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
+	vma->vm_flags |= VM_SOFTDIRTY;
 	return addr;
 }
 
@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm,
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 
-	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
 	vma->vm_ops = &special_mapping_vmops;
-- 
cgit v1.2.3


From 2d8a17813ec817fa58addd2c92b4ca8cae5bafbb Mon Sep 17 00:00:00 2001
From: Yanchuan Nian <ycnian@gmail.com>
Date: Wed, 11 Sep 2013 14:23:05 -0700
Subject: mm/mmap: remove unnecessary assignment

pgoff is not used after the statement "pgoff = vma->vm_pgoff;", so the
assignment is redundant.

Signed-off-by: Yanchuan Nian <ycnian@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm/mmap.c')

diff --git a/mm/mmap.c b/mm/mmap.c
index 51958d192a48..9d548512ff8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1566,7 +1566,6 @@ munmap_back:
 		WARN_ON_ONCE(addr != vma->vm_start);
 
 		addr = vma->vm_start;
-		pgoff = vma->vm_pgoff;
 		vm_flags = vma->vm_flags;
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
-- 
cgit v1.2.3