From 30c9f3a9fae79517bca595826a19c6855fbb6d32 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 10 Apr 2009 08:43:11 -0700
Subject: Remove internal use of 'write_access' in mm/memory.c

The fault handling routines really want more fine-grained flags than a
single "was it a write fault" boolean - the callers will want to set
flags like "you can return a retry error" etc.

And that's actually how the VM works internally, but right now the
top-level fault handling functions in mm/memory.c all pass just the
'write_access' boolean around.

This switches them over to pass around the FAULT_FLAG_xyzzy 'flags'
variable instead.  The 'write_access' calling convention still exists
for the exported 'handle_mm_fault()' function, but that is next.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index d5d1653d60a6..e6a9700359df 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2496,7 +2496,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
 	struct page *page;
@@ -2572,9 +2572,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	inc_mm_counter(mm, anon_rss);
 	pte = mk_pte(page, vma->vm_page_prot);
-	if (write_access && reuse_swap_page(page)) {
+	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
-		write_access = 0;
+		flags &= ~FAULT_FLAG_WRITE;
 	}
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
@@ -2587,7 +2587,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		try_to_free_swap(page);
 	unlock_page(page);
 
-	if (write_access) {
+	if (flags & FAULT_FLAG_WRITE) {
 		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
 		if (ret & VM_FAULT_ERROR)
 			ret &= VM_FAULT_ERROR;
@@ -2616,7 +2616,7 @@ out_page:
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access)
+		unsigned int flags)
 {
 	struct page *page;
 	spinlock_t *ptl;
@@ -2776,7 +2776,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * due to the bad i386 page protection. But it's valid
 	 * for other architectures too.
 	 *
-	 * Note that if write_access is true, we either now have
+	 * Note that if FAULT_FLAG_WRITE is set, we either now have
 	 * an exclusive copy of the page, or this is a shared mapping,
 	 * so we can make it writable and dirty to avoid having to
 	 * handle that later.
@@ -2847,11 +2847,10 @@ unwritable_page:
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
 
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
@@ -2868,12 +2867,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte)
 {
-	unsigned int flags = FAULT_FLAG_NONLINEAR |
-				(write_access ? FAULT_FLAG_WRITE : 0);
 	pgoff_t pgoff;
 
+	flags |= FAULT_FLAG_NONLINEAR;
+
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return 0;
 
@@ -2904,7 +2903,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
-		pte_t *pte, pmd_t *pmd, int write_access)
+		pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
 	pte_t entry;
 	spinlock_t *ptl;
@@ -2915,30 +2914,30 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
-						pte, pmd, write_access, entry);
+						pte, pmd, flags, entry);
 			}
 			return do_anonymous_page(mm, vma, address,
-						 pte, pmd, write_access);
+						 pte, pmd, flags);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, write_access, entry);
+					pte, pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
-					pte, pmd, write_access, entry);
+					pte, pmd, flags, entry);
 	}
 
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
-	if (write_access) {
+	if (flags & FAULT_FLAG_WRITE) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
+	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
 		update_mmu_cache(vma, address, entry);
 	} else {
 		/*
@@ -2947,7 +2946,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 		 * This still avoids useless tlb flushes for .text page faults
 		 * with threads.
 		 */
-		if (write_access)
+		if (flags & FAULT_FLAG_WRITE)
 			flush_tlb_page(vma, address);
 	}
 unlock:
@@ -2965,13 +2964,14 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0;
 
 	__set_current_state(TASK_RUNNING);
 
 	count_vm_event(PGFAULT);
 
 	if (unlikely(is_vm_hugetlb_page(vma)))
-		return hugetlb_fault(mm, vma, address, write_access);
+		return hugetlb_fault(mm, vma, address, flags);
 
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
@@ -2984,7 +2984,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte)
 		return VM_FAULT_OOM;
 
-	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED
-- 
cgit v1.2.3


From d06063cc221fdefcab86589e79ddfdb7c0e14b63 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 10 Apr 2009 09:01:23 -0700
Subject: Move FAULT_FLAG_xyz into handle_mm_fault() callers

This allows the callers to now pass down the full set of FAULT_FLAG_xyz
flags to handle_mm_fault().  All callers have been (mechanically)
converted to the new calling convention, there's almost certainly room
for architectures to clean up their code and then add FAULT_FLAG_RETRY
when that support is added.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index e6a9700359df..98bcb90d5957 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1310,8 +1310,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			cond_resched();
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
-				ret = handle_mm_fault(mm, vma, start,
-						foll_flags & FOLL_WRITE);
+
+				/* FOLL_WRITE matches FAULT_FLAG_WRITE! */
+				ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE);
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
@@ -2958,13 +2959,12 @@ unlock:
  * By the time we get here, we already hold the mm semaphore
  */
 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, int write_access)
+		unsigned long address, unsigned int flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0;
 
 	__set_current_state(TASK_RUNNING);
 
-- 
cgit v1.2.3


From d26ed650d9947a786bbda8de9cd914dbeebc1a68 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Tue, 23 Jun 2009 13:52:49 +0100
Subject: mm: don't rely on flags coincidence

Indeed FOLL_WRITE matches FAULT_FLAG_WRITE, matches GUP_FLAGS_WRITE,
and it's tempting to devise a set of Grand Unified Paging flags;
but not today.  So until then, let's rely upon the compiler to spot
the coincidence, "rather than have that subtle dependency and a
comment for it" - as you remarked in another context yesterday.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 98bcb90d5957..50da9511aa77 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1311,8 +1311,10 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
 
-				/* FOLL_WRITE matches FAULT_FLAG_WRITE! */
-				ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE);
+				ret = handle_mm_fault(mm, vma, start,
+					(foll_flags & FOLL_WRITE) ?
+					FAULT_FLAG_WRITE : 0);
+
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
-- 
cgit v1.2.3


From a5c9b696ec109bb54d547fdb437a7a0c2d514670 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Tue, 23 Jun 2009 12:36:58 -0700
Subject: mm: pass mm to grab_swap_token

If a kthread happens to use get_user_pages() on an mm (as KSM does),
there's a chance that it will end up trying to read in a swap page, then
oops in grab_swap_token() because the kthread has no mm: GUP passes down
the right mm, so grab_swap_token() ought to be using it.

We have not identified a stronger case than KSM's daemon (not yet in
mainline), but the issue must have come up before, since RHEL has included
a fix for this for years (though a different fix, they just back out of
grab_swap_token if current->mm is unset: which is what we first proposed,
but using the right mm here seems more correct).

Reported-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 50da9511aa77..f46ac18ba231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2519,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		grab_swap_token(); /* Contend for token _before_ read-in */
+		grab_swap_token(mm); /* Contend for token _before_ read-in */
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
-- 
cgit v1.2.3


From 9d73777e500929b71dcfed16eec05f6760e345a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 25 Jun 2009 11:58:55 +0200
Subject: clarify get_user_pages() prototype

Currently the 4th parameter of get_user_pages() is called len, but its
in pages, not bytes. Rename the thing to nr_pages to avoid future
confusion.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index f46ac18ba231..65216194eb8d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1207,8 +1207,8 @@ static inline int use_zero_page(struct vm_area_struct *vma)
 
 
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		     unsigned long start, int len, int flags,
-		struct page **pages, struct vm_area_struct **vmas)
+		     unsigned long start, int nr_pages, int flags,
+		     struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
 	unsigned int vm_flags = 0;
@@ -1217,7 +1217,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
 	int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
 
-	if (len <= 0)
+	if (nr_pages <= 0)
 		return 0;
 	/* 
 	 * Require read or write permissions.
@@ -1269,7 +1269,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				vmas[i] = gate_vma;
 			i++;
 			start += PAGE_SIZE;
-			len--;
+			nr_pages--;
 			continue;
 		}
 
@@ -1280,7 +1280,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 		if (is_vm_hugetlb_page(vma)) {
 			i = follow_hugetlb_page(mm, vma, pages, vmas,
-						&start, &len, i, write);
+						&start, &nr_pages, i, write);
 			continue;
 		}
 
@@ -1357,9 +1357,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				vmas[i] = vma;
 			i++;
 			start += PAGE_SIZE;
-			len--;
-		} while (len && start < vma->vm_end);
-	} while (len);
+			nr_pages--;
+		} while (nr_pages && start < vma->vm_end);
+	} while (nr_pages);
 	return i;
 }
 
@@ -1368,7 +1368,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  * @tsk:	task_struct of target task
  * @mm:		mm_struct of target mm
  * @start:	starting user address
- * @len:	number of pages from start to pin
+ * @nr_pages:	number of pages from start to pin
  * @write:	whether pages will be written to by the caller
  * @force:	whether to force write access even if user mapping is
  *		readonly. This will result in the page being COWed even
@@ -1380,7 +1380,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  *		Or NULL if the caller does not require them.
  *
  * Returns number of pages pinned. This may be fewer than the number
- * requested. If len is 0 or negative, returns 0. If no pages
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno. Each page returned must be released
  * with a put_page() call when it is finished with. vmas will only
  * remain valid while mmap_sem is held.
@@ -1414,7 +1414,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  * See also get_user_pages_fast, for performance critical applications.
  */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long start, int len, int write, int force,
+		unsigned long start, int nr_pages, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int flags = 0;
@@ -1424,9 +1424,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	if (force)
 		flags |= GUP_FLAGS_FORCE;
 
-	return __get_user_pages(tsk, mm,
-				start, len, flags,
-				pages, vmas);
+	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
 }
 
 EXPORT_SYMBOL(get_user_pages);
-- 
cgit v1.2.3


From 9e1b32caa525cb236e80e9c671e179bcecccc657 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 22 Jul 2009 15:44:28 +1000
Subject: mm: Pass virtual address to [__]p{te,ud,md}_free_tlb()

mm: Pass virtual address to [__]p{te,ud,md}_free_tlb()

Upcoming paches to support the new 64-bit "BookE" powerpc architecture
will need to have the virtual address corresponding to PTE page when
freeing it, due to the way the HW table walker works.

Basically, the TLB can be loaded with "large" pages that cover the whole
virtual space (well, sort-of, half of it actually) represented by a PTE
page, and which contain an "indirect" bit indicating that this TLB entry
RPN points to an array of PTEs from which the TLB can then create direct
entries. Thus, in order to invalidate those when PTE pages are deleted,
we need the virtual address to pass to tlbilx or tlbivax instructions.

The old trick of sticking it somewhere in the PTE page struct page sucks
too much, the address is almost readily available in all call sites and
almost everybody implemets these as macros, so we may as well add the
argument everywhere. I added it to the pmd and pud variants for consistency.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: David Howells <dhowells@redhat.com> [MN10300 & FRV]
Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> [s390]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 65216194eb8d..aede2ce3aba4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd)
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
-static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+			   unsigned long addr)
 {
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
-	pte_free_tlb(tlb, token);
+	pte_free_tlb(tlb, token, addr);
 	tlb->mm->nr_ptes--;
 }
 
@@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		free_pte_range(tlb, pmd);
+		free_pte_range(tlb, pmd, addr);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
-	pmd_free_tlb(tlb, pmd);
+	pmd_free_tlb(tlb, pmd, start);
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
-	pud_free_tlb(tlb, pud);
+	pud_free_tlb(tlb, pud, start);
 }
 
 /*
-- 
cgit v1.2.3