From 23aaed6659df9adfabe9c583e67a36b54e21df46 Mon Sep 17 00:00:00 2001 From: Shiraz Hashim Date: Thu, 5 Feb 2015 12:25:06 -0800 Subject: mm: pagewalk: call pte_hole() for VM_PFNMAP during walk_page_range walk_page_range() silently skips vma having VM_PFNMAP set, which leads to undesirable behaviour at client end (who called walk_page_range). Userspace applications get the wrong data, so the effect is like just confusing users (if the applications just display the data) or sometimes killing the processes (if the applications do something with misunderstanding virtual addresses due to the wrong data.) For example for pagemap_read, when no callbacks are called against VM_PFNMAP vma, pagemap_read may prepare pagemap data for next virtual address range at wrong index. Eventually userspace may get wrong pagemap data for a task. Corresponding to a VM_PFNMAP marked vma region, kernel may report mappings from subsequent vma regions. User space in turn may account more pages (than really are) to the task. In my case I was using procmem, procrack (Android utility) which uses pagemap interface to account RSS pages of a task. Due to this bug it was giving a wrong picture for vmas (with VM_PFNMAP set). Fixes: a9ff785e4437 ("mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas") Signed-off-by: Shiraz Hashim Acked-by: Naoya Horiguchi Cc: [3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/pagewalk.c') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index ad83195521f2..b264bda46e1b 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -199,7 +199,10 @@ int walk_page_range(unsigned long addr, unsigned long end, */ if ((vma->vm_start <= addr) && (vma->vm_flags & VM_PFNMAP)) { - next = vma->vm_end; + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; pgd = pgd_offset(walk->mm, next); continue; } -- cgit v1.2.3 From 0b1fbfe50006c41014cc25660c0e735d21c34939 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:34 -0800 Subject: mm/pagewalk: remove pgd_entry() and pud_entry() Currently no user of page table walker sets ->pgd_entry() or ->pud_entry(), so checking their existence in each loop is just wasting CPU cycle. So let's remove it to reduce overhead. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm/pagewalk.c') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b264bda46e1b..b793ef149da2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, break; continue; } - if (walk->pud_entry) - err = walk->pud_entry(pud, addr, next, walk); - if (!err && (walk->pmd_entry || walk->pte_entry)) + if (walk->pmd_entry || walk->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) break; @@ -237,10 +235,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd++; continue; } - if (walk->pgd_entry) - err = walk->pgd_entry(pgd, addr, next, walk); - if (!err && - (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) + if (walk->pmd_entry || walk->pte_entry) err = walk_pud_range(pgd, addr, next, walk); if (err) break; -- cgit v1.2.3 From fafaa4264eba49fd10695c193a82760558d093f4 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:37 -0800 Subject: pagewalk: improve vma handling Current implementation of page table walker has a fundamental problem in vma handling, which started when we tried to handle vma(VM_HUGETLB). Because it's done in pgd loop, considering vma boundary makes code complicated and bug-prone. From the users viewpoint, some user checks some vma-related condition to determine whether the user really does page walk over the vma. In order to solve these, this patch moves vma check outside pgd loop and introduce a new callback ->test_walk(). Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 206 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 117 insertions(+), 89 deletions(-) (limited to 'mm/pagewalk.c') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b793ef149da2..d9cc3caae802 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -59,7 +59,7 @@ again: continue; split_huge_page_pmd_mm(walk->mm, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + if (pmd_trans_unstable(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); if (err) @@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +static int walk_pgd_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(pgd, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) @@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, return boundary < end ? boundary : end; } -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, +static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { + struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); @@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) - return err; + break; } while (addr = next, addr != end); - return 0; + return err; } #else /* CONFIG_HUGETLB_PAGE */ -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, +static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; @@ -135,112 +160,115 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, #endif /* CONFIG_HUGETLB_PAGE */ +/* + * Decide whether we really walk over the current vma on [@start, @end) + * or skip it via the returned value. Return 0 if we do walk over the + * current vma, and return 1 if we skip the vma. Negative values means + * error, where we abort the current walk. + * + * Default check (only VM_PFNMAP check for now) is used when the caller + * doesn't define test_walk() callback. + */ +static int walk_page_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + if (walk->test_walk) + return walk->test_walk(start, end, walk); + + /* + * Do not walk over vma(VM_PFNMAP), because we have no valid struct + * page backing a VM_PFNMAP range. See also commit a9ff785e4437. + */ + if (vma->vm_flags & VM_PFNMAP) + return 1; + return 0; +} + +static int __walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + struct vm_area_struct *vma = walk->vma; + + if (vma && is_vm_hugetlb_page(vma)) { + if (walk->hugetlb_entry) + err = walk_hugetlb_range(start, end, walk); + } else + err = walk_pgd_range(start, end, walk); + + return err; +} /** - * walk_page_range - walk a memory map's page tables with a callback - * @addr: starting address - * @end: ending address - * @walk: set of callbacks to invoke for each level of the tree - * - * Recursively walk the page table for the memory area in a VMA, - * calling supplied callbacks. Callbacks are called in-order (first - * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, - * etc.). If lower-level callbacks are omitted, walking depth is reduced. + * walk_page_range - walk page table with caller specific callbacks * - * Each callback receives an entry pointer and the start and end of the - * associated range, and a copy of the original mm_walk for access to - * the ->private or ->mm fields. + * Recursively walk the page table tree of the process represented by @walk->mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. * - * Usually no locks are taken, but splitting transparent huge page may - * take page table lock. And the bottom level iterator will map PTE - * directories from highmem if necessary. + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @walk->test_walk() are used for this + * purpose. * - * If any callback returns a non-zero value, the walk is aborted and - * the return value is propagated back to the caller. Otherwise 0 is returned. + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @walk->private should be helpful. * - * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry - * is !NULL. + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold + * @walk->mm->mmap_sem, because these function traverse vma list and/or + * access to vma's data. */ -int walk_page_range(unsigned long addr, unsigned long end, +int walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { - pgd_t *pgd; - unsigned long next; int err = 0; + unsigned long next; + struct vm_area_struct *vma; - if (addr >= end) - return err; + if (start >= end) + return -EINVAL; if (!walk->mm) return -EINVAL; VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); - pgd = pgd_offset(walk->mm, addr); + vma = find_vma(walk->mm, start); do { - struct vm_area_struct *vma = NULL; + if (!vma) { /* after the last vma */ + walk->vma = NULL; + next = end; + } else if (start < vma->vm_start) { /* outside vma */ + walk->vma = NULL; + next = min(end, vma->vm_start); + } else { /* inside vma */ + walk->vma = vma; + next = min(end, vma->vm_end); + vma = vma->vm_next; - next = pgd_addr_end(addr, end); - - /* - * This function was not intended to be vma based. - * But there are vma special cases to be handled: - * - hugetlb vma's - * - VM_PFNMAP vma's - */ - vma = find_vma(walk->mm, addr); - if (vma) { - /* - * There are no page structures backing a VM_PFNMAP - * range, so do not allow split_huge_page_pmd(). - */ - if ((vma->vm_start <= addr) && - (vma->vm_flags & VM_PFNMAP)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); + err = walk_page_test(start, next, walk); + if (err > 0) continue; - } - /* - * Handle hugetlb vma individually because pagetable - * walk for the hugetlb page is dependent on the - * architecture and we can't handled it in the same - * manner as non-huge pages. - */ - if (walk->hugetlb_entry && (vma->vm_start <= addr) && - is_vm_hugetlb_page(vma)) { - if (vma->vm_end < next) - next = vma->vm_end; - /* - * Hugepage is very tightly coupled with vma, - * so walk through hugetlb entries within a - * given vma. - */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; - } - } - - if (pgd_none_or_clear_bad(pgd)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); - if (err) + if (err < 0) break; - pgd++; - continue; } - if (walk->pmd_entry || walk->pte_entry) - err = walk_pud_range(pgd, addr, next, walk); + if (walk->vma || walk->pte_hole) + err = __walk_page_range(start, next, walk); if (err) break; - pgd++; - } while (addr = next, addr < end); - + } while (start = next, start < end); return err; } -- cgit v1.2.3 From 900fc5f197b05253ae9433fb9a066c3f37d08f69 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:40 -0800 Subject: pagewalk: add walk_page_vma() Introduce walk_page_vma(), which is useful for the callers which want to walk over a given vma. It's used by later patches. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm/pagewalk.c') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d9cc3caae802..4c9a653ba563 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -272,3 +272,21 @@ int walk_page_range(unsigned long start, unsigned long end, } while (start = next, start < end); return err; } + +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +{ + int err; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON(!vma); + walk->vma = vma; + err = walk_page_test(vma->vm_start, vma->vm_end, walk); + if (err > 0) + return 0; + if (err < 0) + return err; + return __walk_page_range(vma->vm_start, vma->vm_end, walk); +} -- cgit v1.2.3 From 48684a65b4e3ff544d62532c1b78962c9677b632 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:28:06 -0800 Subject: mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP) walk_page_range() silently skips vma having VM_PFNMAP set, which leads to undesirable behaviour at client end (who called walk_page_range). For example for pagemap_read(), when no callbacks are called against VM_PFNMAP vma, pagemap_read() may prepare pagemap data for next virtual address range at wrong index. That could confuse and/or break userspace applications. This patch avoid this misbehavior caused by vma(VM_PFNMAP) like follows: - for pagemap_read() which has its own ->pte_hole(), call the ->pte_hole() over vma(VM_PFNMAP), - for clear_refs and queue_pages which have their own ->tests_walk, just return 1 and skip vma(VM_PFNMAP). This is no problem because these are not interested in hole regions, - for other callers, just skip the vma(VM_PFNMAP) as a default behavior. Signed-off-by: Naoya Horiguchi Signed-off-by: Shiraz Hashim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'mm/pagewalk.c') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 4c9a653ba563..75c1f2878519 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); - if (pmd_none(*pmd)) { + if (pmd_none(*pmd) || !walk->vma) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) @@ -165,9 +165,6 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, * or skip it via the returned value. Return 0 if we do walk over the * current vma, and return 1 if we skip the vma. Negative values means * error, where we abort the current walk. - * - * Default check (only VM_PFNMAP check for now) is used when the caller - * doesn't define test_walk() callback. */ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) @@ -178,11 +175,19 @@ static int walk_page_test(unsigned long start, unsigned long end, return walk->test_walk(start, end, walk); /* - * Do not walk over vma(VM_PFNMAP), because we have no valid struct - * page backing a VM_PFNMAP range. See also commit a9ff785e4437. + * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP + * range, so we don't walk over it as we do for normal vmas. However, + * Some callers are interested in handling hole range and they don't + * want to just ignore any single address range. Such users certainly + * define their ->pte_hole() callbacks, so let's delegate them to handle + * vma(VM_PFNMAP). */ - if (vma->vm_flags & VM_PFNMAP) - return 1; + if (vma->vm_flags & VM_PFNMAP) { + int err = 1; + if (walk->pte_hole) + err = walk->pte_hole(start, end, walk); + return err ? err : 1; + } return 0; } -- cgit v1.2.3