From bf3f3bc5e734706730c12a323f9b2068052aa1f0 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:38:55 -0800 Subject: mm: don't mark_page_accessed in fault path Doing a mark_page_accessed at fault-time, then doing SetPageReferenced at unmap-time if the pte is young has a number of problems. mark_page_accessed is supposed to be roughly the equivalent of a young pte for unmapped references. Unfortunately it doesn't come with any context: after being called, reclaim doesn't know who or why the page was touched. So calling mark_page_accessed not only adds extra lru or PG_referenced manipulations for pages that are already going to have pte_young ptes anyway, but it also adds these references which are difficult to work with from the context of vma specific references (eg. MADV_SEQUENTIAL pte_young may not wish to contribute to the page being referenced). Then, simply doing SetPageReferenced when zapping a pte and finding it is young, is not a really good solution either. SetPageReferenced does not correctly promote the page to the active list for example. So after removing mark_page_accessed from the fault path, several mmap()+touch+munmap() would have a very different result from several read(2) calls for example, which is not really desirable. Signed-off-by: Nick Piggin Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7b9db658aca2..5e0e91cc6b67 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -768,7 +768,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent)) - SetPageReferenced(page); + mark_page_accessed(page); file_rss--; } page_remove_rmap(page, vma); -- cgit v1.2.3 From 4917e5d0499b5ae7b26b56fccaefddf9aec9369c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Jan 2009 14:39:17 -0800 Subject: mm: more likely reclaim MADV_SEQUENTIAL mappings File pages mapped only in sequentially read mappings are perfect reclaim canditates. This patch makes these mappings behave like weak references, their pages will be reclaimed unless they have a strong reference from a normal mapping as well. It changes the reclaim and the unmap path where they check if the page has been referenced. In both cases, accesses through sequentially read mappings will be ignored. Benchmark results from KOSAKI Motohiro: http://marc.info/?l=linux-mm&m=122485301925098&w=2 Signed-off-by: Johannes Weiner Signed-off-by: Rik van Riel Acked-by: KOSAKI Motohiro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5e0e91cc6b67..99e8d5c7b312 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -767,7 +767,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, else { if (pte_dirty(ptent)) set_page_dirty(page); - if (pte_young(ptent)) + if (pte_young(ptent) && + likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); file_rss--; } -- cgit v1.2.3 From 38e0edb15bd07c6a0caf0cfe39f8f90bd98601b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 6 Jan 2009 14:39:21 -0800 Subject: mm/apply_to_range: call pte function with lazy updates Make the pte-level function in apply_to_range be called in lazy mmu mode, so that any pagetable modifications can be batched. Signed-off-by: Jeremy Fitzhardinge Cc: Johannes Weiner Cc: Nick Piggin Cc: Venkatesh Pallipadi Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 99e8d5c7b312..b5af358b8b22 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1645,6 +1645,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); + arch_enter_lazy_mmu_mode(); + token = pmd_pgtable(*pmd); do { @@ -1653,6 +1655,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; -- cgit v1.2.3 From b5934c531849ff4a51ce0f290141efe564290e40 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:25 -0800 Subject: mm: add_active_or_unevictable into rmap lru_cache_add_active_or_unevictable() and page_add_new_anon_rmap() always appear together. Save some symbol table space and some jumping around by removing lru_cache_add_active_or_unevictable(), folding its code into page_add_new_anon_rmap(): like how we add file pages to lru just after adding them to page cache. Remove the nearby "TODO: is this safe?" comments (yes, it is safe), and change page_add_new_anon_rmap()'s address BUG_ON to VM_BUG_ON as originally intended. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b5af358b8b22..a138c50dc39a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1949,10 +1949,7 @@ gotten: */ ptep_clear_flush_notify(vma, address, page_table); SetPageSwapBacked(new_page); - lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); - -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { @@ -2448,7 +2445,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2597,7 +2593,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (anon) { inc_mm_counter(mm, anon_rss); SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); @@ -2607,7 +2602,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ -- cgit v1.2.3 From cbf84b7add8103b92aaa84928e335df726bfc8da Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:27 -0800 Subject: mm: further cleanup page_add_new_anon_rmap Moving lru_cache_add_active_or_unevictable() into page_add_new_anon_rmap() was good but stupid: we can and should SetPageSwapBacked() there too; and we know for sure that this anonymous, swap-backed page is not file cache. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Nick Piggin Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index a138c50dc39a..122d965e820f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1948,7 +1948,6 @@ gotten: * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - SetPageSwapBacked(new_page); page_add_new_anon_rmap(new_page, vma, address); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); @@ -2444,7 +2443,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2592,7 +2590,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); -- cgit v1.2.3 From 878b63ac889df706d01048f2c110e322ad2f996d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:32 -0800 Subject: mm: gup persist for write permission do_wp_page()'s VM_FAULT_WRITE return value tells __get_user_pages() that COW has been done if necessary, though it may be leaving the pte without write permission - for the odd case of forced writing to a readonly vma for ptrace. At present GUP then retries the follow_page() without asking for write permission, to escape an endless loop when forced. But an application may be relying on GUP to guarantee a writable page which won't be COWed again when written from userspace, whereas a race here might leave a readonly pte in place? Change the VM_FAULT_WRITE handling to ask follow_page() for write permission again, except in that odd case of forced writing to a readonly vma. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 122d965e820f..f594bb65a9f1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1264,9 +1264,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * do_wp_page has broken COW when necessary, * even if maybe_mkwrite decided not to set * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). */ - if (ret & VM_FAULT_WRITE) + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) foll_flags &= ~FOLL_WRITE; cond_resched(); -- cgit v1.2.3 From ab967d86015a19777955370deebc8262d50fed63 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:33 -0800 Subject: mm: wp lock page before deciding cow An application may rely on get_user_pages() to give it pages writable from userspace and shared with a driver, GUP breaking COW if necessary. It may mprotect() the pages' writability, off and on, from time to time. Normally this works fine (so long as the app does not fork); but just occasionally, under memory pressure, a readonly pte in a newly writable area is COWed unnecessarily, breaking the link with the driver: because do_wp_page() does trylock_page, and falls back to COW whenever that fails. For reliable behaviour in the unshared case, when the trylock_page fails, now unlock pagetable, lock page and relock pagetable, before deciding whether Copy-On-Write is really necessary. Reported-by: Zhou Yingchao Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index f594bb65a9f1..3922ffcf3dff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1848,10 +1848,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (trylock_page(old_page)) { - reuse = can_share_swap_page(old_page); - unlock_page(old_page); + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); + goto unlock; + } + page_cache_release(old_page); } + reuse = can_share_swap_page(old_page); + unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { /* -- cgit v1.2.3 From 7b1fe59793e61f826bef053107b57b23954833bb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:34 -0800 Subject: mm: reuse_swap_page replaces can_share_swap_page A good place to free up old swap is where do_wp_page(), or do_swap_page(), is about to redirty the page: the data on disk is then stale and won't be read again; and if we do decide to write the page out later, using the previous swap location makes an unnecessary disk seek very likely. So give can_share_swap_page() the side-effect of delete_from_swap_cache() when it safely can. And can_share_swap_page() was always a misleading name, the more so if it has a side-effect: rename it reuse_swap_page(). Irrelevant cleanup nearby: remove swap_token_default_timeout definition from swap.h: it's used nowhere. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Acked-by: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 3922ffcf3dff..8f471edcb985 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1861,7 +1861,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } - reuse = can_share_swap_page(old_page); + reuse = reuse_swap_page(old_page); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2392,7 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) { + if (write_access && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } -- cgit v1.2.3 From a2c43eed8334e878702fca713b212ae2a11d84b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:36 -0800 Subject: mm: try_to_free_swap replaces remove_exclusive_swap_page remove_exclusive_swap_page(): its problem is in living up to its name. It doesn't matter if someone else has a reference to the page (raised page_count); it doesn't matter if the page is mapped into userspace (raised page_mapcount - though that hints it may be worth keeping the swap): all that matters is that there be no more references to the swap (and no writeback in progress). swapoff (try_to_unuse) has been removing pages from swapcache for years, with no concern for page count or page mapcount, and we used to have a comment in lookup_swap_cache() recognizing that: if you go for a page of swapcache, you'll get the right page, but it could have been removed from swapcache by the time you get page lock. So, give up asking for exclusivity: get rid of remove_exclusive_swap_page(), and remove_exclusive_swap_page_ref() and remove_exclusive_swap_page_count() which were spawned for the recent LRU work: replace them by the simpler try_to_free_swap() which just checks page_swapcount(). Similarly, remove the page_count limitation from free_swap_and_count(), but assume that it's worth holding on to the swap if page is mapped and swap nowhere near full. Add a vm_swap_full() test in free_swap_cache()? It would be consistent, but I think we probably have enough for now. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 8f471edcb985..1a83fe5339a9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2403,7 +2403,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - remove_exclusive_swap_page(page); + try_to_free_swap(page); unlock_page(page); if (write_access) { -- cgit v1.2.3 From 2bc7273b0e3a509fb598abfc5b9fe50158b830d2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 6 Jan 2009 14:39:43 -0800 Subject: mm: make maddr __iomem sparse output following warnings. mm/memory.c:2936:8: warning: incorrect type in assignment (different address spaces) mm/memory.c:2936:8: expected void *maddr mm/memory.c:2936:8: got void [noderef] cleanup here. Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 1a83fe5339a9..89339c61f8e5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2966,7 +2966,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, { resource_size_t phys_addr; unsigned long prot = 0; - void *maddr; + void __iomem *maddr; int offset = addr & (PAGE_SIZE-1); if (follow_phys(vma, addr, write, &prot, &phys_addr)) -- cgit v1.2.3 From 3dc147414ccad81dc33edb80774b1fed12a38c08 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:08 -0800 Subject: badpage: replace page_remove_rmap Eeek and BUG Now that bad pages are kept out of circulation, there is no need for the infamous page_remove_rmap() BUG() - once that page is freed, its negative mapcount will issue a "Bad page state" message and the page won't be freed. Removing the BUG() allows more info, on subsequent pages, to be gathered. We do have more info about the page at this point than bad_page() can know - notably, what the pmd is, which might pinpoint something like low 64kB corruption - but page_remove_rmap() isn't given the address to find that. In practice, there is only one call to page_remove_rmap() which has ever reported anything, that from zap_pte_range() (usually on exit, sometimes on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek" message and leave it all to zap_pte_range(). mm/memory.c already has a hardly used print_bad_pte() function, showing some of the appropriate info: extend it to show what we want for the rmap case: pte info, page info (when there is a page) and vma info to compare. zap_pte_range() already knows the pmd, but print_bad_pte() is easier to use if it works that out for itself. Some of this info is also shown in bad_page()'s "Bad page state" message. Keep them separate, but adjust them to match each other as far as possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE there too. print_bad_pte() show current->comm unconditionally (though it should get repeated in the usually irrelevant stack trace): sorry, I misled Nick Piggin to make it conditional on vm_mm == current->mm, but current->mm is already NULL in the exit case. Usually current->comm is good, though exceptionally it may not be that of the mm (when "swapoff" for example). Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 89339c61f8e5..cda04b19f733 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,9 @@ #include #include #include +#include +#include +#include #include #include @@ -59,9 +62,6 @@ #include #include -#include -#include - #include "internal.h" #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -375,15 +375,41 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, - unsigned long vaddr) -{ - printk(KERN_ERR "Bad pte = %08llx, process = %s, " - "vm_flags = %lx, vaddr = %lx\n", - (long long)pte_val(pte), - (vma->vm_mm == current->mm ? current->comm : "???"), - vma->vm_flags, vaddr); +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + printk(KERN_EMERG "Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) { + printk(KERN_EMERG + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + } + printk(KERN_EMERG + "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + /* + * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y + */ + if (vma->vm_ops) + print_symbol(KERN_EMERG "vma->vm_ops->fault: %s\n", + (unsigned long)vma->vm_ops->fault); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol(KERN_EMERG "vma->vm_file->f_op->mmap: %s\n", + (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); + add_taint(TAINT_BAD_PAGE); } static inline int is_cow_mapping(unsigned int flags) @@ -773,6 +799,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, file_rss--; } page_remove_rmap(page, vma); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); continue; } @@ -2684,7 +2712,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Page table corrupted: show pte and kill process. */ - print_bad_pte(vma, orig_pte, address); + print_bad_pte(vma, address, orig_pte, NULL); return VM_FAULT_OOM; } -- cgit v1.2.3 From 22b31eec63e5f2e219a3ee15f456897272bc73e8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:09 -0800 Subject: badpage: vm_normal_page use print_bad_pte print_bad_pte() is so far being called only when zap_pte_range() finds negative page_mapcount, or there's a fault on a pte_file where it does not belong. That's weak coverage when we suspect pagetable corruption. Originally, it was called when vm_normal_page() found an invalid pfn: but pfn_valid is expensive on some architectures and configurations, so 2.6.24 put that under CONFIG_DEBUG_VM (which doesn't help in the field), then 2.6.26 replaced it by a VM_BUG_ON (likewise). Reinstate the print_bad_pte() in vm_normal_page(), but use a cheaper test than pfn_valid(): memmap_init_zone() (used in bootup and hotplug) keep a __read_mostly note of the highest_memmap_pfn, vm_normal_page() then check pfn against that. We could call this pfn_plausible() or pfn_sane(), but I doubt we'll need it elsewhere: of course it's not reliable, but gives much stronger pagetable validation on many boxes. Also use print_bad_pte() when the pte_special bit is found outside a VM_PFNMAP or VM_MIXEDMAP area, instead of VM_BUG_ON. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index cda04b19f733..890095f5f36d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -467,21 +467,18 @@ static inline int is_cow_mapping(unsigned int flags) struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long pfn; + unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) { - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - return pte_page(pte); - } - VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + if (likely(!pte_special(pte))) + goto check_pfn; + if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !HAVE_PTE_SPECIAL case follows: */ - pfn = pte_pfn(pte); - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -497,11 +494,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - VM_BUG_ON(!pfn_valid(pfn)); +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } /* * NOTE! We still have PageReserved() pages in the page tables. - * * eg. VDSO mappings can cause them to exist. */ out: -- cgit v1.2.3 From 2509ef26db4699a5d9fa876e90ddfc107afcab84 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:10 -0800 Subject: badpage: zap print_bad_pte on swap and file Complete zap_pte_range()'s coverage of bad pagetable entries by calling print_bad_pte() on a pte_file in a linear vma and on a bad swap entry. That needs free_swap_and_cache() to tell it, which will also have shown one of those "swap_free" errors (but with much less information). Similar checks in fork's copy_one_pte()? No, that would be more noisy than helpful: we'll see them when parent and child exec or exit. Where do_nonlinear_fault() calls print_bad_pte(): omit !VM_CAN_NONLINEAR case, that could only be a bug in sys_remap_file_pages(), not a bad pte. VM_FAULT_OOM rather than VM_FAULT_SIGBUS? Well, okay, that is consistent with what happens if do_swap_page() operates a bad swap entry; but don't we have patches to be more careful about killing when VM_FAULT_OOM? Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 890095f5f36d..b273cc12b15d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -810,8 +810,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, */ if (unlikely(details)) continue; - if (!pte_file(ptent)) - free_swap_and_cache(pte_to_swp_entry(ptent)); + if (pte_file(ptent)) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) + print_bad_pte(vma, addr, ptent, NULL); + } else if + (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -2707,8 +2711,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || - !(vma->vm_flags & VM_CAN_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ -- cgit v1.2.3 From edc315fd222497ae4f4b959a9e31ada1e68a4755 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:11 -0800 Subject: badpage: remove vma from page_remove_rmap Remove page_remove_rmap()'s vma arg, which was only for the Eeek message. And remove the BUG_ON(page_mapcount(page) == 0) from CONFIG_DEBUG_VM's page_dup_rmap(): we're trying to be more resilient about that than BUGs. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b273cc12b15d..0f9abbaf618c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -798,7 +798,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); file_rss--; } - page_remove_rmap(page, vma); + page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); @@ -2023,7 +2023,7 @@ gotten: * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, vma); + page_remove_rmap(old_page); } /* Free the old page.. */ -- cgit v1.2.3 From d936cf9b39b06c8d2e0d7fb5e7b4f176e18dec69 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:12 -0800 Subject: badpage: ratelimit print_bad_pte and bad_page print_bad_pte() and bad_page() might each need ratelimiting - especially for their dump_stacks, almost never of interest, yet not quite dispensible. Correlating corruption across neighbouring entries can be very helpful, so allow a burst of 60 reports before keeping quiet for the remainder of that minute (or allow a steady drip of one report per second). Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 0f9abbaf618c..b12888c1b4e3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -383,6 +383,29 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + printk(KERN_EMERG + "Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); -- cgit v1.2.3 From 1e9e63650d6cb88e6d6d2ca6cc3ee276c26de4a3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:13 -0800 Subject: badpage: KERN_ALERT BUG instead of KERN_EMERG bad_page() and rmap Eeek messages have said KERN_EMERG for a few years, which I've followed in print_bad_pte(). These are serious system errors, on a par with BUGs, but they're not quite emergencies, and we do our best to carry on: say KERN_ALERT "BUG: " like the x86 oops does. And remove the "Trying to fix it up, but a reboot is needed" line: it's not untrue, but I hope the KERN_ALERT "BUG: " conveys as much. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b12888c1b4e3..db68af8e0bc4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -397,8 +397,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, return; } if (nr_unshown) { - printk(KERN_EMERG - "Bad page map: %lu messages suppressed\n", + printk(KERN_ALERT + "BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } @@ -410,26 +410,27 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - printk(KERN_EMERG "Bad page map in process %s pte:%08llx pmd:%08llx\n", + printk(KERN_ALERT + "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) { - printk(KERN_EMERG + printk(KERN_ALERT "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", page, (void *)page->flags, page_count(page), page_mapcount(page), page->mapping, page->index); } - printk(KERN_EMERG + printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ if (vma->vm_ops) - print_symbol(KERN_EMERG "vma->vm_ops->fault: %s\n", + print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", (unsigned long)vma->vm_ops->fault); if (vma->vm_file && vma->vm_file->f_op) - print_symbol(KERN_EMERG "vma->vm_file->f_op->mmap: %s\n", + print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); add_taint(TAINT_BAD_PAGE); -- cgit v1.2.3 From 4779280d1ea4d361af13ae77ba55217fbcd16d4c Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 6 Jan 2009 14:40:18 -0800 Subject: mm: make get_user_pages() interruptible The initial implementation of checking TIF_MEMDIE covers the cases of OOM killing. If the process has been OOM killed, the TIF_MEMDIE is set and it return immediately. This patch includes: 1. add the case that the SIGKILL is sent by user processes. The process can try to get_user_pages() unlimited memory even if a user process has sent a SIGKILL to it(maybe a monitor find the process exceed its memory limit and try to kill it). In the old implementation, the SIGKILL won't be handled until the get_user_pages() returns. 2. change the return value to be ERESTARTSYS. It makes no sense to return ENOMEM if the get_user_pages returned by getting a SIGKILL signal. Considering the general convention for a system call interrupted by a signal is ERESTARTNOSYS, so the current return value is consistant to that. Lee: An unfortunate side effect of "make-get_user_pages-interruptible" is that it prevents a SIGKILL'd task from munlock-ing pages that it had mlocked, resulting in freeing of mlocked pages. Freeing of mlocked pages, in itself, is not so bad. We just count them now--altho' I had hoped to remove this stat and add PG_MLOCKED to the free pages flags check. However, consider pages in shared libraries mapped by more than one task that a task mlocked--e.g., via mlockall(). If the task that mlocked the pages exits via SIGKILL, these pages would be left mlocked and unevictable. Proposed fix: Add another GUP flag to ignore sigkill when calling get_user_pages from munlock()--similar to Kosaki Motohiro's 'IGNORE_VMA_PERMISSIONS flag for the same purpose. We are not actually allocating memory in this case, which "make-get_user_pages-interruptible" intends to avoid. We're just munlocking pages that are already resident and mapped, and we're reusing get_user_pages() to access those pages. ?? Maybe we should combine 'IGNORE_VMA_PERMISSIONS and '_IGNORE_SIGKILL into a single flag: GUP_FLAGS_MUNLOCK ??? [Lee.Schermerhorn@hp.com: ignore sigkill in get_user_pages during munlock] Signed-off-by: Paul Menage Signed-off-by: Ying Han Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Nick Piggin Cc: Hugh Dickins Cc: Oleg Nesterov Cc: Lee Schermerhorn Cc: Rohit Seth Cc: David Rientjes Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index db68af8e0bc4..3f8fa06b963b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1210,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int write = !!(flags & GUP_FLAGS_WRITE); int force = !!(flags & GUP_FLAGS_FORCE); int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); if (len <= 0) return 0; @@ -1288,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page *page; /* - * If tsk is ooming, cut off its access to large memory - * allocations. It has a pending SIGKILL, but it can't - * be processed until returning to user space. + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory, unless + * current is handling munlock--e.g., on exit. In + * that case, we are not allocating memory. Rather, + * we're only unlocking already resident/mapped pages. */ - if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) - return i ? i : -ENOMEM; + if (unlikely(!ignore_sigkill && + fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; if (write) foll_flags |= FOLL_WRITE; -- cgit v1.2.3