From 5a73633ef01cd8772defa6a3c34a588376a1df4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Aug 2013 13:02:52 +0200 Subject: mm: make generic_access_phys available for modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the next commit this function will be used in the uio subsystem Signed-off-by: Uwe Kleine-König Signed-off-by: Greg Kroah-Hartman --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 1ce2e2a734fc..8d9255b69ff0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4066,6 +4066,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, return len; } +EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* -- cgit v1.2.3 From bc4b4448dba660afc8df3790564320302d9709a1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:28 -0700 Subject: mm: move pgtable related functions to right place pgtable related functions are mostly in pgtable-generic.c. So move remaining functions from memory.c to pgtable-generic.c. Signed-off-by: Joonsoo Kim Cc: Johannes Weiner Cc: Minchan Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b3c6bf9a398e..c1c6d59b2b03 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -372,30 +372,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ -/* - * If a p?d_bad entry is found while walking page tables, report - * the error, before resetting entry to p?d_none. Usually (but - * very seldom) called out from the p?d_none_or_clear_bad macros. - */ - -void pgd_clear_bad(pgd_t *pgd) -{ - pgd_ERROR(*pgd); - pgd_clear(pgd); -} - -void pud_clear_bad(pud_t *pud) -{ - pud_ERROR(*pud); - pud_clear(pud); -} - -void pmd_clear_bad(pmd_t *pmd) -{ - pmd_ERROR(*pmd); - pmd_clear(pmd); -} - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. -- cgit v1.2.3 From e632a938d914d271bec26e570d36c755a1e35e4c Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:04 -0700 Subject: mm: migrate: add hugepage migration code to move_pages() Extend move_pages() to handle vma with VM_HUGETLB set. We will be able to migrate hugepage with move_pages(2) after applying the enablement patch which comes later in this series. We avoid getting refcount on tail pages of hugepage, because unlike thp, hugepage is not split and we need not care about races with splitting. And migration of larger (1GB for x86_64) hugepage are not enabled. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Cc: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index c1c6d59b2b03..2b73dbde2274 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1481,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pud_none(*pud)) goto no_page_table; if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); + if (flags & FOLL_GET) + goto out; page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; } @@ -1492,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } goto out; } if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) -- cgit v1.2.3 From 519e52473ebe9db5cdef44670d5a97f1fd53d721 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:42 -0700 Subject: mm: memcg: enable memcg OOM killer only for user faults System calls and kernel faults (uaccess, gup) can handle an out of memory situation gracefully and just return -ENOMEM. Enable the memcg OOM killer only for user faults, where it's really the only option available. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: azurIt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 2b73dbde2274..a8f9deab8719 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3754,22 +3754,14 @@ unlock: /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); - - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3850,6 +3842,34 @@ retry: return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + int ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_enable_oom(); + + ret = __handle_mm_fault(mm, vma, address, flags); + + if (flags & FAULT_FLAG_USER) + mem_cgroup_disable_oom(); + + return ret; +} + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. -- cgit v1.2.3 From 3812c8c8f3953921ef18544110dafc3505c1ac62 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:44 -0700 Subject: mm: memcg: do not trap chargers with full callstack on OOM The memcg OOM handling is incredibly fragile and can deadlock. When a task fails to charge memory, it invokes the OOM killer and loops right there in the charge code until it succeeds. Comparably, any other task that enters the charge path at this point will go to a waitqueue right then and there and sleep until the OOM situation is resolved. The problem is that these tasks may hold filesystem locks and the mmap_sem; locks that the selected OOM victim may need to exit. For example, in one reported case, the task invoking the OOM killer was about to charge a page cache page during a write(), which holds the i_mutex. The OOM killer selected a task that was just entering truncate() and trying to acquire the i_mutex: OOM invoking task: mem_cgroup_handle_oom+0x241/0x3b0 mem_cgroup_cache_charge+0xbe/0xe0 add_to_page_cache_locked+0x4c/0x140 add_to_page_cache_lru+0x22/0x50 grab_cache_page_write_begin+0x8b/0xe0 ext3_write_begin+0x88/0x270 generic_file_buffered_write+0x116/0x290 __generic_file_aio_write+0x27c/0x480 generic_file_aio_write+0x76/0xf0 # takes ->i_mutex do_sync_write+0xea/0x130 vfs_write+0xf3/0x1f0 sys_write+0x51/0x90 system_call_fastpath+0x18/0x1d OOM kill victim: do_truncate+0x58/0xa0 # takes i_mutex do_last+0x250/0xa30 path_openat+0xd7/0x440 do_filp_open+0x49/0xa0 do_sys_open+0x106/0x240 sys_open+0x20/0x30 system_call_fastpath+0x18/0x1d The OOM handling task will retry the charge indefinitely while the OOM killed task is not releasing any resources. A similar scenario can happen when the kernel OOM killer for a memcg is disabled and a userspace task is in charge of resolving OOM situations. In this case, ALL tasks that enter the OOM path will be made to sleep on the OOM waitqueue and wait for userspace to free resources or increase the group's limit. But a userspace OOM handler is prone to deadlock itself on the locks held by the waiting tasks. For example one of the sleeping tasks may be stuck in a brk() call with the mmap_sem held for writing but the userspace handler, in order to pick an optimal victim, may need to read files from /proc/, which tries to acquire the same mmap_sem for reading and deadlocks. This patch changes the way tasks behave after detecting a memcg OOM and makes sure nobody loops or sleeps with locks held: 1. When OOMing in a user fault, invoke the OOM killer and restart the fault instead of looping on the charge attempt. This way, the OOM victim can not get stuck on locks the looping task may hold. 2. When OOMing in a user fault but somebody else is handling it (either the kernel OOM killer or a userspace handler), don't go to sleep in the charge context. Instead, remember the OOMing memcg in the task struct and then fully unwind the page fault stack with -ENOMEM. pagefault_out_of_memory() will then call back into the memcg code to check if the -ENOMEM came from the memcg, and then either put the task to sleep on the memcg's OOM waitqueue or just restart the fault. The OOM victim can no longer get stuck on any lock a sleeping task may hold. Debugged by Michal Hocko. Signed-off-by: Johannes Weiner Reported-by: azurIt Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index a8f9deab8719..5ec6f199e685 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_USER) mem_cgroup_disable_oom(); + if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) + mem_cgroup_oom_synchronize(); + return ret; } -- cgit v1.2.3 From c02925540ca7019465a43c00f8a3c0186ddace2b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 12 Sep 2013 15:14:05 -0700 Subject: thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page() do_huge_pmd_anonymous_page() has copy-pasted piece of handle_mm_fault() to handle fallback path. Let's consolidate code back by introducing VM_FAULT_FALLBACK return code. Signed-off-by: Kirill A. Shutemov Acked-by: Hillf Danton Cc: Andrea Arcangeli Cc: Al Viro Cc: Hugh Dickins Cc: Wu Fengguang Cc: Jan Kara Cc: Mel Gorman Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5ec6f199e685..ca0003947115 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -int handle_pte_fault(struct mm_struct *mm, +static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { @@ -3774,9 +3774,12 @@ retry: if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + int ret = VM_FAULT_FALLBACK; if (!vma->vm_ops) - return do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + ret = do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { pmd_t orig_pmd = *pmd; int ret; -- cgit v1.2.3