From 62695a84eb8f2e718bf4dfb21700afaa7a08e0ea Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:26:09 -0700 Subject: vmscan: move isolate_lru_page() to vmscan.c On large memory systems, the VM can spend way too much time scanning through pages that it cannot (or should not) evict from memory. Not only does it use up CPU time, but it also provokes lock contention and can leave large systems under memory presure in a catatonic state. This patch series improves VM scalability by: 1) putting filesystem backed, swap backed and unevictable pages onto their own LRUs, so the system only scans the pages that it can/should evict from memory 2) switching to two handed clock replacement for the anonymous LRUs, so the number of pages that need to be scanned when the system starts swapping is bound to a reasonable number 3) keeping unevictable pages off the LRU completely, so the VM does not waste CPU time scanning them. ramfs, ramdisk, SHM_LOCKED shared memory segments and mlock()ed VMA pages are keept on the unevictable list. This patch: isolate_lru_page logically belongs to be in vmscan.c than migrate.c. It is tough, because we don't need that function without memory migration so there is a valid argument to have it in migrate.c. However a subsequent patch needs to make use of it in the core mm, so we can happily move it to vmscan.c. Also, make the function a little more generic by not requiring that it adds an isolated page to a given list. Callers can do that. Note that we now have '__isolate_lru_page()', that does something quite different, visible outside of vmscan.c for use with memory controller. Methinks we need to rationalize these names/purposes. --lts [akpm@linux-foundation.org: fix mm/memory_hotplug.c build] Signed-off-by: Nick Piggin Signed-off-by: Rik van Riel Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 2a80136b23bb..da73742e52a5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,36 +36,6 @@ #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) -/* - * Isolate one page from the LRU lists. If successful put it onto - * the indicated list with elevated page count. - * - * Result: - * -EBUSY: page not on LRU list - * 0: page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page, struct list_head *pagelist) -{ - int ret = -EBUSY; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { - ret = 0; - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - list_add_tail(&page->lru, pagelist); - } - spin_unlock_irq(&zone->lru_lock); - } - return ret; -} - /* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). @@ -914,7 +884,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, !migrate_all) goto put_and_set; - err = isolate_lru_page(page, &pagelist); + err = isolate_lru_page(page); + if (!err) + list_add_tail(&page->lru, &pagelist); put_and_set: /* * Either remove the duplicate refcount from -- cgit v1.2.3 From f04e9ebbe4909f9a41efd55149bc353299f4e83b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sat, 18 Oct 2008 20:26:19 -0700 Subject: swap: use an array for the LRU pagevecs Turn the pagevecs into an array just like the LRUs. This significantly cleans up the source code and reduces the size of the kernel by about 13kB after all the LRU lists have been created further down in the split VM patch series. Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index da73742e52a5..ad15b5ef2599 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -55,16 +55,7 @@ int migrate_prep(void) static inline void move_to_lru(struct page *page) { - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } + lru_cache_add_lru(page, page_lru(page)); put_page(page); } -- cgit v1.2.3 From b2e185384f534781fd22f5ce170b2ad26f97df70 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 18 Oct 2008 20:26:30 -0700 Subject: define page_file_cache() function Define page_file_cache() function to answer the question: is page backed by a file? Originally part of Rik van Riel's split-lru patch. Extracted to make available for other, independent reclaim patches. Moved inline function to linux/mm_inline.h where it will be needed by subsequent "split LRU" and "noreclaim" patches. Unfortunately this needs to use a page flag, since the PG_swapbacked state needs to be preserved all the way to the point where the page is last removed from the LRU. Trying to derive the status from other info in the page resulted in wrong VM statistics in earlier split VM patchsets. The total number of page flags in use on a 32 bit machine after this patch is 19. [akpm@linux-foundation.org: fix up out-of-order merge fallout] [hugh@veritas.com: splitlru: shmem_getpage SetPageSwapBacked sooner[ Signed-off-by: Rik van Riel Signed-off-by: Lee Schermerhorn Signed-off-by: MinChan Kim Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index ad15b5ef2599..c07327487111 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -572,6 +572,8 @@ static int move_to_new_page(struct page *newpage, struct page *page) /* Prepare mapping for the new page.*/ newpage->index = page->index; newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); mapping = page_mapping(page); if (!mapping) -- cgit v1.2.3 From 894bc310419ac95f4fa4142dc364401a7e607f65 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Sat, 18 Oct 2008 20:26:39 -0700 Subject: Unevictable LRU Infrastructure When the system contains lots of mlocked or otherwise unevictable pages, the pageout code (kswapd) can spend lots of time scanning over these pages. Worse still, the presence of lots of unevictable pages can confuse kswapd into thinking that more aggressive pageout modes are required, resulting in all kinds of bad behaviour. Infrastructure to manage pages excluded from reclaim--i.e., hidden from vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to maintain "unevictable" pages on a separate per-zone LRU list, to "hide" them from vmscan. Kosaki Motohiro added the support for the memory controller unevictable lru list. Pages on the unevictable list have both PG_unevictable and PG_lru set. Thus, PG_unevictable is analogous to and mutually exclusive with PG_active--it specifies which LRU list the page is on. The unevictable infrastructure is enabled by a new mm Kconfig option [CONFIG_]UNEVICTABLE_LRU. A new function 'page_evictable(page, vma)' in vmscan.c tests whether or not a page may be evictable. Subsequent patches will add the various !evictable tests. We'll want to keep these tests light-weight for use in shrink_active_list() and, possibly, the fault path. To avoid races between tasks putting pages [back] onto an LRU list and tasks that might be moving the page from non-evictable to evictable state, the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()' -- tests the "evictability" of a page after placing it on the LRU, before dropping the reference. If the page has become unevictable, putback_lru_page() will redo the 'putback', thus moving the page to the unevictable list. This way, we avoid "stranding" evictable pages on the unevictable list. [akpm@linux-foundation.org: fix fallout from out-of-order merge] [riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build] [nishimura@mxp.nes.nec.co.jp: remove redundant mapping check] [kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework] [kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c] [kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure] [kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch] [kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch] Signed-off-by: Lee Schermerhorn Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Debugged-by: Benjamin Kidwell Signed-off-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index c07327487111..b10237d8b459 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -53,14 +53,9 @@ int migrate_prep(void) return 0; } -static inline void move_to_lru(struct page *page) -{ - lru_cache_add_lru(page, page_lru(page)); - put_page(page); -} - /* - * Add isolated pages on the list back to the LRU. + * Add isolated pages on the list back to the LRU under page lock + * to avoid leaking evictable pages back onto unevictable list. * * returns the number of pages put back. */ @@ -72,7 +67,7 @@ int putback_lru_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); count++; } return count; @@ -354,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) SetPageReferenced(newpage); if (PageUptodate(page)) SetPageUptodate(newpage); - if (PageActive(page)) + if (TestClearPageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); + } else + unevictable_migrate_page(newpage, page); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -376,7 +374,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page) #ifdef CONFIG_SWAP ClearPageSwapCache(page); #endif - ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); page->mapping = NULL; @@ -555,6 +552,10 @@ static int fallback_migrate_page(struct address_space *mapping, * * The new page will have replaced the old page if this function * is successful. + * + * Return value: + * < 0 - error code + * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page) { @@ -617,9 +618,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; - if (page_count(page) == 1) + if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto move_newpage; + } charge = mem_cgroup_prepare_migration(page, newpage); if (charge == -ENOMEM) { @@ -693,7 +695,6 @@ rcu_unlock: rcu_read_unlock(); unlock: - unlock_page(page); if (rc != -EAGAIN) { @@ -704,17 +705,19 @@ unlock: * restored. */ list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); } move_newpage: if (!charge) mem_cgroup_end_migration(newpage); + /* * Move the new page to the LRU. If migration was not successful * then this will free the page. */ - move_to_lru(newpage); + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; -- cgit v1.2.3 From b291f000393f5a0b679012b39d79fbc85c018233 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:26:44 -0700 Subject: mlock: mlocked pages are unevictable Make sure that mlocked pages also live on the unevictable LRU, so kswapd will not scan them over and over again. This is achieved through various strategies: 1) add yet another page flag--PG_mlocked--to indicate that the page is locked for efficient testing in vmscan and, optionally, fault path. This allows early culling of unevictable pages, preventing them from getting to page_referenced()/try_to_unmap(). Also allows separate accounting of mlock'd pages, as Nick's original patch did. Note: Nick's original mlock patch used a PG_mlocked flag. I had removed this in favor of the PG_unevictable flag + an mlock_count [new page struct member]. I restored the PG_mlocked flag to eliminate the new count field. 2) add the mlock/unevictable infrastructure to mm/mlock.c, with internal APIs in mm/internal.h. This is a rework of Nick's original patch to these files, taking into account that mlocked pages are now kept on unevictable LRU list. 3) update vmscan.c:page_evictable() to check PageMlocked() and, if vma passed in, the vm_flags. Note that the vma will only be passed in for new pages in the fault path; and then only if the "cull unevictable pages in fault path" patch is included. 4) add try_to_unlock() to rmap.c to walk a page's rmap and ClearPageMlocked() if no other vmas have it mlocked. Reuses as much of try_to_unmap() as possible. This effectively replaces the use of one of the lru list links as an mlock count. If this mechanism let's pages in mlocked vmas leak through w/o PG_mlocked set [I don't know that it does], we should catch them later in try_to_unmap(). One hopes this will be rare, as it will be relatively expensive. Original mm/internal.h, mm/rmap.c and mm/mlock.c changes: Signed-off-by: Nick Piggin splitlru: introduce __get_user_pages(): New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS. because current get_user_pages() can't grab PROT_NONE pages theresore it cause PROT_NONE pages can't munlock. [akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch] [akpm@linux-foundation.org: untangle patch interdependencies] [akpm@linux-foundation.org: fix things after out-of-order merging] [hugh@veritas.com: fix page-flags mess] [lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm'] [kosaki.motohiro@jp.fujitsu.com: build fix] [kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments] [kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()] Signed-off-by: KOSAKI Motohiro Signed-off-by: Rik van Riel Signed-off-by: Lee Schermerhorn Cc: Nick Piggin Cc: Dave Hansen Cc: Matt Mackall Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index b10237d8b459..6802a7a3dfec 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -371,6 +371,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + mlock_migrate_page(newpage, page); + #ifdef CONFIG_SWAP ClearPageSwapCache(page); #endif -- cgit v1.2.3 From e78bbfa8262424417a29349a8064a535053912b9 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Sat, 18 Oct 2008 20:27:15 -0700 Subject: mm: stop returning -ENOENT from sys_move_pages() if nothing got migrated A patchset reworking sys_move_pages(). It removes the possibly large vmalloc by using multiple chunks when migrating large buffers. It also dramatically increases the throughput for large buffers since the lookup in new_page_node() is now limited to a single chunk, causing the quadratic complexity to have a much slower impact. There is no need to use any radix-tree-like structure to improve this lookup. sys_move_pages() duration on a 4-quadcore-opteron 2347HE (1.9Gz), migrating between nodes #2 and #3: length move_pages (us) move_pages+patch (us) 4kB 126 98 40kB 198 168 400kB 963 937 4MB 12503 11930 40MB 246867 11848 Patches #1 and #4 are the important ones: 1) stop returning -ENOENT from sys_move_pages() if nothing got migrated 2) don't vmalloc a huge page_to_node array for do_pages_stat() 3) extract do_pages_move() out of sys_move_pages() 4) rework do_pages_move() to work on page_sized chunks 5) move_pages: no need to set pp->page to ZERO_PAGE(0) by default This patch: There is no point in returning -ENOENT from sys_move_pages() if all pages were already on the right node, while we return 0 if only 1 page was not. Most application don't know where their pages are allocated, so it's not an error to try to migrate them anyway. Just return 0 and let the status array in user-space be checked if the application needs details. It will make the upcoming chunked-move_pages() support much easier. Signed-off-by: Brice Goglin Acked-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 6802a7a3dfec..f233519f0453 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -896,11 +896,10 @@ set_status: pp->status = err; } + err = 0; if (!list_empty(&pagelist)) err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm); - else - err = -ENOENT; up_read(&mm->mmap_sem); return err; -- cgit v1.2.3 From 2f007e74bb85b9fc4eab28524052161703300f1a Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Sat, 18 Oct 2008 20:27:16 -0700 Subject: mm: don't vmalloc a huge page_to_node array for do_pages_stat() do_pages_stat() does not need any page_to_node entry for real. Just pass the pointers to the user-space page address array and to the user-space status array, and have do_pages_stat() traverse the former and fill the latter directly. Signed-off-by: Brice Goglin Acked-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index f233519f0453..a4c29081ebce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -906,25 +906,33 @@ set_status: } /* - * Determine the nodes of a list of pages. The addr in the pm array - * must have been set to the virtual address of which we want to determine - * the node number. + * Determine the nodes of an array of pages and store it in an array of status. */ -static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) { + unsigned long i; + int err; + down_read(&mm->mmap_sem); - for ( ; pm->node != MAX_NUMNODES; pm++) { + for (i = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; struct vm_area_struct *vma; struct page *page; - int err; err = -EFAULT; - vma = find_vma(mm, pm->addr); + if (get_user(p, pages+i)) + goto out; + addr = (unsigned long) p; + + vma = find_vma(mm, addr); if (!vma) goto set_status; - page = follow_page(vma, pm->addr, 0); + page = follow_page(vma, addr, 0); err = PTR_ERR(page); if (IS_ERR(page)) @@ -937,11 +945,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) err = page_to_nid(page); set_status: - pm->status = err; + put_user(err, status+i); } + err = 0; +out: up_read(&mm->mmap_sem); - return 0; + return err; } /* @@ -997,6 +1007,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, if (err) goto out2; + if (!nodes) { + err = do_pages_stat(mm, nr_pages, pages, status); + goto out2; + } task_nodes = cpuset_mems_allowed(task); @@ -1045,11 +1059,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, /* End marker */ pm[nr_pages].node = MAX_NUMNODES; - if (nodes) - err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); - else - err = do_pages_stat(mm, pm); - + err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); if (err >= 0) /* Return status information */ for (i = 0; i < nr_pages; i++) -- cgit v1.2.3 From 5e9a0f023bee02bfb94e08590d998660c01f5a49 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Sat, 18 Oct 2008 20:27:17 -0700 Subject: mm: extract do_pages_move() out of sys_move_pages() To prepare the chunking, move the sys_move_pages() code that is used when nodes!=NULL into do_pages_move(). And rename do_move_pages() into do_move_page_to_node_array(). Signed-off-by: Brice Goglin Acked-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 152 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 86 insertions(+), 66 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index a4c29081ebce..11c6c56ec017 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -826,9 +826,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, * Move a set of pages as indicated in the pm array. The addr * field must be set to the virtual address of the page to be moved * and the node number must contain a valid target node. + * The pm array ends with node = MAX_NUMNODES. */ -static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, - int migrate_all) +static int do_move_page_to_node_array(struct mm_struct *mm, + struct page_to_node *pm, + int migrate_all) { int err; struct page_to_node *pp; @@ -905,6 +907,81 @@ set_status: return err; } +/* + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. + */ +static int do_pages_move(struct mm_struct *mm, struct task_struct *task, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + struct page_to_node *pm = NULL; + nodemask_t task_nodes; + int err = 0; + int i; + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void __user *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_pm; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out_pm; + + err = -ENODEV; + if (!node_state(node, N_HIGH_MEMORY)) + goto out_pm; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_pm; + + pm[i].node = node; + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out_pm: + vfree(pm); +out: + return err; +} + /* * Determine the nodes of an array of pages and store it in an array of status. */ @@ -963,12 +1040,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { - int err = 0; - int i; struct task_struct *task; - nodemask_t task_nodes; struct mm_struct *mm; - struct page_to_node *pm = NULL; + int err; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1000,75 +1074,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, (current->uid != task->suid) && (current->uid != task->uid) && !capable(CAP_SYS_NICE)) { err = -EPERM; - goto out2; + goto out; } err = security_task_movememory(task); if (err) - goto out2; + goto out; - if (!nodes) { + if (nodes) { + err = do_pages_move(mm, task, nr_pages, pages, nodes, status, + flags); + } else { err = do_pages_stat(mm, nr_pages, pages, status); - goto out2; - } - - task_nodes = cpuset_mems_allowed(task); - - /* Limit nr_pages so that the multiplication may not overflow */ - if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { - err = -E2BIG; - goto out2; } - pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); - if (!pm) { - err = -ENOMEM; - goto out2; - } - - /* - * Get parameters from user space and initialize the pm - * array. Return various errors if the user did something wrong. - */ - for (i = 0; i < nr_pages; i++) { - const void __user *p; - - err = -EFAULT; - if (get_user(p, pages + i)) - goto out; - - pm[i].addr = (unsigned long)p; - if (nodes) { - int node; - - if (get_user(node, nodes + i)) - goto out; - - err = -ENODEV; - if (!node_state(node, N_HIGH_MEMORY)) - goto out; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out; - - pm[i].node = node; - } else - pm[i].node = 0; /* anything to not match MAX_NUMNODES */ - } - /* End marker */ - pm[nr_pages].node = MAX_NUMNODES; - - err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); - if (err >= 0) - /* Return status information */ - for (i = 0; i < nr_pages; i++) - if (put_user(pm[i].status, status + i)) - err = -EFAULT; - out: - vfree(pm); -out2: mmput(mm); return err; } -- cgit v1.2.3 From b7abea9630bc8ffc663a751e46680db25c4cdf8d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 18 Oct 2008 20:28:09 -0700 Subject: memcg: make page->mapping NULL before uncharge This patch tries to make page->mapping to be NULL before mem_cgroup_uncharge_cache_page() is called. "page->mapping == NULL" is a good check for "whether the page is still radix-tree or not". This patch also adds BUG_ON() to mem_cgroup_uncharge_cache_page(); Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 11c6c56ec017..6602941bfab0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -330,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, __inc_zone_page_state(newpage, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); - if (!PageSwapCache(newpage)) - mem_cgroup_uncharge_cache_page(page); return 0; } @@ -341,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ static void migrate_page_copy(struct page *newpage, struct page *page) { + int anon; + copy_highpage(newpage, page); if (PageError(page)) @@ -378,8 +378,13 @@ static void migrate_page_copy(struct page *newpage, struct page *page) #endif ClearPagePrivate(page); set_page_private(page, 0); + /* page->mapping contains a flag for PageAnon() */ + anon = PageAnon(page); page->mapping = NULL; + if (!anon) /* This page was removed from radix-tree. */ + mem_cgroup_uncharge_cache_page(page); + /* * If any waiters have accumulated on the new page then * wake them up. -- cgit v1.2.3 From 0aedadf91a70a11c4a3e7c7d99b21e5528af8d5d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 6 Nov 2008 12:53:30 -0800 Subject: mm: move migrate_prep out from under mmap_sem Move the migrate_prep outside the mmap_sem for the following system calls 1. sys_move_pages 2. sys_migrate_pages 3. sys_mbind() It really does not matter when we flush the lru. The system is free to add pages onto the lru even during migration which will make the page migration either skip the page (mbind, migrate_pages) or return a busy state (move_pages). Fixes this lockdep warning (and potential deadlock): Some VM place has mmap_sem -> kevent_wq via lru_add_drain_all() net/core/dev.c::dev_ioctl() has rtnl_lock -> mmap_sem (*) the ioctl has copy_from_user() and it can do page fault. linkwatch_event has kevent_wq -> rtnl_lock Signed-off-by: Christoph Lameter Cc: KOSAKI Motohiro Reported-by: Heiko Carstens Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 6602941bfab0..385db89f0c33 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -841,12 +841,12 @@ static int do_move_page_to_node_array(struct mm_struct *mm, struct page_to_node *pp; LIST_HEAD(pagelist); + migrate_prep(); down_read(&mm->mmap_sem); /* * Build a list of pages to migrate */ - migrate_prep(); for (pp = pm; pp->node != MAX_NUMNODES; pp++) { struct vm_area_struct *vma; struct page *page; -- cgit v1.2.3 From 76aac0e9a17742e60d408be1a706e9aaad370891 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:12 +1100 Subject: CRED: Wrap task credential accesses in the core kernel Wrap access to task credentials so that they can be separated more easily from the task_struct during the introduction of COW creds. Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id(). Change some task->e?[ug]id to task_e?[ug]id(). In some places it makes more sense to use RCU directly rather than a convenient wrapper; these will be addressed by later patches. Signed-off-by: David Howells Reviewed-by: James Morris Acked-by: Serge Hallyn Cc: Al Viro Cc: linux-audit@redhat.com Cc: containers@lists.linux-foundation.org Cc: linux-mm@kvack.org Signed-off-by: James Morris --- mm/migrate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 6602941bfab0..6263c24c4afe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1048,6 +1048,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, struct task_struct *task; struct mm_struct *mm; int err; + uid_t uid, euid; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1075,8 +1076,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * capabilities, superuser privileges or the same * userid as the target process. */ - if ((current->euid != task->suid) && (current->euid != task->uid) && - (current->uid != task->suid) && (current->uid != task->uid) && + uid = current_uid(); + euid = current_euid(); + if (euid != task->suid && euid != task->uid && + uid != task->suid && uid != task->uid && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; -- cgit v1.2.3 From b6dff3ec5e116e3af6f537d4caedcad6b9e5082a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:16 +1100 Subject: CRED: Separate task security context from task_struct Separate the task security context from task_struct. At this point, the security data is temporarily embedded in the task_struct with two pointers pointing to it. Note that the Alpha arch is altered as it refers to (E)UID and (E)GID in entry.S via asm-offsets. With comment fixes Signed-off-by: Marc Dionne Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- mm/migrate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 6263c24c4afe..794443da1b4f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1045,10 +1045,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { + struct cred *cred, *tcred; struct task_struct *task; struct mm_struct *mm; int err; - uid_t uid, euid; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1076,10 +1076,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * capabilities, superuser privileges or the same * userid as the target process. */ - uid = current_uid(); - euid = current_euid(); - if (euid != task->suid && euid != task->uid && - uid != task->suid && uid != task->uid && + cred = current->cred; + tcred = task->cred; + if (cred->euid != tcred->suid && cred->euid != tcred->uid && + cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; -- cgit v1.2.3 From c69e8d9c01db2adc503464993c358901c9af9de4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:19 +1100 Subject: CRED: Use RCU to access another task's creds and to release a task's own creds Use RCU to access another task's creds and to release a task's own creds. This means that it will be possible for the credentials of a task to be replaced without another task (a) requiring a full lock to read them, and (b) seeing deallocated memory. Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- mm/migrate.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 794443da1b4f..142284229ce2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1045,7 +1045,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { - struct cred *cred, *tcred; + const struct cred *cred = current_cred(), *tcred; struct task_struct *task; struct mm_struct *mm; int err; @@ -1076,14 +1076,16 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * capabilities, superuser privileges or the same * userid as the target process. */ - cred = current->cred; - tcred = task->cred; + rcu_read_lock(); + tcred = __task_cred(task); if (cred->euid != tcred->suid && cred->euid != tcred->uid && cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); err = -EPERM; goto out; } + rcu_read_unlock(); err = security_task_movememory(task); if (err) -- cgit v1.2.3 From bda8550deed96687f29992d711a88ea21cff4d26 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Nov 2008 15:36:36 -0800 Subject: migration: fix writepage error Page migration's writeout() has got understandably confused by the nasty AOP_WRITEPAGE_ACTIVATE case: as in normal success, a writepage() error has unlocked the page, so writeout() then needs to relock it. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 385db89f0c33..1e0d6b237f44 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -522,15 +522,12 @@ static int writeout(struct address_space *mapping, struct page *page) remove_migration_ptes(page, page); rc = mapping->a_ops->writepage(page, &wbc); - if (rc < 0) - /* I/O Error writing */ - return -EIO; if (rc != AOP_WRITEPAGE_ACTIVATE) /* unlocked. Relock */ lock_page(page); - return -EAGAIN; + return (rc < 0) ? -EIO : -EAGAIN; } /* -- cgit v1.2.3 From 80bba1290ab5122c60cdb73332b26d288dc8aedd Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Tue, 9 Dec 2008 13:14:23 -0800 Subject: mm: no get_user/put_user while holding mmap_sem in do_pages_stat? Since commit 2f007e74bb85b9fc4eab28524052161703300f1a, do_pages_stat() gets the page address from user-space and puts the corresponding status back while holding the mmap_sem for read. There is no need to hold mmap_sem there while some page-faults may occur. This patch adds a temporary address and status buffer so as to only hold mmap_sem while working on these kernel buffers. This is implemented by extracting do_pages_stat_array() out of do_pages_stat(). Signed-off-by: Brice Goglin Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 13 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 1e0d6b237f44..d8f07667fc80 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -987,25 +987,18 @@ out: /* * Determine the nodes of an array of pages and store it in an array of status. */ -static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, - const void __user * __user *pages, - int __user *status) +static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, + const void __user **pages, int *status) { unsigned long i; - int err; down_read(&mm->mmap_sem); for (i = 0; i < nr_pages; i++) { - const void __user *p; - unsigned long addr; + unsigned long addr = (unsigned long)(*pages); struct vm_area_struct *vma; struct page *page; - - err = -EFAULT; - if (get_user(p, pages+i)) - goto out; - addr = (unsigned long) p; + int err; vma = find_vma(mm, addr); if (!vma) @@ -1024,12 +1017,52 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, err = page_to_nid(page); set_status: - put_user(err, status+i); + *status = err; + + pages++; + status++; + } + + up_read(&mm->mmap_sem); +} + +/* + * Determine the nodes of a user array of pages and store it in + * a user array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ +#define DO_PAGES_STAT_CHUNK_NR 16 + const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; + int chunk_status[DO_PAGES_STAT_CHUNK_NR]; + unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; + int err; + + for (i = 0; i < nr_pages; i += chunk_nr) { + if (chunk_nr + i > nr_pages) + chunk_nr = nr_pages - i; + + err = copy_from_user(chunk_pages, &pages[i], + chunk_nr * sizeof(*chunk_pages)); + if (err) { + err = -EFAULT; + goto out; + } + + do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); + + err = copy_to_user(&status[i], chunk_status, + chunk_nr * sizeof(*chunk_status)); + if (err) { + err = -EFAULT; + goto out; + } } err = 0; out: - up_read(&mm->mmap_sem); return err; } -- cgit v1.2.3 From c095adbc211f9f4e990eac7d6cb440de35e4f05f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Dec 2008 16:06:43 +0900 Subject: mm: Don't touch uninitialized variable in do_pages_stat_array() Commit 80bba1290ab5122c60cdb73332b26d288dc8aedd removed one necessary variable initialization. As a result following warning happened: CC mm/migrate.o mm/migrate.c: In function 'sys_move_pages': mm/migrate.c:1001: warning: 'err' may be used uninitialized in this function More unfortunately, if find_vma() failed, kernel read uninitialized memory. Signed-off-by: KOSAKI Motohiro CC: Brice Goglin Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index d8f07667fc80..037b0967c1e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -998,7 +998,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, unsigned long addr = (unsigned long)(*pages); struct vm_area_struct *vma; struct page *page; - int err; + int err = -EFAULT; vma = find_vma(mm, addr); if (!vma) -- cgit v1.2.3