From 5853ff23c2f0f6c87a859e7f882eac3300b329a0 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Mon, 25 Mar 2013 15:47:38 -0700 Subject: mm: export split_page() This symbol will be used in the Hyper-V balloon driver to support 2M allocations. Signed-off-by: K. Y. Srinivasan Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fcced7823fa..7ff1536f01b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1397,6 +1397,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } +EXPORT_SYMBOL_GPL(split_page); static int __isolate_free_page(struct page *page, unsigned int order) { -- cgit v1.2.3 From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:06:11 -0700 Subject: mm, show_mem: suppress page counts in non-blockable contexts On large systems with a lot of memory, walking all RAM to determine page types may take a half second or even more. In non-blockable contexts, the page allocator will emit a page allocation failure warning unless __GFP_NOWARN is specified. In such contexts, irqs are typically disabled and such a lengthy delay may even result in NMI watchdog timeouts. To fix this, suppress the page walk in such contexts when printing the page allocation failure warning. Signed-off-by: David Rientjes Cc: Mel Gorman Acked-by: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ff1536f01b8..da7a2fe7332e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2002,6 +2002,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) debug_guardpage_minorder() > 0) return; + /* + * Walking all memory to count page types is very expensive and should + * be inhibited in non-blockable contexts. + */ + if (!(gfp_mask & __GFP_WAIT)) + filter |= SHOW_MEM_FILTER_PAGE_COUNT; + /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set -- cgit v1.2.3 From 69afade72a3e13e96a065f757891d384d466123f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 29 Apr 2013 15:06:21 -0700 Subject: mm: introduce common help functions to deal with reserved/managed pages The original goal of this patchset is to fix the bug reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501 Now it has also been expanded to reduce common code used by memory initializion. This is the first part, which applies to v3.9-rc1. It introduces following common helper functions to simplify free_initmem() and free_initrd_mem() on different architectures: adjust_managed_page_count(): will be used to adjust totalram_pages, totalhigh_pages, zone->managed_pages when reserving/unresering a page. __free_reserved_page(): free a reserved page into the buddy system without adjusting page statistics info free_reserved_page(): free a reserved page into the buddy system and adjust page statistics info mark_page_reserved(): mark a page as reserved and adjust page statistics info free_reserved_area(): free a continous ranges of pages by calling free_reserved_page() free_initmem_default(): default method to free __init pages. We have only tested these patchset on x86 platforms, and have done basic compliation tests using cross-compilers from ftp.kernel.org. That means some code may not pass compilation on some architectures. So any help to test this patchset are welcomed! There are several other parts still under development: Part2: introduce free_highmem_page() to simplify freeing highmem pages Part3: refine code to manage totalram_pages, totalhigh_pages and zone->managed_pages Part4: introduce helper functions to simplify mem_init() and remove the global variable num_physpages. This patch: Code to deal with reserved/managed pages are duplicated by many architectures, so introduce common help functions to reduce duplicated code. These common help functions will also be used to concentrate code to modify totalram_pages and zone->managed_pages, which makes the code much more clear. Signed-off-by: Jiang Liu Acked-by: Geert Uytterhoeven Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Anatolij Gustschin Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Zankel Cc: David Howells Cc: David S. Miller Cc: Eric Biederman Cc: Fenghua Yu Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Hirokazu Takata Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jiang Liu Cc: Jiang Liu Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Mikael Starvik Cc: Mike Frysinger Cc: Paul Mackerras Cc: Paul Mundt Cc: Ralf Baechle Cc: Richard Henderson Cc: Russell King Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index da7a2fe7332e..5c660f5ba3d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5121,6 +5121,26 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +unsigned long free_reserved_area(unsigned long start, unsigned long end, + int poison, char *s) +{ + unsigned long pages, pos; + + pos = start = PAGE_ALIGN(start); + end &= PAGE_MASK; + for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { + if (poison) + memset((void *)pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); + } + + if (pages && s) + pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", + s, pages << (PAGE_SHIFT - 10), start, end); + + return pages; +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved -- cgit v1.2.3 From cfa11e08ed39eb28a9eff9a907b20913020c69b5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 29 Apr 2013 15:07:00 -0700 Subject: mm: introduce free_highmem_page() helper to free highmem pages into buddy system The original goal of this patchset is to fix the bug reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501 Now it has also been expanded to reduce common code used by memory initializion. This is the second part, which applies to the previous part at: http://marc.info/?l=linux-mm&m=136289696323825&w=2 It introduces a helper function free_highmem_page() to free highmem pages into the buddy system when initializing mm subsystem. Introduction of free_highmem_page() is one step forward to clean up accesses and modificaitons of totalhigh_pages, totalram_pages and zone->managed_pages etc. I hope we could remove all references to totalhigh_pages from the arch/ subdirectory. We have only tested these patchset on x86 platforms, and have done basic compliation tests using cross-compilers from ftp.kernel.org. That means some code may not pass compilation on some architectures. So any help to test this patchset are welcomed! There are several other parts still under development: Part3: refine code to manage totalram_pages, totalhigh_pages and zone->managed_pages Part4: introduce helper functions to simplify mem_init() and remove the global variable num_physpages. This patch: Introduce helper function free_highmem_page(), which will be used by architectures with HIGHMEM enabled to free highmem pages into the buddy system. Signed-off-by: Jiang Liu Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "Suzuki K. Poulose" Cc: Alexander Graf Cc: Arnd Bergmann Cc: Attilio Rao Cc: Benjamin Herrenschmidt Cc: Cong Wang Cc: David Daney Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: James Hogan Cc: Jeff Dike Cc: Jiang Liu Cc: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Konstantin Khlebnikov Cc: Linus Walleij Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Michal Simek Cc: Michel Lespinasse Cc: Minchan Kim Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Rik van Riel Cc: Russell King Cc: Sam Ravnborg Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c660f5ba3d3..72da11c6804d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5141,6 +5141,15 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end, return pages; } +#ifdef CONFIG_HIGHMEM +void free_highmem_page(struct page *page) +{ + __free_reserved_page(page); + totalram_pages++; + totalhigh_pages++; +} +#endif + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved -- cgit v1.2.3 From 949f7ec5760b021da3cccc1eaeb0671270e4238f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:07:48 -0700 Subject: mm, hugetlb: include hugepages in meminfo Particularly in oom conditions, it's troublesome that hugetlb memory is not displayed. All other meminfo that is emitted will not add up to what is expected, and there is no artifact left in the kernel log to show that a potentially significant amount of memory is actually allocated as hugepages which are not available to be reclaimed. Booting with hugepages=8192 on the command line, this memory is now shown in oom conditions. For example, with echo m > /proc/sysrq-trigger: Node 0 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 1 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 2 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB Node 3 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: David Rientjes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 72da11c6804d..7350986bbf99 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -3113,6 +3114,8 @@ void show_free_areas(unsigned int filter) printk("= %lukB\n", K(total)); } + hugetlb_show_meminfo(); + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); show_swap_cache_info(); -- cgit v1.2.3 From fed2719e7a8612471bd17113ed326d38df434f17 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 29 Apr 2013 15:07:57 -0700 Subject: mm: page_alloc: avoid marking zones full prematurely after zone_reclaim() The following problem was reported against a distribution kernel when zone_reclaim was enabled but the same problem applies to the mainline kernel. The reproduction case was as follows 1. Run numactl -m +0 dd if=largefile of=/dev/null This allocates a large number of clean pages in node 0 2. numactl -N +0 memhog 0.5*Mg This start a memory-using application in node 0. The expected behaviour is that the clean pages get reclaimed and the application uses node 0 for its memory. The observed behaviour was that the memory for the memhog application was allocated off-node since commits cd38b115d5ad ("mm: page allocator: initialise ZLC for first zone eligible for zone_reclaim") and commit 76d3fbf8fbf6 ("mm: page allocator: reconsider zones for allocation after direct reclaim"). The assumption of those patches was that it was always preferable to allocate quickly than stall for long periods of time and they were meant to take care that the zone was only marked full when necessary but an important case was missed. In the allocator fast path, only the low watermarks are checked. If the zones free pages are between the low and min watermark then allocations from the allocators slow path will succeed. However, zone_reclaim will only reclaim SWAP_CLUSTER_MAX or 1< Reported-by: Hedi Berriche Tested-by: Hedi Berriche Reviewed-by: Michal Hocko Reviewed-by: Wanpeng Li Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7350986bbf99..b54c5cbf0200 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1942,9 +1942,24 @@ zonelist_scan: continue; default: /* did we reclaim enough */ - if (!zone_watermark_ok(zone, order, mark, + if (zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) + goto try_this_zone; + + /* + * Failed to reclaim enough to meet watermark. + * Only mark the zone full if checking the min + * watermark or if we failed to reclaim just + * 1< Date: Mon, 29 Apr 2013 15:07:59 -0700 Subject: mm: speedup in __early_pfn_to_nid When booting on a large memory system, the kernel spends considerable time in memmap_init_zone() setting up memory zones. Analysis shows significant time spent in __early_pfn_to_nid(). The routine memmap_init_zone() checks each PFN to verify the nid is valid. __early_pfn_to_nid() sequentially scans the list of pfn ranges to find the right range and returns the nid. This does not scale well. On a 4 TB (single rack) system there are 308 memory ranges to scan. The higher the PFN the more time spent sequentially spinning through memory ranges. Since memmap_init_zone() increments pfn, it will almost always be looking for the same range as the previous pfn, so check that range first. If it is in the same range, return that nid. If not, scan the list as before. A 4 TB (single rack) UV1 system takes 512 seconds to get through the zone code. This performance optimization reduces the time by 189 seconds, a 36% improvement. A 2 TB (single rack) UV2 system goes from 212.7 seconds to 99.8 seconds, a 112.9 second (53%) reduction. [akpm@linux-foundation.org: make the statics __meminitdata] [akpm@linux-foundation.org: fix comment formatting] [akpm@linux-foundation.org: fix ia64, per yinghai] [akpm@linux-foundation.org: add missing semicolon, per Tony] Signed-off-by: Russ Anderson Cc: David Rientjes Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Tested-by: "Luck, Tony" Cc: Yinghai Lu Cc: Lin Feng Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b54c5cbf0200..5a234b64f3ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4187,10 +4187,23 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; int i, nid; + /* + * NOTE: The following SMP-unsafe globals are only used early in boot + * when the kernel is running single-threaded. + */ + static unsigned long __meminitdata last_start_pfn, last_end_pfn; + static int __meminitdata last_nid; + + if (last_start_pfn <= pfn && pfn < last_end_pfn) + return last_nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - if (start_pfn <= pfn && pfn < end_pfn) + if (start_pfn <= pfn && pfn < end_pfn) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; return nid; + } /* This is a memory hole */ return -1; } -- cgit v1.2.3 From f9872caf07c1c774034b8bddde7d4a3a7f4a6484 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Mon, 29 Apr 2013 15:08:01 -0700 Subject: page_alloc: make setup_nr_node_ids() usable for arch init code powerpc and x86 were opencoding copies of setup_nr_node_ids(), which page_alloc provides but makes static. Make it avaliable to the archs in linux/mm.h. Signed-off-by: Cody P Schafer Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a234b64f3ac..98cbdf6e5532 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4749,7 +4749,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* * Figure out the number of possible node ids. */ -static void __init setup_nr_node_ids(void) +void __init setup_nr_node_ids(void) { unsigned int node; unsigned int highest = 0; @@ -4758,10 +4758,6 @@ static void __init setup_nr_node_ids(void) highest = node; nr_node_ids = highest + 1; } -#else -static inline void setup_nr_node_ids(void) -{ -} #endif /** -- cgit v1.2.3 From bb3ec6b08396bbd631b6441102dd1c3d89cbc576 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 22 May 2013 12:18:47 +0200 Subject: mm: Fix virt_to_page() warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virt_to_page() is typically implemented as a macro containing a cast so that it will accept both pointers and unsigned long without causing a warning. But MIPS virt_to_page() uses virt_to_phys which is a function so passing an unsigned long will cause a warning: CC mm/page_alloc.o mm/page_alloc.c: In function ‘free_reserved_area’: mm/page_alloc.c:5161:3: warning: passing argument 1 of ‘virt_to_phys’ makes pointer from integer without a cast [enabled by default] arch/mips/include/asm/io.h:119:100: note: expected ‘const volatile void *’ but argument is of type ‘long unsigned int’ All others users of virt_to_page() in mm/ are passing a void *. Signed-off-by: Ralf Baechle Reported-by: Eunbong Song Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-mips@linux-mips.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98cbdf6e5532..378a15bcd649 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5158,7 +5158,7 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end, for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { if (poison) memset((void *)pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + free_reserved_page(virt_to_page((void *)pos)); } if (pages && s) -- cgit v1.2.3