From 58c2ee4007bea04cc37041fcbd380fadb7b7be82 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Tue, 15 Mar 2011 10:59:02 +0530 Subject: mm: Fix section mismatch for setup_zone_pageset() build_all_zonelists() which is not __meminit, calls setup_zone_pageset(). Signed-off-by: Nikanth Karthikesan Signed-off-by: Jiri Kosina --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7945247b1e53..48c9737ad49a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3448,7 +3448,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } -static __meminit void setup_zone_pageset(struct zone *zone) +static void setup_zone_pageset(struct zone *zone) { int cpu; -- cgit v1.2.3 From 8f389a99b652aab5b42297280bd94d95933ad12f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 May 2011 15:13:32 -0700 Subject: mm: use alloc_bootmem_node_nopanic() on really needed path Stefan found nobootmem does not work on his system that has only 8M of RAM. This causes an early panic: BIOS-provided physical RAM map: BIOS-88: 0000000000000000 - 000000000009f000 (usable) BIOS-88: 0000000000100000 - 0000000000840000 (usable) bootconsole [earlyser0] enabled Notice: NX (Execute Disable) protection missing in CPU or disabled in BIOS! DMI not present or invalid. last_pfn = 0x840 max_arch_pfn = 0x100000 init_memory_mapping: 0000000000000000-0000000000840000 8MB LOWMEM available. mapped low ram: 0 - 00840000 low ram: 0 - 00840000 Zone PFN ranges: DMA 0x00000001 -> 0x00001000 Normal empty Movable zone start PFN for each node early_node_map[2] active PFN ranges 0: 0x00000001 -> 0x0000009f 0: 0x00000100 -> 0x00000840 BUG: Int 6: CR2 (null) EDI c034663c ESI (null) EBP c0329f38 ESP c0329ef4 EBX c0346380 EDX 00000006 ECX ffffffff EAX fffffff4 err (null) EIP c0353191 CS c0320060 flg 00010082 Stack: (null) c030c533 000007cd (null) c030c533 00000001 (null) (null) 00000003 0000083f 00000018 00000002 00000002 c0329f6c c03534d6 (null) (null) 00000100 00000840 (null) c0329f64 00000001 00001000 (null) Pid: 0, comm: swapper Not tainted 2.6.36 #5 Call Trace: [] ? 0xc02e3707 [] 0xc035e6e5 [] ? 0xc0353191 [] 0xc03534d6 [] 0xc034f1cd [] 0xc034a824 [] ? 0xc03513cb [] 0xc0349432 [] 0xc0349066 It turns out that we should ignore the low limit of 16M. Use alloc_bootmem_node_nopanic() in this case. [akpm@linux-foundation.org: less mess] Signed-off-by: Yinghai LU Reported-by: Stefan Hellermann Tested-by: Stefan Hellermann Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: [2.6.34+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f8a97b9a350..454191a25173 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3564,7 +3564,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, alloc_size); + alloc_bootmem_node_nopanic(pgdat, alloc_size); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -4141,7 +4141,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); + zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, + usemapsize); } #else static inline void setup_usemap(struct pglist_data *pgdat, @@ -4307,7 +4308,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node(pgdat, size); + map = alloc_bootmem_node_nopanic(pgdat, size); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.2.3 From ee85c2e1454603ebb9f8d87223ac79dcdc87fa32 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 11 May 2011 15:13:34 -0700 Subject: mm: add alloc_pages_exact_nid() Add a alloc_pages_exact_nid() that allocates on a specific node. The naming is quite broken, but fixing that would need a larger renaming action. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: tweak comment] Signed-off-by: Andi Kleen Cc: Michal Hocko Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Dave Hansen Cc: David Rientjes Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 454191a25173..570d944daeb5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2317,6 +2317,21 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +{ + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page((void *)addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + return (void *)addr; +} + /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate @@ -2336,21 +2351,31 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long addr; addr = __get_free_pages(gfp_mask, order); - if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); - - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } - } - - return (void *)addr; + return make_alloc_exact(addr, order, size); } EXPORT_SYMBOL(alloc_pages_exact); +/** + * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + * pages on a node. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * Like alloc_pages_exact(), but try to allocate on node nid first before falling + * back. + * Note this is not alloc_pages_exact_node() which allocates on a specific node, + * but is not exact. + */ +void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +{ + unsigned order = get_order(size); + struct page *p = alloc_pages_node(nid, gfp_mask, order); + if (!p) + return NULL; + return make_alloc_exact((unsigned long)page_address(p), order, size); +} +EXPORT_SYMBOL(alloc_pages_exact_nid); + /** * free_pages_exact - release memory allocated via alloc_pages_exact() * @virt: the value returned by alloc_pages_exact. -- cgit v1.2.3 From b5e6ab589d570ac79cc939517fab05c87a23c262 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 May 2011 13:16:54 -0700 Subject: mm: fix kernel-doc warning in page_alloc.c Fix new kernel-doc warning in mm/page_alloc.c: Warning(mm/page_alloc.c:2370): No description found for parameter 'nid' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 570d944daeb5..3f8bce264df6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2358,6 +2358,7 @@ EXPORT_SYMBOL(alloc_pages_exact); /** * alloc_pages_exact_nid - allocate an exact number of physically-contiguous * pages on a node. + * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate * @gfp_mask: GFP flags for the allocation * -- cgit v1.2.3 From 268bb0ce3e87872cb9290c322b0d35bce230d88f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 May 2011 12:50:29 -0700 Subject: sanitize usage Commit e66eed651fd1 ("list: remove prefetching from regular list iterators") removed the include of prefetch.h from list.h, which uncovered several cases that had apparently relied on that rather obscure header file dependency. So this fixes things up a bit, using grep -L linux/prefetch.h $(git grep -l '[^a-z_]prefetchw*(' -- '*.[ch]') grep -L 'prefetchw*(' $(git grep -l 'linux/prefetch.h' -- '*.[ch]') to guide us in finding files that either need inclusion, or have it despite not needing it. There are more of them around (mostly network drivers), but this gets many core ones. Reported-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3f8bce264df6..d49df7840541 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 7bf02ea22c6cdd09e2d3f1d3c3fe366b834ae9af Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 24 May 2011 17:11:16 -0700 Subject: arch, mm: filter disallowed nodes from arch specific show_mem functions Architectures that implement their own show_mem() function did not pass the filter argument to show_free_areas() to appropriately avoid emitting the state of nodes that are disallowed in the current context. This patch now passes the filter argument to show_free_areas() so those nodes are now avoided. This patch also removes the show_free_areas() wrapper around __show_free_areas() and converts existing callers to pass an empty filter. ia64 emits additional information for each node, so skip_free_areas_zone() must be made global to filter disallowed nodes and it is converted to use a nid argument rather than a zone for this use case. Signed-off-by: David Rientjes Cc: Russell King Cc: Tony Luck Cc: Fenghua Yu Cc: Kyle McMartin Cc: Helge Deller Cc: James Bottomley Cc: "David S. Miller" Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d5498e2d0f5..b1447522d346 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2473,10 +2473,10 @@ void si_meminfo_node(struct sysinfo *val, int nid) #endif /* - * Determine whether the zone's node should be displayed or not, depending on - * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) +bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; @@ -2484,8 +2484,7 @@ static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) goto out; get_mems_allowed(); - ret = !node_isset(zone->zone_pgdat->node_id, - cpuset_current_mems_allowed); + ret = !node_isset(nid, cpuset_current_mems_allowed); put_mems_allowed(); out: return ret; @@ -2500,13 +2499,13 @@ out: * Suppresses nodes that are not allowed by current's cpuset if * SHOW_MEM_FILTER_NODES is passed. */ -void __show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter) { int cpu; struct zone *zone; for_each_populated_zone(zone) { - if (skip_free_areas_zone(filter, zone)) + if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -2549,7 +2548,7 @@ void __show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_zone(filter, zone)) + if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk("%s" @@ -2618,7 +2617,7 @@ void __show_free_areas(unsigned int filter) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; - if (skip_free_areas_zone(filter, zone)) + if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk("%s: ", zone->name); @@ -2639,11 +2638,6 @@ void __show_free_areas(unsigned int filter) show_swap_cache_info(); } -void show_free_areas(void) -{ - __show_free_areas(0); -} - static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; -- cgit v1.2.3 From ac3bbec5ec69b973317677e038de2d1a0c90c18c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 24 May 2011 17:11:21 -0700 Subject: mm: remove unused zone_idx variable from set_migratetype_isolate Signed-off-by: Sergey Senozhatsky Reviewed-by: Christoph Lameter Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1447522d346..eb19ba8396c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5502,10 +5502,8 @@ int set_migratetype_isolate(struct page *page) struct memory_isolate_notify arg; int notifier_ret; int ret = -EBUSY; - int zone_idx; zone = page_zone(page); - zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); -- cgit v1.2.3 From 839a4fcc8af7412be2efd11f0bd0504757f79f08 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:31 -0700 Subject: mm, mem-hotplug: fix section mismatch. setup_per_zone_inactive_ratio() should be __meminit. Commit bce7394a3e ("page-allocator: reset wmark_min and inactive ratio of zone when hotplug happens") introduced invalid section references. Now, setup_per_zone_inactive_ratio() is marked __init and then it can't be referenced from memory hotplug code. This patch marks it as __meminit and also marks caller as __ref. Signed-off-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Yasunori Goto Cc: Rik van Riel Cc: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eb19ba8396c3..56d0be36be9d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5094,7 +5094,7 @@ void setup_per_zone_wmarks(void) * 1TB 101 10GB * 10TB 320 32GB */ -void calculate_zone_inactive_ratio(struct zone *zone) +void __meminit calculate_zone_inactive_ratio(struct zone *zone) { unsigned int gb, ratio; @@ -5108,7 +5108,7 @@ void calculate_zone_inactive_ratio(struct zone *zone) zone->inactive_ratio = ratio; } -static void __init setup_per_zone_inactive_ratio(void) +static void __meminit setup_per_zone_inactive_ratio(void) { struct zone *zone; -- cgit v1.2.3 From 1b79acc91115ba47e744b70bb166b77bd94f5855 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:32 -0700 Subject: mm, mem-hotplug: recalculate lowmem_reserve when memory hotplug occurs Currently, memory hotplug calls setup_per_zone_wmarks() and calculate_zone_inactive_ratio(), but doesn't call setup_per_zone_lowmem_reserve(). It means the number of reserved pages aren't updated even if memory hot plug occur. This patch fixes it. Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 56d0be36be9d..e133cea36932 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5094,7 +5094,7 @@ void setup_per_zone_wmarks(void) * 1TB 101 10GB * 10TB 320 32GB */ -void __meminit calculate_zone_inactive_ratio(struct zone *zone) +static void __meminit calculate_zone_inactive_ratio(struct zone *zone) { unsigned int gb, ratio; @@ -5140,7 +5140,7 @@ static void __meminit setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_wmark_min(void) +int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; -- cgit v1.2.3 From a6cccdc36c966e51fd969560d870cfd37afbfa9c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:33 -0700 Subject: mm, mem-hotplug: update pcp->stat_threshold when memory hotplug occur Currently, cpu hotplug updates pcp->stat_threshold, but memory hotplug doesn't. There is no reason for this. [akpm@linux-foundation.org: fix CONFIG_SMP=n build] Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e133cea36932..77773329aa72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -5152,6 +5153,7 @@ int __meminit init_per_zone_wmark_min(void) if (min_free_kbytes > 65536) min_free_kbytes = 65536; setup_per_zone_wmarks(); + refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; -- cgit v1.2.3 From c6a140bf164829769499b5e50d380893da39b29e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 24 May 2011 17:11:38 -0700 Subject: mm/compaction: reverse the change that forbade sync migraton with __GFP_NO_KSWAPD It's uncertain this has been beneficial, so it's safer to undo it. All other compaction users would still go in synchronous mode if a first attempt at async compaction failed. Hopefully we don't need to force special behavior for THP (which is the only __GFP_NO_KSWAPD user so far and it's the easier to exercise and to be noticeable). This also make __GFP_NO_KSWAPD return to its original strict semantics specific to bypass kswapd, as THP allocations have khugepaged for the async THP allocations/compactions. Signed-off-by: Andrea Arcangeli Cc: Alex Villacis Lasso Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77773329aa72..83eaa2eb72f8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2107,7 +2107,7 @@ rebalance: sync_migration); if (page) goto got_pg; - sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, -- cgit v1.2.3 From a238ab5b0239575c179f4976064192c3f7409dad Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 24 May 2011 17:12:16 -0700 Subject: mm: break out page allocation warning code This originally started as a simple patch to give vmalloc() some more verbose output on failure on top of the plain page allocator messages. Johannes suggested that it might be nicer to lead with the vmalloc() info _before_ the page allocator messages. But, I do think there's a lot of value in what __alloc_pages_slowpath() does with its filtering and so forth. This patch creates a new function which other allocators can call instead of relying on the internal page allocator warnings. It also gives this function private rate-limiting which separates it from other printk_ratelimit() users. Signed-off-by: Dave Hansen Cc: Johannes Weiner Cc: David Rientjes Cc: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 62 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83eaa2eb72f8..44019da9632e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1736,6 +1737,45 @@ static inline bool should_suppress_show_mem(void) return ret; } +static DEFINE_RATELIMIT_STATE(nopage_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +{ + va_list args; + unsigned int filter = SHOW_MEM_FILTER_NODES; + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + return; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (test_thread_flag(TIF_MEMDIE) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + filter &= ~SHOW_MEM_FILTER_NODES; + + if (fmt) { + printk(KERN_WARNING); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + } + + pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", + current->comm, order, gfp_mask); + + dump_stack(); + if (!should_suppress_show_mem()) + show_mem(filter); +} + static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed) @@ -2178,27 +2218,7 @@ rebalance: } nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - unsigned int filter = SHOW_MEM_FILTER_NODES; - - /* - * This documents exceptions given to allocations in certain - * contexts that are allowed to allocate outside current's set - * of allowed nodes. - */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) - if (test_thread_flag(TIF_MEMDIE) || - (current->flags & (PF_MEMALLOC | PF_EXITING))) - filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !wait) - filter &= ~SHOW_MEM_FILTER_NODES; - - pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); - dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); - } + warn_alloc_failed(gfp_mask, order, NULL); return page; got_pg: if (kmemcheck_enabled) -- cgit v1.2.3 From 6d3163ce86dd386b4f7bda80241d7fea2bc0bb1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= Date: Tue, 24 May 2011 17:12:24 -0700 Subject: mm: check if any page in a pageblock is reserved before marking it MIGRATE_RESERVE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a problem where the first pageblock got marked MIGRATE_RESERVE even though it only had a few free pages. eg, On current ARM port, The kernel starts at offset 0x8000 to leave room for boot parameters, and the memory is freed later. This in turn caused no contiguous memory to be reserved and frequent kswapd wakeups that emptied the caches to get more contiguous memory. Unfortunatelly, ARM needs order-2 allocation for pgd (see arm/mm/pgd.c#pgd_alloc()). Therefore the issue is not minor nor easy avoidable. [kosaki.motohiro@jp.fujitsu.com: added some explanation] [kosaki.motohiro@jp.fujitsu.com: add !pfn_valid_within() to check] [minchan.kim@gmail.com: check end_pfn in pageblock_is_reserved] Signed-off-by: John Stultz Signed-off-by: Arve Hjønnevåg Signed-off-by: KOSAKI Motohiro Acked-by: Mel Gorman Acked-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44019da9632e..01e6b614839d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3328,6 +3328,20 @@ static inline unsigned long wait_table_bits(unsigned long size) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) +/* + * Check if a pageblock contains reserved pages + */ +static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) + return 1; + } + return 0; +} + /* * Mark a number of pageblocks as MIGRATE_RESERVE. The number * of blocks reserved is based on min_wmark_pages(zone). The memory within @@ -3337,7 +3351,7 @@ static inline unsigned long wait_table_bits(unsigned long size) */ static void setup_zone_migrate_reserve(struct zone *zone) { - unsigned long start_pfn, pfn, end_pfn; + unsigned long start_pfn, pfn, end_pfn, block_end_pfn; struct page *page; unsigned long block_migratetype; int reserve; @@ -3367,7 +3381,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; /* Blocks with reserved pages will never free, skip them. */ - if (PageReserved(page)) + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) continue; block_migratetype = get_pageblock_migratetype(page); -- cgit v1.2.3 From a197b59ae6e8bee56fcef37ea2482dc08414e2ac Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 24 May 2011 17:12:35 -0700 Subject: mm: fail GFP_DMA allocations when ZONE_DMA is not configured The page allocator will improperly return a page from ZONE_NORMAL even when __GFP_DMA is passed if CONFIG_ZONE_DMA is disabled. The caller expects DMA memory, perhaps for ISA devices with 16-bit address registers, and may get higher memory resulting in undefined behavior. This patch causes the page allocator to return NULL in such circumstances with a warning emitted to the kernel log on the first occurrence. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 01e6b614839d..10a8c6da385f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2247,6 +2247,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return NULL; +#ifndef CONFIG_ZONE_DMA + if (WARN_ON_ONCE(gfp_mask & __GFP_DMA)) + return NULL; +#endif /* * Check the zones suitable for the gfp_mask contain at least one -- cgit v1.2.3 From cfa54a0fcfc1017c6f122b6f21aaba36daa07f71 Mon Sep 17 00:00:00 2001 From: Andrew Barry Date: Tue, 24 May 2011 17:12:52 -0700 Subject: mm/page_alloc.c: prevent unending loop in __alloc_pages_slowpath() I believe I found a problem in __alloc_pages_slowpath, which allows a process to get stuck endlessly looping, even when lots of memory is available. Running an I/O and memory intensive stress-test I see a 0-order page allocation with __GFP_IO and __GFP_WAIT, running on a system with very little free memory. Right about the same time that the stress-test gets killed by the OOM-killer, the utility trying to allocate memory gets stuck in __alloc_pages_slowpath even though most of the systems memory was freed by the oom-kill of the stress-test. The utility ends up looping from the rebalance label down through the wait_iff_congested continiously. Because order=0, __alloc_pages_direct_compact skips the call to get_page_from_freelist. Because all of the reclaimable memory on the system has already been reclaimed, __alloc_pages_direct_reclaim skips the call to get_page_from_freelist. Since there is no __GFP_FS flag, the block with __alloc_pages_may_oom is skipped. The loop hits the wait_iff_congested, then jumps back to rebalance without ever trying to get_page_from_freelist. This loop repeats infinitely. The test case is pretty pathological. Running a mix of I/O stress-tests that do a lot of fork() and consume all of the system memory, I can pretty reliably hit this on 600 nodes, in about 12 hours. 32GB/node. Signed-off-by: Andrew Barry Signed-off-by: Minchan Kim Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10a8c6da385f..2a00f17c3bf4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2106,6 +2106,7 @@ restart: first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone); +rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2113,7 +2114,6 @@ restart: if (page) goto got_pg; -rebalance: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { page = __alloc_pages_high_priority(gfp_mask, order, -- cgit v1.2.3 From 246e87a9393448c20873bc5dee64be68ed559e24 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 26 May 2011 16:25:34 -0700 Subject: memcg: fix get_scan_count() for small targets During memory reclaim we determine the number of pages to be scanned per zone as (anon + file) >> priority. Assume scan = (anon + file) >> priority. If scan < SWAP_CLUSTER_MAX, the scan will be skipped for this time and priority gets higher. This has some problems. 1. This increases priority as 1 without any scan. To do scan in this priority, amount of pages should be larger than 512M. If pages>>priority < SWAP_CLUSTER_MAX, it's recorded and scan will be batched, later. (But we lose 1 priority.) If memory size is below 16M, pages >> priority is 0 and no scan in DEF_PRIORITY forever. 2. If zone->all_unreclaimabe==true, it's scanned only when priority==0. So, x86's ZONE_DMA will never be recoverred until the user of pages frees memory by itself. 3. With memcg, the limit of memory can be small. When using small memcg, it gets priority < DEF_PRIORITY-2 very easily and need to call wait_iff_congested(). For doing scan before priorty=9, 64MB of memory should be used. Then, this patch tries to scan SWAP_CLUSTER_MAX of pages in force...when 1. the target is enough small. 2. it's kswapd or memcg reclaim. Then we can avoid rapid priority drop and may be able to recover all_unreclaimable in a small zones. And this patch removes nr_saved_scan. This will allow scanning in this priority even when pages >> priority is very small. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Ying Han Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Daisuke Nishimura Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a00f17c3bf4..a4e1db3f1981 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4323,10 +4323,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) { + for_each_lru(l) INIT_LIST_HEAD(&zone->lru[l].list); - zone->reclaim_stat.nr_saved_scan[l] = 0; - } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; -- cgit v1.2.3 From 1fa7b6a29c61358cc2ca6f64cef4aa0e1a7ca74c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 2 Jun 2011 06:11:24 +0900 Subject: Revert "mm: fail GFP_DMA allocations when ZONE_DMA is not configured" This reverts commit a197b59ae6e8bee56fcef37ea2482dc08414e2ac. As rmk says: "Commit a197b59ae6e8 (mm: fail GFP_DMA allocations when ZONE_DMA is not configured) is causing regressions on ARM with various drivers which use GFP_DMA. The behaviour up until now has been to silently ignore that flag when CONFIG_ZONE_DMA is not enabled, and to allocate from the normal zone. However, as a result of the above commit, such allocations now fail which causes drivers to fail. These are regressions compared to the previous kernel version." so just revert it. Requested-by: Russell King Acked-by: Andrew Morton Cc: David Rientjes Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a4e1db3f1981..4e8985acdab8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2247,10 +2247,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return NULL; -#ifndef CONFIG_ZONE_DMA - if (WARN_ON_ONCE(gfp_mask & __GFP_DMA)) - return NULL; -#endif /* * Check the zones suitable for the gfp_mask contain at least one -- cgit v1.2.3