From 373d4d099761cb1f637bed488ab3871945882273 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 21 Jan 2013 17:17:39 +1030 Subject: taint: add explicit flag to show whether lock dep is still OK. Fix up all callers as they were before, with make one change: an unsigned module taints the kernel, but doesn't turn off lockdep. Signed-off-by: Rusty Russell --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df2022ff0c8a..4c99cb7e276a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -320,7 +320,7 @@ static void bad_page(struct page *page) out: /* Leave bad fields for debug, except PageBuddy could make trouble */ reset_page_mapcount(page); /* remove PageBuddy */ - add_taint(TAINT_BAD_PAGE); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* -- cgit v1.2.3 From 8bd75c77b7c6a3954140dd2e20346aef3efe4a35 Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:47:07 -0600 Subject: sched/rt: Move rt specific bits into new header file Move rt scheduler definitions out of include/linux/sched.h into new file include/linux/sched/rt.h Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094707.7b9f825f@riff.lan Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df2022ff0c8a..42d18e46f286 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 41a7973447b0b8717f0a214d4328dc31ec2291d7 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 12 Feb 2013 13:46:24 -0800 Subject: mm: cma: fix accounting of CMA pages placed in high memory The total number of low memory pages is determined as totalram_pages - totalhigh_pages, so without this patch all CMA pageblocks placed in highmem were accounted to low memory. Signed-off-by: Marek Szyprowski Acked-by: Kyungmin Park Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df2022ff0c8a..9673d96b1ba7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -773,6 +773,10 @@ void __init init_cma_reserved_pageblock(struct page *page) set_pageblock_migratetype(page, MIGRATE_CMA); __free_pages(page, pageblock_order); totalram_pages += pageblock_nr_pages; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += pageblock_nr_pages; +#endif } #endif -- cgit v1.2.3 From 7c45512df987c5619db041b5c9b80d281e26d3db Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 18 Feb 2013 09:58:02 -0800 Subject: mm: fix pageblock bitmap allocation Commit c060f943d092 ("mm: use aligned zone start for pfn_to_bitidx calculation") fixed out calculation of the index into the pageblock bitmap when a !SPARSEMEM zome was not aligned to pageblock_nr_pages. However, the _allocation_ of that bitmap had never taken this alignment requirement into accout, so depending on the exact size and alignment of the zone, the use of that index could then access past the allocation, resulting in some very subtle memory corruption. This was reported (and bisected) by Ingo Molnar: one of his random config builds would hang with certain very specific kernel command line options. In the meantime, commit c060f943d092 has been marked for stable, so this fix needs to be back-ported to the stable kernels that backported the commit to use the right alignment. Bisected-and-tested-by: Ingo Molnar Acked-by: Mel Gorman Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9673d96b1ba7..6a83cd35cfde 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4420,10 +4420,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, * round what is now in bits to nearest long in bits, then return it in * bytes. */ -static unsigned long __init usemap_size(unsigned long zonesize) +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize; + zonesize += zone_start_pfn & (pageblock_nr_pages-1); usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; @@ -4433,17 +4434,19 @@ static unsigned long __init usemap_size(unsigned long zonesize) } static void __init setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) + struct zone *zone, + unsigned long zone_start_pfn, + unsigned long zonesize) { - unsigned long usemapsize = usemap_size(zonesize); + unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, usemapsize); } #else -static inline void setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) {} +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, + unsigned long zone_start_pfn, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -4594,7 +4597,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, continue; set_pageblock_order(); - setup_usemap(pgdat, zone, size); + setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); -- cgit v1.2.3 From 2a6f512412c7aecd04134721ea392cc496e6c017 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 22 Feb 2013 16:32:09 -0800 Subject: CMA: make putback_lru_pages() call conditional As per documentation and other places calling putback_lru_pages(), putback_lru_pages() is called on error only. Make the CMA code behave consistently. [akpm@linux-foundation.org: remove a test-n-branch in the wrapup code] Signed-off-by: Srinivas Pandruvada Acked-by: Michal Nazarewicz Cc: Marek Szyprowski Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1107adf174a..804cc62ab72f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5806,9 +5806,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, 0, false, MIGRATE_SYNC, MR_CMA); } - - putback_movable_pages(&cc->migratepages); - return ret > 0 ? 0 : ret; + if (ret < 0) { + putback_movable_pages(&cc->migratepages); + return ret; + } + return 0; } /** -- cgit v1.2.3 From 90ae8d670c12156f4328235aca1a528a8bfe6708 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 22 Feb 2013 16:32:22 -0800 Subject: mm/page_alloc.c:__setup_per_zone_wmarks: make min_pages unsigned long `int' is an inappropriate type for a number-of-pages counter. While we're there, use the clamp() macro. Acked-by: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Reviewed-by: Michal Hocko Cc: Hugh Dickins Cc: Satoru Moriya Cc: Simon Jeons Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 804cc62ab72f..703944809666 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5247,13 +5247,10 @@ static void __setup_per_zone_wmarks(void) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ - int min_pages; + unsigned long min_pages; min_pages = zone->present_pages / 1024; - if (min_pages < SWAP_CLUSTER_MAX) - min_pages = SWAP_CLUSTER_MAX; - if (min_pages > 128) - min_pages = 128; + min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); zone->watermark[WMARK_MIN] = min_pages; } else { /* -- cgit v1.2.3 From 34b71f1e04fcba578e719e675b4882eeeb2a1f6f Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:37 -0800 Subject: page_alloc: add movable_memmap kernel parameter Add functions to parse movablemem_map boot option. Since the option could be specified more then once, all the maps will be stored in the global variable movablemem_map.map array. And also, we keep the array in monotonic increasing order by start_pfn. And merge all overlapped ranges. [akpm@linux-foundation.org: improve comment] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: remove unneeded parens] Signed-off-by: Tang Chen Signed-off-by: Lai Jiangshan Reviewed-by: Wen Congyang Tested-by: Lin Feng Cc: Wu Jianguo Cc: Mel Gorman Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 703944809666..aa1cc5fe9904 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -202,6 +202,9 @@ static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +/* Movable memory ranges, will also be used by memblock subsystem. */ +struct movablemem_map movablemem_map; + static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; @@ -5078,6 +5081,134 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); +/** + * insert_movablemem_map - Insert a memory range in to movablemem_map.map. + * @start_pfn: start pfn of the range + * @end_pfn: end pfn of the range + * + * This function will also merge the overlapped ranges, and sort the array + * by start_pfn in monotonic increasing order. + */ +static void __init insert_movablemem_map(unsigned long start_pfn, + unsigned long end_pfn) +{ + int pos, overlap; + + /* + * pos will be at the 1st overlapped range, or the position + * where the element should be inserted. + */ + for (pos = 0; pos < movablemem_map.nr_map; pos++) + if (start_pfn <= movablemem_map.map[pos].end_pfn) + break; + + /* If there is no overlapped range, just insert the element. */ + if (pos == movablemem_map.nr_map || + end_pfn < movablemem_map.map[pos].start_pfn) { + /* + * If pos is not the end of array, we need to move all + * the rest elements backward. + */ + if (pos < movablemem_map.nr_map) + memmove(&movablemem_map.map[pos+1], + &movablemem_map.map[pos], + sizeof(struct movablemem_entry) * + (movablemem_map.nr_map - pos)); + movablemem_map.map[pos].start_pfn = start_pfn; + movablemem_map.map[pos].end_pfn = end_pfn; + movablemem_map.nr_map++; + return; + } + + /* overlap will be at the last overlapped range */ + for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++) + if (end_pfn < movablemem_map.map[overlap].start_pfn) + break; + + /* + * If there are more ranges overlapped, we need to merge them, + * and move the rest elements forward. + */ + overlap--; + movablemem_map.map[pos].start_pfn = min(start_pfn, + movablemem_map.map[pos].start_pfn); + movablemem_map.map[pos].end_pfn = max(end_pfn, + movablemem_map.map[overlap].end_pfn); + + if (pos != overlap && overlap + 1 != movablemem_map.nr_map) + memmove(&movablemem_map.map[pos+1], + &movablemem_map.map[overlap+1], + sizeof(struct movablemem_entry) * + (movablemem_map.nr_map - overlap - 1)); + + movablemem_map.nr_map -= overlap - pos; +} + +/** + * movablemem_map_add_region - Add a memory range into movablemem_map. + * @start: physical start address of range + * @end: physical end address of range + * + * This function transform the physical address into pfn, and then add the + * range into movablemem_map by calling insert_movablemem_map(). + */ +static void __init movablemem_map_add_region(u64 start, u64 size) +{ + unsigned long start_pfn, end_pfn; + + /* In case size == 0 or start + size overflows */ + if (start + size <= start) + return; + + if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) { + pr_err("movablemem_map: too many entries;" + " ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long) start, + (unsigned long long) (start + size - 1)); + return; + } + + start_pfn = PFN_DOWN(start); + end_pfn = PFN_UP(start + size); + insert_movablemem_map(start_pfn, end_pfn); +} + +/* + * cmdline_parse_movablemem_map - Parse boot option movablemem_map. + * @p: The boot option of the following format: + * movablemem_map=nn[KMG]@ss[KMG] + * + * This option sets the memory range [ss, ss+nn) to be used as movable memory. + * + * Return: 0 on success or -EINVAL on failure. + */ +static int __init cmdline_parse_movablemem_map(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + goto err; + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + goto err; + + if (*p == '@') { + oldp = ++p; + start_at = memparse(p, &p); + if (p == oldp || *p != '\0') + goto err; + + movablemem_map_add_region(start_at, mem_size); + return 0; + } +err: + return -EINVAL; +} +early_param("movablemem_map", cmdline_parse_movablemem_map); + #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** -- cgit v1.2.3 From 6981ec31146cf19454c55c130625f6cee89aab95 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:39 -0800 Subject: page_alloc: introduce zone_movable_limit[] to keep movable limit for nodes Introduce a new array zone_movable_limit[] to store the ZONE_MOVABLE limit from movablemem_map boot option for all nodes. The function sanitize_zone_movable_limit() will find out to which node the ranges in movable_map.map[] belongs, and calculates the low boundary of ZONE_MOVABLE for each node. Signed-off-by: Tang Chen Signed-off-by: Liu Jiang Reviewed-by: Wen Congyang Cc: Wu Jianguo Reviewed-by: Lai Jiangshan Tested-by: Lin Feng Cc: Mel Gorman Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa1cc5fe9904..0f267d9c73ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -210,6 +210,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; +static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES]; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -4375,6 +4376,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } +/** + * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array. + * + * zone_movable_limit is initialized as 0. This function will try to get + * the first ZONE_MOVABLE pfn of each node from movablemem_map, and + * assigne them to zone_movable_limit. + * zone_movable_limit[nid] == 0 means no limit for the node. + * + * Note: Each range is represented as [start_pfn, end_pfn) + */ +static void __meminit sanitize_zone_movable_limit(void) +{ + int map_pos = 0, i, nid; + unsigned long start_pfn, end_pfn; + + if (!movablemem_map.nr_map) + return; + + /* Iterate all ranges from minimum to maximum */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + /* + * If we have found lowest pfn of ZONE_MOVABLE of the node + * specified by user, just go on to check next range. + */ + if (zone_movable_limit[nid]) + continue; + +#ifdef CONFIG_ZONE_DMA + /* Skip DMA memory. */ + if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA]) + start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA]; +#endif + +#ifdef CONFIG_ZONE_DMA32 + /* Skip DMA32 memory. */ + if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32]) + start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32]; +#endif + +#ifdef CONFIG_HIGHMEM + /* Skip lowmem if ZONE_MOVABLE is highmem. */ + if (zone_movable_is_highmem() && + start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]) + start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; +#endif + + if (start_pfn >= end_pfn) + continue; + + while (map_pos < movablemem_map.nr_map) { + if (end_pfn <= movablemem_map.map[map_pos].start_pfn) + break; + + if (start_pfn >= movablemem_map.map[map_pos].end_pfn) { + map_pos++; + continue; + } + + /* + * The start_pfn of ZONE_MOVABLE is either the minimum + * pfn specified by movablemem_map, or 0, which means + * the node has no ZONE_MOVABLE. + */ + zone_movable_limit[nid] = max(start_pfn, + movablemem_map.map[map_pos].start_pfn); + + break; + } + } +} + #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, @@ -4392,7 +4464,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } - #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, @@ -4839,7 +4910,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ - find_usable_zone_for_movable(); usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: @@ -4998,6 +5068,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); + find_usable_zone_for_movable(); + sanitize_zone_movable_limit(); find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ -- cgit v1.2.3 From 42f47e27e761fee07da69e04612ec7dd0d490edd Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:40 -0800 Subject: page_alloc: make movablemem_map have higher priority If kernelcore or movablecore is specified at the same time with movablemem_map, movablemem_map will have higher priority to be satisfied. This patch will make find_zone_movable_pfns_for_nodes() calculate zone_movable_pfn[] with the limit from zone_movable_limit[]. Signed-off-by: Tang Chen Reviewed-by: Wen Congyang Cc: Wu Jianguo Reviewed-by: Lai Jiangshan Tested-by: Lin Feng Cc: Mel Gorman Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f267d9c73ff..88b9962c99b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4905,9 +4905,17 @@ static void __init find_zone_movable_pfns_for_nodes(void) required_kernelcore = max(required_kernelcore, corepages); } - /* If kernelcore was not specified, there is no ZONE_MOVABLE */ - if (!required_kernelcore) + /* + * If neither kernelcore/movablecore nor movablemem_map is specified, + * there is no ZONE_MOVABLE. But if movablemem_map is specified, the + * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[]. + */ + if (!required_kernelcore) { + if (movablemem_map.nr_map) + memcpy(zone_movable_pfn, zone_movable_limit, + sizeof(zone_movable_pfn)); goto out; + } /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; @@ -4937,10 +4945,24 @@ restart: for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsigned long size_pages; + /* + * Find more memory for kernelcore in + * [zone_movable_pfn[nid], zone_movable_limit[nid]). + */ start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue; + if (zone_movable_limit[nid]) { + end_pfn = min(end_pfn, zone_movable_limit[nid]); + /* No range left for kernelcore in this node */ + if (start_pfn >= end_pfn) { + zone_movable_pfn[nid] = + zone_movable_limit[nid]; + break; + } + } + /* Account for what is only usable for kernelcore */ if (start_pfn < usable_startpfn) { unsigned long kernel_pages; @@ -5000,12 +5022,12 @@ restart: if (usable_nodes && required_kernelcore > usable_nodes) goto restart; +out: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); -out: /* restore the node_state */ node_states[N_MEMORY] = saved_node_state; } -- cgit v1.2.3 From 27168d38fa209073219abedbe6a9de7ba9acbfad Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:46 -0800 Subject: acpi, memory-hotplug: extend movablemem_map ranges to the end of node When implementing movablemem_map boot option, we introduced an array movablemem_map.map[] to store the memory ranges to be set as ZONE_MOVABLE. Since ZONE_MOVABLE is the latst zone of a node, if user didn't specify the whole node memory range, we need to extend it to the node end so that we can use it to prevent memblock from allocating memory in the ranges user didn't specify. We now implement movablemem_map boot option like this: /* * For movablemem_map=nn[KMG]@ss[KMG]: * * SRAT: |_____| |_____| |_________| |_________| ...... * node id: 0 1 1 2 * user specified: |__| |___| * movablemem_map: |___| |_________| |______| ...... * * Using movablemem_map, we can prevent memblock from allocating memory * on ZONE_MOVABLE at boot time. * * NOTE: In this case, SRAT info will be ingored. */ [akpm@linux-foundation.org: clean up code, fix build warning] Signed-off-by: Tang Chen Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Yasuaki Ishimatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 88b9962c99b3..7ea9a003ad57 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5175,6 +5175,36 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); +/** + * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[]. + * @start_pfn: start pfn of the range to be checked + * @end_pfn: end pfn of the range to be checked (exclusive) + * + * This function checks if a given memory range [start_pfn, end_pfn) overlaps + * the movablemem_map.map[] array. + * + * Return: index of the first overlapped element in movablemem_map.map[] + * or -1 if they don't overlap each other. + */ +int __init movablemem_map_overlap(unsigned long start_pfn, + unsigned long end_pfn) +{ + int overlap; + + if (!movablemem_map.nr_map) + return -1; + + for (overlap = 0; overlap < movablemem_map.nr_map; overlap++) + if (start_pfn < movablemem_map.map[overlap].end_pfn) + break; + + if (overlap == movablemem_map.nr_map || + end_pfn <= movablemem_map.map[overlap].start_pfn) + return -1; + + return overlap; +} + /** * insert_movablemem_map - Insert a memory range in to movablemem_map.map. * @start_pfn: start pfn of the range @@ -5183,8 +5213,8 @@ early_param("movablecore", cmdline_parse_movablecore); * This function will also merge the overlapped ranges, and sort the array * by start_pfn in monotonic increasing order. */ -static void __init insert_movablemem_map(unsigned long start_pfn, - unsigned long end_pfn) +void __init insert_movablemem_map(unsigned long start_pfn, + unsigned long end_pfn) { int pos, overlap; -- cgit v1.2.3 From 01a178a94e8eaec351b29ee49fbb3d1c124cb7fb Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:49 -0800 Subject: acpi, memory-hotplug: support getting hotplug info from SRAT We now provide an option for users who don't want to specify physical memory address in kernel commandline. /* * For movablemem_map=acpi: * * SRAT: |_____| |_____| |_________| |_________| ...... * node id: 0 1 1 2 * hotpluggable: n y y n * movablemem_map: |_____| |_________| * * Using movablemem_map, we can prevent memblock from allocating memory * on ZONE_MOVABLE at boot time. */ So user just specify movablemem_map=acpi, and the kernel will use hotpluggable info in SRAT to determine which memory ranges should be set as ZONE_MOVABLE. If all the memory ranges in SRAT is hotpluggable, then no memory can be used by kernel. But before parsing SRAT, memblock has already reserve some memory ranges for other purposes, such as for kernel image, and so on. We cannot prevent kernel from using these memory. So we need to exclude these ranges even if these memory is hotpluggable. Furthermore, there could be several memory ranges in the single node which the kernel resides in. We may skip one range that have memory reserved by memblock, but if the rest of memory is too small, then the kernel will fail to boot. So, make the whole node which the kernel resides in un-hotpluggable. Then the kernel has enough memory to use. NOTE: Using this way will cause NUMA performance down because the whole node will be set as ZONE_MOVABLE, and kernel cannot use memory on it. If users don't want to lose NUMA performance, just don't use it. [akpm@linux-foundation.org: fix warning] [akpm@linux-foundation.org: use strcmp()] Signed-off-by: Tang Chen Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Yasuaki Ishimatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ea9a003ad57..a7381be21320 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -203,7 +203,10 @@ static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* Movable memory ranges, will also be used by memblock subsystem. */ -struct movablemem_map movablemem_map; +struct movablemem_map movablemem_map = { + .acpi = false, + .nr_map = 0, +}; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; @@ -5314,6 +5317,23 @@ static int __init cmdline_parse_movablemem_map(char *p) if (!p) goto err; + if (!strcmp(p, "acpi")) + movablemem_map.acpi = true; + + /* + * If user decide to use info from BIOS, all the other user specified + * ranges will be ingored. + */ + if (movablemem_map.acpi) { + if (movablemem_map.nr_map) { + memset(movablemem_map.map, 0, + sizeof(struct movablemem_entry) + * movablemem_map.nr_map); + movablemem_map.nr_map = 0; + } + return 0; + } + oldp = p; mem_size = memparse(p, &p); if (p == oldp) -- cgit v1.2.3 From b40da04946aa7b603b2aa4dd479f83b2c9090d96 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Feb 2013 16:33:52 -0800 Subject: mm: use zone->present_pages instead of zone->managed_pages where appropriate Now we have zone->managed_pages for "pages managed by the buddy system in the zone", so replace zone->present_pages with zone->managed_pages if what the user really wants is number of allocatable pages. Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Maciej Rutecki Cc: Chris Clayton Cc: "Rafael J . Wysocki" Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Jianguo Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a7381be21320..5f73106bd8dd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2808,7 +2808,7 @@ static unsigned int nr_free_zone_pages(int offset) struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { - unsigned long size = zone->present_pages; + unsigned long size = zone->managed_pages; unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; @@ -2861,7 +2861,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) val->totalram = pgdat->node_present_pages; val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], NR_FREE_PAGES); #else @@ -3939,7 +3939,7 @@ static int __meminit zone_batchsize(struct zone *zone) * * OK, so we don't know how big the cache is. So guess. */ - batch = zone->present_pages / 1024; + batch = zone->managed_pages / 1024; if (batch * PAGE_SIZE > 512 * 1024) batch = (512 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ @@ -4023,7 +4023,7 @@ static void __meminit setup_zone_pageset(struct zone *zone) if (percpu_pagelist_fraction) setup_pagelist_highmark(pcp, - (zone->present_pages / + (zone->managed_pages / percpu_pagelist_fraction)); } } @@ -5435,8 +5435,8 @@ static void calculate_totalreserve_pages(void) /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > zone->present_pages) - max = zone->present_pages; + if (max > zone->managed_pages) + max = zone->managed_pages; reserve_pages += max; /* * Lowmem reserves are not available to @@ -5468,7 +5468,7 @@ static void setup_per_zone_lowmem_reserve(void) for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long present_pages = zone->present_pages; + unsigned long managed_pages = zone->managed_pages; zone->lowmem_reserve[j] = 0; @@ -5482,9 +5482,9 @@ static void setup_per_zone_lowmem_reserve(void) sysctl_lowmem_reserve_ratio[idx] = 1; lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = present_pages / + lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; - present_pages += lower_zone->present_pages; + managed_pages += lower_zone->managed_pages; } } } @@ -5503,14 +5503,14 @@ static void __setup_per_zone_wmarks(void) /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) - lowmem_pages += zone->present_pages; + lowmem_pages += zone->managed_pages; } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->present_pages; + tmp = (u64)pages_min * zone->managed_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* @@ -5524,7 +5524,7 @@ static void __setup_per_zone_wmarks(void) */ unsigned long min_pages; - min_pages = zone->present_pages / 1024; + min_pages = zone->managed_pages / 1024; min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); zone->watermark[WMARK_MIN] = min_pages; } else { @@ -5586,7 +5586,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone) unsigned int gb, ratio; /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); + gb = zone->managed_pages >> (30 - PAGE_SHIFT); if (gb) ratio = int_sqrt(10 * gb); else @@ -5672,7 +5672,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_unmapped_pages = (zone->present_pages * + zone->min_unmapped_pages = (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } @@ -5688,7 +5688,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_slab_pages = (zone->present_pages * + zone->min_slab_pages = (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } @@ -5730,7 +5730,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, for_each_populated_zone(zone) { for_each_possible_cpu(cpu) { unsigned long high; - high = zone->present_pages / percpu_pagelist_fraction; + high = zone->managed_pages / percpu_pagelist_fraction; setup_pagelist_highmark( per_cpu_ptr(zone->pageset, cpu), high); } -- cgit v1.2.3 From 306f2e9eed173c5b8f3318e36cf92a7df3c3f6d5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 22 Feb 2013 16:33:54 -0800 Subject: mm: set zone->present_pages to number of existing pages in the zone Now all users of "number of pages managed by the buddy system" have been converted to use zone->managed_pages, so set zone->present_pages to what it should be: present_pages = spanned_pages - absent_pages; Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Maciej Rutecki Cc: Chris Clayton Cc: "Rafael J . Wysocki" Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Jianguo Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5f73106bd8dd..07fe78d01ffd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4650,7 +4650,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, nr_all_pages += freesize; zone->spanned_pages = size; - zone->present_pages = freesize; + zone->present_pages = realsize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. -- cgit v1.2.3 From 194159fbcc0d6ac1351837d3cd7a27a4af0219a6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 22 Feb 2013 16:33:58 -0800 Subject: mm: remove MIGRATE_ISOLATE check in hotpath Several functions test MIGRATE_ISOLATE and some of those are hotpath but MIGRATE_ISOLATE is used only if we enable CONFIG_MEMORY_ISOLATION(ie, CMA, memory-hotplug and memory-failure) which are not common config option. So let's not add unnecessary overhead and code when we don't enable CONFIG_MEMORY_ISOLATION. Signed-off-by: Minchan Kim Cc: KOSAKI Motohiro Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07fe78d01ffd..e3fb290194c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -673,7 +673,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { + if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); if (is_migrate_cma(mt)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); @@ -691,7 +691,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); - if (unlikely(migratetype != MIGRATE_ISOLATE)) + if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -923,7 +923,9 @@ static int fallbacks[MIGRATE_TYPES][4] = { [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, #endif [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ +#ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ +#endif }; /* @@ -1149,7 +1151,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add_tail(&page->lru, list); if (IS_ENABLED(CONFIG_CMA)) { mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) + if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) mt = migratetype; } set_freepage_migratetype(page, mt); @@ -1333,7 +1335,7 @@ void free_hot_cold_page(struct page *page, int cold) * excessively into the page allocator */ if (migratetype >= MIGRATE_PCPTYPES) { - if (unlikely(migratetype == MIGRATE_ISOLATE)) { + if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, 0, migratetype); goto out; } @@ -1407,7 +1409,7 @@ static int __isolate_free_page(struct page *page, unsigned int order) zone = page_zone(page); mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_ISOLATE) { + if (!is_migrate_isolate(mt)) { /* Obey watermarks as if the page was being allocated */ watermark = low_wmark_pages(zone) + (1 << order); if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) @@ -1426,7 +1428,7 @@ static int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -2904,7 +2906,9 @@ static void show_migration_types(unsigned char type) #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif +#ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = 'I', +#endif }; char tmp[MIGRATE_TYPES + 1]; char *p = tmp; -- cgit v1.2.3 From 21caf2fc1931b485483ddd254b634fa8f0099963 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 22 Feb 2013 16:34:08 -0800 Subject: mm: teach mm by current context info to not do I/O during memory allocation This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of 'struct task_struct'), so that the flag can be set by one task to avoid doing I/O inside memory allocation in the task's context. The patch trys to solve one deadlock problem caused by block device, and the problem may happen at least in the below situations: - during block device runtime resume, if memory allocation with GFP_KERNEL is called inside runtime resume callback of any one of its ancestors(or the block device itself), the deadlock may be triggered inside the memory allocation since it might not complete until the block device becomes active and the involed page I/O finishes. The situation is pointed out first by Alan Stern. It is not a good approach to convert all GFP_KERNEL[1] in the path into GFP_NOIO because several subsystems may be involved(for example, PCI, USB and SCSI may be involved for usb mass stoarage device, network devices involved too in the iSCSI case) - during block device runtime suspend, because runtime resume need to wait for completion of concurrent runtime suspend. - during error handling of usb mass storage deivce, USB bus reset will be put on the device, so there shouldn't have any memory allocation with GFP_KERNEL during USB bus reset, otherwise the deadlock similar with above may be triggered. Unfortunately, any usb device may include one mass storage interface in theory, so it requires all usb interface drivers to handle the situation. In fact, most usb drivers don't know how to handle bus reset on the device and don't provide .pre_set() and .post_reset() callback at all, so USB core has to unbind and bind driver for these devices. So it is still not practical to resort to GFP_NOIO for solving the problem. Also the introduced solution can be used by block subsystem or block drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing actual I/O transfer. It is not a good idea to convert all these GFP_KERNEL in the affected path into GFP_NOIO because these functions doing that may be implemented as library and will be called in many other contexts. In fact, memalloc_noio_flags() can convert some of current static GFP_NOIO allocation into GFP_KERNEL back in other non-affected contexts, at least almost all GFP_NOIO in USB subsystem can be converted into GFP_KERNEL after applying the approach and make allocation with GFP_NOIO only happen in runtime resume/bus reset/block I/O transfer contexts generally. [1], several GFP_KERNEL allocation examples in runtime resume path - pci subsystem acpi_os_allocate <-acpi_ut_allocate <-ACPI_ALLOCATE_ZEROED <-acpi_evaluate_object <-__acpi_bus_set_power <-acpi_bus_set_power <-acpi_pci_set_power_state <-platform_pci_set_power_state <-pci_platform_power_transition <-__pci_complete_power_transition <-pci_set_power_state <-pci_restore_standard_config <-pci_pm_runtime_resume - usb subsystem usb_get_status <-finish_port_resume <-usb_port_resume <-generic_resume <-usb_resume_device <-usb_resume_both <-usb_runtime_resume - some individual usb drivers usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, .... That is just what I have found. Unfortunately, this allocation can only be found by human being now, and there should be many not found since any function in the resume path(call tree) may allocate memory with GFP_KERNEL. Signed-off-by: Ming Lei Signed-off-by: Minchan Kim Cc: Alan Stern Cc: Oliver Neukum Cc: Jiri Kosina Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Greg KH Cc: Jens Axboe Cc: "David S. Miller" Cc: Eric Dumazet Cc: David Decotigny Cc: Tom Herbert Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3fb290194c0..3ede25e6686e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2624,10 +2624,17 @@ retry_cpuset: page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); - if (unlikely(!page)) + if (unlikely(!page)) { + /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. + */ + gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + } trace_mm_page_alloc(page, order, gfp_mask, migratetype); -- cgit v1.2.3 From 22b751c3d0376e86a377e3a0aa2ddbbe9d2eefc1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 22 Feb 2013 16:34:59 -0800 Subject: mm: rename page struct field helpers The function names page_xchg_last_nid(), page_last_nid() and reset_page_last_nid() were judged to be inconsistent so rename them to a struct_field_op style pattern. As it looked jarring to have reset_page_mapcount() and page_nid_reset_last() beside each other in memmap_init_zone(), this patch also renames reset_page_mapcount() to page_mapcount_reset(). There are others like init_page_count() but as it is used throughout the arch code a rename would likely cause more conflicts than it is worth. [akpm@linux-foundation.org: fix zcache] Signed-off-by: Mel Gorman Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3ede25e6686e..445718b328b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,7 +295,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ return; } @@ -327,7 +327,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -613,7 +613,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - reset_page_last_nid(page); + page_nid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -3894,8 +3894,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); - reset_page_mapcount(page); - reset_page_last_nid(page); + page_mapcount_reset(page); + page_nid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- cgit v1.2.3 From 9c620e2bc5aa4256c102ada34e6c76204ed5898b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:35:14 -0800 Subject: mm: remove offlining arg to migrate_pages No functional change, but the only purpose of the offlining argument to migrate_pages() etc, was to ensure that __unmap_and_move() could migrate a KSM page for memory hotremove (which took ksm_thread_mutex) but not for other callers. Now all cases are safe, remove the arg. Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Gerald Schaefer Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 445718b328b6..64c83a8c3220 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6084,10 +6084,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; - ret = migrate_pages(&cc->migratepages, - alloc_migrate_target, - 0, false, MIGRATE_SYNC, - MR_CMA); + ret = migrate_pages(&cc->migratepages, alloc_migrate_target, + 0, MIGRATE_SYNC, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); -- cgit v1.2.3 From 108bcc96ef7047c02cad4d229f04da38186a3f3f Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Fri, 22 Feb 2013 16:35:23 -0800 Subject: mm: add & use zone_end_pfn() and zone_spans_pfn() Add 2 helpers (zone_end_pfn() and zone_spans_pfn()) to reduce code duplication. This also switches to using them in compaction (where an additional variable needed to be renamed), page_alloc, vmstat, memory_hotplug, and kmemleak. Note that in compaction.c I avoid calling zone_end_pfn() repeatedly because I expect at some point the sycronization issues with start_pfn & spanned_pages will need fixing, either by actually using the seqlock or clever memory barrier usage. Signed-off-by: Cody P Schafer Cc: David Hansen Cc: Catalin Marinas Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 64c83a8c3220..a3687afc5917 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -250,9 +250,7 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) do { seq = zone_span_seqbegin(zone); - if (pfn >= zone->zone_start_pfn + zone->spanned_pages) - ret = 1; - else if (pfn < zone->zone_start_pfn) + if (!zone_spans_pfn(zone, pfn)) ret = 1; } while (zone_span_seqretry(zone, seq)); @@ -990,9 +988,9 @@ int move_freepages_block(struct zone *zone, struct page *page, end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ - if (start_pfn < zone->zone_start_pfn) + if (!zone_spans_pfn(zone, start_pfn)) start_page = page; - if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) + if (!zone_spans_pfn(zone, end_pfn)) return 0; return move_freepages(zone, start_page, end_page, migratetype); @@ -1286,7 +1284,7 @@ void mark_free_pages(struct zone *zone) spin_lock_irqsave(&zone->lock, flags); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -3798,7 +3796,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) * the block. */ start_pfn = zone->zone_start_pfn; - end_pfn = start_pfn + zone->spanned_pages; + end_pfn = zone_end_pfn(zone); start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; @@ -3912,7 +3910,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * pfn out of zone. */ if ((z->zone_start_pfn <= pfn) - && (pfn < z->zone_start_pfn + z->spanned_pages) + && (pfn < zone_end_pfn(z)) && !(pfn & (pageblock_nr_pages - 1))) set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -4713,7 +4711,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); - end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); @@ -5928,8 +5926,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); - VM_BUG_ON(pfn < zone->zone_start_pfn); - VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); + VM_BUG_ON(!zone_spans_pfn(zone, pfn)); for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) if (flags & value) @@ -6027,8 +6024,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone = page_zone(page); pfn = page_to_pfn(page); - if (zone->zone_start_pfn > pfn || - zone->zone_start_pfn + zone->spanned_pages <= pfn) + if (!zone_spans_pfn(zone, pfn)) return false; return !has_unmovable_pages(zone, page, 0, true); -- cgit v1.2.3 From d29bb9782d22063892e28716cdb76a87d2876ddb Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Fri, 22 Feb 2013 16:35:25 -0800 Subject: mm/page_alloc: add a VM_BUG in __free_one_page() if the zone is uninitialized. Freeing pages to uninitialized zones is not handled by __free_one_page(), and should never happen when the code is correct. Ran into this while writing some code that dynamically onlines extra zones. Signed-off-by: Cody P Schafer Cc: David Hansen Cc: Catalin Marinas Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3687afc5917..9614aabee8c5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -538,6 +538,8 @@ static inline void __free_one_page(struct page *page, unsigned long uninitialized_var(buddy_idx); struct page *buddy; + VM_BUG_ON(!zone_is_initialized(zone)); + if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) return; -- cgit v1.2.3 From b5e6a5a2724bc9f0b121062ab730d48731ae83e3 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Fri, 22 Feb 2013 16:35:28 -0800 Subject: mm/page_alloc: add informative debugging message in page_outside_zone_boundaries() Add a debug message which prints when a page is found outside of the boundaries of the zone it should belong to. Format is: "page $pfn outside zone [ $start_pfn - $end_pfn ]" [akpm@linux-foundation.org: s/pr_debug/pr_err/] Signed-off-by: Cody P Schafer Cc: David Hansen Cc: Catalin Marinas Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9614aabee8c5..a40b2f1cac2f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -247,13 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) int ret = 0; unsigned seq; unsigned long pfn = page_to_pfn(page); + unsigned long sp, start_pfn; do { seq = zone_span_seqbegin(zone); + start_pfn = zone->zone_start_pfn; + sp = zone->spanned_pages; if (!zone_spans_pfn(zone, pfn)) ret = 1; } while (zone_span_seqretry(zone, seq)); + if (ret) + pr_err("page %lu outside zone [ %lu - %lu ]\n", + pfn, start_pfn, start_pfn + sp); + return ret; } -- cgit v1.2.3 From 00ef2d2f84babb9b209f0fc003bc490c6bf1e6ef Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 22 Feb 2013 16:35:36 -0800 Subject: mm: use NUMA_NO_NODE Make a sweep through mm/ and convert code that uses -1 directly to using the more appropriate NUMA_NO_NODE. Signed-off-by: David Rientjes Reviewed-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a40b2f1cac2f..159f81577774 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3261,7 +3261,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; - int best_node = -1; + int best_node = NUMA_NO_NODE; const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ -- cgit v1.2.3 From ebec3862fd6eefe8301aa55ed2e30c685d831842 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Fri, 22 Feb 2013 16:35:43 -0800 Subject: mm: fix return type for functions nr_free_*_pages Currently, the amount of RAM that functions nr_free_*_pages return is held in unsigned int. But in machines with big memory (exceeding 16TB), the amount may be incorrect because of overflow, so fix it. Signed-off-by: Zhang Yanfei Cc: Simon Horman Cc: Julian Anastasov Cc: David Miller Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 159f81577774..276140654305 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2813,13 +2813,13 @@ void free_pages_exact(void *virt, size_t size) } EXPORT_SYMBOL(free_pages_exact); -static unsigned int nr_free_zone_pages(int offset) +static unsigned long nr_free_zone_pages(int offset) { struct zoneref *z; struct zone *zone; /* Just pick one node, since fallback list is circular */ - unsigned int sum = 0; + unsigned long sum = 0; struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); @@ -2836,7 +2836,7 @@ static unsigned int nr_free_zone_pages(int offset) /* * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL */ -unsigned int nr_free_buffer_pages(void) +unsigned long nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } @@ -2845,7 +2845,7 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages); /* * Amount of free RAM allocatable within all zones */ -unsigned int nr_free_pagecache_pages(void) +unsigned long nr_free_pagecache_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); } -- cgit v1.2.3 From e0fb58152955142f48ed31c8c0541b53e094da6b Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Fri, 22 Feb 2013 16:35:54 -0800 Subject: mm: accurately document nr_free_*_pages functions with code comments nr_free_zone_pages(), nr_free_buffer_pages() and nr_free_pagecache_pages() are horribly badly named, so accurately document them with code comments in case of the misuse of them. [akpm@linux-foundation.org: tweak comments] Reviewed-by: Randy Dunlap Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 276140654305..e9075fdef695 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2813,6 +2813,15 @@ void free_pages_exact(void *virt, size_t size) } EXPORT_SYMBOL(free_pages_exact); +/** + * nr_free_zone_pages - count number of pages beyond high watermark + * @offset: The zone index of the highest zone + * + * nr_free_zone_pages() counts the number of counts pages which are beyond the + * high watermark within all zones at or below a given zone index. For each + * zone, the number of pages is calculated as: + * present_pages - high_pages + */ static unsigned long nr_free_zone_pages(int offset) { struct zoneref *z; @@ -2833,8 +2842,11 @@ static unsigned long nr_free_zone_pages(int offset) return sum; } -/* - * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL +/** + * nr_free_buffer_pages - count number of pages beyond high watermark + * + * nr_free_buffer_pages() counts the number of pages which are beyond the high + * watermark within ZONE_DMA and ZONE_NORMAL. */ unsigned long nr_free_buffer_pages(void) { @@ -2842,8 +2854,11 @@ unsigned long nr_free_buffer_pages(void) } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); -/* - * Amount of free RAM allocatable within all zones +/** + * nr_free_pagecache_pages - count number of pages beyond high watermark + * + * nr_free_pagecache_pages() counts the number of pages which are beyond the + * high watermark within all zones. */ unsigned long nr_free_pagecache_pages(void) { -- cgit v1.2.3 From 20e6926dcbafa1b361f1c29d967688be14b6ca4b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 1 Mar 2013 14:51:27 -0800 Subject: x86, ACPI, mm: Revert movablemem_map support Tim found: WARNING: at arch/x86/kernel/smpboot.c:324 topology_sane.isra.2+0x6f/0x80() Hardware name: S2600CP sched: CPU #1's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency. smpboot: Booting Node 1, Processors #1 Modules linked in: Pid: 0, comm: swapper/1 Not tainted 3.9.0-0-generic #1 Call Trace: set_cpu_sibling_map+0x279/0x449 start_secondary+0x11d/0x1e5 Don Morris reproduced on a HP z620 workstation, and bisected it to commit e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") It turns out movable_map has some problems, and it breaks several things 1. numa_init is called several times, NOT just for srat. so those nodes_clear(numa_nodes_parsed) memset(&numa_meminfo, 0, sizeof(numa_meminfo)) can not be just removed. Need to consider sequence is: numaq, srat, amd, dummy. and make fall back path working. 2. simply split acpi_numa_init to early_parse_srat. a. that early_parse_srat is NOT called for ia64, so you break ia64. b. for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE) still left in numa_init. So it will just clear result from early_parse_srat. it should be moved before that.... c. it breaks ACPI_TABLE_OVERIDE...as the acpi table scan is moved early before override from INITRD is settled. 3. that patch TITLE is total misleading, there is NO x86 in the title, but it changes critical x86 code. It caused x86 guys did not pay attention to find the problem early. Those patches really should be routed via tip/x86/mm. 4. after that commit, following range can not use movable ram: a. real_mode code.... well..funny, legacy Node0 [0,1M) could be hot-removed? b. initrd... it will be freed after booting, so it could be on movable... c. crashkernel for kdump...: looks like we can not put kdump kernel above 4G anymore. d. init_mem_mapping: can not put page table high anymore. e. initmem_init: vmemmap can not be high local node anymore. That is not good. If node is hotplugable, the mem related range like page table and vmemmap could be on the that node without problem and should be on that node. We have workaround patch that could fix some problems, but some can not be fixed. So just remove that offending commit and related ones including: f7210e6c4ac7 ("mm/memblock.c: use CONFIG_HAVE_MEMBLOCK_NODE_MAP to protect movablecore_map in memblock_overlaps_region().") 01a178a94e8e ("acpi, memory-hotplug: support getting hotplug info from SRAT") 27168d38fa20 ("acpi, memory-hotplug: extend movablemem_map ranges to the end of node") e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") fb06bc8e5f42 ("page_alloc: bootmem limit with movablecore_map") 42f47e27e761 ("page_alloc: make movablemem_map have higher priority") 6981ec31146c ("page_alloc: introduce zone_movable_limit[] to keep movable limit for nodes") 34b71f1e04fc ("page_alloc: add movable_memmap kernel parameter") 4d59a75125d5 ("x86: get pg_data_t's memory from other node") Later we should have patches that will make sure kernel put page table and vmemmap on local node ram instead of push them down to node0. Also need to find way to put other kernel used ram to local node ram. Reported-by: Tim Gardner Reported-by: Don Morris Bisected-by: Don Morris Tested-by: Don Morris Signed-off-by: Yinghai Lu Cc: Tony Luck Cc: Thomas Renninger Cc: Tejun Heo Cc: Tang Chen Cc: Yasuaki Ishimatsu Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 285 +------------------------------------------------------- 1 file changed, 5 insertions(+), 280 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0dade3f18f7d..8fcced7823fa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -202,18 +202,11 @@ static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -/* Movable memory ranges, will also be used by memblock subsystem. */ -struct movablemem_map movablemem_map = { - .acpi = false, - .nr_map = 0, -}; - static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; -static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES]; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -4412,77 +4405,6 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } -/** - * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array. - * - * zone_movable_limit is initialized as 0. This function will try to get - * the first ZONE_MOVABLE pfn of each node from movablemem_map, and - * assigne them to zone_movable_limit. - * zone_movable_limit[nid] == 0 means no limit for the node. - * - * Note: Each range is represented as [start_pfn, end_pfn) - */ -static void __meminit sanitize_zone_movable_limit(void) -{ - int map_pos = 0, i, nid; - unsigned long start_pfn, end_pfn; - - if (!movablemem_map.nr_map) - return; - - /* Iterate all ranges from minimum to maximum */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - /* - * If we have found lowest pfn of ZONE_MOVABLE of the node - * specified by user, just go on to check next range. - */ - if (zone_movable_limit[nid]) - continue; - -#ifdef CONFIG_ZONE_DMA - /* Skip DMA memory. */ - if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA]) - start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA]; -#endif - -#ifdef CONFIG_ZONE_DMA32 - /* Skip DMA32 memory. */ - if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32]) - start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32]; -#endif - -#ifdef CONFIG_HIGHMEM - /* Skip lowmem if ZONE_MOVABLE is highmem. */ - if (zone_movable_is_highmem() && - start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]) - start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; -#endif - - if (start_pfn >= end_pfn) - continue; - - while (map_pos < movablemem_map.nr_map) { - if (end_pfn <= movablemem_map.map[map_pos].start_pfn) - break; - - if (start_pfn >= movablemem_map.map[map_pos].end_pfn) { - map_pos++; - continue; - } - - /* - * The start_pfn of ZONE_MOVABLE is either the minimum - * pfn specified by movablemem_map, or 0, which means - * the node has no ZONE_MOVABLE. - */ - zone_movable_limit[nid] = max(start_pfn, - movablemem_map.map[map_pos].start_pfn); - - break; - } - } -} - #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, @@ -4500,6 +4422,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } + #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, @@ -4941,19 +4864,12 @@ static void __init find_zone_movable_pfns_for_nodes(void) required_kernelcore = max(required_kernelcore, corepages); } - /* - * If neither kernelcore/movablecore nor movablemem_map is specified, - * there is no ZONE_MOVABLE. But if movablemem_map is specified, the - * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[]. - */ - if (!required_kernelcore) { - if (movablemem_map.nr_map) - memcpy(zone_movable_pfn, zone_movable_limit, - sizeof(zone_movable_pfn)); + /* If kernelcore was not specified, there is no ZONE_MOVABLE */ + if (!required_kernelcore) goto out; - } /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ + find_usable_zone_for_movable(); usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: @@ -4981,24 +4897,10 @@ restart: for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsigned long size_pages; - /* - * Find more memory for kernelcore in - * [zone_movable_pfn[nid], zone_movable_limit[nid]). - */ start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue; - if (zone_movable_limit[nid]) { - end_pfn = min(end_pfn, zone_movable_limit[nid]); - /* No range left for kernelcore in this node */ - if (start_pfn >= end_pfn) { - zone_movable_pfn[nid] = - zone_movable_limit[nid]; - break; - } - } - /* Account for what is only usable for kernelcore */ if (start_pfn < usable_startpfn) { unsigned long kernel_pages; @@ -5058,12 +4960,12 @@ restart: if (usable_nodes && required_kernelcore > usable_nodes) goto restart; -out: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); +out: /* restore the node_state */ node_states[N_MEMORY] = saved_node_state; } @@ -5126,8 +5028,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); - find_usable_zone_for_movable(); - sanitize_zone_movable_limit(); find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ @@ -5211,181 +5111,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -/** - * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[]. - * @start_pfn: start pfn of the range to be checked - * @end_pfn: end pfn of the range to be checked (exclusive) - * - * This function checks if a given memory range [start_pfn, end_pfn) overlaps - * the movablemem_map.map[] array. - * - * Return: index of the first overlapped element in movablemem_map.map[] - * or -1 if they don't overlap each other. - */ -int __init movablemem_map_overlap(unsigned long start_pfn, - unsigned long end_pfn) -{ - int overlap; - - if (!movablemem_map.nr_map) - return -1; - - for (overlap = 0; overlap < movablemem_map.nr_map; overlap++) - if (start_pfn < movablemem_map.map[overlap].end_pfn) - break; - - if (overlap == movablemem_map.nr_map || - end_pfn <= movablemem_map.map[overlap].start_pfn) - return -1; - - return overlap; -} - -/** - * insert_movablemem_map - Insert a memory range in to movablemem_map.map. - * @start_pfn: start pfn of the range - * @end_pfn: end pfn of the range - * - * This function will also merge the overlapped ranges, and sort the array - * by start_pfn in monotonic increasing order. - */ -void __init insert_movablemem_map(unsigned long start_pfn, - unsigned long end_pfn) -{ - int pos, overlap; - - /* - * pos will be at the 1st overlapped range, or the position - * where the element should be inserted. - */ - for (pos = 0; pos < movablemem_map.nr_map; pos++) - if (start_pfn <= movablemem_map.map[pos].end_pfn) - break; - - /* If there is no overlapped range, just insert the element. */ - if (pos == movablemem_map.nr_map || - end_pfn < movablemem_map.map[pos].start_pfn) { - /* - * If pos is not the end of array, we need to move all - * the rest elements backward. - */ - if (pos < movablemem_map.nr_map) - memmove(&movablemem_map.map[pos+1], - &movablemem_map.map[pos], - sizeof(struct movablemem_entry) * - (movablemem_map.nr_map - pos)); - movablemem_map.map[pos].start_pfn = start_pfn; - movablemem_map.map[pos].end_pfn = end_pfn; - movablemem_map.nr_map++; - return; - } - - /* overlap will be at the last overlapped range */ - for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++) - if (end_pfn < movablemem_map.map[overlap].start_pfn) - break; - - /* - * If there are more ranges overlapped, we need to merge them, - * and move the rest elements forward. - */ - overlap--; - movablemem_map.map[pos].start_pfn = min(start_pfn, - movablemem_map.map[pos].start_pfn); - movablemem_map.map[pos].end_pfn = max(end_pfn, - movablemem_map.map[overlap].end_pfn); - - if (pos != overlap && overlap + 1 != movablemem_map.nr_map) - memmove(&movablemem_map.map[pos+1], - &movablemem_map.map[overlap+1], - sizeof(struct movablemem_entry) * - (movablemem_map.nr_map - overlap - 1)); - - movablemem_map.nr_map -= overlap - pos; -} - -/** - * movablemem_map_add_region - Add a memory range into movablemem_map. - * @start: physical start address of range - * @end: physical end address of range - * - * This function transform the physical address into pfn, and then add the - * range into movablemem_map by calling insert_movablemem_map(). - */ -static void __init movablemem_map_add_region(u64 start, u64 size) -{ - unsigned long start_pfn, end_pfn; - - /* In case size == 0 or start + size overflows */ - if (start + size <= start) - return; - - if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) { - pr_err("movablemem_map: too many entries;" - " ignoring [mem %#010llx-%#010llx]\n", - (unsigned long long) start, - (unsigned long long) (start + size - 1)); - return; - } - - start_pfn = PFN_DOWN(start); - end_pfn = PFN_UP(start + size); - insert_movablemem_map(start_pfn, end_pfn); -} - -/* - * cmdline_parse_movablemem_map - Parse boot option movablemem_map. - * @p: The boot option of the following format: - * movablemem_map=nn[KMG]@ss[KMG] - * - * This option sets the memory range [ss, ss+nn) to be used as movable memory. - * - * Return: 0 on success or -EINVAL on failure. - */ -static int __init cmdline_parse_movablemem_map(char *p) -{ - char *oldp; - u64 start_at, mem_size; - - if (!p) - goto err; - - if (!strcmp(p, "acpi")) - movablemem_map.acpi = true; - - /* - * If user decide to use info from BIOS, all the other user specified - * ranges will be ingored. - */ - if (movablemem_map.acpi) { - if (movablemem_map.nr_map) { - memset(movablemem_map.map, 0, - sizeof(struct movablemem_entry) - * movablemem_map.nr_map); - movablemem_map.nr_map = 0; - } - return 0; - } - - oldp = p; - mem_size = memparse(p, &p); - if (p == oldp) - goto err; - - if (*p == '@') { - oldp = ++p; - start_at = memparse(p, &p); - if (p == oldp || *p != '\0') - goto err; - - movablemem_map_add_region(start_at, mem_size); - return 0; - } -err: - return -EINVAL; -} -early_param("movablemem_map", cmdline_parse_movablemem_map); - #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** -- cgit v1.2.3