From 5dfe8660a3d7f1ee1265c3536433ee53da3f98a3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2011 09:46:10 +0200 Subject: bootmem: Replace work_with_active_regions() with for_each_mem_pfn_range() Callback based iteration is cumbersome and much less useful than for_each_*() iterator. This patch implements for_each_mem_pfn_range() which replaces work_with_active_regions(). All the current users of work_with_active_regions() are converted. This simplifies walking over early_node_map and will allow converting internal logics in page_alloc to use iterator instead of walking early_node_map directly, which in turn will enable moving node information to memblock. powerpc change is only compile tested. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/20110714074610.GD3455@htj.dyndns.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c70a326b8f26..57e4c9ffdff8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1327,9 +1327,27 @@ int add_from_early_node_map(struct range *range, int az, int nr_range, int nid); u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit); -typedef int (*work_fn_t)(unsigned long, unsigned long, void *); -extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); + +extern void __next_mem_pfn_range(int *idx, int nid, + unsigned long *out_start_pfn, + unsigned long *out_end_pfn, int *out_nid); + +/** + * for_each_mem_pfn_range - early memory pfn range iterator + * @i: an integer used as loop variable + * @nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to ulong for start pfn of the range, can be %NULL + * @p_end: ptr to ulong for end pfn of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Walks over configured memory ranges. Available after early_node_map is + * populated. + */ +#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ + for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ + i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) + #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ -- cgit v1.2.3 From eb40c4c27f1722f058e4713ccfedebac577d5190 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Jul 2011 10:46:35 +0200 Subject: memblock, x86: Replace memblock_x86_find_in_range_node() with generic memblock calls With the previous changes, generic NUMA aware memblock API has feature parity with memblock_x86_find_in_range_node(). There currently are two users - x86 setup_node_data() and __alloc_memory_core_early() in nobootmem.c. This patch converts the former to use memblock_alloc_nid() and the latter memblock_find_range_in_node(), and kills memblock_x86_find_in_range_node() and related functions including find_memory_early_core_early() in page_alloc.c. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1310460395-30913-9-git-send-email-tj@kernel.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 57e4c9ffdff8..9ebc65ae6863 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1325,8 +1325,6 @@ extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); int add_from_early_node_map(struct range *range, int az, int nr_range, int nid); -u64 __init find_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit); extern void sparse_memory_present_with_active_regions(int nid); extern void __next_mem_pfn_range(int *idx, int nid, -- cgit v1.2.3 From 7c0caeb866b0f648d91bb75b8bc6f86af95bb033 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2011 11:43:42 +0200 Subject: memblock: Add optional region->nid From 83103b92f3234ec830852bbc5c45911bd6cbdb20 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2011 11:22:16 +0200 Add optional region->nid which can be enabled by arch using CONFIG_HAVE_MEMBLOCK_NODE_MAP. When enabled, memblock also carries NUMA node information and replaces early_node_map[]. Newly added memblocks have MAX_NUMNODES as nid. Arch can then call memblock_set_node() to set node information. memblock takes care of merging and node affine allocations w.r.t. node information. When MEMBLOCK_NODE_MAP is enabled, early_node_map[], related data structures and functions to manipulate and iterate it are disabled. memblock version of __next_mem_pfn_range() is provided such that for_each_mem_pfn_range() behaves the same and its users don't have to be updated. -v2: Yinghai spotted section mismatch caused by missing __init_memblock in memblock_set_node(). Fixed. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/20110714094342.GF3455@htj.dyndns.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ebc65ae6863..ceb1e4a1a736 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1307,12 +1307,14 @@ extern void free_area_init_node(int nid, unsigned long * zones_size, * CONFIG_ARCH_POPULATES_NODE_MAP */ extern void free_area_init_nodes(unsigned long *max_zone_pfn); +#ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); void sort_node_map(void); +#endif unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); -- cgit v1.2.3 From 54c29c635ae91f5d75ced7bffeaa77ba37ca02bb Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 29 Nov 2011 17:05:11 +0100 Subject: mm, x86: Remove debug_pagealloc_enabled When (no)bootmem finish operation, it pass pages to buddy allocator. Since debug_pagealloc_enabled is not set, we will do not protect pages, what is not what we want with CONFIG_DEBUG_PAGEALLOC=y. To fix remove debug_pagealloc_enabled. That variable was introduced by commit 12d6f21e "x86: do not PSE on CONFIG_DEBUG_PAGEALLOC=y" to get more CPA (change page attribude) code testing. But currently we have CONFIG_CPA_DEBUG, which test CPA. Signed-off-by: Stanislaw Gruszka Acked-by: Mel Gorman Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1322582711-14571-1-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- include/linux/mm.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dc3a8c2c485..0a22db144753 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1537,23 +1537,13 @@ static inline void vm_stat_account(struct mm_struct *mm, #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_PAGEALLOC -extern int debug_pagealloc_enabled; - extern void kernel_map_pages(struct page *page, int numpages, int enable); - -static inline void enable_debug_pagealloc(void) -{ - debug_pagealloc_enabled = 1; -} #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} -static inline void enable_debug_pagealloc(void) -{ -} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ -- cgit v1.2.3 From 0ee332c1451869963626bf9cac88f165a90990e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Dec 2011 10:22:09 -0800 Subject: memblock: Kill early_node_map[] Now all ARCH_POPULATES_NODE_MAP archs select HAVE_MEBLOCK_NODE_MAP - there's no user of early_node_map[] left. Kill early_node_map[] and replace ARCH_POPULATES_NODE_MAP with HAVE_MEMBLOCK_NODE_MAP. Also, relocate for_each_mem_pfn_range() and helper from mm.h to memblock.h as page_alloc.c would no longer host an alternative implementation. This change is ultimately one to one mapping and shouldn't cause any observable difference; however, after the recent changes, there are some functions which now would fit memblock.c better than page_alloc.c and dependency on HAVE_MEMBLOCK_NODE_MAP instead of HAVE_MEMBLOCK doesn't make much sense on some of them. Further cleanups for functions inside HAVE_MEMBLOCK_NODE_MAP in mm.h would be nice. -v2: Fix compile bug introduced by mis-spelling CONFIG_HAVE_MEMBLOCK_NODE_MAP to CONFIG_MEMBLOCK_HAVE_NODE_MAP in mmzone.h. Reported by Stephen Rothwell. Signed-off-by: Tejun Heo Cc: Stephen Rothwell Cc: Benjamin Herrenschmidt Cc: Yinghai Lu Cc: Tony Luck Cc: Ralf Baechle Cc: Martin Schwidefsky Cc: Chen Liqin Cc: Paul Mundt Cc: "David S. Miller" Cc: "H. Peter Anvin" --- include/linux/mm.h | 50 +++++++++++--------------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b365aee8396..c6f49bea52a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1252,43 +1252,34 @@ static inline void pgtable_page_dtor(struct page *page) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its + * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in a more * architecture independent manner. This is a substitute for creating the * zone_sizes[] and zholes_size[] arrays and passing them to * free_area_init_node() * * An architecture is expected to register range of page frames backed by - * physical memory with add_active_range() before calling + * physical memory with memblock_add[_node]() before calling * free_area_init_nodes() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * add_active_range(node_id, start_pfn, end_pfn) + * memblock_add_node(base, size, nid) * free_area_init_nodes(max_zone_pfns); * - * If the architecture guarantees that there are no holes in the ranges - * registered with add_active_range(), free_bootmem_active_regions() - * will call free_bootmem_node() for each registered physical page range. - * Similarly sparse_memory_present_with_active_regions() calls - * memory_present() for each range when SPARSEMEM is enabled. + * free_bootmem_with_active_regions() calls free_bootmem_node() for each + * registered physical page range. Similarly + * sparse_memory_present_with_active_regions() calls memory_present() for + * each range when SPARSEMEM is enabled. * * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_ARCH_POPULATES_NODE_MAP + * CONFIG_HAVE_MEMBLOCK_NODE_MAP. */ extern void free_area_init_nodes(unsigned long *max_zone_pfn); -#ifndef CONFIG_HAVE_MEMBLOCK_NODE_MAP -extern void add_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn); -extern void remove_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn); -extern void remove_all_active_ranges(void); -void sort_node_map(void); -#endif unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); @@ -1303,28 +1294,9 @@ int add_from_early_node_map(struct range *range, int az, int nr_range, int nid); extern void sparse_memory_present_with_active_regions(int nid); -extern void __next_mem_pfn_range(int *idx, int nid, - unsigned long *out_start_pfn, - unsigned long *out_end_pfn, int *out_nid); - -/** - * for_each_mem_pfn_range - early memory pfn range iterator - * @i: an integer used as loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to ulong for start pfn of the range, can be %NULL - * @p_end: ptr to ulong for end pfn of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL - * - * Walks over configured memory ranges. Available after early_node_map is - * populated. - */ -#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ - for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ - i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) - -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ +#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) static inline int __early_pfn_to_nid(unsigned long pfn) { -- cgit v1.2.3 From 83aeeada7c69f35e5100b27ec354335597a7a488 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:54 -0800 Subject: vmscan: use atomic-long for shrinker batching Use atomic-long operations instead of looping around cmpxchg(). [akpm@linux-foundation.org: massage atomic.h inclusions] Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dc3a8c2c485..4baadd18f4ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From c0a32fc5a2e470d0b02597b23ad79a317735253e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:28 -0800 Subject: mm: more intensive memory corruption debugging With CONFIG_DEBUG_PAGEALLOC configured, the CPU will generate an exception on access (read,write) to an unallocated page, which permits us to catch code which corrupts memory. However the kernel is trying to maximise memory usage, hence there are usually few free pages in the system and buggy code usually corrupts some crucial data. This patch changes the buddy allocator to keep more free/protected pages and to interlace free/protected and allocated pages to increase the probability of catching corruption. When the kernel is compiled with CONFIG_DEBUG_PAGEALLOC, debug_guardpage_minorder defines the minimum order used by the page allocator to grant a request. The requested size will be returned with the remaining pages used as guard pages. The default value of debug_guardpage_minorder is zero: no change from current behaviour. [akpm@linux-foundation.org: tweak documentation, s/flg/flag/] Signed-off-by: Stanislaw Gruszka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: "Rafael J. Wysocki" Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5d9b4c9813bd..5568553a41fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1618,5 +1618,22 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +#ifdef CONFIG_DEBUG_PAGEALLOC +extern unsigned int _debug_guardpage_minorder; + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool page_is_guard(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool page_is_guard(struct page *page) { return false; } +#endif /* CONFIG_DEBUG_PAGEALLOC */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From 640708a2cff7f81e246243b0073c66e6ece7e53e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 10 Jan 2012 15:11:23 -0800 Subject: procfs: introduce the /proc//map_files/ directory This one behaves similarly to the /proc//fd/ one - it contains symlinks one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", the target is the file. Opening a symlink results in a file that point exactly to the same inode as them vma's one. For example the ls -l of some arbitrary /proc//map_files/ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so This *helps* checkpointing process in three ways: 1. When dumping a task mappings we do know exact file that is mapped by particular region. We do this by opening /proc/$pid/map_files/$address symlink the way we do with file descriptors. 2. This also helps in determining which anonymous shared mappings are shared with each other by comparing the inodes of them. 3. When restoring a set of processes in case two of them has a mapping shared, we map the memory by the 1st one and then open its /proc/$pid/map_files/$address file and map it by the 2nd task. Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable re-reading and reparsing for this text file which slows down restore procedure significantly. Also as being pointed in (3) it is a way easier to use top level shared mapping in children as /proc/$pid/map_files/$address when needed. [akpm@linux-foundation.org: coding-style fixes] [gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE] Signed-off-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Reviewed-by: Vasiliy Kulikov Reviewed-by: "Kirill A. Shutemov" Cc: Tejun Heo Cc: Alexey Dobriyan Cc: Al Viro Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5568553a41fd..6eba2cc016c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1482,6 +1482,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +/* Look up the first VMA which exactly match the interval vm_start ... vm_end */ +static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, + unsigned long vm_start, unsigned long vm_end) +{ + struct vm_area_struct *vma = find_vma(mm, vm_start); + + if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) + vma = NULL; + + return vma; +} + #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); #else -- cgit v1.2.3