From cd099682e4c786c3a866e462b37fcac6e3a44a68 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:02 -0800 Subject: memory-hotplug: move pgdat_resize_lock into sparse_remove_one_section() In __remove_section(), we locked pgdat_resize_lock when calling sparse_remove_one_section(). This lock will disable irq. But we don't need to lock the whole function. If we do some work to free pagetables in free_section_usemap(), we need to call flush_tlb_all(), which need irq enabled. Otherwise the WARN_ON_ONCE() in smp_call_function_many() will be triggered. If we lock the whole sparse_remove_one_section(), then we come to this call trace: ------------[ cut here ]------------ WARNING: at kernel/smp.c:461 smp_call_function_many+0xbd/0x260() Hardware name: PRIMEQUEST 1800E ...... Call Trace: smp_call_function_many+0xbd/0x260 smp_call_function+0x3b/0x50 on_each_cpu+0x3b/0xc0 flush_tlb_all+0x1c/0x20 remove_pagetable+0x14e/0x1d0 vmemmap_free+0x18/0x20 sparse_remove_one_section+0xf7/0x100 __remove_section+0xa2/0xb0 __remove_pages+0xa0/0xd0 arch_remove_memory+0x6b/0xc0 remove_memory+0xb8/0xf0 acpi_memory_device_remove+0x53/0x96 acpi_device_remove+0x90/0xb2 __device_release_driver+0x7c/0xf0 device_release_driver+0x2f/0x50 acpi_bus_remove+0x32/0x6d acpi_bus_trim+0x91/0x102 acpi_bus_hot_remove_device+0x88/0x16b acpi_os_execute_deferred+0x27/0x34 process_one_work+0x20e/0x5c0 worker_thread+0x12e/0x370 kthread+0xee/0x100 ret_from_fork+0x7c/0xb0 ---[ end trace 25e85300f542aa01 ]--- Signed-off-by: Tang Chen Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Wu Jianguo Cc: Yasuaki Ishimatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 6b5fb762e2ca..66f0fd9d7964 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -796,8 +796,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) { struct page *memmap = NULL; - unsigned long *usemap = NULL; + unsigned long *usemap = NULL, flags; + struct pglist_data *pgdat = zone->zone_pgdat; + pgdat_resize_lock(pgdat, &flags); if (ms->section_mem_map) { usemap = ms->pageblock_flags; memmap = sparse_decode_mem_map(ms->section_mem_map, @@ -805,6 +807,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) ms->section_mem_map = 0; ms->pageblock_flags = NULL; } + pgdat_resize_unlock(pgdat, &flags); clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); free_section_usemap(memmap, usemap); -- cgit v1.2.3 From 0197518cd3672029618a16a57597946a094ac7a8 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:08 -0800 Subject: memory-hotplug: remove memmap of sparse-vmemmap Introduce a new API vmemmap_free() to free and remove vmemmap pagetables. Since pagetable implements are different, each architecture has to provide its own version of vmemmap_free(), just like vmemmap_populate(). Note: vmemmap_free() is not implemented for ia64, ppc, s390, and sparc. [mhocko@suse.cz: fix implicit declaration of remove_pagetable] Signed-off-by: Yasuaki Ishimatsu Signed-off-by: Jianguo Wu Signed-off-by: Wen Congyang Signed-off-by: Tang Chen Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 66f0fd9d7964..46f6ea47d9ab 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, } static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - return; /* XXX: Not implemented yet */ + vmemmap_free(memmap, nr_pages); } static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { + vmemmap_free(memmap, nr_pages); } #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) -- cgit v1.2.3 From 8a356ce38e134b3b09b439e88dc770f8f5567648 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Fri, 22 Feb 2013 16:33:21 -0800 Subject: memory-hotplug: consider compound pages when free memmap usemap could also be allocated as compound pages. Should also consider compound pages when freeing memmap. If we don't fix it, there could be problems when we free vmemmap pagetables which are stored in compound pages. The old pagetables will not be freed properly, and when we add the memory again, no new pagetable will be created. And the old pagetable entry is used, than the kernel will panic. The call trace is like the following: BUG: unable to handle kernel paging request at ffffea0040000000 IP: [] sparse_add_one_section+0xef/0x166 PGD 7ff7d4067 PUD 78e035067 PMD 78e11d067 PTE 0 Oops: 0002 [#1] SMP Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter ip_tables bridge stp llc sunrpc binfmt_misc dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel microcode pcspkr sg lpc_ich mfd_core i2c_i801 i2c_core i7core_edac edac_core ioatdma e1000e igb dca ptp pps_core sd_mod crc_t10dif megaraid_sas mptsas mptscsih mptbase scsi_transport_sas scsi_mod CPU 0 Pid: 4, comm: kworker/0:0 Tainted: G W 3.8.0-rc3-phy-hot-remove+ #3 FUJITSU-SV PRIMEQUEST 1800E/SB RIP: 0010:[] [] sparse_add_one_section+0xef/0x166 RSP: 0018:ffff8807bdcb35d8 EFLAGS: 00010006 RAX: 0000000000000000 RBX: 0000000000000200 RCX: 0000000000200000 RDX: ffff88078df01148 RSI: 0000000000000282 RDI: ffffea0040000000 RBP: ffff8807bdcb3618 R08: 4cf05005b019467a R09: 0cd98fa09631467a R10: 0000000000000000 R11: 0000000000030e20 R12: 0000000000008000 R13: ffffea0040000000 R14: ffff88078df66248 R15: ffff88078ea13b10 FS: 0000000000000000(0000) GS:ffff8807c1a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffffea0040000000 CR3: 0000000001c0c000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/0:0 (pid: 4, threadinfo ffff8807bdcb2000, task ffff8807bde18000) Call Trace: __add_pages+0x85/0x120 arch_add_memory+0x71/0xf0 add_memory+0xd6/0x1f0 acpi_memory_device_add+0x170/0x20c acpi_device_probe+0x50/0x18a really_probe+0x6c/0x320 driver_probe_device+0x47/0xa0 __device_attach+0x53/0x60 bus_for_each_drv+0x6c/0xa0 device_attach+0xa8/0xc0 bus_probe_device+0xb0/0xe0 device_add+0x301/0x570 device_register+0x1e/0x30 acpi_device_register+0x1d8/0x27c acpi_add_single_object+0x1df/0x2b9 acpi_bus_check_add+0x112/0x18f acpi_ns_walk_namespace+0x105/0x255 acpi_walk_namespace+0xcf/0x118 acpi_bus_scan+0x5b/0x7c acpi_bus_add+0x2a/0x2c container_notify_cb+0x112/0x1a9 acpi_ev_notify_dispatch+0x46/0x61 acpi_os_execute_deferred+0x27/0x34 process_one_work+0x20e/0x5c0 worker_thread+0x12e/0x370 kthread+0xee/0x100 ret_from_fork+0x7c/0xb0 Code: 00 00 48 89 df 48 89 45 c8 e8 3e 71 b1 ff 48 89 c2 48 8b 75 c8 b8 ef ff ff ff f6 02 01 75 4b 49 63 cc 31 c0 4c 89 ef 48 c1 e1 06 aa 48 8b 02 48 83 c8 01 48 85 d2 48 89 02 74 29 a8 01 74 25 RIP [] sparse_add_one_section+0xef/0x166 RSP CR2: ffffea0040000000 ---[ end trace e7f94e3a34c442d4 ]--- Kernel panic - not syncing: Fatal exception Signed-off-by: Wen Congyang Signed-off-by: Tang Chen Cc: Jiang Liu Cc: Jianguo Wu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 46f6ea47d9ab..cff97960f1d7 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -698,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) /* * Check to see if allocation came from hot-plug-add */ - if (PageSlab(usemap_page)) { + if (PageSlab(usemap_page) || PageCompound(usemap_page)) { kfree(usemap); if (memmap) __kfree_section_memmap(memmap, PAGES_PER_SECTION); -- cgit v1.2.3 From 293c07e31ab5a0b8df8c19b2a9e5c6fa30308849 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 22 Feb 2013 16:34:02 -0800 Subject: memory-failure: use num_poisoned_pages instead of mce_bad_pages Since MCE is an x86 concept, and this code is in mm/, it would be better to use the name num_poisoned_pages instead of mce_bad_pages. [akpm@linux-foundation.org: fix mm/sparse.c] Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Suggested-by: Borislav Petkov Reviewed-by: Wanpeng Li Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index cff97960f1d7..7ca6dc847947 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -783,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) for (i = 0; i < PAGES_PER_SECTION; i++) { if (PageHWPoison(&memmap[i])) { - atomic_long_sub(1, &mce_bad_pages); + atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); } } -- cgit v1.2.3