From d43006d503ac921c7df4f94d13c17db6f13c9d26 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 3 Jul 2013 15:01:50 -0700 Subject: mm: vmscan: have kswapd writeback pages based on dirty pages encountered, not priority Currently kswapd queues dirty pages for writeback if scanning at an elevated priority but the priority kswapd scans at is not related to the number of unqueued dirty encountered. Since commit "mm: vmscan: Flatten kswapd priority loop", the priority is related to the size of the LRU and the zone watermark which is no indication as to whether kswapd should write pages or not. This patch tracks if an excessive number of unqueued dirty pages are being encountered at the end of the LRU. If so, it indicates that dirty pages are being recycled before flusher threads can clean them and flags the zone so that kswapd will start writing pages until the zone is balanced. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Jiri Slaby Cc: Valdis Kletnieks Tested-by: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/mmzone.h') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c76737d836b..2aaf72f7e345 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -495,6 +495,10 @@ typedef enum { ZONE_CONGESTED, /* zone has many dirty pages backed by * a congested BDI */ + ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found + * many dirty file pages at the tail + * of the LRU. + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -517,6 +521,11 @@ static inline int zone_is_reclaim_congested(const struct zone *zone) return test_bit(ZONE_CONGESTED, &zone->flags); } +static inline int zone_is_reclaim_dirty(const struct zone *zone) +{ + return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); -- cgit v1.2.3 From 283aba9f9e0e4882bf09bd37a2983379a6fae805 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 3 Jul 2013 15:01:51 -0700 Subject: mm: vmscan: block kswapd if it is encountering pages under writeback Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: Mel Gorman Reviewed-by: Michal Hocko Acked-by: Rik van Riel Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Tested-by: Zlatko Calusic Cc: dormando Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/mmzone.h') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf72f7e345..fce64afba042 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -499,6 +499,9 @@ typedef enum { * many dirty file pages at the tail * of the LRU. */ + ZONE_WRITEBACK, /* reclaim scanning has recently found + * many pages under writeback + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone) return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); } +static inline int zone_is_reclaim_writeback(const struct zone *zone) +{ + return test_bit(ZONE_WRITEBACK, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); -- cgit v1.2.3 From 72c3b51bda557ab38d6c5b2af750c27cba15f828 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:02:08 -0700 Subject: mm: fix comment referring to non-existent size_seqlock, change to span_seqlock Signed-off-by: Cody P Schafer Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mmzone.h') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fce64afba042..9ec7cffcae21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -733,7 +733,7 @@ typedef struct pglist_data { * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * - * Nests above zone->lock and zone->size_seqlock. + * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif -- cgit v1.2.3 From 114d4b79f7e561fd76fb98eba79e18df7cdd60f0 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:02:09 -0700 Subject: mmzone: note that node_size_lock should be manipulated via pgdat_resize_lock() Signed-off-by: Cody P Schafer Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/mmzone.h') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9ec7cffcae21..e511f9429f1e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -733,6 +733,9 @@ typedef struct pglist_data { * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * + * pgdat_resize_lock() and pgdat_resize_unlock() are provided to + * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. + * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; -- cgit v1.2.3 From c3d5f5f0c2bc4eabeaf49f1a21e1aeb965246cd2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:14 -0700 Subject: mm: use a dedicated lock to protect totalram_pages and zone->managed_pages Currently lock_memory_hotplug()/unlock_memory_hotplug() are used to protect totalram_pages and zone->managed_pages. Other than the memory hotplug driver, totalram_pages and zone->managed_pages may also be modified at runtime by other drivers, such as Xen balloon, virtio_balloon etc. For those cases, memory hotplug lock is a little too heavy, so introduce a dedicated lock to protect totalram_pages and zone->managed_pages. Now we have a simplified locking rules totalram_pages and zone->managed_pages as: 1) no locking for read accesses because they are unsigned long. 2) no locking for write accesses at boot time in single-threaded context. 3) serialize write accesses at runtime by acquiring the dedicated managed_page_count_lock. Also adjust zone->managed_pages when freeing reserved pages into the buddy system, to keep totalram_pages and zone->managed_pages in consistence. [akpm@linux-foundation.org: don't export adjust_managed_page_count to modules (for now)] Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux/mmzone.h') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e511f9429f1e..09d381b71fd8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -474,10 +474,16 @@ struct zone { * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * - * Write access to present_pages and managed_pages at runtime should - * be protected by lock_memory_hotplug()/unlock_memory_hotplug(). - * Any reader who can't tolerant drift of present_pages and - * managed_pages should hold memory hotplug lock to get a stable value. + * Write access to present_pages at runtime should be protected by + * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't + * tolerant drift of present_pages should hold memory hotplug lock to + * get a stable value. + * + * Read access to managed_pages should be safe because it's unsigned + * long. Write access to zone->managed_pages and totalram_pages are + * protected by managed_page_count_lock at runtime. Idealy only + * adjust_managed_page_count() should be used instead of directly + * touching zone->managed_pages and totalram_pages. */ unsigned long spanned_pages; unsigned long present_pages; -- cgit v1.2.3 From 55878e88c59221c3187e1c24ec3b15eb79c374c0 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:04:44 -0700 Subject: sparsemem: add BUILD_BUG_ON when sizeof mem_section is non-power-of-2 Instead of leaving a hidden trap for the next person who comes along and wants to add something to mem_section, add a big fat warning about it needing to be a power-of-2, and insert a BUILD_BUG_ON() in sparse_init() to catch mistakes. Right now non-power-of-2 mem_sections cause a number of WARNs at boot (which don't clearly point to the size of mem_section as an issue), but the system limps on (temporarily, at least). This is based upon Dave Hansen's earlier RFC where he ran into the same issue: "sparsemem: fix boot when SECTIONS_PER_ROOT is not power-of-2" http://lkml.indiana.edu/hypermail/linux/kernel/1205.2/03077.html Signed-off-by: Cody P Schafer Acked-by: Dave Hansen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/mmzone.h') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 09d381b71fd8..ae19af5ec02c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1137,6 +1137,10 @@ struct mem_section { struct page_cgroup *page_cgroup; unsigned long pad; #endif + /* + * WARNING: mem_section must be a power-of-2 in size for the + * calculation and use of SECTION_ROOT_MASK to make sense. + */ }; #ifdef CONFIG_SPARSEMEM_EXTREME -- cgit v1.2.3 From b21fbccd4b8aba805cbc231998ec7bf83616a79e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:07 -0700 Subject: mm: remove unused functions is_{normal_idx, normal, dma32, dma} These functions are nowhere used, so remove them. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux/mmzone.h') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ae19af5ec02c..af4a3b77a8de 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -869,11 +869,6 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } -static inline int is_normal_idx(enum zone_type idx) -{ - return (idx == ZONE_NORMAL); -} - /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -892,29 +887,6 @@ static inline int is_highmem(struct zone *zone) #endif } -static inline int is_normal(struct zone *zone) -{ - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; -} - -static inline int is_dma32(struct zone *zone) -{ -#ifdef CONFIG_ZONE_DMA32 - return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; -#else - return 0; -#endif -} - -static inline int is_dma(struct zone *zone) -{ -#ifdef CONFIG_ZONE_DMA - return zone == zone->zone_pgdat->node_zones + ZONE_DMA; -#else - return 0; -#endif -} - /* These two functions are used to setup the per zone pages min values */ struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, -- cgit v1.2.3