From d43006d503ac921c7df4f94d13c17db6f13c9d26 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:01:50 -0700
Subject: mm: vmscan: have kswapd writeback pages based on dirty pages
 encountered, not priority

Currently kswapd queues dirty pages for writeback if scanning at an
elevated priority but the priority kswapd scans at is not related to the
number of unqueued dirty encountered.  Since commit "mm: vmscan: Flatten
kswapd priority loop", the priority is related to the size of the LRU
and the zone watermark which is no indication as to whether kswapd
should write pages or not.

This patch tracks if an excessive number of unqueued dirty pages are
being encountered at the end of the LRU.  If so, it indicates that dirty
pages are being recycled before flusher threads can clean them and flags
the zone so that kswapd will start writing pages until the zone is
balanced.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Tested-by: Zlatko Calusic <zcalusic@bitsync.net>
Cc: dormando <dormando@rydia.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5c76737d836b..2aaf72f7e345 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -495,6 +495,10 @@ typedef enum {
 	ZONE_CONGESTED,			/* zone has many dirty pages backed by
 					 * a congested BDI
 					 */
+	ZONE_TAIL_LRU_DIRTY,		/* reclaim scanning has recently found
+					 * many dirty file pages at the tail
+					 * of the LRU.
+					 */
 } zone_flags_t;
 
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -517,6 +521,11 @@ static inline int zone_is_reclaim_congested(const struct zone *zone)
 	return test_bit(ZONE_CONGESTED, &zone->flags);
 }
 
+static inline int zone_is_reclaim_dirty(const struct zone *zone)
+{
+	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
+}
+
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
-- 
cgit v1.2.3


From 283aba9f9e0e4882bf09bd37a2983379a6fae805 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:01:51 -0700
Subject: mm: vmscan: block kswapd if it is encountering pages under writeback

Historically, kswapd used to congestion_wait() at higher priorities if
it was not making forward progress.  This made no sense as the failure
to make progress could be completely independent of IO.  It was later
replaced by wait_iff_congested() and removed entirely by commit 258401a6
(mm: don't wait on congested zones in balance_pgdat()) as it was
duplicating logic in shrink_inactive_list().

This is problematic.  If kswapd encounters many pages under writeback
and it continues to scan until it reaches the high watermark then it
will quickly skip over the pages under writeback and reclaim clean young
pages or push applications out to swap.

The use of wait_iff_congested() is not suited to kswapd as it will only
stall if the underlying BDI is really congested or a direct reclaimer
was unable to write to the underlying BDI.  kswapd bypasses the BDI
congestion as it sets PF_SWAPWRITE but even if this was taken into
account then it would cause direct reclaimers to stall on writeback
which is not desirable.

This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is
encountering too many pages under writeback.  If this flag is set and
kswapd encounters a PageReclaim page under writeback then it'll assume
that the LRU lists are being recycled too quickly before IO can complete
and block waiting for some IO to complete.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Tested-by: Zlatko Calusic <zcalusic@bitsync.net>
Cc: dormando <dormando@rydia.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2aaf72f7e345..fce64afba042 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
 					 * many dirty file pages at the tail
 					 * of the LRU.
 					 */
+	ZONE_WRITEBACK,			/* reclaim scanning has recently found
+					 * many pages under writeback
+					 */
 } zone_flags_t;
 
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
 	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
 }
 
+static inline int zone_is_reclaim_writeback(const struct zone *zone)
+{
+	return test_bit(ZONE_WRITEBACK, &zone->flags);
+}
+
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
-- 
cgit v1.2.3


From 72c3b51bda557ab38d6c5b2af750c27cba15f828 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:02:08 -0700
Subject: mm: fix comment referring to non-existent size_seqlock, change to
 span_seqlock

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fce64afba042..9ec7cffcae21 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -733,7 +733,7 @@ typedef struct pglist_data {
 	 * or node_spanned_pages stay constant.  Holding this will also
 	 * guarantee that any pfn_valid() stays that way.
 	 *
-	 * Nests above zone->lock and zone->size_seqlock.
+	 * Nests above zone->lock and zone->span_seqlock
 	 */
 	spinlock_t node_size_lock;
 #endif
-- 
cgit v1.2.3


From 114d4b79f7e561fd76fb98eba79e18df7cdd60f0 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:02:09 -0700
Subject: mmzone: note that node_size_lock should be manipulated via
 pgdat_resize_lock()

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ec7cffcae21..e511f9429f1e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -733,6 +733,9 @@ typedef struct pglist_data {
 	 * or node_spanned_pages stay constant.  Holding this will also
 	 * guarantee that any pfn_valid() stays that way.
 	 *
+	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
+	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
+	 *
 	 * Nests above zone->lock and zone->span_seqlock
 	 */
 	spinlock_t node_size_lock;
-- 
cgit v1.2.3


From c3d5f5f0c2bc4eabeaf49f1a21e1aeb965246cd2 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:03:14 -0700
Subject: mm: use a dedicated lock to protect totalram_pages and
 zone->managed_pages

Currently lock_memory_hotplug()/unlock_memory_hotplug() are used to
protect totalram_pages and zone->managed_pages.  Other than the memory
hotplug driver, totalram_pages and zone->managed_pages may also be
modified at runtime by other drivers, such as Xen balloon,
virtio_balloon etc.  For those cases, memory hotplug lock is a little
too heavy, so introduce a dedicated lock to protect totalram_pages and
zone->managed_pages.

Now we have a simplified locking rules totalram_pages and
zone->managed_pages as:

1) no locking for read accesses because they are unsigned long.
2) no locking for write accesses at boot time in single-threaded context.
3) serialize write accesses at runtime by acquiring the dedicated
   managed_page_count_lock.

Also adjust zone->managed_pages when freeing reserved pages into the
buddy system, to keep totalram_pages and zone->managed_pages in
consistence.

[akpm@linux-foundation.org: don't export adjust_managed_page_count to modules (for now)]
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: <sworddragon2@aol.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e511f9429f1e..09d381b71fd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -474,10 +474,16 @@ struct zone {
 	 * frequently read in proximity to zone->lock.  It's good to
 	 * give them a chance of being in the same cacheline.
 	 *
-	 * Write access to present_pages and managed_pages at runtime should
-	 * be protected by lock_memory_hotplug()/unlock_memory_hotplug().
-	 * Any reader who can't tolerant drift of present_pages and
-	 * managed_pages should hold memory hotplug lock to get a stable value.
+	 * Write access to present_pages at runtime should be protected by
+	 * lock_memory_hotplug()/unlock_memory_hotplug().  Any reader who can't
+	 * tolerant drift of present_pages should hold memory hotplug lock to
+	 * get a stable value.
+	 *
+	 * Read access to managed_pages should be safe because it's unsigned
+	 * long. Write access to zone->managed_pages and totalram_pages are
+	 * protected by managed_page_count_lock at runtime. Idealy only
+	 * adjust_managed_page_count() should be used instead of directly
+	 * touching zone->managed_pages and totalram_pages.
 	 */
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
-- 
cgit v1.2.3


From 55878e88c59221c3187e1c24ec3b15eb79c374c0 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:04:44 -0700
Subject: sparsemem: add BUILD_BUG_ON when sizeof mem_section is non-power-of-2

Instead of leaving a hidden trap for the next person who comes along and
wants to add something to mem_section, add a big fat warning about it
needing to be a power-of-2, and insert a BUILD_BUG_ON() in sparse_init()
to catch mistakes.

Right now non-power-of-2 mem_sections cause a number of WARNs at boot
(which don't clearly point to the size of mem_section as an issue), but
the system limps on (temporarily, at least).

This is based upon Dave Hansen's earlier RFC where he ran into the same
issue:
	"sparsemem: fix boot when SECTIONS_PER_ROOT is not power-of-2"
	http://lkml.indiana.edu/hypermail/linux/kernel/1205.2/03077.html

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiang Liu <liuj97@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 09d381b71fd8..ae19af5ec02c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1137,6 +1137,10 @@ struct mem_section {
 	struct page_cgroup *page_cgroup;
 	unsigned long pad;
 #endif
+	/*
+	 * WARNING: mem_section must be a power-of-2 in size for the
+	 * calculation and use of SECTION_ROOT_MASK to make sense.
+	 */
 };
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
-- 
cgit v1.2.3


From b21fbccd4b8aba805cbc231998ec7bf83616a79e Mon Sep 17 00:00:00 2001
From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Date: Mon, 8 Jul 2013 16:00:07 -0700
Subject: mm: remove unused functions is_{normal_idx, normal, dma32, dma}

These functions are nowhere used, so remove them.

Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'include/linux/mmzone.h')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ae19af5ec02c..af4a3b77a8de 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -869,11 +869,6 @@ static inline int is_highmem_idx(enum zone_type idx)
 #endif
 }
 
-static inline int is_normal_idx(enum zone_type idx)
-{
-	return (idx == ZONE_NORMAL);
-}
-
 /**
  * is_highmem - helper function to quickly check if a struct zone is a 
  *              highmem zone or not.  This is an attempt to keep references
@@ -892,29 +887,6 @@ static inline int is_highmem(struct zone *zone)
 #endif
 }
 
-static inline int is_normal(struct zone *zone)
-{
-	return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
-}
-
-static inline int is_dma32(struct zone *zone)
-{
-#ifdef CONFIG_ZONE_DMA32
-	return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
-#else
-	return 0;
-#endif
-}
-
-static inline int is_dma(struct zone *zone)
-{
-#ifdef CONFIG_ZONE_DMA
-	return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
-#else
-	return 0;
-#endif
-}
-
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
-- 
cgit v1.2.3