From 24f7c6b981fb70084757382da464ea85d72af300 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 28 Aug 2013 10:17:56 +1000
Subject: mm: new shrinker API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current shrinker callout API uses an a single shrinker call for
multiple functions.  To determine the function, a special magical value is
passed in a parameter to change the behaviour.  This complicates the
implementation and return value specification for the different
behaviours.

Separate the two different behaviours into separate operations, one to
return a count of freeable objects in the cache, and another to scan a
certain number of objects in the cache for freeing.  In defining these new
operations, ensure the return values and resultant behaviours are clearly
defined and documented.

Modify shrink_slab() to use the new API and implement the callouts for all
the existing shrinkers.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/vmscan.c | 60 ++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 20 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2cff0d491c6d..4d4e859b4b9c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -205,19 +205,24 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
  *
  * Returns the number of slab objects which we shrunk.
  */
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
-	unsigned long ret = 0;
+	unsigned long freed = 0;
 
 	if (nr_pages_scanned == 0)
 		nr_pages_scanned = SWAP_CLUSTER_MAX;
 
 	if (!down_read_trylock(&shrinker_rwsem)) {
-		/* Assume we'll be able to shrink next time */
-		ret = 1;
+		/*
+		 * If we would return 0, our callers would understand that we
+		 * have nothing else to shrink and give up trying. By returning
+		 * 1 we keep it going and assume we'll be able to shrink next
+		 * time.
+		 */
+		freed = 1;
 		goto out;
 	}
 
@@ -225,14 +230,16 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		unsigned long long delta;
 		long total_scan;
 		long max_pass;
-		int shrink_ret = 0;
 		long nr;
 		long new_nr;
 		long batch_size = shrinker->batch ? shrinker->batch
 						  : SHRINK_BATCH;
 
-		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-		if (max_pass <= 0)
+		if (shrinker->count_objects)
+			max_pass = shrinker->count_objects(shrinker, shrinkctl);
+		else
+			max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+		if (max_pass == 0)
 			continue;
 
 		/*
@@ -248,8 +255,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		do_div(delta, lru_pages + 1);
 		total_scan += delta;
 		if (total_scan < 0) {
-			printk(KERN_ERR "shrink_slab: %pF negative objects to "
-			       "delete nr=%ld\n",
+			printk(KERN_ERR
+			"shrink_slab: %pF negative objects to delete nr=%ld\n",
 			       shrinker->shrink, total_scan);
 			total_scan = max_pass;
 		}
@@ -277,20 +284,33 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		if (total_scan > max_pass * 2)
 			total_scan = max_pass * 2;
 
-		trace_mm_shrink_slab_start(shrinker, shrink, nr,
+		trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 					nr_pages_scanned, lru_pages,
 					max_pass, delta, total_scan);
 
 		while (total_scan >= batch_size) {
-			int nr_before;
 
-			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-			shrink_ret = do_shrinker_shrink(shrinker, shrink,
-							batch_size);
-			if (shrink_ret == -1)
-				break;
-			if (shrink_ret < nr_before)
-				ret += nr_before - shrink_ret;
+			if (shrinker->scan_objects) {
+				unsigned long ret;
+				shrinkctl->nr_to_scan = batch_size;
+				ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+				if (ret == SHRINK_STOP)
+					break;
+				freed += ret;
+			} else {
+				int nr_before;
+				long ret;
+
+				nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+				ret = do_shrinker_shrink(shrinker, shrinkctl,
+								batch_size);
+				if (ret == -1)
+					break;
+				if (ret < nr_before)
+					freed += nr_before - ret;
+			}
+
 			count_vm_events(SLABS_SCANNED, batch_size);
 			total_scan -= batch_size;
 
@@ -308,12 +328,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		else
 			new_nr = atomic_long_read(&shrinker->nr_in_batch);
 
-		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+		trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
 	cond_resched();
-	return ret;
+	return freed;
 }
 
 static inline int is_page_cache_freeable(struct page *page)
-- 
cgit v1.2.3


From 0ce3d74450815500e31f16a0b65f6bab687985c3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 28 Aug 2013 10:18:03 +1000
Subject: shrinker: add node awareness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pass the node of the current zone being reclaimed to shrink_slab(),
allowing the shrinker control nodemask to be set appropriately for node
aware shrinkers.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/vmscan.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4d4e859b4b9c..fe0d5c458440 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2374,12 +2374,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 */
 		if (global_reclaim(sc)) {
 			unsigned long lru_pages = 0;
+
+			nodes_clear(shrink->nodes_to_scan);
 			for_each_zone_zonelist(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask)) {
 				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 					continue;
 
 				lru_pages += zone_reclaimable_pages(zone);
+				node_set(zone_to_nid(zone),
+					 shrink->nodes_to_scan);
 			}
 
 			shrink_slab(shrink, sc->nr_scanned, lru_pages);
@@ -2836,6 +2840,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
 		return true;
 
 	shrink_zone(zone, sc);
+	nodes_clear(shrink.nodes_to_scan);
+	node_set(zone_to_nid(zone), shrink.nodes_to_scan);
 
 	reclaim_state->reclaimed_slab = 0;
 	nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
@@ -3544,10 +3550,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * number of slab pages and shake the slab until it is reduced
 		 * by the same nr_pages that we used for reclaiming unmapped
 		 * pages.
-		 *
-		 * Note that shrink_slab will free memory on all zones and may
-		 * take a long time.
 		 */
+		nodes_clear(shrink.nodes_to_scan);
+		node_set(zone_to_nid(zone), shrink.nodes_to_scan);
 		for (;;) {
 			unsigned long lru_pages = zone_reclaimable_pages(zone);
 
-- 
cgit v1.2.3


From 1d3d4437eae1bb2963faab427f65f90663c64aa1 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@openvz.org>
Date: Wed, 28 Aug 2013 10:18:04 +1000
Subject: vmscan: per-node deferred work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The list_lru infrastructure already keeps per-node LRU lists in its
node-specific list_lru_node arrays and provide us with a per-node API, and
the shrinkers are properly equiped with node information.  This means that
we can now focus our shrinking effort in a single node, but the work that
is deferred from one run to another is kept global at nr_in_batch.  Work
can be deferred, for instance, during direct reclaim under a GFP_NOFS
allocation, where situation, all the filesystem shrinkers will be
prevented from running and accumulate in nr_in_batch the amount of work
they should have done, but could not.

This creates an impedance problem, where upon node pressure, work deferred
will accumulate and end up being flushed in other nodes.  The problem we
describe is particularly harmful in big machines, where many nodes can
accumulate at the same time, all adding to the global counter nr_in_batch.
 As we accumulate more and more, we start to ask for the caches to flush
even bigger numbers.  The result is that the caches are depleted and do
not stabilize.  To achieve stable steady state behavior, we need to tackle
it differently.

In this patch we keep the deferred count per-node, in the new array
nr_deferred[] (the name is also a bit more descriptive) and will never
accumulate that to other nodes.

Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/vmscan.c | 241 +++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 140 insertions(+), 101 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe0d5c458440..799ebceeb4f7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 }
 
 /*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
  */
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
 {
-	atomic_long_set(&shrinker->nr_in_batch, 0);
+	size_t size = sizeof(*shrinker->nr_deferred);
+
+	/*
+	 * If we only have one possible node in the system anyway, save
+	 * ourselves the trouble and disable NUMA aware behavior. This way we
+	 * will save memory and some small loop time later.
+	 */
+	if (nr_node_ids == 1)
+		shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+	if (shrinker->flags & SHRINKER_NUMA_AWARE)
+		size *= nr_node_ids;
+
+	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+	if (!shrinker->nr_deferred)
+		return -ENOMEM;
+
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
+	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
 
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
 }
 
 #define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+		 unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+	unsigned long freed = 0;
+	unsigned long long delta;
+	long total_scan;
+	long max_pass;
+	long nr;
+	long new_nr;
+	int nid = shrinkctl->nid;
+	long batch_size = shrinker->batch ? shrinker->batch
+					  : SHRINK_BATCH;
+
+	if (shrinker->count_objects)
+		max_pass = shrinker->count_objects(shrinker, shrinkctl);
+	else
+		max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+	if (max_pass == 0)
+		return 0;
+
+	/*
+	 * copy the current shrinker scan count into a local variable
+	 * and zero it so that other concurrent shrinker invocations
+	 * don't also do this scanning work.
+	 */
+	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+	total_scan = nr;
+	delta = (4 * nr_pages_scanned) / shrinker->seeks;
+	delta *= max_pass;
+	do_div(delta, lru_pages + 1);
+	total_scan += delta;
+	if (total_scan < 0) {
+		printk(KERN_ERR
+		"shrink_slab: %pF negative objects to delete nr=%ld\n",
+		       shrinker->shrink, total_scan);
+		total_scan = max_pass;
+	}
+
+	/*
+	 * We need to avoid excessive windup on filesystem shrinkers
+	 * due to large numbers of GFP_NOFS allocations causing the
+	 * shrinkers to return -1 all the time. This results in a large
+	 * nr being built up so when a shrink that can do some work
+	 * comes along it empties the entire cache due to nr >>>
+	 * max_pass.  This is bad for sustaining a working set in
+	 * memory.
+	 *
+	 * Hence only allow the shrinker to scan the entire cache when
+	 * a large delta change is calculated directly.
+	 */
+	if (delta < max_pass / 4)
+		total_scan = min(total_scan, max_pass / 2);
+
+	/*
+	 * Avoid risking looping forever due to too large nr value:
+	 * never try to free more than twice the estimate number of
+	 * freeable entries.
+	 */
+	if (total_scan > max_pass * 2)
+		total_scan = max_pass * 2;
+
+	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+				nr_pages_scanned, lru_pages,
+				max_pass, delta, total_scan);
+
+	while (total_scan >= batch_size) {
+
+		if (shrinker->scan_objects) {
+			unsigned long ret;
+			shrinkctl->nr_to_scan = batch_size;
+			ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+			if (ret == SHRINK_STOP)
+				break;
+			freed += ret;
+		} else {
+			int nr_before;
+			long ret;
+
+			nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+			ret = do_shrinker_shrink(shrinker, shrinkctl,
+							batch_size);
+			if (ret == -1)
+				break;
+			if (ret < nr_before)
+				freed += nr_before - ret;
+		}
+
+		count_vm_events(SLABS_SCANNED, batch_size);
+		total_scan -= batch_size;
+
+		cond_resched();
+	}
+
+	/*
+	 * move the unused scan count back into the shrinker in a
+	 * manner that handles concurrent updates. If we exhausted the
+	 * scan, there is no need to do an update.
+	 */
+	if (total_scan > 0)
+		new_nr = atomic_long_add_return(total_scan,
+						&shrinker->nr_deferred[nid]);
+	else
+		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+	return freed;
+}
+
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
 	}
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
-		unsigned long long delta;
-		long total_scan;
-		long max_pass;
-		long nr;
-		long new_nr;
-		long batch_size = shrinker->batch ? shrinker->batch
-						  : SHRINK_BATCH;
-
-		if (shrinker->count_objects)
-			max_pass = shrinker->count_objects(shrinker, shrinkctl);
-		else
-			max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
-		if (max_pass == 0)
-			continue;
-
-		/*
-		 * copy the current shrinker scan count into a local variable
-		 * and zero it so that other concurrent shrinker invocations
-		 * don't also do this scanning work.
-		 */
-		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
-		total_scan = nr;
-		delta = (4 * nr_pages_scanned) / shrinker->seeks;
-		delta *= max_pass;
-		do_div(delta, lru_pages + 1);
-		total_scan += delta;
-		if (total_scan < 0) {
-			printk(KERN_ERR
-			"shrink_slab: %pF negative objects to delete nr=%ld\n",
-			       shrinker->shrink, total_scan);
-			total_scan = max_pass;
-		}
-
-		/*
-		 * We need to avoid excessive windup on filesystem shrinkers
-		 * due to large numbers of GFP_NOFS allocations causing the
-		 * shrinkers to return -1 all the time. This results in a large
-		 * nr being built up so when a shrink that can do some work
-		 * comes along it empties the entire cache due to nr >>>
-		 * max_pass.  This is bad for sustaining a working set in
-		 * memory.
-		 *
-		 * Hence only allow the shrinker to scan the entire cache when
-		 * a large delta change is calculated directly.
-		 */
-		if (delta < max_pass / 4)
-			total_scan = min(total_scan, max_pass / 2);
-
-		/*
-		 * Avoid risking looping forever due to too large nr value:
-		 * never try to free more than twice the estimate number of
-		 * freeable entries.
-		 */
-		if (total_scan > max_pass * 2)
-			total_scan = max_pass * 2;
-
-		trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-					nr_pages_scanned, lru_pages,
-					max_pass, delta, total_scan);
-
-		while (total_scan >= batch_size) {
-
-			if (shrinker->scan_objects) {
-				unsigned long ret;
-				shrinkctl->nr_to_scan = batch_size;
-				ret = shrinker->scan_objects(shrinker, shrinkctl);
+		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+			if (!node_online(shrinkctl->nid))
+				continue;
 
-				if (ret == SHRINK_STOP)
-					break;
-				freed += ret;
-			} else {
-				int nr_before;
-				long ret;
-
-				nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
-				ret = do_shrinker_shrink(shrinker, shrinkctl,
-								batch_size);
-				if (ret == -1)
-					break;
-				if (ret < nr_before)
-					freed += nr_before - ret;
-			}
+			if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+			    (shrinkctl->nid != 0))
+				break;
 
-			count_vm_events(SLABS_SCANNED, batch_size);
-			total_scan -= batch_size;
+			freed += shrink_slab_node(shrinkctl, shrinker,
+				 nr_pages_scanned, lru_pages);
 
-			cond_resched();
 		}
-
-		/*
-		 * move the unused scan count back into the shrinker in a
-		 * manner that handles concurrent updates. If we exhausted the
-		 * scan, there is no need to do an update.
-		 */
-		if (total_scan > 0)
-			new_nr = atomic_long_add_return(total_scan,
-					&shrinker->nr_in_batch);
-		else
-			new_nr = atomic_long_read(&shrinker->nr_in_batch);
-
-		trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
-- 
cgit v1.2.3


From a0b02131c5fcd8545b867db72224b3659e813f10 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 28 Aug 2013 10:18:16 +1000
Subject: shrinker: Kill old ->shrink API.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no more users of this API, so kill it dead, dead, dead and
quietly bury the corpse in a shallow, unmarked grave in a dark forest deep
in the hills...

[glommer@openvz.org: added flowers to the grave]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/vmscan.c | 41 ++++++++---------------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 799ebceeb4f7..e36454220614 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -194,14 +194,6 @@ void unregister_shrinker(struct shrinker *shrinker)
 }
 EXPORT_SYMBOL(unregister_shrinker);
 
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
-				     struct shrink_control *sc,
-				     unsigned long nr_to_scan)
-{
-	sc->nr_to_scan = nr_to_scan;
-	return (*shrinker->shrink)(shrinker, sc);
-}
-
 #define SHRINK_BATCH 128
 
 static unsigned long
@@ -218,10 +210,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
 	long batch_size = shrinker->batch ? shrinker->batch
 					  : SHRINK_BATCH;
 
-	if (shrinker->count_objects)
-		max_pass = shrinker->count_objects(shrinker, shrinkctl);
-	else
-		max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+	max_pass = shrinker->count_objects(shrinker, shrinkctl);
 	if (max_pass == 0)
 		return 0;
 
@@ -240,7 +229,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
 	if (total_scan < 0) {
 		printk(KERN_ERR
 		"shrink_slab: %pF negative objects to delete nr=%ld\n",
-		       shrinker->shrink, total_scan);
+		       shrinker->scan_objects, total_scan);
 		total_scan = max_pass;
 	}
 
@@ -272,27 +261,13 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
 				max_pass, delta, total_scan);
 
 	while (total_scan >= batch_size) {
+		unsigned long ret;
 
-		if (shrinker->scan_objects) {
-			unsigned long ret;
-			shrinkctl->nr_to_scan = batch_size;
-			ret = shrinker->scan_objects(shrinker, shrinkctl);
-
-			if (ret == SHRINK_STOP)
-				break;
-			freed += ret;
-		} else {
-			int nr_before;
-			long ret;
-
-			nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
-			ret = do_shrinker_shrink(shrinker, shrinkctl,
-							batch_size);
-			if (ret == -1)
-				break;
-			if (ret < nr_before)
-				freed += nr_before - ret;
-		}
+		shrinkctl->nr_to_scan = batch_size;
+		ret = shrinker->scan_objects(shrinker, shrinkctl);
+		if (ret == SHRINK_STOP)
+			break;
+		freed += ret;
 
 		count_vm_events(SLABS_SCANNED, batch_size);
 		total_scan -= batch_size;
-- 
cgit v1.2.3


From 892f795df1eb119b560a3ee5a1ca3f385a852e84 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Sep 2013 14:20:39 -0700
Subject: mm: vmscan: fix numa reclaim balance problem in kswapd

The way the page allocator interacts with kswapd creates aging imbalances,
where the amount of time a userspace page gets in memory under reclaim
pressure is dependent on which zone, which node the allocator took the
page frame from.

#1 fixes missed kswapd wakeups on NUMA systems, which lead to some
   nodes falling behind for a full reclaim cycle relative to the other
   nodes in the system

#3 fixes an interaction where kswapd and a continuous stream of page
   allocations keep the preferred zone of a task between the high and
   low watermark (allocations succeed + kswapd does not go to sleep)
   indefinitely, completely underutilizing the lower zones and
   thrashing on the preferred zone

These patches are the aging fairness part of the thrash-detection based
file LRU balancing.  Andrea recommended to submit them separately as they
are bugfixes in their own right.

The following test ran a foreground workload (memcachetest) with
background IO of various sizes on a 4 node 8G system (similar results were
observed with single-node 4G systems):

parallelio
                                               BAS                    FAIRALLO
                                              BASE                   FAIRALLOC
Ops memcachetest-0M              5170.00 (  0.00%)           5283.00 (  2.19%)
Ops memcachetest-791M            4740.00 (  0.00%)           5293.00 ( 11.67%)
Ops memcachetest-2639M           2551.00 (  0.00%)           4950.00 ( 94.04%)
Ops memcachetest-4487M           2606.00 (  0.00%)           3922.00 ( 50.50%)
Ops io-duration-0M                  0.00 (  0.00%)              0.00 (  0.00%)
Ops io-duration-791M               55.00 (  0.00%)             18.00 ( 67.27%)
Ops io-duration-2639M             235.00 (  0.00%)            103.00 ( 56.17%)
Ops io-duration-4487M             278.00 (  0.00%)            173.00 ( 37.77%)
Ops swaptotal-0M                    0.00 (  0.00%)              0.00 (  0.00%)
Ops swaptotal-791M             245184.00 (  0.00%)              0.00 (  0.00%)
Ops swaptotal-2639M            468069.00 (  0.00%)         108778.00 ( 76.76%)
Ops swaptotal-4487M            452529.00 (  0.00%)          76623.00 ( 83.07%)
Ops swapin-0M                       0.00 (  0.00%)              0.00 (  0.00%)
Ops swapin-791M                108297.00 (  0.00%)              0.00 (  0.00%)
Ops swapin-2639M               169537.00 (  0.00%)          50031.00 ( 70.49%)
Ops swapin-4487M               167435.00 (  0.00%)          34178.00 ( 79.59%)
Ops minorfaults-0M            1518666.00 (  0.00%)        1503993.00 (  0.97%)
Ops minorfaults-791M          1676963.00 (  0.00%)        1520115.00 (  9.35%)
Ops minorfaults-2639M         1606035.00 (  0.00%)        1799717.00 (-12.06%)
Ops minorfaults-4487M         1612118.00 (  0.00%)        1583825.00 (  1.76%)
Ops majorfaults-0M                  6.00 (  0.00%)              0.00 (  0.00%)
Ops majorfaults-791M            13836.00 (  0.00%)             10.00 ( 99.93%)
Ops majorfaults-2639M           22307.00 (  0.00%)           6490.00 ( 70.91%)
Ops majorfaults-4487M           21631.00 (  0.00%)           4380.00 ( 79.75%)

                 BAS    FAIRALLO
                BASE   FAIRALLOC
User          287.78      460.97
System       2151.67     3142.51
Elapsed      9737.00     8879.34

                                   BAS    FAIRALLO
                                  BASE   FAIRALLOC
Minor Faults                  53721925    57188551
Major Faults                    392195       15157
Swap Ins                       2994854      112770
Swap Outs                      4907092      134982
Direct pages scanned                 0       41824
Kswapd pages scanned          32975063     8128269
Kswapd pages reclaimed         6323069     7093495
Direct pages reclaimed               0       41824
Kswapd efficiency                  19%         87%
Kswapd velocity               3386.573     915.414
Direct efficiency                 100%        100%
Direct velocity                  0.000       4.710
Percentage direct scans             0%          0%
Zone normal velocity          2011.338     550.661
Zone dma32 velocity           1365.623     369.221
Zone dma velocity                9.612       0.242
Page writes by reclaim    18732404.000  614807.000
Page writes file              13825312      479825
Page writes anon               4907092      134982
Page reclaim immediate           85490        5647
Sector Reads                  12080532      483244
Sector Writes                 88740508    65438876
Page rescued immediate               0           0
Slabs scanned                    82560       12160
Direct inode steals                  0           0
Kswapd inode steals              24401       40013
Kswapd skipped wait                  0           0
THP fault alloc                      6           8
THP collapse alloc                5481        5812
THP splits                          75          22
THP fault fallback                   0           0
THP collapse fail                    0           0
Compaction stalls                    0          54
Compaction success                   0          45
Compaction failures                  0           9
Page migrate success            881492       82278
Page migrate failure                 0           0
Compaction pages isolated            0       60334
Compaction migrate scanned           0       53505
Compaction free scanned              0     1537605
Compaction cost                    914          86
NUMA PTE updates              46738231    41988419
NUMA hint faults              31175564    24213387
NUMA hint local faults        10427393     6411593
NUMA pages migrated             881492       55344
AutoNUMA cost                   156221      121361

The overall runtime was reduced, throughput for both the foreground
workload as well as the background IO improved, major faults, swapping and
reclaim activity shrunk significantly, reclaim efficiency more than
quadrupled.

This patch:

When the page allocator fails to get a page from all zones in its given
zonelist, it wakes up the per-node kswapds for all zones that are at their
low watermark.

However, with a system under load the free pages in a zone can fluctuate
enough that the allocation fails but the kswapd wakeup is also skipped
while the zone is still really close to the low watermark.

When one node misses a wakeup like this, it won't be aged before all the
other node's zones are down to their low watermarks again.  And skipping a
full aging cycle is an obvious fairness problem.

Kswapd runs until the high watermarks are restored, so it should also be
woken when the high watermarks are not met.  This ages nodes more equally
and creates a safety margin for the page counter fluctuation.

By using zone_balanced(), it will now check, in addition to the watermark,
if compaction requires more order-0 pages to create a higher order page.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Paul Bolle <paul.bollee@gmail.com>
Tested-by: Zlatko Calusic <zcalusic@bitsync.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2cff0d491c6d..758540d3ca83 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3237,7 +3237,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	}
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
-	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+	if (zone_balanced(zone, order, 0, 0))
 		return;
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
-- 
cgit v1.2.3


From 0ec3b74c7f5599c8a4d2b33d430a5470af26ebf6 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:26 -0700
Subject: mm: putback_lru_page: remove unnecessary call to page_lru_base_type()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The goal of this patch series is to improve performance of munlock() of
large mlocked memory areas on systems without THP.  This is motivated by
reported very long times of crash recovery of processes with such areas,
where munlock() can take several seconds.  See
http://lwn.net/Articles/548108/

The work was driven by a simple benchmark (to be included in mmtests) that
mmaps() e.g.  56GB with MAP_LOCKED | MAP_POPULATE and measures the time of
munlock().  Profiling was performed by attaching operf --pid to the
process and sending a signal to trigger the munlock() part and then notify
bach the monitoring wrapper to stop operf, so that only munlock() appears
in the profile.

The profiles have shown that CPU time is spent mostly by atomic operations
and repeated locking per single pages. This series aims to reduce both, starting
from simpler to more complex changes.

Patch 1 performs a simple cleanup in putback_lru_page() so that page lru base
	type is not determined without being actually needed.

Patch 2 removes an unnecessary call to lru_add_drain() which drains the per-cpu
	pagevec after each munlocked page is put there.

Patch 3 changes munlock_vma_range() to use an on-stack pagevec for isolating
	multiple non-THP pages under a single lru_lock instead of locking and
	processing each page separately.

Patch 4 changes the NR_MLOCK accounting to be called only once per the pvec
	introduced by previous patch.

Patch 5 uses the introduced pagevec to batch also the work of putback_lru_page
	when possible, bypassing the per-cpu pvec and associated overhead.

Patch 6 removes a redundant get_page/put_page pair which saves costly atomic
	operations.

Patch 7 avoids calling follow_page_mask() on each individual page, and obtains
	multiple page references under a single page table lock where possible.

Measurements were made using 3.11-rc3 as a baseline.  The first set of
measurements shows the possibly ideal conditions where batching should
help the most.  All memory is allocated from a single NUMA node and THP is
disabled.

timedmunlock
                            3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3
                                   0                     1                     2                     3                     4                     5                     6                     7
Elapsed min           3.38 (  0.00%)        3.39 ( -0.13%)        3.00 ( 11.33%)        2.70 ( 20.20%)        2.67 ( 21.11%)        2.37 ( 29.88%)        2.20 ( 34.91%)        1.91 ( 43.59%)
Elapsed mean          3.39 (  0.00%)        3.40 ( -0.23%)        3.01 ( 11.33%)        2.70 ( 20.26%)        2.67 ( 21.21%)        2.38 ( 29.88%)        2.21 ( 34.93%)        1.92 ( 43.46%)
Elapsed stddev        0.01 (  0.00%)        0.01 (-43.09%)        0.01 ( 15.42%)        0.01 ( 23.42%)        0.00 ( 89.78%)        0.01 ( -7.15%)        0.00 ( 76.69%)        0.02 (-91.77%)
Elapsed max           3.41 (  0.00%)        3.43 ( -0.52%)        3.03 ( 11.29%)        2.72 ( 20.16%)        2.67 ( 21.63%)        2.40 ( 29.50%)        2.21 ( 35.21%)        1.96 ( 42.39%)
Elapsed range         0.03 (  0.00%)        0.04 (-51.16%)        0.02 (  6.27%)        0.02 ( 14.67%)        0.00 ( 88.90%)        0.03 (-19.18%)        0.01 ( 73.70%)        0.06 (-113.35%

The second set of measurements simulates the worst possible conditions for
batching by using numactl --interleave, so that there is in fact only one
page per pagevec.  Even in this case the series seems to improve
performance thanks to reduced atomic operations and removal of
lru_add_drain().

timedmunlock
                            3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3
                                   0                     1                     2                     3                     4                     5                     6                     7
Elapsed min           4.00 (  0.00%)        4.04 ( -0.93%)        3.87 (  3.37%)        3.72 (  6.94%)        3.81 (  4.72%)        3.69 (  7.82%)        3.64 (  8.92%)        3.41 ( 14.81%)
Elapsed mean          4.17 (  0.00%)        4.15 (  0.51%)        4.03 (  3.49%)        3.89 (  6.84%)        3.86 (  7.48%)        3.89 (  6.69%)        3.70 ( 11.27%)        3.48 ( 16.59%)
Elapsed stddev        0.16 (  0.00%)        0.08 ( 50.76%)        0.10 ( 41.58%)        0.16 (  4.59%)        0.05 ( 72.38%)        0.19 (-12.91%)        0.05 ( 68.09%)        0.06 ( 66.03%)
Elapsed max           4.34 (  0.00%)        4.32 (  0.56%)        4.19 (  3.62%)        4.12 (  5.15%)        3.91 (  9.88%)        4.12 (  5.25%)        3.80 ( 12.58%)        3.56 ( 18.08%)
Elapsed range         0.34 (  0.00%)        0.28 ( 17.91%)        0.32 (  6.45%)        0.40 (-15.73%)        0.10 ( 70.06%)        0.43 (-24.84%)        0.15 ( 55.32%)        0.15 ( 56.16%)

For completeness, a third set of measurements shows the situation where
THP is enabled and allocations are again done on a single NUMA node.  Here
munlock() is already very fast thanks to huge pages, and this series does
not compromise that performance.  It seems that the removal of call to
lru_add_drain() still helps a bit.

timedmunlock
                            3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3
                                   0                     1                     2                     3                     4                     5                     6                     7
Elapsed min           0.01 (  0.00%)        0.01 ( -0.11%)        0.01 (  6.59%)        0.01 (  5.41%)        0.01 (  5.45%)        0.01 (  5.03%)        0.01 (  6.08%)        0.01 (  5.20%)
Elapsed mean          0.01 (  0.00%)        0.01 ( -0.27%)        0.01 (  6.39%)        0.01 (  5.30%)        0.01 (  5.32%)        0.01 (  5.03%)        0.01 (  5.97%)        0.01 (  5.22%)
Elapsed stddev        0.00 (  0.00%)        0.00 ( -9.59%)        0.00 ( 10.77%)        0.00 (  3.24%)        0.00 ( 24.42%)        0.00 ( 31.86%)        0.00 ( -7.46%)        0.00 (  6.11%)
Elapsed max           0.01 (  0.00%)        0.01 ( -0.01%)        0.01 (  6.83%)        0.01 (  5.42%)        0.01 (  5.79%)        0.01 (  5.53%)        0.01 (  6.08%)        0.01 (  5.26%)
Elapsed range         0.00 (  0.00%)        0.00 (  7.30%)        0.00 ( 24.38%)        0.00 (  6.10%)        0.00 ( 30.79%)        0.00 ( 42.52%)        0.00 (  6.11%)        0.00 ( 10.07%)

This patch (of 7):

In putback_lru_page() since commit c53954a092 (""mm: remove lru parameter
from __lru_cache_add and lru_cache_add_lru") it is no longer needed to
determine lru list via page_lru_base_type().

This patch replaces it with simple flag is_unevictable which says that the
page was put on the inevictable list.  This is the only information that
matters in subsequent tests.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 758540d3ca83..44c072a7cba2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -545,7 +545,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
  */
 void putback_lru_page(struct page *page)
 {
-	int lru;
+	bool is_unevictable;
 	int was_unevictable = PageUnevictable(page);
 
 	VM_BUG_ON(PageLRU(page));
@@ -560,14 +560,14 @@ redo:
 		 * unevictable page on [in]active list.
 		 * We know how to handle that.
 		 */
-		lru = page_lru_base_type(page);
+		is_unevictable = false;
 		lru_cache_add(page);
 	} else {
 		/*
 		 * Put unevictable pages directly on zone's unevictable
 		 * list.
 		 */
-		lru = LRU_UNEVICTABLE;
+		is_unevictable = true;
 		add_page_to_unevictable_list(page);
 		/*
 		 * When racing with an mlock or AS_UNEVICTABLE clearing
@@ -587,7 +587,7 @@ redo:
 	 * page is on unevictable list, it never be freed. To avoid that,
 	 * check after we added it to the list, again.
 	 */
-	if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
+	if (is_unevictable && page_evictable(page)) {
 		if (!isolate_lru_page(page)) {
 			put_page(page);
 			goto redo;
@@ -598,9 +598,9 @@ redo:
 		 */
 	}
 
-	if (was_unevictable && lru != LRU_UNEVICTABLE)
+	if (was_unevictable && !is_unevictable)
 		count_vm_event(UNEVICTABLE_PGRESCUED);
-	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+	else if (!was_unevictable && is_unevictable)
 		count_vm_event(UNEVICTABLE_PGCULLED);
 
 	put_page(page);		/* drop ref from isolate */
-- 
cgit v1.2.3


From 6e543d5780e36ff5ee56c44d7e2e30db3457a7ed Mon Sep 17 00:00:00 2001
From: Lisa Du <cldu@marvell.com>
Date: Wed, 11 Sep 2013 14:22:36 -0700
Subject: mm: vmscan: fix do_try_to_free_pages() livelock

This patch is based on KOSAKI's work and I add a little more description,
please refer https://lkml.org/lkml/2012/6/14/74.

Currently, I found system can enter a state that there are lots of free
pages in a zone but only order-0 and order-1 pages which means the zone is
heavily fragmented, then high order allocation could make direct reclaim
path's long stall(ex, 60 seconds) especially in no swap and no compaciton
enviroment.  This problem happened on v3.4, but it seems issue still lives
in current tree, the reason is do_try_to_free_pages enter live lock:

kswapd will go to sleep if the zones have been fully scanned and are still
not balanced.  As kswapd thinks there's little point trying all over again
to avoid infinite loop.  Instead it changes order from high-order to
0-order because kswapd think order-0 is the most important.  Look at
73ce02e9 in detail.  If watermarks are ok, kswapd will go back to sleep
and may leave zone->all_unreclaimable =3D 0.  It assume high-order users
can still perform direct reclaim if they wish.

Direct reclaim continue to reclaim for a high order which is not a
COSTLY_ORDER without oom-killer until kswapd turn on
zone->all_unreclaimble= .  This is because to avoid too early oom-kill.
So it means direct_reclaim depends on kswapd to break this loop.

In worst case, direct-reclaim may continue to page reclaim forever when
kswapd sleeps forever until someone like watchdog detect and finally kill
the process.  As described in:
http://thread.gmane.org/gmane.linux.kernel.mm/103737

We can't turn on zone->all_unreclaimable from direct reclaim path because
direct reclaim path don't take any lock and this way is racy.  Thus this
patch removes zone->all_unreclaimable field completely and recalculates
zone reclaimable state every time.

Note: we can't take the idea that direct-reclaim see zone->pages_scanned
directly and kswapd continue to use zone->all_unreclaimable.  Because, it
is racy.  commit 929bea7c71 (vmscan: all_unreclaimable() use
zone->all_unreclaimable as a name) describes the detail.

[akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()]
Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Bob Liu <lliubbo@gmail.com>
Cc: Neil Zhang <zhangwm@marvell.com>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Lisa Du <cldu@marvell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 66 +++++++++++++++++++++++++++++--------------------------------
 1 file changed, 31 insertions(+), 35 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 44c072a7cba2..fe715daeb8bc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
 }
 #endif
 
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+	int nr;
+
+	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+	     zone_page_state(zone, NR_INACTIVE_FILE);
+
+	if (get_nr_swap_pages() > 0)
+		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+		      zone_page_state(zone, NR_INACTIVE_ANON);
+
+	return nr;
+}
+
+bool zone_reclaimable(struct zone *zone)
+{
+	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	if (!mem_cgroup_disabled())
@@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
-	if (current_is_kswapd() && zone->all_unreclaimable)
+	if (current_is_kswapd() && !zone_reclaimable(zone))
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
@@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 		if (global_reclaim(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-			if (zone->all_unreclaimable &&
-					sc->priority != DEF_PRIORITY)
+			if (sc->priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;	/* Let kswapd poll it */
 			if (IS_ENABLED(CONFIG_COMPACTION)) {
 				/*
@@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	return aborted_reclaim;
 }
 
-static bool zone_reclaimable(struct zone *zone)
-{
-	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
-}
-
 /* All zones in zonelist are unreclaimable? */
 static bool all_unreclaimable(struct zonelist *zonelist,
 		struct scan_control *sc)
@@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
 			continue;
 		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 			continue;
-		if (!zone->all_unreclaimable)
+		if (zone_reclaimable(zone))
 			return false;
 	}
 
@@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 		 * DEF_PRIORITY. Effectively, it considers them balanced so
 		 * they must be considered balanced here as well!
 		 */
-		if (zone->all_unreclaimable) {
+		if (!zone_reclaimable(zone)) {
 			balanced_pages += zone->managed_pages;
 			continue;
 		}
@@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
 			       unsigned long lru_pages,
 			       unsigned long *nr_attempted)
 {
-	unsigned long nr_slab;
 	int testorder = sc->order;
 	unsigned long balance_gap;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	shrink_zone(zone, sc);
 
 	reclaim_state->reclaimed_slab = 0;
-	nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+	shrink_slab(&shrink, sc->nr_scanned, lru_pages);
 	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 
 	/* Account for the number of pages attempted to reclaim */
 	*nr_attempted += sc->nr_to_reclaim;
 
-	if (nr_slab == 0 && !zone_reclaimable(zone))
-		zone->all_unreclaimable = 1;
-
 	zone_clear_flag(zone, ZONE_WRITEBACK);
 
 	/*
@@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	 * BDIs but as pressure is relieved, speculatively avoid congestion
 	 * waits.
 	 */
-	if (!zone->all_unreclaimable &&
+	if (zone_reclaimable(zone) &&
 	    zone_balanced(zone, testorder, 0, classzone_idx)) {
 		zone_clear_flag(zone, ZONE_CONGESTED);
 		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			/*
@@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			sc.nr_scanned = 0;
@@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void)
 	return nr;
 }
 
-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
-	int nr;
-
-	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
-	     zone_page_state(zone, NR_INACTIVE_FILE);
-
-	if (get_nr_swap_pages() > 0)
-		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
-		      zone_page_state(zone, NR_INACTIVE_ANON);
-
-	return nr;
-}
-
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return ZONE_RECLAIM_FULL;
 
-	if (zone->all_unreclaimable)
+	if (!zone_reclaimable(zone))
 		return ZONE_RECLAIM_FULL;
 
 	/*
-- 
cgit v1.2.3


From 3b38722efd9f66da63bbbd41520c2e6fa9db3d68 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 12 Sep 2013 15:13:21 -0700
Subject: memcg, vmscan: integrate soft reclaim tighter with zone shrinking
 code

This patchset is sitting out of tree for quite some time without any
objections.  I would be really happy if it made it into 3.12.  I do not
want to push it too hard but I think this work is basically ready and
waiting more doesn't help.

The basic idea is quite simple.  Pull soft reclaim into shrink_zone in the
first step and get rid of the previous soft reclaim infrastructure.
shrink_zone is done in two passes now.  First it tries to do the soft
limit reclaim and it falls back to reclaim-all mode if no group is over
the limit or no pages have been scanned.  The second pass happens at the
same priority so the only time we waste is the memcg tree walk which has
been updated in the third step to have only negligible overhead.

As a bonus we will get rid of a _lot_ of code by this and soft reclaim
will not stand out like before when it wasn't integrated into the zone
shrinking code and it reclaimed at priority 0 (the testing results show
that some workloads suffers from such an aggressive reclaim).  The clean
up is in a separate patch because I felt it would be easier to review that
way.

The second step is soft limit reclaim integration into targeted reclaim.
It should be rather straight forward.  Soft limit has been used only for
the global reclaim so far but it makes sense for any kind of pressure
coming from up-the-hierarchy, including targeted reclaim.

The third step (patches 4-8) addresses the tree walk overhead by enhancing
memcg iterators to enable skipping whole subtrees and tracking number of
over soft limit children at each level of the hierarchy.  This information
is updated same way the old soft limit tree was updated (from
memcg_check_events) so we shouldn't see an additional overhead.  In fact
mem_cgroup_update_soft_limit is much simpler than tree manipulation done
previously.

__shrink_zone uses mem_cgroup_soft_reclaim_eligible as a predicate for
mem_cgroup_iter so the decision whether a particular group should be
visited is done at the iterator level which allows us to decide to skip
the whole subtree as well (if there is no child in excess).  This reduces
the tree walk overhead considerably.

* TEST 1
========

My primary test case was a parallel kernel build with 2 groups (make is
running with -j8 with a distribution .config in a separate cgroup without
any hard limit) on a 32 CPU machine booted with 1GB memory and both builds
run taskset to Node 0 cpus.

I was mostly interested in 2 setups.  Default - no soft limit set and -
and 0 soft limit set to both groups.  The first one should tell us whether
the rework regresses the default behavior while the second one should show
us improvements in an extreme case where both workloads are always over
the soft limit.

/usr/bin/time -v has been used to collect the statistics and each
configuration had 3 runs after fresh boot without any other load on the
system.

base is mmotm-2013-07-18-16-40
rework all 8 patches applied on top of base

* No-limit
User
no-limit/base: min: 651.92 max: 672.65 avg: 664.33 std: 8.01 runs: 6
no-limit/rework: min: 657.34 [100.8%] max: 668.39 [99.4%] avg: 663.13 [99.8%] std: 3.61 runs: 6
System
no-limit/base: min: 69.33 max: 71.39 avg: 70.32 std: 0.79 runs: 6
no-limit/rework: min: 69.12 [99.7%] max: 71.05 [99.5%] avg: 70.04 [99.6%] std: 0.59 runs: 6
Elapsed
no-limit/base: min: 398.27 max: 422.36 avg: 408.85 std: 7.74 runs: 6
no-limit/rework: min: 386.36 [97.0%] max: 438.40 [103.8%] avg: 416.34 [101.8%] std: 18.85 runs: 6

The results are within noise. Elapsed time has a bigger variance but the
average looks good.

* 0-limit
User
0-limit/base: min: 573.76 max: 605.63 avg: 585.73 std: 12.21 runs: 6
0-limit/rework: min: 645.77 [112.6%] max: 666.25 [110.0%] avg: 656.97 [112.2%] std: 7.77 runs: 6
System
0-limit/base: min: 69.57 max: 71.13 avg: 70.29 std: 0.54 runs: 6
0-limit/rework: min: 68.68 [98.7%] max: 71.40 [100.4%] avg: 69.91 [99.5%] std: 0.87 runs: 6
Elapsed
0-limit/base: min: 1306.14 max: 1550.17 avg: 1430.35 std: 90.86 runs: 6
0-limit/rework: min: 404.06 [30.9%] max: 465.94 [30.1%] avg: 434.81 [30.4%] std: 22.68 runs: 6

The improvement is really huge here (even bigger than with my previous
testing and I suspect that this highly depends on the storage).  Page
fault statistics tell us at least part of the story:

Minor
0-limit/base: min: 37180461.00 max: 37319986.00 avg: 37247470.00 std: 54772.71 runs: 6
0-limit/rework: min: 36751685.00 [98.8%] max: 36805379.00 [98.6%] avg: 36774506.33 [98.7%] std: 17109.03 runs: 6
Major
0-limit/base: min: 170604.00 max: 221141.00 avg: 196081.83 std: 18217.01 runs: 6
0-limit/rework: min: 2864.00 [1.7%] max: 10029.00 [4.5%] avg: 5627.33 [2.9%] std: 2252.71 runs: 6

Same as with my previous testing Minor faults are more or less within
noise but Major fault count is way bellow the base kernel.

While this looks as a nice win it is fair to say that 0-limit
configuration is quite artificial. So I was playing with 0-no-limit
loads as well.

* TEST 2
========

The following results are from 2 groups configuration on a 16GB machine
(single NUMA node).

- A running stream IO (dd if=/dev/zero of=local.file bs=1024) with
  2*TotalMem with 0 soft limit.
- B running a mem_eater which consumes TotalMem-1G without any limit. The
  mem_eater consumes the memory in 100 chunks with 1s nap after each
  mmap+poppulate so that both loads have chance to fight for the memory.

The expected result is that B shouldn't be reclaimed and A shouldn't see
a big dropdown in elapsed time.

User
base: min: 2.68 max: 2.89 avg: 2.76 std: 0.09 runs: 3
rework: min: 3.27 [122.0%] max: 3.74 [129.4%] avg: 3.44 [124.6%] std: 0.21 runs: 3
System
base: min: 86.26 max: 88.29 avg: 87.28 std: 0.83 runs: 3
rework: min: 81.05 [94.0%] max: 84.96 [96.2%] avg: 83.14 [95.3%] std: 1.61 runs: 3
Elapsed
base: min: 317.28 max: 332.39 avg: 325.84 std: 6.33 runs: 3
rework: min: 281.53 [88.7%] max: 298.16 [89.7%] avg: 290.99 [89.3%] std: 6.98 runs: 3

System time improved slightly as well as Elapsed. My previous testing
has shown worse numbers but this again seem to depend on the storage
speed.

My theory is that the writeback doesn't catch up and prio-0 soft reclaim
falls into wait on writeback page too often in the base kernel. The
patched kernel doesn't do that because the soft reclaim is done from the
kswapd/direct reclaim context. This can be seen on the following graph
nicely. The A's group usage_in_bytes regurarly drops really low very often.

All 3 runs
http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/stream.png
resp. a detail of the single run
http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/stream-one-run.png

mem_eater seems to be doing better as well. It gets to the full
allocation size faster as can be seen on the following graph:
http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/mem_eater-one-run.png

/proc/meminfo collected during the test also shows that rework kernel
hasn't swapped that much (well almost not at all):
base: max: 123900 K avg: 56388.29 K
rework: max: 300 K avg: 128.68 K

kswapd and direct reclaim statistics are of no use unfortunatelly because
soft reclaim is not accounted properly as the counters are hidden by
global_reclaim() checks in the base kernel.

* TEST 3
========

Another test was the same configuration as TEST2 except the stream IO was
replaced by a single kbuild (16 parallel jobs bound to Node0 cpus same as
in TEST1) and mem_eater allocated TotalMem-200M so kbuild had only 200MB
left.

Kbuild did better with the rework kernel here as well:
User
base: min: 860.28 max: 872.86 avg: 868.03 std: 5.54 runs: 3
rework: min: 880.81 [102.4%] max: 887.45 [101.7%] avg: 883.56 [101.8%] std: 2.83 runs: 3
System
base: min: 84.35 max: 85.06 avg: 84.79 std: 0.31 runs: 3
rework: min: 85.62 [101.5%] max: 86.09 [101.2%] avg: 85.79 [101.2%] std: 0.21 runs: 3
Elapsed
base: min: 135.36 max: 243.30 avg: 182.47 std: 45.12 runs: 3
rework: min: 110.46 [81.6%] max: 116.20 [47.8%] avg: 114.15 [62.6%] std: 2.61 runs: 3
Minor
base: min: 36635476.00 max: 36673365.00 avg: 36654812.00 std: 15478.03 runs: 3
rework: min: 36639301.00 [100.0%] max: 36695541.00 [100.1%] avg: 36665511.00 [100.0%] std: 23118.23 runs: 3
Major
base: min: 14708.00 max: 53328.00 avg: 31379.00 std: 16202.24 runs: 3
rework: min: 302.00 [2.1%] max: 414.00 [0.8%] avg: 366.33 [1.2%] std: 47.22 runs: 3

Again we can see a significant improvement in Elapsed (it also seems to
be more stable), there is a huge dropdown for the Major page faults and
much more swapping:
base: max: 583736 K avg: 112547.43 K
rework: max: 4012 K avg: 124.36 K

Graphs from all three runs show the variability of the kbuild quite
nicely.  It even seems that it took longer after every run with the base
kernel which would be quite surprising as the source tree for the build is
removed and caches are dropped after each run so the build operates on a
freshly extracted sources everytime.
http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/kbuild-mem_eater.png

My other testing shows that this is just a matter of timing and other runs
behave differently the std for Elapsed time is similar ~50.  Example of
other three runs:
http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/kbuild-mem_eater2.png

So to wrap this up.  The series is still doing good and improves the soft
limit.

The testing results for bunch of cgroups with both stream IO and kbuild
loads can be found in "memcg: track children in soft limit excess to
improve soft limit".

This patch:

Memcg soft reclaim has been traditionally triggered from the global
reclaim paths before calling shrink_zone.  mem_cgroup_soft_limit_reclaim
then picked up a group which exceeds the soft limit the most and reclaimed
it with 0 priority to reclaim at least SWAP_CLUSTER_MAX pages.

The infrastructure requires per-node-zone trees which hold over-limit
groups and keep them up-to-date (via memcg_check_events) which is not cost
free.  Although this overhead hasn't turned out to be a bottle neck the
implementation is suboptimal because mem_cgroup_update_tree has no idea
which zones consumed memory over the limit so we could easily end up
having a group on a node-zone tree having only few pages from that
node-zone.

This patch doesn't try to fix node-zone trees management because it seems
that integrating soft reclaim into zone shrinking sounds much easier and
more appropriate for several reasons.  First of all 0 priority reclaim was
a crude hack which might lead to big stalls if the group's LRUs are big
and hard to reclaim (e.g.  a lot of dirty/writeback pages).  Soft reclaim
should be applicable also to the targeted reclaim which is awkward right
now without additional hacks.  Last but not least the whole infrastructure
eats quite some code.

After this patch shrink_zone is done in 2 passes.  First it tries to do
the soft reclaim if appropriate (only for global reclaim for now to keep
compatible with the original state) and fall back to ignoring soft limit
if no group is eligible to soft reclaim or nothing has been scanned during
the first pass.  Only groups which are over their soft limit or any of
their parents up the hierarchy is over the limit are considered eligible
during the first pass.

Soft limit tree which is not necessary anymore will be removed in the
follow up patch to make this patch smaller and easier to review.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Glauber Costa <glommer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 62 +++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 26 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe715daeb8bc..cf4643807ec2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,11 +139,21 @@ static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+	return !mem_cgroup_disabled() && global_reclaim(sc);
+}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+	return false;
+}
 #endif
 
 unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2130,7 +2140,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static void
+__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 
@@ -2149,6 +2160,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 		do {
 			struct lruvec *lruvec;
 
+			if (soft_reclaim &&
+			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
+				memcg = mem_cgroup_iter(root, memcg, &reclaim);
+				continue;
+			}
+
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2179,6 +2196,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 					 sc->nr_scanned - nr_scanned, sc));
 }
 
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
+	unsigned long nr_scanned = sc->nr_scanned;
+
+	__shrink_zone(zone, sc, do_soft_reclaim);
+
+	/*
+	 * No group is over the soft limit or those that are do not have
+	 * pages in the zone we are reclaiming so we have to reclaim everybody
+	 */
+	if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
+		__shrink_zone(zone, sc, false);
+		return;
+	}
+}
+
 /* Returns true if compaction should go ahead for a high-order request */
 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
@@ -2240,8 +2275,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
-	unsigned long nr_soft_reclaimed;
-	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 
 	/*
@@ -2281,18 +2314,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 					continue;
 				}
 			}
-			/*
-			 * This steals pages from memory cgroups over softlimit
-			 * and returns the number of reclaimed pages and
-			 * scanned pages. This works for global memory pressure
-			 * and balancing, not for a memcg's limit.
-			 */
-			nr_soft_scanned = 0;
-			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-						sc->order, sc->gfp_mask,
-						&nr_soft_scanned);
-			sc->nr_reclaimed += nr_soft_reclaimed;
-			sc->nr_scanned += nr_soft_scanned;
 			/* need some check for avoid more shrink_zone() */
 		}
 
@@ -2880,8 +2901,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
-	unsigned long nr_soft_reclaimed;
-	unsigned long nr_soft_scanned;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
@@ -2996,15 +3015,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
 			sc.nr_scanned = 0;
 
-			nr_soft_scanned = 0;
-			/*
-			 * Call soft limit reclaim before calling shrink_zone.
-			 */
-			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-							order, sc.gfp_mask,
-							&nr_soft_scanned);
-			sc.nr_reclaimed += nr_soft_reclaimed;
-
 			/*
 			 * There should be no need to raise the scanning
 			 * priority if enough pages are already being scanned
-- 
cgit v1.2.3


From a5b7c87f92076352dbff2fe0423ec255e1c9a71b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 12 Sep 2013 15:13:25 -0700
Subject: vmscan, memcg: do softlimit reclaim also for targeted reclaim

Soft reclaim has been done only for the global reclaim (both background
and direct).  Since "memcg: integrate soft reclaim tighter with zone
shrinking code" there is no reason for this limitation anymore as the soft
limit reclaim doesn't use any special code paths and it is a part of the
zone shrinking code which is used by both global and targeted reclaims.

From the semantic point of view it is natural to consider soft limit
before touching all groups in the hierarchy tree which is touching the
hard limit because soft limit tells us where to push back when there is a
memory pressure.  It is not important whether the pressure comes from the
limit or imbalanced zones.

This patch simply enables soft reclaim unconditionally in
mem_cgroup_should_soft_reclaim so it is enabled for both global and
targeted reclaim paths.  mem_cgroup_soft_reclaim_eligible needs to learn
about the root of the reclaim to know where to stop checking soft limit
state of parents up the hierarchy.  Say we have

A (over soft limit)
 \
  B (below s.l., hit the hard limit)
 / \
C   D (below s.l.)

B is the source of the outside memory pressure now for D but we shouldn't
soft reclaim it because it is behaving well under B subtree and we can
still reclaim from C (pressumably it is over the limit).
mem_cgroup_soft_reclaim_eligible should therefore stop climbing up the
hierarchy at B (root of the memory pressure).

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index cf4643807ec2..1896e7ca494b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc)
 
 static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
 {
-	return !mem_cgroup_disabled() && global_reclaim(sc);
+	return !mem_cgroup_disabled();
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
@@ -2161,7 +2161,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			struct lruvec *lruvec;
 
 			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
+			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
 				memcg = mem_cgroup_iter(root, memcg, &reclaim);
 				continue;
 			}
-- 
cgit v1.2.3


From de57780dc659f95b17ccb649f003278dde0b5b86 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 12 Sep 2013 15:13:26 -0700
Subject: memcg: enhance memcg iterator to support predicates

The caller of the iterator might know that some nodes or even subtrees
should be skipped but there is no way to tell iterators about that so the
only choice left is to let iterators to visit each node and do the
selection outside of the iterating code.  This, however, doesn't scale
well with hierarchies with many groups where only few groups are
interesting.

This patch adds mem_cgroup_iter_cond variant of the iterator with a
callback which gets called for every visited node.  There are three
possible ways how the callback can influence the walk.  Either the node is
visited, it is skipped but the tree walk continues down the tree or the
whole subtree of the current group is skipped.

[hughd@google.com: fix memcg-less page reclaim]
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1896e7ca494b..f2e35099508b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2151,21 +2151,16 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			.zone = zone,
 			.priority = sc->priority,
 		};
-		struct mem_cgroup *memcg;
+		struct mem_cgroup *memcg = NULL;
+		mem_cgroup_iter_filter filter = (soft_reclaim) ?
+			mem_cgroup_soft_reclaim_eligible : NULL;
 
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 
-		memcg = mem_cgroup_iter(root, NULL, &reclaim);
-		do {
+		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
 			struct lruvec *lruvec;
 
-			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
-				memcg = mem_cgroup_iter(root, memcg, &reclaim);
-				continue;
-			}
-
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2185,8 +2180,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-			memcg = mem_cgroup_iter(root, memcg, &reclaim);
-		} while (memcg);
+		}
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
-- 
cgit v1.2.3


From e839b6a1c8d0895803bcbd587595a54f4221a625 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 12 Sep 2013 15:13:30 -0700
Subject: memcg, vmscan: do not attempt soft limit reclaim if it would not scan
 anything

mem_cgroup_should_soft_reclaim controls whether soft reclaim pass is
done and it always says yes currently.  Memcg iterators are clever to
skip nodes that are not soft reclaimable quite efficiently but
mem_cgroup_should_soft_reclaim can be more clever and do not start the
soft reclaim pass at all if it knows that nothing would be scanned
anyway.

In order to do that, simply reuse mem_cgroup_soft_reclaim_eligible for
the target group of the reclaim and allow the pass only if the whole
subtree wouldn't be skipped.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f2e35099508b..04dca89c4f34 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,7 +142,9 @@ static bool global_reclaim(struct scan_control *sc)
 
 static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
 {
-	return !mem_cgroup_disabled();
+	struct mem_cgroup *root = sc->target_mem_cgroup;
+	return !mem_cgroup_disabled() &&
+		mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
-- 
cgit v1.2.3


From e975de998b9612aa97e3165f8827c09e0d36ce77 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 12 Sep 2013 15:13:34 -0700
Subject: memcg, vmscan: do not fall into reclaim-all pass too quickly

shrink_zone starts with soft reclaim pass first and then falls back to
regular reclaim if nothing has been scanned.  This behavior is natural
but there is a catch.  Memcg iterators, when used with the reclaim
cookie, are designed to help to prevent from over reclaim by
interleaving reclaimers (per node-zone-priority) so the tree walk might
miss many (even all) nodes in the hierarchy e.g.  when there are direct
reclaimers racing with each other or with kswapd in the global case or
multiple allocators reaching the limit for the target reclaim case.  To
make it even more complicated, targeted reclaim doesn't do the whole
tree walk because it stops reclaiming once it reclaims sufficient pages.
As a result groups over the limit might be missed, thus nothing is
scanned, and reclaim would fall back to the reclaim all mode.

This patch checks for the incomplete tree walk in shrink_zone.  If no
group has been visited and the hierarchy is soft reclaimable then we
must have missed some groups, in which case the __shrink_zone is called
again.  This doesn't guarantee there will be some progress of course
because the current reclaimer might be still racing with others but it
would at least give a chance to start the walk without a big risk of
reclaim latencies.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04dca89c4f34..fa91c20fe4b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2142,10 +2142,11 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static void
+static int
 __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 {
 	unsigned long nr_reclaimed, nr_scanned;
+	int groups_scanned = 0;
 
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2163,6 +2164,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
 			struct lruvec *lruvec;
 
+			groups_scanned++;
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2190,6 +2192,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
+
+	return groups_scanned;
 }
 
 
@@ -2197,8 +2201,19 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
 	unsigned long nr_scanned = sc->nr_scanned;
+	int scanned_groups;
 
-	__shrink_zone(zone, sc, do_soft_reclaim);
+	scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
+	/*
+         * memcg iterator might race with other reclaimer or start from
+         * a incomplete tree walk so the tree walk in __shrink_zone
+         * might have missed groups that are above the soft limit. Try
+         * another loop to catch up with others. Do it just once to
+         * prevent from reclaim latencies when other reclaimers always
+         * preempt this one.
+	 */
+	if (do_soft_reclaim && !scanned_groups)
+		__shrink_zone(zone, sc, do_soft_reclaim);
 
 	/*
 	 * No group is over the soft limit or those that are do not have
-- 
cgit v1.2.3


From f894ffa865301d4010d68b15be29912fa4039e77 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 12 Sep 2013 15:13:35 -0700
Subject: memcg: trivial cleanups

Clean up some mess made by the "Soft limit rework" series, and a few other
things.

Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa91c20fe4b7..76d1d5eaeec3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2205,12 +2205,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
 	scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
 	/*
-         * memcg iterator might race with other reclaimer or start from
-         * a incomplete tree walk so the tree walk in __shrink_zone
-         * might have missed groups that are above the soft limit. Try
-         * another loop to catch up with others. Do it just once to
-         * prevent from reclaim latencies when other reclaimers always
-         * preempt this one.
+	 * memcg iterator might race with other reclaimer or start from
+	 * a incomplete tree walk so the tree walk in __shrink_zone
+	 * might have missed groups that are above the soft limit. Try
+	 * another loop to catch up with others. Do it just once to
+	 * prevent from reclaim latencies when other reclaimers always
+	 * preempt this one.
 	 */
 	if (do_soft_reclaim && !scanned_groups)
 		__shrink_zone(zone, sc, do_soft_reclaim);
-- 
cgit v1.2.3


From 20ba27f52eded4d14e309ac57971c9c83dec46e4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:33 -0700
Subject: revert "memcg, vmscan: do not fall into reclaim-all pass too quickly"

Revert commit e975de998b96 ("memcg, vmscan: do not fall into reclaim-all
pass too quickly")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ed1b775bdc9..97b0ed16749f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2176,11 +2176,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static int
+static void
 __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 {
 	unsigned long nr_reclaimed, nr_scanned;
-	int groups_scanned = 0;
 
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2198,7 +2197,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
 			struct lruvec *lruvec;
 
-			groups_scanned++;
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2226,8 +2224,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
-
-	return groups_scanned;
 }
 
 
@@ -2235,19 +2231,8 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
 	unsigned long nr_scanned = sc->nr_scanned;
-	int scanned_groups;
 
-	scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
-	/*
-	 * memcg iterator might race with other reclaimer or start from
-	 * a incomplete tree walk so the tree walk in __shrink_zone
-	 * might have missed groups that are above the soft limit. Try
-	 * another loop to catch up with others. Do it just once to
-	 * prevent from reclaim latencies when other reclaimers always
-	 * preempt this one.
-	 */
-	if (do_soft_reclaim && !scanned_groups)
-		__shrink_zone(zone, sc, do_soft_reclaim);
+	__shrink_zone(zone, sc, do_soft_reclaim);
 
 	/*
 	 * No group is over the soft limit or those that are do not have
-- 
cgit v1.2.3


From 3120055e869f2a208480f238680d097eec8f0e02 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:35 -0700
Subject: revert "memcg, vmscan: do not attempt soft limit reclaim if it would
 not scan anything"

Revert commit e839b6a1c8d0 ("memcg, vmscan: do not attempt soft limit
reclaim if it would not scan anything")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 97b0ed16749f..cdd300b81485 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,9 +142,7 @@ static bool global_reclaim(struct scan_control *sc)
 
 static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
 {
-	struct mem_cgroup *root = sc->target_mem_cgroup;
-	return !mem_cgroup_disabled() &&
-		mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
+	return !mem_cgroup_disabled();
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
-- 
cgit v1.2.3


From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:37 -0700
Subject: revert "memcg: enhance memcg iterator to support predicates"

Revert commit de57780dc659 ("memcg: enhance memcg iterator to support
predicates")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index cdd300b81485..17a7134de4d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2185,16 +2185,21 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			.zone = zone,
 			.priority = sc->priority,
 		};
-		struct mem_cgroup *memcg = NULL;
-		mem_cgroup_iter_filter filter = (soft_reclaim) ?
-			mem_cgroup_soft_reclaim_eligible : NULL;
+		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 
-		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
+		memcg = mem_cgroup_iter(root, NULL, &reclaim);
+		do {
 			struct lruvec *lruvec;
 
+			if (soft_reclaim &&
+			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
+				memcg = mem_cgroup_iter(root, memcg, &reclaim);
+				continue;
+			}
+
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2214,7 +2219,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-		}
+			memcg = mem_cgroup_iter(root, memcg, &reclaim);
+		} while (memcg);
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
-- 
cgit v1.2.3


From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:38 -0700
Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted
 reclaim"

Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also
for targeted reclaim")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 17a7134de4d7..0e081cada4ba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc)
 
 static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
 {
-	return !mem_cgroup_disabled();
+	return !mem_cgroup_disabled() && global_reclaim(sc);
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
@@ -2195,7 +2195,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			struct lruvec *lruvec;
 
 			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
+			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
 				memcg = mem_cgroup_iter(root, memcg, &reclaim);
 				continue;
 			}
-- 
cgit v1.2.3


From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:41 -0700
Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone
 shrinking code"

Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim
tighter with zone shrinking code")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 62 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 26 insertions(+), 36 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0e081cada4ba..beb35778c69f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,21 +139,11 @@ static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
-
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-	return !mem_cgroup_disabled() && global_reclaim(sc);
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
-
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-	return false;
-}
 #endif
 
 unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2174,8 +2164,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static void
-__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 
@@ -2194,12 +2183,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 		do {
 			struct lruvec *lruvec;
 
-			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
-				memcg = mem_cgroup_iter(root, memcg, &reclaim);
-				continue;
-			}
-
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2230,24 +2213,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 					 sc->nr_scanned - nr_scanned, sc));
 }
 
-
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
-{
-	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
-	unsigned long nr_scanned = sc->nr_scanned;
-
-	__shrink_zone(zone, sc, do_soft_reclaim);
-
-	/*
-	 * No group is over the soft limit or those that are do not have
-	 * pages in the zone we are reclaiming so we have to reclaim everybody
-	 */
-	if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
-		__shrink_zone(zone, sc, false);
-		return;
-	}
-}
-
 /* Returns true if compaction should go ahead for a high-order request */
 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
@@ -2309,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
+	unsigned long nr_soft_reclaimed;
+	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 
 	/*
@@ -2348,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 					continue;
 				}
 			}
+			/*
+			 * This steals pages from memory cgroups over softlimit
+			 * and returns the number of reclaimed pages and
+			 * scanned pages. This works for global memory pressure
+			 * and balancing, not for a memcg's limit.
+			 */
+			nr_soft_scanned = 0;
+			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+						sc->order, sc->gfp_mask,
+						&nr_soft_scanned);
+			sc->nr_reclaimed += nr_soft_reclaimed;
+			sc->nr_scanned += nr_soft_scanned;
 			/* need some check for avoid more shrink_zone() */
 		}
 
@@ -2941,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+	unsigned long nr_soft_reclaimed;
+	unsigned long nr_soft_scanned;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
@@ -3055,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
 			sc.nr_scanned = 0;
 
+			nr_soft_scanned = 0;
+			/*
+			 * Call soft limit reclaim before calling shrink_zone.
+			 */
+			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+							order, sc.gfp_mask,
+							&nr_soft_scanned);
+			sc.nr_reclaimed += nr_soft_reclaimed;
+
 			/*
 			 * There should be no need to raise the scanning
 			 * priority if enough pages are already being scanned
-- 
cgit v1.2.3


From 117aad1e9e4d97448d1df3f84b08bd65811e6d6a Mon Sep 17 00:00:00 2001
From: Rafael Aquini <aquini@redhat.com>
Date: Mon, 30 Sep 2013 13:45:16 -0700
Subject: mm: avoid reinserting isolated balloon pages into LRU lists

Isolated balloon pages can wrongly end up in LRU lists when
migrate_pages() finishes its round without draining all the isolated
page list.

The same issue can happen when reclaim_clean_pages_from_list() tries to
reclaim pages from an isolated page list, before migration, in the CMA
path.  Such balloon page leak opens a race window against LRU lists
shrinkers that leads us to the following kernel panic:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
  IP: [<ffffffff810c2625>] shrink_page_list+0x24e/0x897
  PGD 3cda2067 PUD 3d713067 PMD 0
  Oops: 0000 [#1] SMP
  CPU: 0 PID: 340 Comm: kswapd0 Not tainted 3.12.0-rc1-22626-g4367597 #87
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  RIP: shrink_page_list+0x24e/0x897
  RSP: 0000:ffff88003da499b8  EFLAGS: 00010286
  RAX: 0000000000000000 RBX: ffff88003e82bd60 RCX: 00000000000657d5
  RDX: 0000000000000000 RSI: 000000000000031f RDI: ffff88003e82bd40
  RBP: ffff88003da49ab0 R08: 0000000000000001 R09: 0000000081121a45
  R10: ffffffff81121a45 R11: ffff88003c4a9a28 R12: ffff88003e82bd40
  R13: ffff88003da0e800 R14: 0000000000000001 R15: ffff88003da49d58
  FS:  0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000067d9000 CR3: 000000003ace5000 CR4: 00000000000407b0
  Call Trace:
    shrink_inactive_list+0x240/0x3de
    shrink_lruvec+0x3e0/0x566
    __shrink_zone+0x94/0x178
    shrink_zone+0x3a/0x82
    balance_pgdat+0x32a/0x4c2
    kswapd+0x2f0/0x372
    kthread+0xa2/0xaa
    ret_from_fork+0x7c/0xb0
  Code: 80 7d 8f 01 48 83 95 68 ff ff ff 00 4c 89 e7 e8 5a 7b 00 00 48 85 c0 49 89 c5 75 08 80 7d 8f 00 74 3e eb 31 48 8b 80 18 01 00 00 <48> 8b 74 0d 48 8b 78 30 be 02 00 00 00 ff d2 eb
  RIP  [<ffffffff810c2625>] shrink_page_list+0x24e/0x897
   RSP <ffff88003da499b8>
  CR2: 0000000000000028
  ---[ end trace 703d2451af6ffbfd ]---
  Kernel panic - not syncing: Fatal exception

This patch fixes the issue, by assuring the proper tests are made at
putback_movable_pages() & reclaim_clean_pages_from_list() to avoid
isolated balloon pages being wrongly reinserted in LRU lists.

[akpm@linux-foundation.org: clarify awkward comment text]
Signed-off-by: Rafael Aquini <aquini@redhat.com>
Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index beb35778c69f..53f2f82f83ae 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,7 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
 
 #include "internal.h"
 
@@ -1113,7 +1114,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	LIST_HEAD(clean_pages);
 
 	list_for_each_entry_safe(page, next, page_list, lru) {
-		if (page_is_file_cache(page) && !PageDirty(page)) {
+		if (page_is_file_cache(page) && !PageDirty(page) &&
+		    !isolated_balloon_page(page)) {
 			ClearPageActive(page);
 			list_move(&page->lru, &clean_pages);
 		}
-- 
cgit v1.2.3