From 24f7c6b981fb70084757382da464ea85d72af300 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:17:56 +1000 Subject: mm: new shrinker API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current shrinker callout API uses an a single shrinker call for multiple functions. To determine the function, a special magical value is passed in a parameter to change the behaviour. This complicates the implementation and return value specification for the different behaviours. Separate the two different behaviours into separate operations, one to return a count of freeable objects in the cache, and another to scan a certain number of objects in the cache for freeing. In defining these new operations, ensure the return values and resultant behaviours are clearly defined and documented. Modify shrink_slab() to use the new API and implement the callouts for all the existing shrinkers. Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- mm/vmscan.c | 60 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 20 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2cff0d491c6d..4d4e859b4b9c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -205,19 +205,24 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(struct shrink_control *shrink, +unsigned long shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_pages_scanned, unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; + unsigned long freed = 0; if (nr_pages_scanned == 0) nr_pages_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { - /* Assume we'll be able to shrink next time */ - ret = 1; + /* + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. + */ + freed = 1; goto out; } @@ -225,14 +230,16 @@ unsigned long shrink_slab(struct shrink_control *shrink, unsigned long long delta; long total_scan; long max_pass; - int shrink_ret = 0; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); - if (max_pass <= 0) + if (shrinker->count_objects) + max_pass = shrinker->count_objects(shrinker, shrinkctl); + else + max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); + if (max_pass == 0) continue; /* @@ -248,8 +255,8 @@ unsigned long shrink_slab(struct shrink_control *shrink, do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->shrink, total_scan); total_scan = max_pass; } @@ -277,20 +284,33 @@ unsigned long shrink_slab(struct shrink_control *shrink, if (total_scan > max_pass * 2) total_scan = max_pass * 2; - trace_mm_shrink_slab_start(shrinker, shrink, nr, + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); while (total_scan >= batch_size) { - int nr_before; - nr_before = do_shrinker_shrink(shrinker, shrink, 0); - shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) - ret += nr_before - shrink_ret; + if (shrinker->scan_objects) { + unsigned long ret; + shrinkctl->nr_to_scan = batch_size; + ret = shrinker->scan_objects(shrinker, shrinkctl); + + if (ret == SHRINK_STOP) + break; + freed += ret; + } else { + int nr_before; + long ret; + + nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); + ret = do_shrinker_shrink(shrinker, shrinkctl, + batch_size); + if (ret == -1) + break; + if (ret < nr_before) + freed += nr_before - ret; + } + count_vm_events(SLABS_SCANNED, batch_size); total_scan -= batch_size; @@ -308,12 +328,12 @@ unsigned long shrink_slab(struct shrink_control *shrink, else new_nr = atomic_long_read(&shrinker->nr_in_batch); - trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); } up_read(&shrinker_rwsem); out: cond_resched(); - return ret; + return freed; } static inline int is_page_cache_freeable(struct page *page) -- cgit v1.2.3 From 0ce3d74450815500e31f16a0b65f6bab687985c3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:18:03 +1000 Subject: shrinker: add node awareness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the node of the current zone being reclaimed to shrink_slab(), allowing the shrinker control nodemask to be set appropriately for node aware shrinkers. Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- mm/vmscan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4d4e859b4b9c..fe0d5c458440 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2374,12 +2374,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (global_reclaim(sc)) { unsigned long lru_pages = 0; + + nodes_clear(shrink->nodes_to_scan); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(sc->gfp_mask)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), + shrink->nodes_to_scan); } shrink_slab(shrink, sc->nr_scanned, lru_pages); @@ -2836,6 +2840,8 @@ static bool kswapd_shrink_zone(struct zone *zone, return true; shrink_zone(zone, sc); + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); @@ -3544,10 +3550,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * number of slab pages and shake the slab until it is reduced * by the same nr_pages that we used for reclaiming unmapped * pages. - * - * Note that shrink_slab will free memory on all zones and may - * take a long time. */ + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); for (;;) { unsigned long lru_pages = zone_reclaimable_pages(zone); -- cgit v1.2.3 From 1d3d4437eae1bb2963faab427f65f90663c64aa1 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:18:04 +1000 Subject: vmscan: per-node deferred work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The list_lru infrastructure already keeps per-node LRU lists in its node-specific list_lru_node arrays and provide us with a per-node API, and the shrinkers are properly equiped with node information. This means that we can now focus our shrinking effort in a single node, but the work that is deferred from one run to another is kept global at nr_in_batch. Work can be deferred, for instance, during direct reclaim under a GFP_NOFS allocation, where situation, all the filesystem shrinkers will be prevented from running and accumulate in nr_in_batch the amount of work they should have done, but could not. This creates an impedance problem, where upon node pressure, work deferred will accumulate and end up being flushed in other nodes. The problem we describe is particularly harmful in big machines, where many nodes can accumulate at the same time, all adding to the global counter nr_in_batch. As we accumulate more and more, we start to ask for the caches to flush even bigger numbers. The result is that the caches are depleted and do not stabilize. To achieve stable steady state behavior, we need to tackle it differently. In this patch we keep the deferred count per-node, in the new array nr_deferred[] (the name is also a bit more descriptive) and will never accumulate that to other nodes. Signed-off-by: Glauber Costa Cc: Dave Chinner Cc: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- mm/vmscan.c | 241 +++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 140 insertions(+), 101 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index fe0d5c458440..799ebceeb4f7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) } /* - * Add a shrinker callback to be called from the vm + * Add a shrinker callback to be called from the vm. */ -void register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker) { - atomic_long_set(&shrinker->nr_in_batch, 0); + size_t size = sizeof(*shrinker->nr_deferred); + + /* + * If we only have one possible node in the system anyway, save + * ourselves the trouble and disable NUMA aware behavior. This way we + * will save memory and some small loop time later. + */ + if (nr_node_ids == 1) + shrinker->flags &= ~SHRINKER_NUMA_AWARE; + + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); + return 0; } EXPORT_SYMBOL(register_shrinker); @@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, } #define SHRINK_BATCH 128 + +static unsigned long +shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + unsigned long nr_pages_scanned, unsigned long lru_pages) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long max_pass; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + + if (shrinker->count_objects) + max_pass = shrinker->count_objects(shrinker, shrinkctl); + else + max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); + if (max_pass == 0) + return 0; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + total_scan = nr; + delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta *= max_pass; + do_div(delta, lru_pages + 1); + total_scan += delta; + if (total_scan < 0) { + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->shrink, total_scan); + total_scan = max_pass; + } + + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); + + while (total_scan >= batch_size) { + + if (shrinker->scan_objects) { + unsigned long ret; + shrinkctl->nr_to_scan = batch_size; + ret = shrinker->scan_objects(shrinker, shrinkctl); + + if (ret == SHRINK_STOP) + break; + freed += ret; + } else { + int nr_before; + long ret; + + nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); + ret = do_shrinker_shrink(shrinker, shrinkctl, + batch_size); + if (ret == -1) + break; + if (ret < nr_before) + freed += nr_before - ret; + } + + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; + + cond_resched(); + } + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + return freed; +} + /* * Call the shrink functions to age shrinkable caches * @@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, } list_for_each_entry(shrinker, &shrinker_list, list) { - unsigned long long delta; - long total_scan; - long max_pass; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; - - if (shrinker->count_objects) - max_pass = shrinker->count_objects(shrinker, shrinkctl); - else - max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); - if (max_pass == 0) - continue; - - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); - - total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; - do_div(delta, lru_pages + 1); - total_scan += delta; - if (total_scan < 0) { - printk(KERN_ERR - "shrink_slab: %pF negative objects to delete nr=%ld\n", - shrinker->shrink, total_scan); - total_scan = max_pass; - } - - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; - - trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); - - while (total_scan >= batch_size) { - - if (shrinker->scan_objects) { - unsigned long ret; - shrinkctl->nr_to_scan = batch_size; - ret = shrinker->scan_objects(shrinker, shrinkctl); + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (!node_online(shrinkctl->nid)) + continue; - if (ret == SHRINK_STOP) - break; - freed += ret; - } else { - int nr_before; - long ret; - - nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); - ret = do_shrinker_shrink(shrinker, shrinkctl, - batch_size); - if (ret == -1) - break; - if (ret < nr_before) - freed += nr_before - ret; - } + if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && + (shrinkctl->nid != 0)) + break; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); - cond_resched(); } - - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. - */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, - &shrinker->nr_in_batch); - else - new_nr = atomic_long_read(&shrinker->nr_in_batch); - - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); } up_read(&shrinker_rwsem); out: -- cgit v1.2.3 From a0b02131c5fcd8545b867db72224b3659e813f10 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Aug 2013 10:18:16 +1000 Subject: shrinker: Kill old ->shrink API. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no more users of this API, so kill it dead, dead, dead and quietly bury the corpse in a shallow, unmarked grave in a dark forest deep in the hills... [glommer@openvz.org: added flowers to the grave] Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Reviewed-by: Greg Thelen Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- mm/vmscan.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 799ebceeb4f7..e36454220614 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -194,14 +194,6 @@ void unregister_shrinker(struct shrinker *shrinker) } EXPORT_SYMBOL(unregister_shrinker); -static inline int do_shrinker_shrink(struct shrinker *shrinker, - struct shrink_control *sc, - unsigned long nr_to_scan) -{ - sc->nr_to_scan = nr_to_scan; - return (*shrinker->shrink)(shrinker, sc); -} - #define SHRINK_BATCH 128 static unsigned long @@ -218,10 +210,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - if (shrinker->count_objects) - max_pass = shrinker->count_objects(shrinker, shrinkctl); - else - max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); + max_pass = shrinker->count_objects(shrinker, shrinkctl); if (max_pass == 0) return 0; @@ -240,7 +229,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to delete nr=%ld\n", - shrinker->shrink, total_scan); + shrinker->scan_objects, total_scan); total_scan = max_pass; } @@ -272,27 +261,13 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, max_pass, delta, total_scan); while (total_scan >= batch_size) { + unsigned long ret; - if (shrinker->scan_objects) { - unsigned long ret; - shrinkctl->nr_to_scan = batch_size; - ret = shrinker->scan_objects(shrinker, shrinkctl); - - if (ret == SHRINK_STOP) - break; - freed += ret; - } else { - int nr_before; - long ret; - - nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); - ret = do_shrinker_shrink(shrinker, shrinkctl, - batch_size); - if (ret == -1) - break; - if (ret < nr_before) - freed += nr_before - ret; - } + shrinkctl->nr_to_scan = batch_size; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; count_vm_events(SLABS_SCANNED, batch_size); total_scan -= batch_size; -- cgit v1.2.3 From 892f795df1eb119b560a3ee5a1ca3f385a852e84 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Sep 2013 14:20:39 -0700 Subject: mm: vmscan: fix numa reclaim balance problem in kswapd The way the page allocator interacts with kswapd creates aging imbalances, where the amount of time a userspace page gets in memory under reclaim pressure is dependent on which zone, which node the allocator took the page frame from. #1 fixes missed kswapd wakeups on NUMA systems, which lead to some nodes falling behind for a full reclaim cycle relative to the other nodes in the system #3 fixes an interaction where kswapd and a continuous stream of page allocations keep the preferred zone of a task between the high and low watermark (allocations succeed + kswapd does not go to sleep) indefinitely, completely underutilizing the lower zones and thrashing on the preferred zone These patches are the aging fairness part of the thrash-detection based file LRU balancing. Andrea recommended to submit them separately as they are bugfixes in their own right. The following test ran a foreground workload (memcachetest) with background IO of various sizes on a 4 node 8G system (similar results were observed with single-node 4G systems): parallelio BAS FAIRALLO BASE FAIRALLOC Ops memcachetest-0M 5170.00 ( 0.00%) 5283.00 ( 2.19%) Ops memcachetest-791M 4740.00 ( 0.00%) 5293.00 ( 11.67%) Ops memcachetest-2639M 2551.00 ( 0.00%) 4950.00 ( 94.04%) Ops memcachetest-4487M 2606.00 ( 0.00%) 3922.00 ( 50.50%) Ops io-duration-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops io-duration-791M 55.00 ( 0.00%) 18.00 ( 67.27%) Ops io-duration-2639M 235.00 ( 0.00%) 103.00 ( 56.17%) Ops io-duration-4487M 278.00 ( 0.00%) 173.00 ( 37.77%) Ops swaptotal-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-791M 245184.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-2639M 468069.00 ( 0.00%) 108778.00 ( 76.76%) Ops swaptotal-4487M 452529.00 ( 0.00%) 76623.00 ( 83.07%) Ops swapin-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-791M 108297.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-2639M 169537.00 ( 0.00%) 50031.00 ( 70.49%) Ops swapin-4487M 167435.00 ( 0.00%) 34178.00 ( 79.59%) Ops minorfaults-0M 1518666.00 ( 0.00%) 1503993.00 ( 0.97%) Ops minorfaults-791M 1676963.00 ( 0.00%) 1520115.00 ( 9.35%) Ops minorfaults-2639M 1606035.00 ( 0.00%) 1799717.00 (-12.06%) Ops minorfaults-4487M 1612118.00 ( 0.00%) 1583825.00 ( 1.76%) Ops majorfaults-0M 6.00 ( 0.00%) 0.00 ( 0.00%) Ops majorfaults-791M 13836.00 ( 0.00%) 10.00 ( 99.93%) Ops majorfaults-2639M 22307.00 ( 0.00%) 6490.00 ( 70.91%) Ops majorfaults-4487M 21631.00 ( 0.00%) 4380.00 ( 79.75%) BAS FAIRALLO BASE FAIRALLOC User 287.78 460.97 System 2151.67 3142.51 Elapsed 9737.00 8879.34 BAS FAIRALLO BASE FAIRALLOC Minor Faults 53721925 57188551 Major Faults 392195 15157 Swap Ins 2994854 112770 Swap Outs 4907092 134982 Direct pages scanned 0 41824 Kswapd pages scanned 32975063 8128269 Kswapd pages reclaimed 6323069 7093495 Direct pages reclaimed 0 41824 Kswapd efficiency 19% 87% Kswapd velocity 3386.573 915.414 Direct efficiency 100% 100% Direct velocity 0.000 4.710 Percentage direct scans 0% 0% Zone normal velocity 2011.338 550.661 Zone dma32 velocity 1365.623 369.221 Zone dma velocity 9.612 0.242 Page writes by reclaim 18732404.000 614807.000 Page writes file 13825312 479825 Page writes anon 4907092 134982 Page reclaim immediate 85490 5647 Sector Reads 12080532 483244 Sector Writes 88740508 65438876 Page rescued immediate 0 0 Slabs scanned 82560 12160 Direct inode steals 0 0 Kswapd inode steals 24401 40013 Kswapd skipped wait 0 0 THP fault alloc 6 8 THP collapse alloc 5481 5812 THP splits 75 22 THP fault fallback 0 0 THP collapse fail 0 0 Compaction stalls 0 54 Compaction success 0 45 Compaction failures 0 9 Page migrate success 881492 82278 Page migrate failure 0 0 Compaction pages isolated 0 60334 Compaction migrate scanned 0 53505 Compaction free scanned 0 1537605 Compaction cost 914 86 NUMA PTE updates 46738231 41988419 NUMA hint faults 31175564 24213387 NUMA hint local faults 10427393 6411593 NUMA pages migrated 881492 55344 AutoNUMA cost 156221 121361 The overall runtime was reduced, throughput for both the foreground workload as well as the background IO improved, major faults, swapping and reclaim activity shrunk significantly, reclaim efficiency more than quadrupled. This patch: When the page allocator fails to get a page from all zones in its given zonelist, it wakes up the per-node kswapds for all zones that are at their low watermark. However, with a system under load the free pages in a zone can fluctuate enough that the allocation fails but the kswapd wakeup is also skipped while the zone is still really close to the low watermark. When one node misses a wakeup like this, it won't be aged before all the other node's zones are down to their low watermarks again. And skipping a full aging cycle is an obvious fairness problem. Kswapd runs until the high watermarks are restored, so it should also be woken when the high watermarks are not met. This ages nodes more equally and creates a safety margin for the page counter fluctuation. By using zone_balanced(), it will now check, in addition to the watermark, if compaction requires more order-0 pages to create a higher order page. Signed-off-by: Johannes Weiner Cc: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Paul Bolle Tested-by: Zlatko Calusic Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2cff0d491c6d..758540d3ca83 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3237,7 +3237,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_balanced(zone, order, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); -- cgit v1.2.3 From 0ec3b74c7f5599c8a4d2b33d430a5470af26ebf6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:26 -0700 Subject: mm: putback_lru_page: remove unnecessary call to page_lru_base_type() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The goal of this patch series is to improve performance of munlock() of large mlocked memory areas on systems without THP. This is motivated by reported very long times of crash recovery of processes with such areas, where munlock() can take several seconds. See http://lwn.net/Articles/548108/ The work was driven by a simple benchmark (to be included in mmtests) that mmaps() e.g. 56GB with MAP_LOCKED | MAP_POPULATE and measures the time of munlock(). Profiling was performed by attaching operf --pid to the process and sending a signal to trigger the munlock() part and then notify bach the monitoring wrapper to stop operf, so that only munlock() appears in the profile. The profiles have shown that CPU time is spent mostly by atomic operations and repeated locking per single pages. This series aims to reduce both, starting from simpler to more complex changes. Patch 1 performs a simple cleanup in putback_lru_page() so that page lru base type is not determined without being actually needed. Patch 2 removes an unnecessary call to lru_add_drain() which drains the per-cpu pagevec after each munlocked page is put there. Patch 3 changes munlock_vma_range() to use an on-stack pagevec for isolating multiple non-THP pages under a single lru_lock instead of locking and processing each page separately. Patch 4 changes the NR_MLOCK accounting to be called only once per the pvec introduced by previous patch. Patch 5 uses the introduced pagevec to batch also the work of putback_lru_page when possible, bypassing the per-cpu pvec and associated overhead. Patch 6 removes a redundant get_page/put_page pair which saves costly atomic operations. Patch 7 avoids calling follow_page_mask() on each individual page, and obtains multiple page references under a single page table lock where possible. Measurements were made using 3.11-rc3 as a baseline. The first set of measurements shows the possibly ideal conditions where batching should help the most. All memory is allocated from a single NUMA node and THP is disabled. timedmunlock 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 0 1 2 3 4 5 6 7 Elapsed min 3.38 ( 0.00%) 3.39 ( -0.13%) 3.00 ( 11.33%) 2.70 ( 20.20%) 2.67 ( 21.11%) 2.37 ( 29.88%) 2.20 ( 34.91%) 1.91 ( 43.59%) Elapsed mean 3.39 ( 0.00%) 3.40 ( -0.23%) 3.01 ( 11.33%) 2.70 ( 20.26%) 2.67 ( 21.21%) 2.38 ( 29.88%) 2.21 ( 34.93%) 1.92 ( 43.46%) Elapsed stddev 0.01 ( 0.00%) 0.01 (-43.09%) 0.01 ( 15.42%) 0.01 ( 23.42%) 0.00 ( 89.78%) 0.01 ( -7.15%) 0.00 ( 76.69%) 0.02 (-91.77%) Elapsed max 3.41 ( 0.00%) 3.43 ( -0.52%) 3.03 ( 11.29%) 2.72 ( 20.16%) 2.67 ( 21.63%) 2.40 ( 29.50%) 2.21 ( 35.21%) 1.96 ( 42.39%) Elapsed range 0.03 ( 0.00%) 0.04 (-51.16%) 0.02 ( 6.27%) 0.02 ( 14.67%) 0.00 ( 88.90%) 0.03 (-19.18%) 0.01 ( 73.70%) 0.06 (-113.35% The second set of measurements simulates the worst possible conditions for batching by using numactl --interleave, so that there is in fact only one page per pagevec. Even in this case the series seems to improve performance thanks to reduced atomic operations and removal of lru_add_drain(). timedmunlock 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 0 1 2 3 4 5 6 7 Elapsed min 4.00 ( 0.00%) 4.04 ( -0.93%) 3.87 ( 3.37%) 3.72 ( 6.94%) 3.81 ( 4.72%) 3.69 ( 7.82%) 3.64 ( 8.92%) 3.41 ( 14.81%) Elapsed mean 4.17 ( 0.00%) 4.15 ( 0.51%) 4.03 ( 3.49%) 3.89 ( 6.84%) 3.86 ( 7.48%) 3.89 ( 6.69%) 3.70 ( 11.27%) 3.48 ( 16.59%) Elapsed stddev 0.16 ( 0.00%) 0.08 ( 50.76%) 0.10 ( 41.58%) 0.16 ( 4.59%) 0.05 ( 72.38%) 0.19 (-12.91%) 0.05 ( 68.09%) 0.06 ( 66.03%) Elapsed max 4.34 ( 0.00%) 4.32 ( 0.56%) 4.19 ( 3.62%) 4.12 ( 5.15%) 3.91 ( 9.88%) 4.12 ( 5.25%) 3.80 ( 12.58%) 3.56 ( 18.08%) Elapsed range 0.34 ( 0.00%) 0.28 ( 17.91%) 0.32 ( 6.45%) 0.40 (-15.73%) 0.10 ( 70.06%) 0.43 (-24.84%) 0.15 ( 55.32%) 0.15 ( 56.16%) For completeness, a third set of measurements shows the situation where THP is enabled and allocations are again done on a single NUMA node. Here munlock() is already very fast thanks to huge pages, and this series does not compromise that performance. It seems that the removal of call to lru_add_drain() still helps a bit. timedmunlock 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 0 1 2 3 4 5 6 7 Elapsed min 0.01 ( 0.00%) 0.01 ( -0.11%) 0.01 ( 6.59%) 0.01 ( 5.41%) 0.01 ( 5.45%) 0.01 ( 5.03%) 0.01 ( 6.08%) 0.01 ( 5.20%) Elapsed mean 0.01 ( 0.00%) 0.01 ( -0.27%) 0.01 ( 6.39%) 0.01 ( 5.30%) 0.01 ( 5.32%) 0.01 ( 5.03%) 0.01 ( 5.97%) 0.01 ( 5.22%) Elapsed stddev 0.00 ( 0.00%) 0.00 ( -9.59%) 0.00 ( 10.77%) 0.00 ( 3.24%) 0.00 ( 24.42%) 0.00 ( 31.86%) 0.00 ( -7.46%) 0.00 ( 6.11%) Elapsed max 0.01 ( 0.00%) 0.01 ( -0.01%) 0.01 ( 6.83%) 0.01 ( 5.42%) 0.01 ( 5.79%) 0.01 ( 5.53%) 0.01 ( 6.08%) 0.01 ( 5.26%) Elapsed range 0.00 ( 0.00%) 0.00 ( 7.30%) 0.00 ( 24.38%) 0.00 ( 6.10%) 0.00 ( 30.79%) 0.00 ( 42.52%) 0.00 ( 6.11%) 0.00 ( 10.07%) This patch (of 7): In putback_lru_page() since commit c53954a092 (""mm: remove lru parameter from __lru_cache_add and lru_cache_add_lru") it is no longer needed to determine lru list via page_lru_base_type(). This patch replaces it with simple flag is_unevictable which says that the page was put on the inevictable list. This is the only information that matters in subsequent tests. Signed-off-by: Vlastimil Babka Reviewed-by: Jörn Engel Acked-by: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 758540d3ca83..44c072a7cba2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -545,7 +545,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) */ void putback_lru_page(struct page *page) { - int lru; + bool is_unevictable; int was_unevictable = PageUnevictable(page); VM_BUG_ON(PageLRU(page)); @@ -560,14 +560,14 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = page_lru_base_type(page); + is_unevictable = false; lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ - lru = LRU_UNEVICTABLE; + is_unevictable = true; add_page_to_unevictable_list(page); /* * When racing with an mlock or AS_UNEVICTABLE clearing @@ -587,7 +587,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page)) { + if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -598,9 +598,9 @@ redo: */ } - if (was_unevictable && lru != LRU_UNEVICTABLE) + if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) + else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ -- cgit v1.2.3 From 6e543d5780e36ff5ee56c44d7e2e30db3457a7ed Mon Sep 17 00:00:00 2001 From: Lisa Du Date: Wed, 11 Sep 2013 14:22:36 -0700 Subject: mm: vmscan: fix do_try_to_free_pages() livelock This patch is based on KOSAKI's work and I add a little more description, please refer https://lkml.org/lkml/2012/6/14/74. Currently, I found system can enter a state that there are lots of free pages in a zone but only order-0 and order-1 pages which means the zone is heavily fragmented, then high order allocation could make direct reclaim path's long stall(ex, 60 seconds) especially in no swap and no compaciton enviroment. This problem happened on v3.4, but it seems issue still lives in current tree, the reason is do_try_to_free_pages enter live lock: kswapd will go to sleep if the zones have been fully scanned and are still not balanced. As kswapd thinks there's little point trying all over again to avoid infinite loop. Instead it changes order from high-order to 0-order because kswapd think order-0 is the most important. Look at 73ce02e9 in detail. If watermarks are ok, kswapd will go back to sleep and may leave zone->all_unreclaimable =3D 0. It assume high-order users can still perform direct reclaim if they wish. Direct reclaim continue to reclaim for a high order which is not a COSTLY_ORDER without oom-killer until kswapd turn on zone->all_unreclaimble= . This is because to avoid too early oom-kill. So it means direct_reclaim depends on kswapd to break this loop. In worst case, direct-reclaim may continue to page reclaim forever when kswapd sleeps forever until someone like watchdog detect and finally kill the process. As described in: http://thread.gmane.org/gmane.linux.kernel.mm/103737 We can't turn on zone->all_unreclaimable from direct reclaim path because direct reclaim path don't take any lock and this way is racy. Thus this patch removes zone->all_unreclaimable field completely and recalculates zone reclaimable state every time. Note: we can't take the idea that direct-reclaim see zone->pages_scanned directly and kswapd continue to use zone->all_unreclaimable. Because, it is racy. commit 929bea7c71 (vmscan: all_unreclaimable() use zone->all_unreclaimable as a name) describes the detail. [akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()] Cc: Aaditya Kumar Cc: Ying Han Cc: Nick Piggin Acked-by: Rik van Riel Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Bob Liu Cc: Neil Zhang Cc: Russell King - ARM Linux Reviewed-by: Michal Hocko Acked-by: Minchan Kim Acked-by: Johannes Weiner Signed-off-by: KOSAKI Motohiro Signed-off-by: Lisa Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 66 +++++++++++++++++++++++++++++-------------------------------- 1 file changed, 31 insertions(+), 35 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 44c072a7cba2..fe715daeb8bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc) } #endif +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; +} + +bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) @@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && zone->all_unreclaimable) + if (current_is_kswapd() && !zone_reclaimable(zone)) force_scan = true; if (!global_reclaim(sc)) force_scan = true; @@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone->all_unreclaimable && - sc->priority != DEF_PRIORITY) + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ if (IS_ENABLED(CONFIG_COMPACTION)) { /* @@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) return aborted_reclaim; } -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; -} - /* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) @@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (!zone->all_unreclaimable) + if (zone_reclaimable(zone)) return false; } @@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * DEF_PRIORITY. Effectively, it considers them balanced so * they must be considered balanced here as well! */ - if (zone->all_unreclaimable) { + if (!zone_reclaimable(zone)) { balanced_pages += zone->managed_pages; continue; } @@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone, unsigned long lru_pages, unsigned long *nr_attempted) { - unsigned long nr_slab; int testorder = sc->order; unsigned long balance_gap; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone, shrink_zone(zone, sc); reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + shrink_slab(&shrink, sc->nr_scanned, lru_pages); sc->nr_reclaimed += reclaim_state->reclaimed_slab; /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - zone_clear_flag(zone, ZONE_WRITEBACK); /* @@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * BDIs but as pressure is relieved, speculatively avoid congestion * waits. */ - if (!zone->all_unreclaimable && + if (zone_reclaimable(zone) && zone_balanced(zone, testorder, 0, classzone_idx)) { zone_clear_flag(zone, ZONE_CONGESTED); zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); @@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* @@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; sc.nr_scanned = 0; @@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void) return nr; } -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; - - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) return ZONE_RECLAIM_FULL; /* -- cgit v1.2.3 From 3b38722efd9f66da63bbbd41520c2e6fa9db3d68 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:21 -0700 Subject: memcg, vmscan: integrate soft reclaim tighter with zone shrinking code This patchset is sitting out of tree for quite some time without any objections. I would be really happy if it made it into 3.12. I do not want to push it too hard but I think this work is basically ready and waiting more doesn't help. The basic idea is quite simple. Pull soft reclaim into shrink_zone in the first step and get rid of the previous soft reclaim infrastructure. shrink_zone is done in two passes now. First it tries to do the soft limit reclaim and it falls back to reclaim-all mode if no group is over the limit or no pages have been scanned. The second pass happens at the same priority so the only time we waste is the memcg tree walk which has been updated in the third step to have only negligible overhead. As a bonus we will get rid of a _lot_ of code by this and soft reclaim will not stand out like before when it wasn't integrated into the zone shrinking code and it reclaimed at priority 0 (the testing results show that some workloads suffers from such an aggressive reclaim). The clean up is in a separate patch because I felt it would be easier to review that way. The second step is soft limit reclaim integration into targeted reclaim. It should be rather straight forward. Soft limit has been used only for the global reclaim so far but it makes sense for any kind of pressure coming from up-the-hierarchy, including targeted reclaim. The third step (patches 4-8) addresses the tree walk overhead by enhancing memcg iterators to enable skipping whole subtrees and tracking number of over soft limit children at each level of the hierarchy. This information is updated same way the old soft limit tree was updated (from memcg_check_events) so we shouldn't see an additional overhead. In fact mem_cgroup_update_soft_limit is much simpler than tree manipulation done previously. __shrink_zone uses mem_cgroup_soft_reclaim_eligible as a predicate for mem_cgroup_iter so the decision whether a particular group should be visited is done at the iterator level which allows us to decide to skip the whole subtree as well (if there is no child in excess). This reduces the tree walk overhead considerably. * TEST 1 ======== My primary test case was a parallel kernel build with 2 groups (make is running with -j8 with a distribution .config in a separate cgroup without any hard limit) on a 32 CPU machine booted with 1GB memory and both builds run taskset to Node 0 cpus. I was mostly interested in 2 setups. Default - no soft limit set and - and 0 soft limit set to both groups. The first one should tell us whether the rework regresses the default behavior while the second one should show us improvements in an extreme case where both workloads are always over the soft limit. /usr/bin/time -v has been used to collect the statistics and each configuration had 3 runs after fresh boot without any other load on the system. base is mmotm-2013-07-18-16-40 rework all 8 patches applied on top of base * No-limit User no-limit/base: min: 651.92 max: 672.65 avg: 664.33 std: 8.01 runs: 6 no-limit/rework: min: 657.34 [100.8%] max: 668.39 [99.4%] avg: 663.13 [99.8%] std: 3.61 runs: 6 System no-limit/base: min: 69.33 max: 71.39 avg: 70.32 std: 0.79 runs: 6 no-limit/rework: min: 69.12 [99.7%] max: 71.05 [99.5%] avg: 70.04 [99.6%] std: 0.59 runs: 6 Elapsed no-limit/base: min: 398.27 max: 422.36 avg: 408.85 std: 7.74 runs: 6 no-limit/rework: min: 386.36 [97.0%] max: 438.40 [103.8%] avg: 416.34 [101.8%] std: 18.85 runs: 6 The results are within noise. Elapsed time has a bigger variance but the average looks good. * 0-limit User 0-limit/base: min: 573.76 max: 605.63 avg: 585.73 std: 12.21 runs: 6 0-limit/rework: min: 645.77 [112.6%] max: 666.25 [110.0%] avg: 656.97 [112.2%] std: 7.77 runs: 6 System 0-limit/base: min: 69.57 max: 71.13 avg: 70.29 std: 0.54 runs: 6 0-limit/rework: min: 68.68 [98.7%] max: 71.40 [100.4%] avg: 69.91 [99.5%] std: 0.87 runs: 6 Elapsed 0-limit/base: min: 1306.14 max: 1550.17 avg: 1430.35 std: 90.86 runs: 6 0-limit/rework: min: 404.06 [30.9%] max: 465.94 [30.1%] avg: 434.81 [30.4%] std: 22.68 runs: 6 The improvement is really huge here (even bigger than with my previous testing and I suspect that this highly depends on the storage). Page fault statistics tell us at least part of the story: Minor 0-limit/base: min: 37180461.00 max: 37319986.00 avg: 37247470.00 std: 54772.71 runs: 6 0-limit/rework: min: 36751685.00 [98.8%] max: 36805379.00 [98.6%] avg: 36774506.33 [98.7%] std: 17109.03 runs: 6 Major 0-limit/base: min: 170604.00 max: 221141.00 avg: 196081.83 std: 18217.01 runs: 6 0-limit/rework: min: 2864.00 [1.7%] max: 10029.00 [4.5%] avg: 5627.33 [2.9%] std: 2252.71 runs: 6 Same as with my previous testing Minor faults are more or less within noise but Major fault count is way bellow the base kernel. While this looks as a nice win it is fair to say that 0-limit configuration is quite artificial. So I was playing with 0-no-limit loads as well. * TEST 2 ======== The following results are from 2 groups configuration on a 16GB machine (single NUMA node). - A running stream IO (dd if=/dev/zero of=local.file bs=1024) with 2*TotalMem with 0 soft limit. - B running a mem_eater which consumes TotalMem-1G without any limit. The mem_eater consumes the memory in 100 chunks with 1s nap after each mmap+poppulate so that both loads have chance to fight for the memory. The expected result is that B shouldn't be reclaimed and A shouldn't see a big dropdown in elapsed time. User base: min: 2.68 max: 2.89 avg: 2.76 std: 0.09 runs: 3 rework: min: 3.27 [122.0%] max: 3.74 [129.4%] avg: 3.44 [124.6%] std: 0.21 runs: 3 System base: min: 86.26 max: 88.29 avg: 87.28 std: 0.83 runs: 3 rework: min: 81.05 [94.0%] max: 84.96 [96.2%] avg: 83.14 [95.3%] std: 1.61 runs: 3 Elapsed base: min: 317.28 max: 332.39 avg: 325.84 std: 6.33 runs: 3 rework: min: 281.53 [88.7%] max: 298.16 [89.7%] avg: 290.99 [89.3%] std: 6.98 runs: 3 System time improved slightly as well as Elapsed. My previous testing has shown worse numbers but this again seem to depend on the storage speed. My theory is that the writeback doesn't catch up and prio-0 soft reclaim falls into wait on writeback page too often in the base kernel. The patched kernel doesn't do that because the soft reclaim is done from the kswapd/direct reclaim context. This can be seen on the following graph nicely. The A's group usage_in_bytes regurarly drops really low very often. All 3 runs http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/stream.png resp. a detail of the single run http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/stream-one-run.png mem_eater seems to be doing better as well. It gets to the full allocation size faster as can be seen on the following graph: http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/mem_eater-one-run.png /proc/meminfo collected during the test also shows that rework kernel hasn't swapped that much (well almost not at all): base: max: 123900 K avg: 56388.29 K rework: max: 300 K avg: 128.68 K kswapd and direct reclaim statistics are of no use unfortunatelly because soft reclaim is not accounted properly as the counters are hidden by global_reclaim() checks in the base kernel. * TEST 3 ======== Another test was the same configuration as TEST2 except the stream IO was replaced by a single kbuild (16 parallel jobs bound to Node0 cpus same as in TEST1) and mem_eater allocated TotalMem-200M so kbuild had only 200MB left. Kbuild did better with the rework kernel here as well: User base: min: 860.28 max: 872.86 avg: 868.03 std: 5.54 runs: 3 rework: min: 880.81 [102.4%] max: 887.45 [101.7%] avg: 883.56 [101.8%] std: 2.83 runs: 3 System base: min: 84.35 max: 85.06 avg: 84.79 std: 0.31 runs: 3 rework: min: 85.62 [101.5%] max: 86.09 [101.2%] avg: 85.79 [101.2%] std: 0.21 runs: 3 Elapsed base: min: 135.36 max: 243.30 avg: 182.47 std: 45.12 runs: 3 rework: min: 110.46 [81.6%] max: 116.20 [47.8%] avg: 114.15 [62.6%] std: 2.61 runs: 3 Minor base: min: 36635476.00 max: 36673365.00 avg: 36654812.00 std: 15478.03 runs: 3 rework: min: 36639301.00 [100.0%] max: 36695541.00 [100.1%] avg: 36665511.00 [100.0%] std: 23118.23 runs: 3 Major base: min: 14708.00 max: 53328.00 avg: 31379.00 std: 16202.24 runs: 3 rework: min: 302.00 [2.1%] max: 414.00 [0.8%] avg: 366.33 [1.2%] std: 47.22 runs: 3 Again we can see a significant improvement in Elapsed (it also seems to be more stable), there is a huge dropdown for the Major page faults and much more swapping: base: max: 583736 K avg: 112547.43 K rework: max: 4012 K avg: 124.36 K Graphs from all three runs show the variability of the kbuild quite nicely. It even seems that it took longer after every run with the base kernel which would be quite surprising as the source tree for the build is removed and caches are dropped after each run so the build operates on a freshly extracted sources everytime. http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/kbuild-mem_eater.png My other testing shows that this is just a matter of timing and other runs behave differently the std for Elapsed time is similar ~50. Example of other three runs: http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/kbuild-mem_eater2.png So to wrap this up. The series is still doing good and improves the soft limit. The testing results for bunch of cgroups with both stream IO and kbuild loads can be found in "memcg: track children in soft limit excess to improve soft limit". This patch: Memcg soft reclaim has been traditionally triggered from the global reclaim paths before calling shrink_zone. mem_cgroup_soft_limit_reclaim then picked up a group which exceeds the soft limit the most and reclaimed it with 0 priority to reclaim at least SWAP_CLUSTER_MAX pages. The infrastructure requires per-node-zone trees which hold over-limit groups and keep them up-to-date (via memcg_check_events) which is not cost free. Although this overhead hasn't turned out to be a bottle neck the implementation is suboptimal because mem_cgroup_update_tree has no idea which zones consumed memory over the limit so we could easily end up having a group on a node-zone tree having only few pages from that node-zone. This patch doesn't try to fix node-zone trees management because it seems that integrating soft reclaim into zone shrinking sounds much easier and more appropriate for several reasons. First of all 0 priority reclaim was a crude hack which might lead to big stalls if the group's LRUs are big and hard to reclaim (e.g. a lot of dirty/writeback pages). Soft reclaim should be applicable also to the targeted reclaim which is awkward right now without additional hacks. Last but not least the whole infrastructure eats quite some code. After this patch shrink_zone is done in 2 passes. First it tries to do the soft reclaim if appropriate (only for global reclaim for now to keep compatible with the original state) and fall back to ignoring soft limit if no group is eligible to soft reclaim or nothing has been scanned during the first pass. Only groups which are over their soft limit or any of their parents up the hierarchy is over the limit are considered eligible during the first pass. Soft limit tree which is not necessary anymore will be removed in the follow up patch to make this patch smaller and easier to review. Signed-off-by: Michal Hocko Reviewed-by: Glauber Costa Reviewed-by: Tejun Heo Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Ying Han Cc: Hugh Dickins Cc: Michel Lespinasse Cc: Greg Thelen Cc: KOSAKI Motohiro Cc: Balbir Singh Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 62 +++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 26 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index fe715daeb8bc..cf4643807ec2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -139,11 +139,21 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } + +static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) +{ + return !mem_cgroup_disabled() && global_reclaim(sc); +} #else static bool global_reclaim(struct scan_control *sc) { return true; } + +static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) +{ + return false; +} #endif unsigned long zone_reclaimable_pages(struct zone *zone) @@ -2130,7 +2140,8 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static void +__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) { unsigned long nr_reclaimed, nr_scanned; @@ -2149,6 +2160,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) do { struct lruvec *lruvec; + if (soft_reclaim && + !mem_cgroup_soft_reclaim_eligible(memcg)) { + memcg = mem_cgroup_iter(root, memcg, &reclaim); + continue; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2179,6 +2196,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc)); } + +static void shrink_zone(struct zone *zone, struct scan_control *sc) +{ + bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); + unsigned long nr_scanned = sc->nr_scanned; + + __shrink_zone(zone, sc, do_soft_reclaim); + + /* + * No group is over the soft limit or those that are do not have + * pages in the zone we are reclaiming so we have to reclaim everybody + */ + if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { + __shrink_zone(zone, sc, false); + return; + } +} + /* Returns true if compaction should go ahead for a high-order request */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { @@ -2240,8 +2275,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - unsigned long nr_soft_reclaimed; - unsigned long nr_soft_scanned; bool aborted_reclaim = false; /* @@ -2281,18 +2314,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; } } - /* - * This steals pages from memory cgroups over softlimit - * and returns the number of reclaimed pages and - * scanned pages. This works for global memory pressure - * and balancing, not for a memcg's limit. - */ - nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - sc->order, sc->gfp_mask, - &nr_soft_scanned); - sc->nr_reclaimed += nr_soft_reclaimed; - sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } @@ -2880,8 +2901,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long nr_soft_reclaimed; - unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, @@ -2996,15 +3015,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_scanned = 0; - nr_soft_scanned = 0; - /* - * Call soft limit reclaim before calling shrink_zone. - */ - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - order, sc.gfp_mask, - &nr_soft_scanned); - sc.nr_reclaimed += nr_soft_reclaimed; - /* * There should be no need to raise the scanning * priority if enough pages are already being scanned -- cgit v1.2.3 From a5b7c87f92076352dbff2fe0423ec255e1c9a71b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:25 -0700 Subject: vmscan, memcg: do softlimit reclaim also for targeted reclaim Soft reclaim has been done only for the global reclaim (both background and direct). Since "memcg: integrate soft reclaim tighter with zone shrinking code" there is no reason for this limitation anymore as the soft limit reclaim doesn't use any special code paths and it is a part of the zone shrinking code which is used by both global and targeted reclaims. From the semantic point of view it is natural to consider soft limit before touching all groups in the hierarchy tree which is touching the hard limit because soft limit tells us where to push back when there is a memory pressure. It is not important whether the pressure comes from the limit or imbalanced zones. This patch simply enables soft reclaim unconditionally in mem_cgroup_should_soft_reclaim so it is enabled for both global and targeted reclaim paths. mem_cgroup_soft_reclaim_eligible needs to learn about the root of the reclaim to know where to stop checking soft limit state of parents up the hierarchy. Say we have A (over soft limit) \ B (below s.l., hit the hard limit) / \ C D (below s.l.) B is the source of the outside memory pressure now for D but we shouldn't soft reclaim it because it is behaving well under B subtree and we can still reclaim from C (pressumably it is over the limit). mem_cgroup_soft_reclaim_eligible should therefore stop climbing up the hierarchy at B (root of the memory pressure). Signed-off-by: Michal Hocko Reviewed-by: Glauber Costa Reviewed-by: Tejun Heo Cc: Balbir Singh Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index cf4643807ec2..1896e7ca494b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc) static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) { - return !mem_cgroup_disabled() && global_reclaim(sc); + return !mem_cgroup_disabled(); } #else static bool global_reclaim(struct scan_control *sc) @@ -2161,7 +2161,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) struct lruvec *lruvec; if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg)) { + !mem_cgroup_soft_reclaim_eligible(memcg, root)) { memcg = mem_cgroup_iter(root, memcg, &reclaim); continue; } -- cgit v1.2.3 From de57780dc659f95b17ccb649f003278dde0b5b86 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:26 -0700 Subject: memcg: enhance memcg iterator to support predicates The caller of the iterator might know that some nodes or even subtrees should be skipped but there is no way to tell iterators about that so the only choice left is to let iterators to visit each node and do the selection outside of the iterating code. This, however, doesn't scale well with hierarchies with many groups where only few groups are interesting. This patch adds mem_cgroup_iter_cond variant of the iterator with a callback which gets called for every visited node. There are three possible ways how the callback can influence the walk. Either the node is visited, it is skipped but the tree walk continues down the tree or the whole subtree of the current group is skipped. [hughd@google.com: fix memcg-less page reclaim] Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1896e7ca494b..f2e35099508b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2151,21 +2151,16 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) .zone = zone, .priority = sc->priority, }; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; + mem_cgroup_iter_filter filter = (soft_reclaim) ? + mem_cgroup_soft_reclaim_eligible : NULL; nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { + while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { struct lruvec *lruvec; - if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg, root)) { - memcg = mem_cgroup_iter(root, memcg, &reclaim); - continue; - } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2185,8 +2180,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) mem_cgroup_iter_break(root, memcg); break; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, -- cgit v1.2.3 From e839b6a1c8d0895803bcbd587595a54f4221a625 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:30 -0700 Subject: memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything mem_cgroup_should_soft_reclaim controls whether soft reclaim pass is done and it always says yes currently. Memcg iterators are clever to skip nodes that are not soft reclaimable quite efficiently but mem_cgroup_should_soft_reclaim can be more clever and do not start the soft reclaim pass at all if it knows that nothing would be scanned anyway. In order to do that, simply reuse mem_cgroup_soft_reclaim_eligible for the target group of the reclaim and allow the pass only if the whole subtree wouldn't be skipped. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index f2e35099508b..04dca89c4f34 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -142,7 +142,9 @@ static bool global_reclaim(struct scan_control *sc) static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) { - return !mem_cgroup_disabled(); + struct mem_cgroup *root = sc->target_mem_cgroup; + return !mem_cgroup_disabled() && + mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; } #else static bool global_reclaim(struct scan_control *sc) -- cgit v1.2.3 From e975de998b9612aa97e3165f8827c09e0d36ce77 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:34 -0700 Subject: memcg, vmscan: do not fall into reclaim-all pass too quickly shrink_zone starts with soft reclaim pass first and then falls back to regular reclaim if nothing has been scanned. This behavior is natural but there is a catch. Memcg iterators, when used with the reclaim cookie, are designed to help to prevent from over reclaim by interleaving reclaimers (per node-zone-priority) so the tree walk might miss many (even all) nodes in the hierarchy e.g. when there are direct reclaimers racing with each other or with kswapd in the global case or multiple allocators reaching the limit for the target reclaim case. To make it even more complicated, targeted reclaim doesn't do the whole tree walk because it stops reclaiming once it reclaims sufficient pages. As a result groups over the limit might be missed, thus nothing is scanned, and reclaim would fall back to the reclaim all mode. This patch checks for the incomplete tree walk in shrink_zone. If no group has been visited and the hierarchy is soft reclaimable then we must have missed some groups, in which case the __shrink_zone is called again. This doesn't guarantee there will be some progress of course because the current reclaimer might be still racing with others but it would at least give a chance to start the walk without a big risk of reclaim latencies. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 04dca89c4f34..fa91c20fe4b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2142,10 +2142,11 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void +static int __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) { unsigned long nr_reclaimed, nr_scanned; + int groups_scanned = 0; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2163,6 +2164,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { struct lruvec *lruvec; + groups_scanned++; lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2190,6 +2192,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return groups_scanned; } @@ -2197,8 +2201,19 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) { bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); unsigned long nr_scanned = sc->nr_scanned; + int scanned_groups; - __shrink_zone(zone, sc, do_soft_reclaim); + scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); + /* + * memcg iterator might race with other reclaimer or start from + * a incomplete tree walk so the tree walk in __shrink_zone + * might have missed groups that are above the soft limit. Try + * another loop to catch up with others. Do it just once to + * prevent from reclaim latencies when other reclaimers always + * preempt this one. + */ + if (do_soft_reclaim && !scanned_groups) + __shrink_zone(zone, sc, do_soft_reclaim); /* * No group is over the soft limit or those that are do not have -- cgit v1.2.3 From f894ffa865301d4010d68b15be29912fa4039e77 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 12 Sep 2013 15:13:35 -0700 Subject: memcg: trivial cleanups Clean up some mess made by the "Soft limit rework" series, and a few other things. Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index fa91c20fe4b7..76d1d5eaeec3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2205,12 +2205,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); /* - * memcg iterator might race with other reclaimer or start from - * a incomplete tree walk so the tree walk in __shrink_zone - * might have missed groups that are above the soft limit. Try - * another loop to catch up with others. Do it just once to - * prevent from reclaim latencies when other reclaimers always - * preempt this one. + * memcg iterator might race with other reclaimer or start from + * a incomplete tree walk so the tree walk in __shrink_zone + * might have missed groups that are above the soft limit. Try + * another loop to catch up with others. Do it just once to + * prevent from reclaim latencies when other reclaimers always + * preempt this one. */ if (do_soft_reclaim && !scanned_groups) __shrink_zone(zone, sc, do_soft_reclaim); -- cgit v1.2.3 From 20ba27f52eded4d14e309ac57971c9c83dec46e4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:33 -0700 Subject: revert "memcg, vmscan: do not fall into reclaim-all pass too quickly" Revert commit e975de998b96 ("memcg, vmscan: do not fall into reclaim-all pass too quickly") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ed1b775bdc9..97b0ed16749f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2176,11 +2176,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static int +static void __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) { unsigned long nr_reclaimed, nr_scanned; - int groups_scanned = 0; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2198,7 +2197,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { struct lruvec *lruvec; - groups_scanned++; lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2226,8 +2224,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); - - return groups_scanned; } @@ -2235,19 +2231,8 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) { bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); unsigned long nr_scanned = sc->nr_scanned; - int scanned_groups; - scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); - /* - * memcg iterator might race with other reclaimer or start from - * a incomplete tree walk so the tree walk in __shrink_zone - * might have missed groups that are above the soft limit. Try - * another loop to catch up with others. Do it just once to - * prevent from reclaim latencies when other reclaimers always - * preempt this one. - */ - if (do_soft_reclaim && !scanned_groups) - __shrink_zone(zone, sc, do_soft_reclaim); + __shrink_zone(zone, sc, do_soft_reclaim); /* * No group is over the soft limit or those that are do not have -- cgit v1.2.3 From 3120055e869f2a208480f238680d097eec8f0e02 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:35 -0700 Subject: revert "memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything" Revert commit e839b6a1c8d0 ("memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 97b0ed16749f..cdd300b81485 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -142,9 +142,7 @@ static bool global_reclaim(struct scan_control *sc) static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) { - struct mem_cgroup *root = sc->target_mem_cgroup; - return !mem_cgroup_disabled() && - mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; + return !mem_cgroup_disabled(); } #else static bool global_reclaim(struct scan_control *sc) -- cgit v1.2.3 From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:37 -0700 Subject: revert "memcg: enhance memcg iterator to support predicates" Revert commit de57780dc659 ("memcg: enhance memcg iterator to support predicates") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index cdd300b81485..17a7134de4d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2185,16 +2185,21 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) .zone = zone, .priority = sc->priority, }; - struct mem_cgroup *memcg = NULL; - mem_cgroup_iter_filter filter = (soft_reclaim) ? - mem_cgroup_soft_reclaim_eligible : NULL; + struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { struct lruvec *lruvec; + if (soft_reclaim && + !mem_cgroup_soft_reclaim_eligible(memcg, root)) { + memcg = mem_cgroup_iter(root, memcg, &reclaim); + continue; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2214,7 +2219,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) mem_cgroup_iter_break(root, memcg); break; } - } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, -- cgit v1.2.3 From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:38 -0700 Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim" Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also for targeted reclaim") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 17a7134de4d7..0e081cada4ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc) static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) { - return !mem_cgroup_disabled(); + return !mem_cgroup_disabled() && global_reclaim(sc); } #else static bool global_reclaim(struct scan_control *sc) @@ -2195,7 +2195,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) struct lruvec *lruvec; if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg, root)) { + !mem_cgroup_soft_reclaim_eligible(memcg)) { memcg = mem_cgroup_iter(root, memcg, &reclaim); continue; } -- cgit v1.2.3 From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:41 -0700 Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code" Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim tighter with zone shrinking code") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 62 ++++++++++++++++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 36 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e081cada4ba..beb35778c69f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -139,21 +139,11 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - return !mem_cgroup_disabled() && global_reclaim(sc); -} #else static bool global_reclaim(struct scan_control *sc) { return true; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - return false; -} #endif unsigned long zone_reclaimable_pages(struct zone *zone) @@ -2174,8 +2164,7 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void -__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; @@ -2194,12 +2183,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) do { struct lruvec *lruvec; - if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg)) { - memcg = mem_cgroup_iter(root, memcg, &reclaim); - continue; - } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2230,24 +2213,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) sc->nr_scanned - nr_scanned, sc)); } - -static void shrink_zone(struct zone *zone, struct scan_control *sc) -{ - bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); - unsigned long nr_scanned = sc->nr_scanned; - - __shrink_zone(zone, sc, do_soft_reclaim); - - /* - * No group is over the soft limit or those that are do not have - * pages in the zone we are reclaiming so we have to reclaim everybody - */ - if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { - __shrink_zone(zone, sc, false); - return; - } -} - /* Returns true if compaction should go ahead for a high-order request */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { @@ -2309,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; bool aborted_reclaim = false; /* @@ -2348,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; } } + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } @@ -2941,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, @@ -3055,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_scanned = 0; + nr_soft_scanned = 0; + /* + * Call soft limit reclaim before calling shrink_zone. + */ + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* * There should be no need to raise the scanning * priority if enough pages are already being scanned -- cgit v1.2.3 From 117aad1e9e4d97448d1df3f84b08bd65811e6d6a Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Mon, 30 Sep 2013 13:45:16 -0700 Subject: mm: avoid reinserting isolated balloon pages into LRU lists Isolated balloon pages can wrongly end up in LRU lists when migrate_pages() finishes its round without draining all the isolated page list. The same issue can happen when reclaim_clean_pages_from_list() tries to reclaim pages from an isolated page list, before migration, in the CMA path. Such balloon page leak opens a race window against LRU lists shrinkers that leads us to the following kernel panic: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] shrink_page_list+0x24e/0x897 PGD 3cda2067 PUD 3d713067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 340 Comm: kswapd0 Not tainted 3.12.0-rc1-22626-g4367597 #87 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: shrink_page_list+0x24e/0x897 RSP: 0000:ffff88003da499b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88003e82bd60 RCX: 00000000000657d5 RDX: 0000000000000000 RSI: 000000000000031f RDI: ffff88003e82bd40 RBP: ffff88003da49ab0 R08: 0000000000000001 R09: 0000000081121a45 R10: ffffffff81121a45 R11: ffff88003c4a9a28 R12: ffff88003e82bd40 R13: ffff88003da0e800 R14: 0000000000000001 R15: ffff88003da49d58 FS: 0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000067d9000 CR3: 000000003ace5000 CR4: 00000000000407b0 Call Trace: shrink_inactive_list+0x240/0x3de shrink_lruvec+0x3e0/0x566 __shrink_zone+0x94/0x178 shrink_zone+0x3a/0x82 balance_pgdat+0x32a/0x4c2 kswapd+0x2f0/0x372 kthread+0xa2/0xaa ret_from_fork+0x7c/0xb0 Code: 80 7d 8f 01 48 83 95 68 ff ff ff 00 4c 89 e7 e8 5a 7b 00 00 48 85 c0 49 89 c5 75 08 80 7d 8f 00 74 3e eb 31 48 8b 80 18 01 00 00 <48> 8b 74 0d 48 8b 78 30 be 02 00 00 00 ff d2 eb RIP [] shrink_page_list+0x24e/0x897 RSP CR2: 0000000000000028 ---[ end trace 703d2451af6ffbfd ]--- Kernel panic - not syncing: Fatal exception This patch fixes the issue, by assuring the proper tests are made at putback_movable_pages() & reclaim_clean_pages_from_list() to avoid isolated balloon pages being wrongly reinserted in LRU lists. [akpm@linux-foundation.org: clarify awkward comment text] Signed-off-by: Rafael Aquini Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Cc: Mel Gorman Cc: Rik van Riel Cc: Hugh Dickins Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index beb35778c69f..53f2f82f83ae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,7 @@ #include #include +#include #include "internal.h" @@ -1113,7 +1114,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page)) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } -- cgit v1.2.3