From 4008bab7b3969ad9f9dd1d02096a3f0aa5610bd2 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:28 -0700 Subject: mm/page_alloc: factor out setting of pcp->high and pcp->batch "Problems" with the current code: 1: there is a lack of synchronization in setting ->high and ->batch in percpu_pagelist_fraction_sysctl_handler() 2: stop_machine() in zone_pcp_update() is unnecissary. 3: zone_pcp_update() does not consider the case where percpu_pagelist_fraction is non-zero To fix: 1: add memory barriers, a safe ->batch value, an update side mutex when updating ->high and ->batch, and use ACCESS_ONCE() for ->batch users that expect a stable value. 2: avoid draining pages in zone_pcp_update(), rely upon the memory barriers added to fix #1 3: factor out quite a few functions, and then call the appropriate one. Note that it results in a change to the behavior of zone_pcp_update(), which is used by memory_hotplug. I'm rather certain that I've diserned (and preserved) the essential behavior (changing ->high and ->batch), and only eliminated unneeded actions (draining the per cpu pages), but this may not be the case. Further note that the draining of pages that previously took place in zone_pcp_update() occured after repeated draining when attempting to offline a page, and after the offline has "succeeded". It appears that the draining was added to zone_pcp_update() to avoid refactoring setup_pageset() into 2 funtions. This patch: Creates pageset_set_batch() for use in setup_pageset(). pageset_set_batch() imitates the functionality of setup_pagelist_highmark(), but uses the boot time (percpu_pagelist_fraction == 0) calculations for determining ->high based on ->batch. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb624fccf..d4bcc20ab6f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4032,6 +4032,14 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* a companion to setup_pagelist_highmark() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp = &p->pcp; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); +} + static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; @@ -4041,8 +4049,7 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } @@ -4051,7 +4058,6 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { -- cgit v1.2.3 From c8e251fadc6220261f6e0c6b8a4f1cdf27626165 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:29 -0700 Subject: mm/page_alloc: prevent concurrent updaters of pcp ->batch and ->high Because we are going to rely upon a careful transision between old and new ->high and ->batch values using memory barriers and will remove stop_machine(), we need to prevent multiple updaters from interweaving their memory writes. Add a simple mutex to protect both update loops. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d4bcc20ab6f0..8d4335779633 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,9 @@ #include #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -5557,6 +5560,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { for_each_possible_cpu(cpu) { unsigned long high; @@ -5565,6 +5570,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, per_cpu_ptr(zone->pageset, cpu), high); } } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -6078,7 +6084,9 @@ static int __meminit __zone_pcp_update(void *data) void __meminit zone_pcp_update(struct zone *zone) { + mutex_lock(&pcp_batch_high_lock); stop_machine(__zone_pcp_update, zone, NULL); + mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 8d7a8fa97abeb4fd6b3975d32c9f859875157770 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:31 -0700 Subject: mm/page_alloc: insert memory barriers to allow async update of pcp batch and high Introduce pageset_update() to perform a safe transision from one set of pcp->{batch,high} to a new set using memory barriers. This ensures that batch is always set to a safe value (1) prior to updating high, and ensure that high is fully updated before setting the real value of batch. It avoids ->batch ever rising above ->high. Suggested by Gilad Ben-Yossef in these threads: https://lkml.org/lkml/2013/4/9/23 https://lkml.org/lkml/2013/4/10/49 Also reproduces his proposed comment. Signed-off-by: Cody P Schafer Reviewed-by: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d4335779633..eaaef2a09424 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4035,12 +4035,37 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + /* a companion to setup_pagelist_highmark() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { - struct per_cpu_pages *pcp = &p->pcp; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) @@ -4064,13 +4089,11 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); } static void __meminit setup_zone_pageset(struct zone *zone) -- cgit v1.2.3 From 998d39cb236fe464af86a3492a24d2f67ee1efc2 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:32 -0700 Subject: mm/page_alloc: protect pcp->batch accesses with ACCESS_ONCE pcp->batch could change at any point, avoid relying on it being a stable value. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaaef2a09424..97b8f861e63d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1182,10 +1182,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; if (to_drain > 0) { @@ -1353,8 +1355,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: -- cgit v1.2.3 From 0a647f3811d6af56405a819341ceac23e31d4572 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:33 -0700 Subject: mm/page_alloc: convert zone_pcp_update() to rely on memory barriers instead of stop_machine() zone_pcp_update()'s goal is to adjust the ->high and ->mark members of a percpu pageset based on a zone's ->managed_pages. We don't need to drain the entire percpu pageset just to modify these fields. This lets us avoid calling setup_pageset() (and the draining required to call it) and instead allows simply setting the fields' values (with some attention paid to memory barriers to prevent the relationship between ->batch and ->high from being thrown off). This does change the behavior of zone_pcp_update() as the percpu pagesets will not be drained when zone_pcp_update() is called (they will end up being shrunk, not completely drained, later when a 0-order page is freed in free_hot_cold_page()). Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97b8f861e63d..8125263be60f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6085,33 +6085,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - drain_zonestat(zone, pset); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ void __meminit zone_pcp_update(struct zone *zone) { + unsigned cpu; + unsigned long batch; mutex_lock(&pcp_batch_high_lock); - stop_machine(__zone_pcp_update, zone, NULL); + batch = zone_batchsize(zone); + for_each_possible_cpu(cpu) + pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 22a7f12b1606327f0e11fcdf9043ae00bf9917df Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:34 -0700 Subject: mm/page_alloc: when handling percpu_pagelist_fraction, don't unneedly recalulate high Simply moves calculation of the new 'high' value outside the for_each_possible_cpu() loop, as it does not depend on the cpu. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8125263be60f..386de0f11bea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5575,7 +5575,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist * can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5589,12 +5588,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + per_cpu_ptr(zone->pageset, cpu), high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3 From 88c90dbccaaed35991b5336fec84294de1d23538 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:35 -0700 Subject: mm/page_alloc: factor setup_pageset() into pageset_init() and pageset_set_batch() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 386de0f11bea..a235149d9406 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4071,7 +4071,7 @@ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -4080,11 +4080,16 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. -- cgit v1.2.3 From dd1895e2c5c9ed3a791d1d8eb4a6a3e241ec9d6e Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:36 -0700 Subject: mm/page_alloc: relocate comment to be directly above code it refers to. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a235149d9406..2793ce50f316 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3711,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } -- cgit v1.2.3 From 56cef2b85c28d81efd39f2eeaddce28678756fe3 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:38 -0700 Subject: mm/page_alloc: factor zone_pageset_init() out of setup_zone_pageset() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2793ce50f316..ee6fe7faabad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4104,22 +4104,25 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} + static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - setup_pageset(pcp, zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->managed_pages / - percpu_pagelist_fraction)); - } + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* -- cgit v1.2.3 From 737af4c0110fc69a81dc7464a74a4113f7645255 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:39 -0700 Subject: mm/page_alloc: in zone_pcp_update(), uze zone_pageset_init() Previously, zone_pcp_update() called pageset_set_batch() directly, essentially assuming that percpu_pagelist_fraction == 0. Correct this by calling zone_pageset_init(), which chooses the appropriate ->batch and ->high calculations. Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ee6fe7faabad..c7344d17660b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,11 +6098,9 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) void __meminit zone_pcp_update(struct zone *zone) { unsigned cpu; - unsigned long batch; mutex_lock(&pcp_batch_high_lock); - batch = zone_batchsize(zone); for_each_possible_cpu(cpu) - pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); + zone_pageset_init(zone, cpu); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 3664033c56f211a3dcf28d9d68c604ed447d8d79 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:40 -0700 Subject: mm/page_alloc: rename setup_pagelist_highmark() to match naming of pageset_set_batch() Signed-off-by: Cody P Schafer Cc: Gilad Ben-Yossef Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7344d17660b..03a3f943d98e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4065,7 +4065,7 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, pcp->batch = batch; } -/* a companion to setup_pagelist_highmark() */ +/* a companion to pageset_set_high() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); @@ -4091,10 +4091,10 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) } /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { unsigned long batch = max(1UL, high / 4); @@ -4110,7 +4110,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_init(pcp); if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, + pageset_set_high(pcp, (zone->managed_pages / percpu_pagelist_fraction)); else @@ -5599,8 +5599,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, unsigned long high; high = zone->managed_pages / percpu_pagelist_fraction; for_each_possible_cpu(cpu) - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3 From 169f6c1999ca6d0c5e06e8d810817ed3d1ebf017 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 3 Jul 2013 15:01:41 -0700 Subject: mm/page_alloc: don't re-init pageset in zone_pcp_update() When memory hotplug is triggered, we call pageset_init() on per-cpu-pagesets which both contain pages and are in use, causing both the leakage of those pages and (potentially) bad behaviour if a page is allocated from a pageset while it is being cleared. Avoid this by factoring out pageset_set_high_and_batch() (which contains all needed logic too set a pageset's ->high and ->batch inrespective of system state) from zone_pageset_init() and using the new pageset_set_high_and_batch() instead of zone_pageset_init() in zone_pcp_update(). Signed-off-by: Cody P Schafer Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a3f943d98e..fab9506273be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4104,11 +4104,9 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit zone_pageset_init(struct zone *zone, int cpu) +static void __meminit pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - pageset_init(pcp); if (percpu_pagelist_fraction) pageset_set_high(pcp, (zone->managed_pages / @@ -4117,6 +4115,14 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_batch(pcp, zone_batchsize(zone)); } +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + pageset_set_high_and_batch(zone, pcp); +} + static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; @@ -6100,7 +6106,8 @@ void __meminit zone_pcp_update(struct zone *zone) unsigned cpu; mutex_lock(&pcp_batch_high_lock); for_each_possible_cpu(cpu) - zone_pageset_init(zone, cpu); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From dacbde0963d62a4962d5e8a5cc38dfd1f016124b Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 3 Jul 2013 15:02:35 -0700 Subject: mm/page_alloc.c: add additional checking and return value for the 'table->data' - check the length of the procfs data before copying it into a fixed size array. - when __parse_numa_zonelist_order() fails, save the error code for return. - 'char*' --> 'char *' coding style fix Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fab9506273be..a662c74a0f5d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3256,18 +3256,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write, static DEFINE_MUTEX(zl_order_mutex); mutex_lock(&zl_order_mutex); - if (write) - strcpy(saved_string, (char*)table->data); + if (write) { + if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { + ret = -EINVAL; + goto out; + } + strcpy(saved_string, (char *)table->data); + } ret = proc_dostring(table, write, buffer, length, ppos); if (ret) goto out; if (write) { int oldval = user_zonelist_order; - if (__parse_numa_zonelist_order((char*)table->data)) { + + ret = __parse_numa_zonelist_order((char *)table->data); + if (ret) { /* * bogus value. restore saved string */ - strncpy((char*)table->data, saved_string, + strncpy((char *)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { -- cgit v1.2.3 From cea27eb2a202959783f81254c48c250ddd80e129 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 3 Jul 2013 15:02:40 -0700 Subject: mm/memory-hotplug: fix lowmem count overflow when offline pages The logic for the memory-remove code fails to correctly account the Total High Memory when a memory block which contains High Memory is offlined as shown in the example below. The following patch fixes it. Before logic memory remove: MemTotal: 7603740 kB MemFree: 6329612 kB Buffers: 94352 kB Cached: 872008 kB SwapCached: 0 kB Active: 626932 kB Inactive: 519216 kB Active(anon): 180776 kB Inactive(anon): 222944 kB Active(file): 446156 kB Inactive(file): 296272 kB Unevictable: 0 kB Mlocked: 0 kB HighTotal: 7294672 kB HighFree: 5704696 kB LowTotal: 309068 kB LowFree: 624916 kB After logic memory remove: MemTotal: 7079452 kB MemFree: 5805976 kB Buffers: 94372 kB Cached: 872000 kB SwapCached: 0 kB Active: 626936 kB Inactive: 519236 kB Active(anon): 180780 kB Inactive(anon): 222944 kB Active(file): 446156 kB Inactive(file): 296292 kB Unevictable: 0 kB Mlocked: 0 kB HighTotal: 7294672 kB HighFree: 5181024 kB LowTotal: 4294752076 kB LowFree: 624952 kB [mhocko@suse.cz: fix CONFIG_HIGHMEM=n build] Signed-off-by: Wanpeng Li Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: [2.6.24+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a662c74a0f5d..d711dcdda362 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6185,6 +6185,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages -= 1 << order; +#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); -- cgit v1.2.3 From 11199692d83dd3fe1511203024fb9853d176ec4c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:02:48 -0700 Subject: mm: change signature of free_reserved_area() to fix building warnings Change signature of free_reserved_area() according to Russell King's suggestion to fix following build warnings: arch/arm/mm/init.c: In function 'mem_init': arch/arm/mm/init.c:603:2: warning: passing argument 1 of 'free_reserved_area' makes integer from pointer without a cast [enabled by default] free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL); ^ In file included from include/linux/mman.h:4:0, from arch/arm/mm/init.c:15: include/linux/mm.h:1301:22: note: expected 'long unsigned int' but argument is of type 'void *' extern unsigned long free_reserved_area(unsigned long start, unsigned long end, mm/page_alloc.c: In function 'free_reserved_area': >> mm/page_alloc.c:5134:3: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast [enabled by default] In file included from arch/mips/include/asm/page.h:49:0, from include/linux/mmzone.h:20, from include/linux/gfp.h:4, from include/linux/mm.h:8, from mm/page_alloc.c:18: arch/mips/include/asm/io.h:119:29: note: expected 'const volatile void *' but argument is of type 'long unsigned int' mm/page_alloc.c: In function 'free_area_init_nodes': mm/page_alloc.c:5030:34: warning: array subscript is below array bounds [-Warray-bounds] Also address some minor code review comments. Signed-off-by: Jiang Liu Reported-by: Arnd Bergmann Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michel Lespinasse Cc: Minchan Kim Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d711dcdda362..be18ccd017bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5206,25 +5206,26 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -unsigned long free_reserved_area(unsigned long start, unsigned long end, - int poison, char *s) +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { - unsigned long pages, pos; + void *pos; + unsigned long pages = 0; - pos = start = PAGE_ALIGN(start); - end &= PAGE_MASK; - for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { if (poison) - memset((void *)pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page((void *)pos)); + memset(pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); } if (pages && s) - pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", + pr_info("Freeing %s memory: %ldK (%p - %p)\n", s, pages << (PAGE_SHIFT - 10), start, end); return pages; } +EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) -- cgit v1.2.3 From dbe67df4ba78c79db547c7864e1120981c144c97 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:02:51 -0700 Subject: mm: enhance free_reserved_area() to support poisoning memory with zero Address more review comments from last round of code review. 1) Enhance free_reserved_area() to support poisoning freed memory with pattern '0'. This could be used to get rid of poison_init_mem() on ARM64. 2) A previous patch has disabled memory poison for initmem on s390 by mistake, so restore to the original behavior. 3) Remove redundant PAGE_ALIGN() when calling free_reserved_area(). Signed-off-by: Jiang Liu Cc: Geert Uytterhoeven Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Mel Gorman Cc: Michel Lespinasse Cc: Minchan Kim Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index be18ccd017bb..6780b2e18aa1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5214,7 +5214,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { - if (poison) + if ((unsigned int)poison <= 0xFF) memset(pos, poison, PAGE_SIZE); free_reserved_page(virt_to_page(pos)); } -- cgit v1.2.3 From 834405c3b6aebf6853663796401cdfe11aac6275 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:04 -0700 Subject: mm: fix some trivial typos in comments Fix some trivial typos in comments. Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Cc: Mel Gorman Cc: Minchan Kim Cc: Marek Szyprowski Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tejun Heo Cc: Thomas Gleixner Cc: Will Deacon Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6780b2e18aa1..657daea88aa8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2845,7 +2845,7 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: - * present_pages - high_pages + * managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { -- cgit v1.2.3 From 4f9f47745e948eca18bb97c82dbb4d53f2380086 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:07 -0700 Subject: mm: use managed_pages to calculate default zonelist order Use zone->managed_pages instead of zone->present_pages to calculate default zonelist order because managed_pages means allocatable pages. Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Minchan Kim Cc: Marek Szyprowski Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 657daea88aa8..f22542f6dc12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3438,8 +3438,8 @@ static int default_zonelist_order(void) z = &NODE_DATA(nid)->node_zones[zone_type]; if (populated_zone(z)) { if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; + low_kmem_size += z->managed_pages; + total_size += z->managed_pages; } else if (zone_type == ZONE_NORMAL) { /* * If any node has only lowmem, then node order -- cgit v1.2.3 From 7b4b2a0d6c8500350784beb83a6a55e60ea3bea3 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:11 -0700 Subject: mm: accurately calculate zone->managed_pages for highmem zones Commit "mm: introduce new field 'managed_pages' to struct zone" assumes that all highmem pages will be freed into the buddy system by function mem_init(). But that's not always true, some architectures may reserve some highmem pages during boot. For example PPC may allocate highmem pages for giagant HugeTLB pages, and several architectures have code to check PageReserved flag to exclude highmem pages allocated during boot when freeing highmem pages into the buddy system. So treat highmem pages in the same way as normal pages, that is to: 1) reset zone->managed_pages to zero in mem_init(). 2) recalculate managed_pages when freeing pages into the buddy system. Signed-off-by: Jiang Liu Cc: "H. Peter Anvin" Cc: Tejun Heo Cc: Joonsoo Kim Cc: Yinghai Lu Cc: Mel Gorman Cc: Minchan Kim Cc: Kamezawa Hiroyuki Cc: Marek Szyprowski Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Konrad Rzeszutek Wilk Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f22542f6dc12..22438eba00b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5232,6 +5232,7 @@ void free_highmem_page(struct page *page) { __free_reserved_page(page); totalram_pages++; + page_zone(page)->managed_pages++; totalhigh_pages++; } #endif -- cgit v1.2.3 From c3d5f5f0c2bc4eabeaf49f1a21e1aeb965246cd2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:14 -0700 Subject: mm: use a dedicated lock to protect totalram_pages and zone->managed_pages Currently lock_memory_hotplug()/unlock_memory_hotplug() are used to protect totalram_pages and zone->managed_pages. Other than the memory hotplug driver, totalram_pages and zone->managed_pages may also be modified at runtime by other drivers, such as Xen balloon, virtio_balloon etc. For those cases, memory hotplug lock is a little too heavy, so introduce a dedicated lock to protect totalram_pages and zone->managed_pages. Now we have a simplified locking rules totalram_pages and zone->managed_pages as: 1) no locking for read accesses because they are unsigned long. 2) no locking for write accesses at boot time in single-threaded context. 3) serialize write accesses at runtime by acquiring the dedicated managed_page_count_lock. Also adjust zone->managed_pages when freeing reserved pages into the buddy system, to keep totalram_pages and zone->managed_pages in consistence. [akpm@linux-foundation.org: don't export adjust_managed_page_count to modules (for now)] Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22438eba00b6..93f292a60cb0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -103,6 +103,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; /* @@ -5206,6 +5209,14 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +void adjust_managed_page_count(struct page *page, long count) +{ + spin_lock(&managed_page_count_lock); + page_zone(page)->managed_pages += count; + totalram_pages += count; + spin_unlock(&managed_page_count_lock); +} + unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { void *pos; -- cgit v1.2.3 From 170a5a7eb2bf10161197e5490fbc29ca4561aedb Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:17 -0700 Subject: mm: make __free_pages_bootmem() only available at boot time In order to simpilify management of totalram_pages and zone->managed_pages, make __free_pages_bootmem() only available at boot time. With this change applied, __free_pages_bootmem() will only be used by bootmem.c and nobootmem.c at boot time, so mark it as __init. Other callers of __free_pages_bootmem() have been converted to use free_reserved_page(), which handles totalram_pages and zone->managed_pages in a safer way. This patch also fix a bug in free_pagetable() for x86_64, which should increase zone->managed_pages instead of zone->present_pages when freeing reserved pages. And now we have managed_pages_count_lock to protect totalram_pages and zone->managed_pages, so remove the redundant ppb_lock lock in put_page_bootmem(). This greatly simplifies the locking rules. Signed-off-by: Jiang Liu Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Cc: Mel Gorman Cc: Minchan Kim Cc: "Michael S. Tsirkin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tejun Heo Cc: Will Deacon Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 93f292a60cb0..2437a7e17aba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -745,14 +745,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * Read access to zone->managed_pages is safe because it's unsigned long, - * but we still need to serialize writers. Currently all callers of - * __free_pages_bootmem() except put_page_bootmem() should only be used - * at boot time. So for shorter boot time, we shift the burden to - * put_page_bootmem() to serialize writers. - */ -void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; unsigned int loop; -- cgit v1.2.3 From 3dcc0571cd64816309765b7c7e4691a4cadf2ee7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:21 -0700 Subject: mm: correctly update zone->managed_pages Enhance adjust_managed_page_count() to adjust totalhigh_pages for highmem pages. And change code which directly adjusts totalram_pages to use adjust_managed_page_count() because it adjusts totalram_pages, totalhigh_pages and zone->managed_pages altogether in a safe way. Remove inc_totalhigh_pages() and dec_totalhigh_pages() from xen/balloon driver bacause adjust_managed_page_count() has already adjusted totalhigh_pages. This patch also fixes two bugs: 1) enhances virtio_balloon driver to adjust totalhigh_pages when reserve/unreserve pages. 2) enhance memory_hotplug.c to adjust totalhigh_pages when hot-removing memory. We still need to deal with modifications of totalram_pages in file arch/powerpc/platforms/pseries/cmm.c, but need help from PPC experts. [akpm@linux-foundation.org: remove ifdef, per Wanpeng Li, virtio_balloon.c cleanup, per Sergei] [akpm@linux-foundation.org: export adjust_managed_page_count() to modules, for drivers/virtio/virtio_balloon.c] Signed-off-by: Jiang Liu Cc: Chris Metcalf Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Wen Congyang Cc: Tang Chen Cc: Yasuaki Ishimatsu Cc: Mel Gorman Cc: Minchan Kim Cc: "H. Peter Anvin" Cc: Cc: Arnd Bergmann Cc: Catalin Marinas Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Marek Szyprowski Cc: Michel Lespinasse Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: Will Deacon Cc: Yinghai Lu Cc: Russell King Cc: Sergei Shtylyov Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2437a7e17aba..1481439ee2e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -780,11 +780,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); __free_pages(page, pageblock_order); - totalram_pages += pageblock_nr_pages; -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages += pageblock_nr_pages; -#endif + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -5207,8 +5203,13 @@ void adjust_managed_page_count(struct page *page, long count) spin_lock(&managed_page_count_lock); page_zone(page)->managed_pages += count; totalram_pages += count; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += count; +#endif spin_unlock(&managed_page_count_lock); } +EXPORT_SYMBOL(adjust_managed_page_count); unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { -- cgit v1.2.3 From cdd91a77043ba81585236ef61f65c18222b212e6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:27 -0700 Subject: mm: report available pages as "MemTotal" for each NUMA node As reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501, "MemTotal" from /proc/meminfo means memory pages managed by the buddy system (managed_pages), but "MemTotal" from /sys/.../node/nodex/meminfo means physical pages present (present_pages) within the NUMA node. There's a difference between managed_pages and present_pages due to bootmem allocator and reserved pages. And Documentation/filesystems/proc.txt says MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) So change /sys/.../node/nodex/meminfo to report available pages within the node as "MemTotal". Signed-off-by: Jiang Liu Reported-by: Cc: Mel Gorman Cc: Minchan Kim Cc: "H. Peter Anvin" Cc: "Michael S. Tsirkin" Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Cc: Jianguo Wu Cc: Joonsoo Kim Cc: Kamezawa Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Marek Szyprowski Cc: Michel Lespinasse Cc: Rik van Riel Cc: Rusty Russell Cc: Tang Chen Cc: Tejun Heo Cc: Thomas Gleixner Cc: Wen Congyang Cc: Will Deacon Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1481439ee2e4..d9445c4f5fd7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2904,9 +2904,13 @@ EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_present_pages; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += pgdat->node_zones[zone_type].managed_pages; + val->totalram = managed_pages; val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; -- cgit v1.2.3 From 7ee3d4e8cd560500192d80ca84d7f15d6dee0807 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:41 -0700 Subject: mm: introduce helper function mem_init_print_info() to simplify mem_init() Introduce helper function mem_init_print_info() to simplify mem_init() across different architectures, which also unifies the format and information printed. Function mem_init_print_info() calculates memory statistics information without walking each page, so it should be a little faster on some architectures. Also introduce another helper get_num_physpages() to kill the global variable num_physpages. Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9445c4f5fd7..327516b7aee9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -61,6 +61,7 @@ #include #include +#include #include #include #include "internal.h" @@ -5246,6 +5247,57 @@ void free_highmem_page(struct page *page) } #endif + +void __init mem_init_print_info(const char *str) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + printk("Memory: %luK/%luK available " + "(%luK kernel code, %luK rwdata, %luK rodata, " + "%luK init, %luK bss, %luK reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages) << (PAGE_SHIFT-10), +#ifdef CONFIG_HIGHMEM + totalhigh_pages << (PAGE_SHIFT-10), +#endif + str ? ", " : "", str ? str : ""); +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved -- cgit v1.2.3 From 7960aedde8cfa72e4caf488806ea7ea7d2fa8dba Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:52 -0700 Subject: mm: remove duplicated call of get_pfn_range_for_nid When calculating pages in a node, for each zone in that node, we will have zone_spanned_pages_in_node --> get_pfn_range_for_nid zone_absent_pages_in_node --> get_pfn_range_for_nid That is to say, we call the get_pfn_range_for_nid to get start_pfn and end_pfn of the node for MAX_NR_ZONES * 2 times. And this is totally unnecessary if we call the get_pfn_range_for_nid before zone_*_pages_in_node add two extra arguments node_start_pfn and node_end_pfn for zone_*_pages_in_node, then we can remove the get_pfn_range_in_node in zone_*_pages_in_node. [akpm@linux-foundation.org: make definitions more readable] Signed-off-by: Zhang Yanfei Cc: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 327516b7aee9..7d5e40fe0c29 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4421,13 +4421,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - /* Get the start and end of the node and zone */ - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, @@ -4482,14 +4482,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4502,6 +4502,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; @@ -4509,6 +4511,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) @@ -4520,21 +4524,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - zones_size); + node_start_pfn, + node_end_pfn, + zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, - zholes_size); + node_start_pfn, node_end_pfn, + zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4643,6 +4653,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -4664,8 +4675,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, zones_size); + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, zholes_size); /* @@ -4779,6 +4793,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); @@ -4786,7 +4802,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; init_zone_allows_reclaim(nid); - calculate_node_totalpages(pgdat, zones_size, zholes_size); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4795,7 +4815,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit v1.2.3 From bc732f1d55cf41627ee4c64078812b2fa592b394 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:06 -0700 Subject: mm/page_alloc.c: remove zone_type argument of build_zonelists_node The callers of build_zonelists_node always pass MAX_NR_ZONES -1 as the zone_type argument, so we can directly use the value in build_zonelists_node and remove zone_type argument. Signed-off-by: Zhang Yanfei Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d5e40fe0c29..27f9d4beac98 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3153,12 +3153,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones, enum zone_type zone_type) + int nr_zones) { struct zone *zone; - - BUG_ON(zone_type >= MAX_NR_ZONES); - zone_type++; + enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; @@ -3168,8 +3166,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } - } while (zone_type); + return nr_zones; } @@ -3363,8 +3361,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3378,7 +3375,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3586,7 +3583,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones @@ -3599,14 +3596,12 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; -- cgit v1.2.3 From 345606d42971fc4ed164fbabac118708d51b8e0a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:08 -0700 Subject: mm/page_alloc.c: remove unlikely() from the current_order test In __rmqueue_fallback(), current_order loops down from MAX_ORDER - 1 to the order passed. MAX_ORDER is typically 11 and pageblock_order is typically 9 on x86. Integer division truncates, so pageblock_order / 2 is 4. For the first eight iterations, it's guaranteed that current_order >= pageblock_order / 2 if it even gets that far! So just remove the unlikely(), it's completely bogus. Signed-off-by: Zhang Yanfei Suggested-by: David Rientjes Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27f9d4beac98..b5855e545eec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1046,7 +1046,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * MIGRATE_CMA areas. */ if (!is_migrate_cma(migratetype) && - (unlikely(current_order >= pageblock_order / 2) || + (current_order >= pageblock_order / 2 || start_migratetype == MIGRATE_RECLAIMABLE || page_group_by_mobility_disabled)) { int pages; -- cgit v1.2.3 From 5f12733e9d976132e6cbbae9d08f71406fdacdfb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:40 -0700 Subject: mm: honor min_free_kbytes set by user min_free_kbytes is updated during memory hotplug (by init_per_zone_wmark_min) currently which is right thing to do in most cases but this could be unexpected if admin increased the value to prevent from allocation failures and the new min_free_kbytes would be decreased as a result of memory hotadd. This patch saves the user defined value and allows updating min_free_kbytes only if it is higher than the saved one. A warning is printed when the new value is ignored. Signed-off-by: Michal Hocko Cc: Mel Gorman Acked-by: Zhang Yanfei Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5855e545eec..b100255dedda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -204,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; +int user_min_free_kbytes; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -5589,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; + int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -5614,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - if (write) + if (write) { + user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); + } return 0; } -- cgit v1.2.3