From 8af01f56a03e9cbd91a55d688fce1315021efba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: s/cgroup_subsys_state/cgroup_css/ s/task_subsys_state/task_css/ The names of the two struct cgroup_subsys_state accessors - cgroup_subsys_state() and task_subsys_state() - are somewhat awkward. The former clashes with the type name and the latter doesn't even indicate it's somehow related to cgroup. We're about to revamp large portion of cgroup API, so, let's rename them so that they're less awkward. Most per-controller usages of the accessors are localized in accessor wrappers and given the amount of scheduled changes, this isn't gonna add any noticeable headache. Rename cgroup_subsys_state() to cgroup_css() and task_subsys_state() to task_css(). This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d12ca6f3c293..b47bd3ad3c2b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1037,8 +1037,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { - return mem_cgroup_from_css( - cgroup_subsys_state(cont, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1051,7 +1050,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) -- cgit v1.2.3 From a7c6d554aa01236ac2a9f851ab0f75704f76dfa2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add/update accessors which obtain subsys specific data from css css (cgroup_subsys_state) is usually embedded in a subsys specific data structure. Subsystems either use container_of() directly to cast from css to such data structure or has an accessor function wrapping such cast. As cgroup as whole is moving towards using css as the main interface handle, add and update such accessors to ease dealing with css's. All accessors explicitly handle NULL input and return NULL in those cases. While this looks like an extra branch in the code, as all controllers specific data structures have css as the first field, the casting doesn't involve any offsetting and the compiler can trivially optimize out the branch. * blkio, freezer, cpuset, cpu, cpuacct and net_cls didn't have such accessor. Added. * memory, hugetlb and devices already had one but didn't explicitly handle NULL input. Updated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b47bd3ad3c2b..11d659e3b08e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -486,7 +486,7 @@ static DEFINE_MUTEX(memcg_create_mutex); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { - return container_of(s, struct mem_cgroup, css); + return s ? container_of(s, struct mem_cgroup, css) : NULL; } /* Some nice accessors for the vmpressure. */ -- cgit v1.2.3 From 6387698699afd72d6304566fb6ccf84bffe07c56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add css_parent() Currently, controllers have to explicitly follow the cgroup hierarchy to find the parent of a given css. cgroup is moving towards using cgroup_subsys_state as the main controller interface construct, so let's provide a way to climb the hierarchy using just csses. This patch implements css_parent() which, given a css, returns its parent. The function is guarnateed to valid non-NULL parent css as long as the target css is not at the top of the hierarchy. freezer, cpuset, cpu, cpuacct, hugetlb, memory, net_cls and devices are converted to use css_parent() instead of accessing cgroup->parent directly. * __parent_ca() is dropped from cpuacct and its usage is replaced with parent_ca(). The only difference between the two was NULL test on cgroup->parent which is now embedded in css_parent() making the distinction moot. Note that eventually a css->parent field will be added to css and the NULL check in css_parent() will go away. This patch shouldn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- mm/memcontrol.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 11d659e3b08e..69b3e520f921 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1524,10 +1524,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { - struct cgroup *cgrp = memcg->css.cgroup; - /* root ? */ - if (cgrp->parent == NULL) + if (!css_parent(&memcg->css)) return vm_swappiness; return memcg->swappiness; @@ -5026,11 +5024,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, { int retval = 0; struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct cgroup *parent = cont->parent; - struct mem_cgroup *parent_memcg = NULL; - - if (parent) - parent_memcg = mem_cgroup_from_cont(parent); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); mutex_lock(&memcg_create_mutex); @@ -5282,18 +5276,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, unsigned long long *mem_limit, unsigned long long *memsw_limit) { - struct cgroup *cgroup; unsigned long long min_limit, min_memsw_limit, tmp; min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - cgroup = memcg->css.cgroup; if (!memcg->use_hierarchy) goto out; - while (cgroup->parent) { - cgroup = cgroup->parent; - memcg = mem_cgroup_from_cont(cgroup); + while (css_parent(&memcg->css)) { + memcg = mem_cgroup_from_css(css_parent(&memcg->css)); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5523,16 +5514,11 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; - - if (val > 100) - return -EINVAL; + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (cgrp->parent == NULL) + if (val > 100 || !parent) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* If under hierarchy, only empty-root can set this value */ @@ -5861,14 +5847,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!cgrp->parent || !((val == 0) || (val == 1))) + if (!parent || !((val == 0) || (val == 1))) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* oom-kill-disable is a flag for subhierarchy. */ if ((parent->use_hierarchy) || memcg_has_children(memcg)) { @@ -6266,15 +6250,14 @@ free_out: static int mem_cgroup_css_online(struct cgroup *cont) { - struct mem_cgroup *memcg, *parent; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); int error = 0; - if (!cont->parent) + if (!parent) return 0; mutex_lock(&memcg_create_mutex); - memcg = mem_cgroup_from_cont(cont); - parent = mem_cgroup_from_cont(cont->parent); memcg->use_hierarchy = parent->use_hierarchy; memcg->oom_kill_disable = parent->oom_kill_disable; -- cgit v1.2.3 From eb95419b023abacb415e2a18fea899023ce7624d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- mm/memcontrol.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 69b3e520f921..32cca0f0af0d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6211,7 +6211,7 @@ static void __init mem_cgroup_soft_limit_tree_init(void) } static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup *cont) +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *memcg; long error = -ENOMEM; @@ -6226,7 +6226,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) goto free_out; /* root ? */ - if (cont->parent == NULL) { + if (parent_css == NULL) { root_mem_cgroup = memcg; res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); @@ -6248,10 +6248,10 @@ free_out: } static int -mem_cgroup_css_online(struct cgroup *cont) +mem_cgroup_css_online(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; if (!parent) @@ -6308,9 +6308,9 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) mem_cgroup_iter_invalidate(root_mem_cgroup); } -static void mem_cgroup_css_offline(struct cgroup *cont) +static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); kmem_cgroup_css_offline(memcg); @@ -6319,9 +6319,9 @@ static void mem_cgroup_css_offline(struct cgroup *cont) mem_cgroup_destroy_all_caches(memcg); } -static void mem_cgroup_css_free(struct cgroup *cont) +static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); @@ -6691,12 +6691,12 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long move_charge_at_immigrate; /* @@ -6738,7 +6738,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); @@ -6886,7 +6886,7 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); @@ -6901,16 +6901,16 @@ static void mem_cgroup_move_task(struct cgroup *cont, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } @@ -6920,15 +6920,15 @@ static void mem_cgroup_move_task(struct cgroup *cont, * Cgroup retains root cgroups across [un]mount cycles making it necessary * to verify sane_behavior flag on each mount attempt. */ -static void mem_cgroup_bind(struct cgroup *root) +static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* * use_hierarchy is forced with sane_behavior. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root)) - mem_cgroup_from_cont(root)->use_hierarchy = true; + if (cgroup_sane_behavior(root_css->cgroup)) + mem_cgroup_from_css(root_css)->use_hierarchy = true; } struct cgroup_subsys mem_cgroup_subsys = { -- cgit v1.2.3 From 182446d087906de40e514573a92a97b203695f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in file methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup. Please see the previous commit which converts the subsystem methods for rationale. This patch converts all cftype file operations to take @css instead of @cgroup. cftypes for the cgroup core files don't have their subsytem pointer set. These will automatically use the dummy_css added by the previous patch and can be converted the same way. Most subsystem conversions are straight forwards but there are some interesting ones. * freezer: update_if_frozen() is also converted to take @css instead of @cgroup for consistency. This will make the code look simpler too once iterators are converted to use css. * memory/vmpressure: mem_cgroup_from_css() needs to be exported to vmpressure while mem_cgroup_from_cont() can be made static. Updated accordingly. * cpu: cgroup_tg() doesn't have any user left. Removed. * cpuacct: cgroup_ca() doesn't have any user left. Removed. * hugetlb: hugetlb_cgroup_form_cgroup() doesn't have any user left. Removed. * net_cls: cgrp_cls_state() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- mm/memcontrol.c | 88 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 43 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32cca0f0af0d..ab64dfc84f8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -483,7 +483,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct mem_cgroup, css) : NULL; @@ -1035,7 +1034,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); } @@ -2951,10 +2950,10 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) } #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct memcg_cache_params *params; if (!memcg_can_account_kmem(memcg)) @@ -4999,9 +4998,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return 0; } -static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) +static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, + unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int ret; if (mem_cgroup_is_root(memcg)) @@ -5014,16 +5014,17 @@ static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) } -static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) +static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return mem_cgroup_from_cont(cont)->use_hierarchy; + return mem_cgroup_from_css(css)->use_hierarchy; } -static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, - u64 val) +static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); mutex_lock(&memcg_create_mutex); @@ -5094,11 +5095,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } -static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); char str[64]; u64 val; int name, len; @@ -5131,11 +5132,11 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } -static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* * For simplicity, we won't allow this to be disabled. It also can't * be changed if the cgroup has children already, or if tasks had @@ -5151,7 +5152,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { - if (cgroup_task_count(cont) || memcg_has_children(memcg)) { + if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; } @@ -5221,10 +5222,10 @@ out: * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, +static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); enum res_type type; int name; unsigned long long val; @@ -5248,7 +5249,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); else if (type == _KMEM) - ret = memcg_update_kmem_limit(cont, val); + ret = memcg_update_kmem_limit(css, val); else return -EINVAL; break; @@ -5297,9 +5298,9 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) +static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int name; enum res_type type; @@ -5332,17 +5333,17 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } -static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, +static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; + return mem_cgroup_from_css(css)->move_charge_at_immigrate; } #ifdef CONFIG_MMU -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; @@ -5357,7 +5358,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, return 0; } #else -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { return -ENOSYS; @@ -5365,13 +5366,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, #endif #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int memcg_numa_stat_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { int nid; unsigned long total_nr, file_nr, anon_nr, unevictable_nr; unsigned long node_nr; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); seq_printf(m, "total=%lu", total_nr); @@ -5416,10 +5417,10 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); } -static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, +static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *mi; unsigned int i; @@ -5503,17 +5504,18 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, return 0; } -static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) +static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); return mem_cgroup_swappiness(memcg); } -static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, - u64 val) +static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); if (val > 100 || !parent) @@ -5829,10 +5831,10 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct cgroup *cgrp, +static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); @@ -5843,10 +5845,10 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, return 0; } -static int mem_cgroup_oom_control_write(struct cgroup *cgrp, +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ -- cgit v1.2.3 From 492eb21b98f88e411a8bb43d6edcd7d7022add10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:25 -0400 Subject: cgroup: make hierarchy iterators deal with cgroup_subsys_state instead of cgroup cgroup is currently in the process of transitioning to using css (cgroup_subsys_state) as the primary handle instead of cgroup in subsystem API. For hierarchy iterators, this is beneficial because * In most cases, css is the only thing subsystems care about anyway. * On the planned unified hierarchy, iterations for different subsystems will need to skip over different subtrees of the hierarchy depending on which subsystems are enabled on each cgroup. Passing around css makes it unnecessary to explicitly specify the subsystem in question as css is intersection between cgroup and subsystem * For the planned unified hierarchy, css's would need to be created and destroyed dynamically independent from cgroup hierarchy. Having cgroup core manage css iteration makes enforcing deref rules a lot easier. Most subsystem conversions are straight-forward. Noteworthy changes are * blkio: cgroup_to_blkcg() is no longer used. Removed. * freezer: cgroup_freezer() is no longer used. Removed. * devices: cgroup_to_devcgroup() is no longer used. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe --- mm/memcontrol.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab64dfc84f8c..2285319e23a9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1082,7 +1082,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, struct mem_cgroup *last_visited) { - struct cgroup *prev_cgroup, *next_cgroup; + struct cgroup_subsys_state *prev_css, *next_css; /* * Root is not visited by cgroup iterators so it needs an @@ -1091,11 +1091,9 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, if (!last_visited) return root; - prev_cgroup = (last_visited == root) ? NULL - : last_visited->css.cgroup; + prev_css = (last_visited == root) ? NULL : &last_visited->css; skip_node: - next_cgroup = cgroup_next_descendant_pre( - prev_cgroup, root->css.cgroup); + next_css = css_next_descendant_pre(prev_css, &root->css); /* * Even if we found a group we have to make sure it is @@ -1104,13 +1102,13 @@ skip_node: * last_visited css is safe to use because it is * protected by css_get and the tree walk is rcu safe. */ - if (next_cgroup) { - struct mem_cgroup *mem = mem_cgroup_from_cont( - next_cgroup); + if (next_css) { + struct mem_cgroup *mem = mem_cgroup_from_css(next_css); + if (css_tryget(&mem->css)) return mem; else { - prev_cgroup = next_cgroup; + prev_css = next_css; goto skip_node; } } @@ -4939,10 +4937,10 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) */ static inline bool __memcg_has_children(struct mem_cgroup *memcg) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* bounce at first found */ - cgroup_for_each_child(pos, memcg->css.cgroup) + css_for_each_child(pos, &memcg->css) return true; return false; } -- cgit v1.2.3 From 0942eeeef68f9493c1bcb1a52baf612b73fcf9fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: rename cgroup_iter to cgroup_task_iter cgroup now has multiple iterators and it's quite confusing to have something which walks over tasks of a single cgroup named cgroup_iter. Let's rename it to cgroup_task_iter. While at it, reformat / update comments and replace the overview comment above the interface function decls with proper function comments. Such overview can be useful but function comments should be more than enough here. This is pure rename and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- mm/memcontrol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2285319e23a9..00b055dd1d3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1800,11 +1800,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct cgroup *cgroup = iter->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1817,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1834,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } if (!chosen) -- cgit v1.2.3 From c59cd3d840b1b0a8f996cbbd9132128dcaabbeb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cgroup_task_iter remember the cgroup being iterated Currently all cgroup_task_iter functions require @cgrp to be passed in, which is superflous and increases chance of usage error. Make cgroup_task_iter remember the cgroup being iterated and drop @cgrp argument from next and end functions. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00b055dd1d3d..5a5f4dc649f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1804,7 +1804,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) { + while ((task = cgroup_task_iter_next(&it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1817,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1834,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } if (!chosen) -- cgit v1.2.3 From 72ec7029937f0518eff21b8762743c31591684f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make task iterators deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. This patch converts task iterators to deal with css instead of cgroup. Note that under unified hierarchy, different sets of tasks will be considered belonging to a given cgroup depending on the subsystem in question and making the iterators deal with css instead cgroup provides them with enough information about the iteration. While at it, fix several function comment formats in cpuset.c. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley --- mm/memcontrol.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5a5f4dc649f0..95106a993777 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1799,12 +1799,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { - struct cgroup *cgroup = iter->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) { + css_task_iter_start(&iter->css, &it); + while ((task = css_task_iter_next(&it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1816,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_task_iter_end(&it); + css_task_iter_end(&it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1833,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } if (!chosen) -- cgit v1.2.3 From 81eeaf0411204f52af8ef78ff107cfca2fcfec1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cftype->[un]register_event() is among the remaining couple interfaces which still use struct cgroup. Convert it to cgroup_subsys_state. The conversion is mostly mechanical and removes the last users of mem_cgroup_from_cont() and cg_to_vmpressure(), which are removed. v2: indentation update as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- mm/memcontrol.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 95106a993777..2885e3e85047 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,11 +1034,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) -{ - return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -5620,10 +5615,10 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup *cgrp, +static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5703,10 +5698,10 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5782,10 +5777,10 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static int mem_cgroup_oom_register_event(struct cgroup *cgrp, +static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5807,10 +5802,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; enum res_type type = MEMFILE_TYPE(cft->private); -- cgit v1.2.3 From bd8815a6d802fc16a7a106e170593aa05dc17e72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make css_for_each_descendant() and friends include the origin css in the iteration Previously, all css descendant iterators didn't include the origin (root of subtree) css in the iteration. The reasons were maintaining consistency with css_for_each_child() and that at the time of introduction more use cases needed skipping the origin anyway; however, given that css_is_descendant() considers self to be a descendant, omitting the origin css has become more confusing and looking at the accumulated use cases rather clearly indicates that including origin would result in simpler code overall. While this is a change which can easily lead to subtle bugs, cgroup API including the iterators has recently gone through major restructuring and no out-of-tree changes will be applicable without adjustments making this a relatively acceptable opportunity for this type of change. The conversions are mostly straight-forward. If the iteration block had explicit origin handling before or after, it's moved inside the iteration. If not, if (pos == origin) continue; is added. Some conversions add extra reference get/put around origin handling by consolidating origin handling and the rest. While the extra ref operations aren't strictly necessary, this shouldn't cause any noticeable difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Michal Hocko Cc: Jens Axboe Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- mm/memcontrol.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2885e3e85047..b89d4cbc0c08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1079,14 +1079,7 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, { struct cgroup_subsys_state *prev_css, *next_css; - /* - * Root is not visited by cgroup iterators so it needs an - * explicit visit. - */ - if (!last_visited) - return root; - - prev_css = (last_visited == root) ? NULL : &last_visited->css; + prev_css = last_visited ? &last_visited->css : NULL; skip_node: next_css = css_next_descendant_pre(prev_css, &root->css); -- cgit v1.2.3 From 90c7a79cc45becc6cdb8c026d55ace19e299a02d Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 11 Sep 2013 14:22:18 -0700 Subject: kmemcg: don't allocate extra memory for root memcg_cache_params The memcg_cache_params structure contains the common part and the union, which represents two different types of data: one for root cashes and another for child caches. The size of child data is fixed. The size of the memcg_caches array is calculated in runtime. Currently the size of memcg_cache_params for root caches is calculated incorrectly, because it includes the size of parameters for child caches. ssize_t size = memcg_caches_array_size(num_groups); size *= sizeof(void *); size += sizeof(struct memcg_cache_params); v2: Fix a typo in calculations Signed-off-by: Andrey Vagin Cc: Glauber Costa Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3b83957b6439..5ca1dcf77ce9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) ssize_t size = memcg_caches_array_size(num_groups); size *= sizeof(void *); - size += sizeof(struct memcg_cache_params); + size += offsetof(struct memcg_cache_params, memcg_caches); s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) { @@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { - size_t size = sizeof(struct memcg_cache_params); + size_t size; if (!memcg_kmem_enabled()) return 0; - if (!memcg) + if (!memcg) { + size = offsetof(struct memcg_cache_params, memcg_caches); size += memcg_limited_groups_array_size * sizeof(void *); + } else + size = sizeof(struct memcg_cache_params); s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) -- cgit v1.2.3 From 2bff24a3707093c435ab3241c47dcdb5f16e432b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 11 Sep 2013 14:23:08 -0700 Subject: memcg: fix multiple large threshold notifications A memory cgroup with (1) multiple threshold notifications and (2) at least one threshold >=2G was not reliable. Specifically the notifications would either not fire or would not fire in the proper order. The __mem_cgroup_threshold() signaling logic depends on keeping 64 bit thresholds in sorted order. mem_cgroup_usage_register_event() sorts them with compare_thresholds(), which returns the difference of two 64 bit thresholds as an int. If the difference is positive but has bit[31] set, then sort() treats the difference as negative and breaks sort order. This fix compares the two arbitrary 64 bit thresholds returning the classic -1, 0, 1 result. The test below sets two notifications (at 0x1000 and 0x81001000): cd /sys/fs/cgroup/memory mkdir x for x in 4096 2164264960; do cgroup_event_listener x/memory.usage_in_bytes $x | sed "s/^/$x listener:/" & done echo $$ > x/cgroup.procs anon_leaker 500M v3.11-rc7 fails to signal the 4096 event listener: Leaking... Done leaking pages. Patched v3.11-rc7 properly notifies: Leaking... 4096 listener:2013:8:31:14:13:36 Done leaking pages. The fixed bug is old. It appears to date back to the introduction of memcg threshold notifications in v2.6.34-rc1-116-g2e72b6347c94 "memcg: implement memory thresholds" Signed-off-by: Greg Thelen Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5ca1dcf77ce9..c6bd28edd533 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5591,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b) const struct mem_cgroup_threshold *_a = a; const struct mem_cgroup_threshold *_b = b; - return _a->threshold - _b->threshold; + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; } static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) -- cgit v1.2.3 From c33bd8354f3a3bb26a98d2b6bf600b7b35657328 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 12 Sep 2013 15:13:19 -0700 Subject: memcg: remove redundant code in mem_cgroup_force_empty_write() vfs guarantees the cgroup won't be destroyed, so it's redundant to get a css reference. Signed-off-by: Li Zefan Acked-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c6bd28edd533..fe6b9f96abdd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4990,18 +4990,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - int ret; if (mem_cgroup_is_root(memcg)) return -EINVAL; - css_get(&memcg->css); - ret = mem_cgroup_force_empty(memcg); - css_put(&memcg->css); - - return ret; + return mem_cgroup_force_empty(memcg); } - static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, struct cftype *cft) { -- cgit v1.2.3 From 3b38722efd9f66da63bbbd41520c2e6fa9db3d68 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:21 -0700 Subject: memcg, vmscan: integrate soft reclaim tighter with zone shrinking code This patchset is sitting out of tree for quite some time without any objections. I would be really happy if it made it into 3.12. I do not want to push it too hard but I think this work is basically ready and waiting more doesn't help. The basic idea is quite simple. Pull soft reclaim into shrink_zone in the first step and get rid of the previous soft reclaim infrastructure. shrink_zone is done in two passes now. First it tries to do the soft limit reclaim and it falls back to reclaim-all mode if no group is over the limit or no pages have been scanned. The second pass happens at the same priority so the only time we waste is the memcg tree walk which has been updated in the third step to have only negligible overhead. As a bonus we will get rid of a _lot_ of code by this and soft reclaim will not stand out like before when it wasn't integrated into the zone shrinking code and it reclaimed at priority 0 (the testing results show that some workloads suffers from such an aggressive reclaim). The clean up is in a separate patch because I felt it would be easier to review that way. The second step is soft limit reclaim integration into targeted reclaim. It should be rather straight forward. Soft limit has been used only for the global reclaim so far but it makes sense for any kind of pressure coming from up-the-hierarchy, including targeted reclaim. The third step (patches 4-8) addresses the tree walk overhead by enhancing memcg iterators to enable skipping whole subtrees and tracking number of over soft limit children at each level of the hierarchy. This information is updated same way the old soft limit tree was updated (from memcg_check_events) so we shouldn't see an additional overhead. In fact mem_cgroup_update_soft_limit is much simpler than tree manipulation done previously. __shrink_zone uses mem_cgroup_soft_reclaim_eligible as a predicate for mem_cgroup_iter so the decision whether a particular group should be visited is done at the iterator level which allows us to decide to skip the whole subtree as well (if there is no child in excess). This reduces the tree walk overhead considerably. * TEST 1 ======== My primary test case was a parallel kernel build with 2 groups (make is running with -j8 with a distribution .config in a separate cgroup without any hard limit) on a 32 CPU machine booted with 1GB memory and both builds run taskset to Node 0 cpus. I was mostly interested in 2 setups. Default - no soft limit set and - and 0 soft limit set to both groups. The first one should tell us whether the rework regresses the default behavior while the second one should show us improvements in an extreme case where both workloads are always over the soft limit. /usr/bin/time -v has been used to collect the statistics and each configuration had 3 runs after fresh boot without any other load on the system. base is mmotm-2013-07-18-16-40 rework all 8 patches applied on top of base * No-limit User no-limit/base: min: 651.92 max: 672.65 avg: 664.33 std: 8.01 runs: 6 no-limit/rework: min: 657.34 [100.8%] max: 668.39 [99.4%] avg: 663.13 [99.8%] std: 3.61 runs: 6 System no-limit/base: min: 69.33 max: 71.39 avg: 70.32 std: 0.79 runs: 6 no-limit/rework: min: 69.12 [99.7%] max: 71.05 [99.5%] avg: 70.04 [99.6%] std: 0.59 runs: 6 Elapsed no-limit/base: min: 398.27 max: 422.36 avg: 408.85 std: 7.74 runs: 6 no-limit/rework: min: 386.36 [97.0%] max: 438.40 [103.8%] avg: 416.34 [101.8%] std: 18.85 runs: 6 The results are within noise. Elapsed time has a bigger variance but the average looks good. * 0-limit User 0-limit/base: min: 573.76 max: 605.63 avg: 585.73 std: 12.21 runs: 6 0-limit/rework: min: 645.77 [112.6%] max: 666.25 [110.0%] avg: 656.97 [112.2%] std: 7.77 runs: 6 System 0-limit/base: min: 69.57 max: 71.13 avg: 70.29 std: 0.54 runs: 6 0-limit/rework: min: 68.68 [98.7%] max: 71.40 [100.4%] avg: 69.91 [99.5%] std: 0.87 runs: 6 Elapsed 0-limit/base: min: 1306.14 max: 1550.17 avg: 1430.35 std: 90.86 runs: 6 0-limit/rework: min: 404.06 [30.9%] max: 465.94 [30.1%] avg: 434.81 [30.4%] std: 22.68 runs: 6 The improvement is really huge here (even bigger than with my previous testing and I suspect that this highly depends on the storage). Page fault statistics tell us at least part of the story: Minor 0-limit/base: min: 37180461.00 max: 37319986.00 avg: 37247470.00 std: 54772.71 runs: 6 0-limit/rework: min: 36751685.00 [98.8%] max: 36805379.00 [98.6%] avg: 36774506.33 [98.7%] std: 17109.03 runs: 6 Major 0-limit/base: min: 170604.00 max: 221141.00 avg: 196081.83 std: 18217.01 runs: 6 0-limit/rework: min: 2864.00 [1.7%] max: 10029.00 [4.5%] avg: 5627.33 [2.9%] std: 2252.71 runs: 6 Same as with my previous testing Minor faults are more or less within noise but Major fault count is way bellow the base kernel. While this looks as a nice win it is fair to say that 0-limit configuration is quite artificial. So I was playing with 0-no-limit loads as well. * TEST 2 ======== The following results are from 2 groups configuration on a 16GB machine (single NUMA node). - A running stream IO (dd if=/dev/zero of=local.file bs=1024) with 2*TotalMem with 0 soft limit. - B running a mem_eater which consumes TotalMem-1G without any limit. The mem_eater consumes the memory in 100 chunks with 1s nap after each mmap+poppulate so that both loads have chance to fight for the memory. The expected result is that B shouldn't be reclaimed and A shouldn't see a big dropdown in elapsed time. User base: min: 2.68 max: 2.89 avg: 2.76 std: 0.09 runs: 3 rework: min: 3.27 [122.0%] max: 3.74 [129.4%] avg: 3.44 [124.6%] std: 0.21 runs: 3 System base: min: 86.26 max: 88.29 avg: 87.28 std: 0.83 runs: 3 rework: min: 81.05 [94.0%] max: 84.96 [96.2%] avg: 83.14 [95.3%] std: 1.61 runs: 3 Elapsed base: min: 317.28 max: 332.39 avg: 325.84 std: 6.33 runs: 3 rework: min: 281.53 [88.7%] max: 298.16 [89.7%] avg: 290.99 [89.3%] std: 6.98 runs: 3 System time improved slightly as well as Elapsed. My previous testing has shown worse numbers but this again seem to depend on the storage speed. My theory is that the writeback doesn't catch up and prio-0 soft reclaim falls into wait on writeback page too often in the base kernel. The patched kernel doesn't do that because the soft reclaim is done from the kswapd/direct reclaim context. This can be seen on the following graph nicely. The A's group usage_in_bytes regurarly drops really low very often. All 3 runs http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/stream.png resp. a detail of the single run http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/stream-one-run.png mem_eater seems to be doing better as well. It gets to the full allocation size faster as can be seen on the following graph: http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/mem_eater-one-run.png /proc/meminfo collected during the test also shows that rework kernel hasn't swapped that much (well almost not at all): base: max: 123900 K avg: 56388.29 K rework: max: 300 K avg: 128.68 K kswapd and direct reclaim statistics are of no use unfortunatelly because soft reclaim is not accounted properly as the counters are hidden by global_reclaim() checks in the base kernel. * TEST 3 ======== Another test was the same configuration as TEST2 except the stream IO was replaced by a single kbuild (16 parallel jobs bound to Node0 cpus same as in TEST1) and mem_eater allocated TotalMem-200M so kbuild had only 200MB left. Kbuild did better with the rework kernel here as well: User base: min: 860.28 max: 872.86 avg: 868.03 std: 5.54 runs: 3 rework: min: 880.81 [102.4%] max: 887.45 [101.7%] avg: 883.56 [101.8%] std: 2.83 runs: 3 System base: min: 84.35 max: 85.06 avg: 84.79 std: 0.31 runs: 3 rework: min: 85.62 [101.5%] max: 86.09 [101.2%] avg: 85.79 [101.2%] std: 0.21 runs: 3 Elapsed base: min: 135.36 max: 243.30 avg: 182.47 std: 45.12 runs: 3 rework: min: 110.46 [81.6%] max: 116.20 [47.8%] avg: 114.15 [62.6%] std: 2.61 runs: 3 Minor base: min: 36635476.00 max: 36673365.00 avg: 36654812.00 std: 15478.03 runs: 3 rework: min: 36639301.00 [100.0%] max: 36695541.00 [100.1%] avg: 36665511.00 [100.0%] std: 23118.23 runs: 3 Major base: min: 14708.00 max: 53328.00 avg: 31379.00 std: 16202.24 runs: 3 rework: min: 302.00 [2.1%] max: 414.00 [0.8%] avg: 366.33 [1.2%] std: 47.22 runs: 3 Again we can see a significant improvement in Elapsed (it also seems to be more stable), there is a huge dropdown for the Major page faults and much more swapping: base: max: 583736 K avg: 112547.43 K rework: max: 4012 K avg: 124.36 K Graphs from all three runs show the variability of the kbuild quite nicely. It even seems that it took longer after every run with the base kernel which would be quite surprising as the source tree for the build is removed and caches are dropped after each run so the build operates on a freshly extracted sources everytime. http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/kbuild-mem_eater.png My other testing shows that this is just a matter of timing and other runs behave differently the std for Elapsed time is similar ~50. Example of other three runs: http://labs.suse.cz/mhocko/soft_limit_rework/stream_io-vs-mem_eater/kbuild-mem_eater2.png So to wrap this up. The series is still doing good and improves the soft limit. The testing results for bunch of cgroups with both stream IO and kbuild loads can be found in "memcg: track children in soft limit excess to improve soft limit". This patch: Memcg soft reclaim has been traditionally triggered from the global reclaim paths before calling shrink_zone. mem_cgroup_soft_limit_reclaim then picked up a group which exceeds the soft limit the most and reclaimed it with 0 priority to reclaim at least SWAP_CLUSTER_MAX pages. The infrastructure requires per-node-zone trees which hold over-limit groups and keep them up-to-date (via memcg_check_events) which is not cost free. Although this overhead hasn't turned out to be a bottle neck the implementation is suboptimal because mem_cgroup_update_tree has no idea which zones consumed memory over the limit so we could easily end up having a group on a node-zone tree having only few pages from that node-zone. This patch doesn't try to fix node-zone trees management because it seems that integrating soft reclaim into zone shrinking sounds much easier and more appropriate for several reasons. First of all 0 priority reclaim was a crude hack which might lead to big stalls if the group's LRUs are big and hard to reclaim (e.g. a lot of dirty/writeback pages). Soft reclaim should be applicable also to the targeted reclaim which is awkward right now without additional hacks. Last but not least the whole infrastructure eats quite some code. After this patch shrink_zone is done in 2 passes. First it tries to do the soft reclaim if appropriate (only for global reclaim for now to keep compatible with the original state) and fall back to ignoring soft limit if no group is eligible to soft reclaim or nothing has been scanned during the first pass. Only groups which are over their soft limit or any of their parents up the hierarchy is over the limit are considered eligible during the first pass. Soft limit tree which is not necessary anymore will be removed in the follow up patch to make this patch smaller and easier to review. Signed-off-by: Michal Hocko Reviewed-by: Glauber Costa Reviewed-by: Tejun Heo Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Ying Han Cc: Hugh Dickins Cc: Michel Lespinasse Cc: Greg Thelen Cc: KOSAKI Motohiro Cc: Balbir Singh Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 163 ++++++++------------------------------------------------ 1 file changed, 21 insertions(+), 142 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe6b9f96abdd..6c32271a31c5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2004,57 +2004,28 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - struct mem_cgroup *victim = NULL; - int total = 0; - int loop = 0; - unsigned long excess; - unsigned long nr_scanned; - struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, - .priority = 0, - }; - - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - - while (1) { - victim = mem_cgroup_iter(root_memcg, victim, &reclaim); - if (!victim) { - loop++; - if (loop >= 2) { - /* - * If we have not been able to reclaim - * anything, it might because there are - * no reclaimable pages under this hierarchy - */ - if (!total) - break; - /* - * We want to do more targeted reclaim. - * excess >> 2 is not to excessive so as to - * reclaim too much, nor too less that we keep - * coming back to reclaim from this cgroup - */ - if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) - break; - } - continue; - } - if (!mem_cgroup_reclaimable(victim, false)) - continue; - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, - zone, &nr_scanned); - *total_scanned += nr_scanned; - if (!res_counter_soft_limit_excess(&root_memcg->res)) - break; +/* + * A group is eligible for the soft limit reclaim if it is + * a) is over its soft limit + * b) any parent up the hierarchy is over its soft limit + */ +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = memcg; + + if (res_counter_soft_limit_excess(&memcg->res)) + return true; + + /* + * If any parent up the hierarchy is over its soft limit then we + * have to obey and reclaim from this group as well. + */ + while((parent = parent_mem_cgroup(parent))) { + if (res_counter_soft_limit_excess(&parent->res)) + return true; } - mem_cgroup_iter_break(root_memcg, victim); - return total; + + return false; } /* @@ -4727,98 +4698,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - unsigned long nr_reclaimed = 0; - struct mem_cgroup_per_zone *mz, *next_mz = NULL; - unsigned long reclaimed; - int loop = 0; - struct mem_cgroup_tree_per_zone *mctz; - unsigned long long excess; - unsigned long nr_scanned; - - if (order > 0) - return 0; - - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); - /* - * This loop can run a while, specially if mem_cgroup's continuously - * keep exceeding their soft limit and putting the system under - * pressure - */ - do { - if (next_mz) - mz = next_mz; - else - mz = mem_cgroup_largest_soft_limit_node(mctz); - if (!mz) - break; - - nr_scanned = 0; - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, - gfp_mask, &nr_scanned); - nr_reclaimed += reclaimed; - *total_scanned += nr_scanned; - spin_lock(&mctz->lock); - - /* - * If we failed to reclaim anything from this memory cgroup - * it is time to move on to the next cgroup - */ - next_mz = NULL; - if (!reclaimed) { - do { - /* - * Loop until we find yet another one. - * - * By the time we get the soft_limit lock - * again, someone might have aded the - * group back on the RB tree. Iterate to - * make sure we get a different mem. - * mem_cgroup_largest_soft_limit_node returns - * NULL if no other cgroup is present on - * the tree - */ - next_mz = - __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) - css_put(&next_mz->memcg->css); - else /* next_mz == NULL or other memcg */ - break; - } while (1); - } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); - excess = res_counter_soft_limit_excess(&mz->memcg->res); - /* - * One school of thought says that we should not add - * back the node to the tree if reclaim returns 0. - * But our reclaim could return 0, simply because due - * to priority we are exposing a smaller subset of - * memory to reclaim from. Consider this as a longer - * term TODO. - */ - /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); - spin_unlock(&mctz->lock); - css_put(&mz->memcg->css); - loop++; - /* - * Could not reclaim anything and there are no more - * mem cgroups to try or we seem to be looping without - * reclaiming anything. - */ - if (!nr_reclaimed && - (next_mz == NULL || - loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) - break; - } while (!nr_reclaimed); - if (next_mz) - css_put(&next_mz->memcg->css); - return nr_reclaimed; -} - /** * mem_cgroup_force_empty_list - clears LRU of a group * @memcg: group to clear -- cgit v1.2.3 From e883110aad718b65de658db77387aaa69cce996d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:23 -0700 Subject: memcg: get rid of soft-limit tree infrastructure Now that the soft limit is integrated to the reclaim directly the whole soft-limit tree infrastructure is not needed anymore. Rip it out. Signed-off-by: Michal Hocko Reviewed-by: Glauber Costa Reviewed-by: Tejun Heo Cc: Balbir Singh Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 265 +------------------------------------------------------- 1 file changed, 2 insertions(+), 263 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c32271a31c5..87a448dd9c10 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -139,7 +138,6 @@ static const char * const mem_cgroup_lru_names[] = { */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -175,10 +173,6 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; - struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ - /* the soft limit is exceeded*/ - bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -187,26 +181,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -/* - * Cgroups above their limits are maintained in a RB-Tree, independent of - * their hierarchy representation - */ - -struct mem_cgroup_tree_per_zone { - struct rb_root rb_root; - spinlock_t lock; -}; - -struct mem_cgroup_tree_per_node { - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; -}; - -struct mem_cgroup_tree { - struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; -}; - -static struct mem_cgroup_tree soft_limit_tree __read_mostly; - struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; u64 threshold; @@ -444,7 +418,6 @@ static bool move_file(void) * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -671,164 +644,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) return mem_cgroup_zoneinfo(memcg, nid, zid); } -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_node_zone(int nid, int zid) -{ - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_from_page(struct page *page) -{ - int nid = page_to_nid(page); - int zid = page_zonenum(page); - - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) -{ - struct rb_node **p = &mctz->rb_root.rb_node; - struct rb_node *parent = NULL; - struct mem_cgroup_per_zone *mz_node; - - if (mz->on_tree) - return; - - mz->usage_in_excess = new_usage_in_excess; - if (!mz->usage_in_excess) - return; - while (*p) { - parent = *p; - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, - tree_node); - if (mz->usage_in_excess < mz_node->usage_in_excess) - p = &(*p)->rb_left; - /* - * We can't avoid mem cgroups that are over their soft - * limit by the same amount - */ - else if (mz->usage_in_excess >= mz_node->usage_in_excess) - p = &(*p)->rb_right; - } - rb_link_node(&mz->tree_node, parent, p); - rb_insert_color(&mz->tree_node, &mctz->rb_root); - mz->on_tree = true; -} - -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - if (!mz->on_tree) - return; - rb_erase(&mz->tree_node, &mctz->rb_root); - mz->on_tree = false; -} - -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); - spin_unlock(&mctz->lock); -} - - -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) -{ - unsigned long long excess; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); - - /* - * Necessary to update all ancestors when hierarchy is used. - * because their event counter is not touched. - */ - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - excess = res_counter_soft_limit_excess(&memcg->res); - /* - * We have to update the tree if mz is on RB-tree or - * mem is over its softlimit. - */ - if (excess || mz->on_tree) { - spin_lock(&mctz->lock); - /* if on-tree, remove it */ - if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); - /* - * Insert again. mz->usage_in_excess will be updated. - * If excess is 0, no tree ops. - */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); - spin_unlock(&mctz->lock); - } - } -} - -static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) -{ - int node, zone; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); - } - } -} - -static struct mem_cgroup_per_zone * -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz; - -retry: - mz = NULL; - rightmost = rb_last(&mctz->rb_root); - if (!rightmost) - goto done; /* Nothing to reclaim from */ - - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); - /* - * Remove the node now but someone else can add it back, - * we will to add it back at the end of reclaim to its correct - * position in the tree. - */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); - if (!res_counter_soft_limit_excess(&mz->memcg->res) || - !css_tryget(&mz->memcg->css)) - goto retry; -done: - return mz; -} - -static struct mem_cgroup_per_zone * -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct mem_cgroup_per_zone *mz; - - spin_lock(&mctz->lock); - mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); - return mz; -} - /* * Implementation Note: reading percpu statistics for memcg. * @@ -987,9 +802,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; case MEM_CGROUP_TARGET_NUMAINFO: next = val + NUMAINFO_EVENTS_TARGET; break; @@ -1012,11 +824,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit; bool do_numainfo __maybe_unused; - do_softlimit = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); @@ -1024,8 +833,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); mem_cgroup_threshold(memcg); - if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, page); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1867,6 +1674,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, return total; } +#if MAX_NUMNODES > 1 /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1889,7 +1697,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } -#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -1957,51 +1764,12 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return node; } -/* - * Check all nodes whether it contains reclaimable pages or not. - * For quick scan, we make use of scan_nodes. This will allow us to skip - * unused nodes. But scan_nodes is lazily updated and may not cotain - * enough new information. We need to do double check. - */ -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - int nid; - - /* - * quick check...making use of scan_node. - * We can skip unused nodes. - */ - if (!nodes_empty(memcg->scan_nodes)) { - for (nid = first_node(memcg->scan_nodes); - nid < MAX_NUMNODES; - nid = next_node(nid, memcg->scan_nodes)) { - - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - } - /* - * Check rest of nodes. - */ - for_each_node_state(nid, N_MEMORY) { - if (node_isset(nid, memcg->scan_nodes)) - continue; - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - return false; -} - #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); -} #endif /* @@ -2876,9 +2644,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, unlock_page_cgroup(pc); /* - * "charge_statistics" updated event counter. Then, check it. - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. - * if they exceeds softlimit. + * "charge_statistics" updated event counter. */ memcg_check_events(memcg, page); } @@ -5962,8 +5728,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); - mz->usage_in_excess = 0; - mz->on_tree = false; mz->memcg = memcg; } memcg->nodeinfo[node] = pn; @@ -6019,7 +5783,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; size_t size = memcg_size(); - mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) @@ -6056,29 +5819,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); -static void __init mem_cgroup_soft_limit_tree_init(void) -{ - struct mem_cgroup_tree_per_node *rtpn; - struct mem_cgroup_tree_per_zone *rtpz; - int tmp, node, zone; - - for_each_node(node) { - tmp = node; - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); - BUG_ON(!rtpn); - - soft_limit_tree.rb_tree_per_node[node] = rtpn; - - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - rtpz = &rtpn->rb_tree_per_zone[zone]; - rtpz->rb_root = RB_ROOT; - spin_lock_init(&rtpz->lock); - } - } -} - static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -6859,7 +6599,6 @@ static int __init mem_cgroup_init(void) { hotcpu_notifier(memcg_cpu_hotplug_callback, 0); enable_swap_cgroup(); - mem_cgroup_soft_limit_tree_init(); memcg_stock_init(); return 0; } -- cgit v1.2.3 From a5b7c87f92076352dbff2fe0423ec255e1c9a71b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:25 -0700 Subject: vmscan, memcg: do softlimit reclaim also for targeted reclaim Soft reclaim has been done only for the global reclaim (both background and direct). Since "memcg: integrate soft reclaim tighter with zone shrinking code" there is no reason for this limitation anymore as the soft limit reclaim doesn't use any special code paths and it is a part of the zone shrinking code which is used by both global and targeted reclaims. From the semantic point of view it is natural to consider soft limit before touching all groups in the hierarchy tree which is touching the hard limit because soft limit tells us where to push back when there is a memory pressure. It is not important whether the pressure comes from the limit or imbalanced zones. This patch simply enables soft reclaim unconditionally in mem_cgroup_should_soft_reclaim so it is enabled for both global and targeted reclaim paths. mem_cgroup_soft_reclaim_eligible needs to learn about the root of the reclaim to know where to stop checking soft limit state of parents up the hierarchy. Say we have A (over soft limit) \ B (below s.l., hit the hard limit) / \ C D (below s.l.) B is the source of the outside memory pressure now for D but we shouldn't soft reclaim it because it is behaving well under B subtree and we can still reclaim from C (pressumably it is over the limit). mem_cgroup_soft_reclaim_eligible should therefore stop climbing up the hierarchy at B (root of the memory pressure). Signed-off-by: Michal Hocko Reviewed-by: Glauber Costa Reviewed-by: Tejun Heo Cc: Balbir Singh Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 87a448dd9c10..c016e001c5b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1773,11 +1773,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif /* - * A group is eligible for the soft limit reclaim if it is - * a) is over its soft limit + * A group is eligible for the soft limit reclaim under the given root + * hierarchy if + * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, + struct mem_cgroup *root) { struct mem_cgroup *parent = memcg; @@ -1785,12 +1787,14 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) return true; /* - * If any parent up the hierarchy is over its soft limit then we - * have to obey and reclaim from this group as well. + * If any parent up to the root in the hierarchy is over its soft limit + * then we have to obey and reclaim from this group as well. */ while((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) return true; + if (parent == root) + break; } return false; -- cgit v1.2.3 From de57780dc659f95b17ccb649f003278dde0b5b86 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:26 -0700 Subject: memcg: enhance memcg iterator to support predicates The caller of the iterator might know that some nodes or even subtrees should be skipped but there is no way to tell iterators about that so the only choice left is to let iterators to visit each node and do the selection outside of the iterating code. This, however, doesn't scale well with hierarchies with many groups where only few groups are interesting. This patch adds mem_cgroup_iter_cond variant of the iterator with a callback which gets called for every visited node. There are three possible ways how the callback can influence the walk. Either the node is visited, it is skipped but the tree walk continues down the tree or the whole subtree of the current group is skipped. [hughd@google.com: fix memcg-less page reclaim] Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 70 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 15 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c016e001c5b2..a4bb857d902c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -875,6 +875,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +static enum mem_cgroup_filter_t +mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, + mem_cgroup_iter_filter cond) +{ + if (!cond) + return VISIT; + return cond(memcg, root); +} + /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -882,7 +891,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited) + struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) { struct cgroup_subsys_state *prev_css, *next_css; @@ -900,11 +909,31 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - if (css_tryget(&mem->css)) - return mem; - else { + switch (mem_cgroup_filter(mem, root, cond)) { + case SKIP: prev_css = next_css; goto skip_node; + case SKIP_TREE: + if (mem == root) + return NULL; + /* + * css_rightmost_descendant is not an optimal way to + * skip through a subtree (especially for imbalanced + * trees leaning to right) but that's what we have right + * now. More effective solution would be traversing + * right-up for first non-NULL without calling + * css_next_descendant_pre afterwards. + */ + prev_css = css_rightmost_descendant(next_css); + goto skip_node; + case VISIT: + if (css_tryget(&mem->css)) + return mem; + else { + prev_css = next_css; + goto skip_node; + } + break; } } @@ -968,6 +997,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks + * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -980,15 +1010,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim) + struct mem_cgroup_reclaim_cookie *reclaim, + mem_cgroup_iter_filter cond) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) - return NULL; + if (mem_cgroup_disabled()) { + /* first call must return non-NULL, second return NULL */ + return (struct mem_cgroup *)(unsigned long)!prev; + } if (!root) root = root_mem_cgroup; @@ -999,7 +1032,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - return root; + if (mem_cgroup_filter(root, root, cond) == VISIT) + return root; + return NULL; } rcu_read_lock(); @@ -1022,7 +1057,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited); + memcg = __mem_cgroup_iter_next(root, last_visited, cond); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1033,7 +1068,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, reclaim->generation = iter->generation; } - if (prev && !memcg) + /* + * We have finished the whole tree walk or no group has been + * visited because filter told us to skip the root node. + */ + if (!memcg && (prev || (cond && !last_visited))) goto out_unlock; } out_unlock: @@ -1778,13 +1817,14 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +enum mem_cgroup_filter_t +mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) - return true; + return VISIT; /* * If any parent up to the root in the hierarchy is over its soft limit @@ -1792,12 +1832,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, */ while((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) - return true; + return VISIT; if (parent == root) break; } - return false; + return SKIP; } /* -- cgit v1.2.3 From 7d910c054be42515cd3e43f2e1bec8c536632de2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:28 -0700 Subject: memcg: track children in soft limit excess to improve soft limit Soft limit reclaim has to check the whole reclaim hierarchy while doing the first pass of the reclaim. This leads to a higher system time which can be visible especially when there are many groups in the hierarchy. This patch adds a per-memcg counter of children in excess. It also restores MEM_CGROUP_TARGET_SOFTLIMIT into mem_cgroup_event_ratelimit for a proper batching. If a group crosses soft limit for the first time it increases parent's children_in_excess up the hierarchy. The similarly if a group gets below the limit it will decrease the counter. The transition phase is recorded in soft_contributed flag. mem_cgroup_soft_reclaim_eligible then uses this information to better decide whether to skip the node or the whole subtree. The rule is simple. Skip the node with a children in excess or skip the whole subtree otherwise. This has been tested by a stream IO (dd if=/dev/zero of=file with 4*MemTotal size) which is quite sensitive to overhead during reclaim. The load is running in a group with soft limit set to 0 and without any limit. Apart from that there was a hierarchy with ~500, 2k and 8k groups (two groups on each level) without any pages in them. base denotes to the kernel on which the whole series is based on, rework is the kernel before this patch and reworkoptim is with this patch applied: * Run with soft limit set to 0 Elapsed 0-0-limit/base: min: 88.21 max: 94.61 avg: 91.73 std: 2.65 runs: 3 0-0-limit/rework: min: 76.05 [86.2%] max: 79.08 [83.6%] avg: 77.84 [84.9%] std: 1.30 runs: 3 0-0-limit/reworkoptim: min: 77.98 [88.4%] max: 80.36 [84.9%] avg: 78.92 [86.0%] std: 1.03 runs: 3 System 0.5k-0-limit/base: min: 34.86 max: 36.42 avg: 35.89 std: 0.73 runs: 3 0.5k-0-limit/rework: min: 43.26 [124.1%] max: 48.95 [134.4%] avg: 46.09 [128.4%] std: 2.32 runs: 3 0.5k-0-limit/reworkoptim: min: 46.98 [134.8%] max: 50.98 [140.0%] avg: 48.49 [135.1%] std: 1.77 runs: 3 Elapsed 0.5k-0-limit/base: min: 88.50 max: 97.52 avg: 93.92 std: 3.90 runs: 3 0.5k-0-limit/rework: min: 75.92 [85.8%] max: 78.45 [80.4%] avg: 77.34 [82.3%] std: 1.06 runs: 3 0.5k-0-limit/reworkoptim: min: 75.79 [85.6%] max: 79.37 [81.4%] avg: 77.55 [82.6%] std: 1.46 runs: 3 System 2k-0-limit/base: min: 34.57 max: 37.65 avg: 36.34 std: 1.30 runs: 3 2k-0-limit/rework: min: 64.17 [185.6%] max: 68.20 [181.1%] avg: 66.21 [182.2%] std: 1.65 runs: 3 2k-0-limit/reworkoptim: min: 49.78 [144.0%] max: 52.99 [140.7%] avg: 51.00 [140.3%] std: 1.42 runs: 3 Elapsed 2k-0-limit/base: min: 92.61 max: 97.83 avg: 95.03 std: 2.15 runs: 3 2k-0-limit/rework: min: 78.33 [84.6%] max: 84.08 [85.9%] avg: 81.09 [85.3%] std: 2.35 runs: 3 2k-0-limit/reworkoptim: min: 75.72 [81.8%] max: 78.57 [80.3%] avg: 76.73 [80.7%] std: 1.30 runs: 3 System 8k-0-limit/base: min: 39.78 max: 42.09 avg: 41.09 std: 0.97 runs: 3 8k-0-limit/rework: min: 200.86 [504.9%] max: 265.42 [630.6%] avg: 241.80 [588.5%] std: 29.06 runs: 3 8k-0-limit/reworkoptim: min: 53.70 [135.0%] max: 54.89 [130.4%] avg: 54.43 [132.5%] std: 0.52 runs: 3 Elapsed 8k-0-limit/base: min: 95.11 max: 98.61 avg: 96.81 std: 1.43 runs: 3 8k-0-limit/rework: min: 246.96 [259.7%] max: 331.47 [336.1%] avg: 301.32 [311.2%] std: 38.52 runs: 3 8k-0-limit/reworkoptim: min: 76.79 [80.7%] max: 81.71 [82.9%] avg: 78.97 [81.6%] std: 2.05 runs: 3 System time is increased by 30-40% but it is reduced a lot comparing to kernel without this patch. The higher time can be explained by the fact that the original soft reclaim scanned at priority 0 so it was much more effective for this workload (which is basically touch once and writeback). The Elapsed time looks better though (~20%). * Run with no soft limit set System 0-no-limit/base: min: 42.18 max: 50.38 avg: 46.44 std: 3.36 runs: 3 0-no-limit/rework: min: 40.57 [96.2%] max: 47.04 [93.4%] avg: 43.82 [94.4%] std: 2.64 runs: 3 0-no-limit/reworkoptim: min: 40.45 [95.9%] max: 45.28 [89.9%] avg: 42.10 [90.7%] std: 2.25 runs: 3 Elapsed 0-no-limit/base: min: 75.97 max: 78.21 avg: 76.87 std: 0.96 runs: 3 0-no-limit/rework: min: 75.59 [99.5%] max: 80.73 [103.2%] avg: 77.64 [101.0%] std: 2.23 runs: 3 0-no-limit/reworkoptim: min: 77.85 [102.5%] max: 82.42 [105.4%] avg: 79.64 [103.6%] std: 1.99 runs: 3 System 0.5k-no-limit/base: min: 44.54 max: 46.93 avg: 46.12 std: 1.12 runs: 3 0.5k-no-limit/rework: min: 42.09 [94.5%] max: 46.16 [98.4%] avg: 43.92 [95.2%] std: 1.69 runs: 3 0.5k-no-limit/reworkoptim: min: 42.47 [95.4%] max: 45.67 [97.3%] avg: 44.06 [95.5%] std: 1.31 runs: 3 Elapsed 0.5k-no-limit/base: min: 78.26 max: 81.49 avg: 79.65 std: 1.36 runs: 3 0.5k-no-limit/rework: min: 77.01 [98.4%] max: 80.43 [98.7%] avg: 78.30 [98.3%] std: 1.52 runs: 3 0.5k-no-limit/reworkoptim: min: 76.13 [97.3%] max: 77.87 [95.6%] avg: 77.18 [96.9%] std: 0.75 runs: 3 System 2k-no-limit/base: min: 62.96 max: 69.14 avg: 66.14 std: 2.53 runs: 3 2k-no-limit/rework: min: 76.01 [120.7%] max: 81.06 [117.2%] avg: 78.17 [118.2%] std: 2.12 runs: 3 2k-no-limit/reworkoptim: min: 62.57 [99.4%] max: 66.10 [95.6%] avg: 64.53 [97.6%] std: 1.47 runs: 3 Elapsed 2k-no-limit/base: min: 76.47 max: 84.22 avg: 79.12 std: 3.60 runs: 3 2k-no-limit/rework: min: 89.67 [117.3%] max: 93.26 [110.7%] avg: 91.10 [115.1%] std: 1.55 runs: 3 2k-no-limit/reworkoptim: min: 76.94 [100.6%] max: 79.21 [94.1%] avg: 78.45 [99.2%] std: 1.07 runs: 3 System 8k-no-limit/base: min: 104.74 max: 151.34 avg: 129.21 std: 19.10 runs: 3 8k-no-limit/rework: min: 205.23 [195.9%] max: 285.94 [188.9%] avg: 258.98 [200.4%] std: 38.01 runs: 3 8k-no-limit/reworkoptim: min: 161.16 [153.9%] max: 184.54 [121.9%] avg: 174.52 [135.1%] std: 9.83 runs: 3 Elapsed 8k-no-limit/base: min: 125.43 max: 181.00 avg: 154.81 std: 22.80 runs: 3 8k-no-limit/rework: min: 254.05 [202.5%] max: 355.67 [196.5%] avg: 321.46 [207.6%] std: 47.67 runs: 3 8k-no-limit/reworkoptim: min: 193.77 [154.5%] max: 222.72 [123.0%] avg: 210.18 [135.8%] std: 12.13 runs: 3 Both System and Elapsed are in stdev with the base kernel for all configurations except for 8k where both System and Elapsed are up by 35%. I do not have a good explanation for this because there is no soft reclaim pass going on as no group is above the limit which is checked in mem_cgroup_should_soft_reclaim. Then I have tested kernel build with the same configuration to see the behavior with a more general behavior. * Soft limit set to 0 for the build System 0-0-limit/base: min: 242.70 max: 245.17 avg: 243.85 std: 1.02 runs: 3 0-0-limit/rework min: 237.86 [98.0%] max: 240.22 [98.0%] avg: 239.00 [98.0%] std: 0.97 runs: 3 0-0-limit/reworkoptim: min: 241.11 [99.3%] max: 243.53 [99.3%] avg: 242.01 [99.2%] std: 1.08 runs: 3 Elapsed 0-0-limit/base: min: 348.48 max: 360.86 avg: 356.04 std: 5.41 runs: 3 0-0-limit/rework min: 286.95 [82.3%] max: 290.26 [80.4%] avg: 288.27 [81.0%] std: 1.43 runs: 3 0-0-limit/reworkoptim: min: 286.55 [82.2%] max: 289.00 [80.1%] avg: 287.69 [80.8%] std: 1.01 runs: 3 System 0.5k-0-limit/base: min: 251.77 max: 254.41 avg: 252.70 std: 1.21 runs: 3 0.5k-0-limit/rework min: 286.44 [113.8%] max: 289.30 [113.7%] avg: 287.60 [113.8%] std: 1.23 runs: 3 0.5k-0-limit/reworkoptim: min: 252.18 [100.2%] max: 253.16 [99.5%] avg: 252.62 [100.0%] std: 0.41 runs: 3 Elapsed 0.5k-0-limit/base: min: 347.83 max: 353.06 avg: 350.04 std: 2.21 runs: 3 0.5k-0-limit/rework min: 290.19 [83.4%] max: 295.62 [83.7%] avg: 293.12 [83.7%] std: 2.24 runs: 3 0.5k-0-limit/reworkoptim: min: 293.91 [84.5%] max: 294.87 [83.5%] avg: 294.29 [84.1%] std: 0.42 runs: 3 System 2k-0-limit/base: min: 263.05 max: 271.52 avg: 267.94 std: 3.58 runs: 3 2k-0-limit/rework min: 458.99 [174.5%] max: 468.31 [172.5%] avg: 464.45 [173.3%] std: 3.97 runs: 3 2k-0-limit/reworkoptim: min: 267.10 [101.5%] max: 279.38 [102.9%] avg: 272.78 [101.8%] std: 5.05 runs: 3 Elapsed 2k-0-limit/base: min: 372.33 max: 379.32 avg: 375.47 std: 2.90 runs: 3 2k-0-limit/rework min: 334.40 [89.8%] max: 339.52 [89.5%] avg: 337.44 [89.9%] std: 2.20 runs: 3 2k-0-limit/reworkoptim: min: 301.47 [81.0%] max: 319.19 [84.1%] avg: 307.90 [82.0%] std: 8.01 runs: 3 System 8k-0-limit/base: min: 320.50 max: 332.10 avg: 325.46 std: 4.88 runs: 3 8k-0-limit/rework min: 1115.76 [348.1%] max: 1165.66 [351.0%] avg: 1132.65 [348.0%] std: 23.34 runs: 3 8k-0-limit/reworkoptim: min: 403.75 [126.0%] max: 409.22 [123.2%] avg: 406.16 [124.8%] std: 2.28 runs: 3 Elapsed 8k-0-limit/base: min: 475.48 max: 585.19 avg: 525.54 std: 45.30 runs: 3 8k-0-limit/rework min: 616.25 [129.6%] max: 625.90 [107.0%] avg: 620.68 [118.1%] std: 3.98 runs: 3 8k-0-limit/reworkoptim: min: 420.18 [88.4%] max: 428.28 [73.2%] avg: 423.05 [80.5%] std: 3.71 runs: 3 Apart from 8k the system time is comparable with the base kernel while Elapsed is up to 20% better with all configurations. * No soft limit set System 0-no-limit/base: min: 234.76 max: 237.42 avg: 236.25 std: 1.11 runs: 3 0-no-limit/rework min: 233.09 [99.3%] max: 238.65 [100.5%] avg: 236.09 [99.9%] std: 2.29 runs: 3 0-no-limit/reworkoptim: min: 236.12 [100.6%] max: 240.53 [101.3%] avg: 237.94 [100.7%] std: 1.88 runs: 3 Elapsed 0-no-limit/base: min: 288.52 max: 295.42 avg: 291.29 std: 2.98 runs: 3 0-no-limit/rework min: 283.17 [98.1%] max: 284.33 [96.2%] avg: 283.78 [97.4%] std: 0.48 runs: 3 0-no-limit/reworkoptim: min: 288.50 [100.0%] max: 290.79 [98.4%] avg: 289.78 [99.5%] std: 0.95 runs: 3 System 0.5k-no-limit/base: min: 286.51 max: 293.23 avg: 290.21 std: 2.78 runs: 3 0.5k-no-limit/rework min: 291.69 [101.8%] max: 294.38 [100.4%] avg: 292.97 [101.0%] std: 1.10 runs: 3 0.5k-no-limit/reworkoptim: min: 277.05 [96.7%] max: 288.76 [98.5%] avg: 284.17 [97.9%] std: 5.11 runs: 3 Elapsed 0.5k-no-limit/base: min: 294.94 max: 298.92 avg: 296.47 std: 1.75 runs: 3 0.5k-no-limit/rework min: 292.55 [99.2%] max: 294.21 [98.4%] avg: 293.55 [99.0%] std: 0.72 runs: 3 0.5k-no-limit/reworkoptim: min: 294.41 [99.8%] max: 301.67 [100.9%] avg: 297.78 [100.4%] std: 2.99 runs: 3 System 2k-no-limit/base: min: 443.41 max: 466.66 avg: 457.66 std: 10.19 runs: 3 2k-no-limit/rework min: 490.11 [110.5%] max: 516.02 [110.6%] avg: 501.42 [109.6%] std: 10.83 runs: 3 2k-no-limit/reworkoptim: min: 435.25 [98.2%] max: 458.11 [98.2%] avg: 446.73 [97.6%] std: 9.33 runs: 3 Elapsed 2k-no-limit/base: min: 330.85 max: 333.75 avg: 332.52 std: 1.23 runs: 3 2k-no-limit/rework min: 343.06 [103.7%] max: 349.59 [104.7%] avg: 345.95 [104.0%] std: 2.72 runs: 3 2k-no-limit/reworkoptim: min: 330.01 [99.7%] max: 333.92 [100.1%] avg: 332.22 [99.9%] std: 1.64 runs: 3 System 8k-no-limit/base: min: 1175.64 max: 1259.38 avg: 1222.39 std: 34.88 runs: 3 8k-no-limit/rework min: 1226.31 [104.3%] max: 1241.60 [98.6%] avg: 1233.74 [100.9%] std: 6.25 runs: 3 8k-no-limit/reworkoptim: min: 1023.45 [87.1%] max: 1056.74 [83.9%] avg: 1038.92 [85.0%] std: 13.69 runs: 3 Elapsed 8k-no-limit/base: min: 613.36 max: 619.60 avg: 616.47 std: 2.55 runs: 3 8k-no-limit/rework min: 627.56 [102.3%] max: 642.33 [103.7%] avg: 633.44 [102.8%] std: 6.39 runs: 3 8k-no-limit/reworkoptim: min: 545.89 [89.0%] max: 555.36 [89.6%] avg: 552.06 [89.6%] std: 4.37 runs: 3 and these numbers look good as well. System time is around 100% (suprisingly better for the 8k case) and Elapsed is copies that trend. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4bb857d902c..a18e228f140b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -138,6 +138,7 @@ static const char * const mem_cgroup_lru_names[] = { */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -315,6 +316,22 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif + /* + * Protects soft_contributed transitions. + * See mem_cgroup_update_soft_limit + */ + spinlock_t soft_lock; + + /* + * If true then this group has increased parents' children_in_excess + * when it got over the soft limit. + * When a group falls bellow the soft limit, parents' children_in_excess + * is decreased and soft_contributed changed to false. + */ + bool soft_contributed; + + /* Number of children that are in soft limit excess */ + atomic_t children_in_excess; struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ @@ -802,6 +819,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; case MEM_CGROUP_TARGET_NUMAINFO: next = val + NUMAINFO_EVENTS_TARGET; break; @@ -814,6 +834,42 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, return false; } +/* + * Called from rate-limitted memcg_check_events when enough + * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure + * that all the parents up the hierarchy will be noticed that this group + * is in excess or that it is not in excess anymore. mmecg->soft_contributed + * makes the transition a single action whenever the state flips from one to + * other. + */ +static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) +{ + unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); + struct mem_cgroup *parent = memcg; + int delta = 0; + + spin_lock(&memcg->soft_lock); + if (excess) { + if (!memcg->soft_contributed) { + delta = 1; + memcg->soft_contributed = true; + } + } else { + if (memcg->soft_contributed) { + delta = -1; + memcg->soft_contributed = false; + } + } + + /* + * Necessary to update all ancestors when hierarchy is used + * because their event counter is not touched. + */ + while (delta && (parent = parent_mem_cgroup(parent))) + atomic_add(delta, &parent->children_in_excess); + spin_unlock(&memcg->soft_lock); +} + /* * Check events in order. * @@ -824,8 +880,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; bool do_numainfo __maybe_unused; + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); @@ -833,6 +892,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_soft_limit(memcg); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1816,6 +1877,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * hierarchy if * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit + * + * If the given group doesn't have any children over the limit then it + * doesn't make any sense to iterate its subtree. */ enum mem_cgroup_filter_t mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, @@ -1837,6 +1901,8 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, break; } + if (!atomic_read(&memcg->children_in_excess)) + return SKIP_TREE; return SKIP; } @@ -5892,6 +5958,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); + spin_lock_init(&memcg->soft_lock); return &memcg->css; @@ -5969,6 +6036,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); + if (memcg->soft_contributed) { + while ((memcg = parent_mem_cgroup(memcg))) + atomic_dec(&memcg->children_in_excess); + } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } -- cgit v1.2.3 From e839b6a1c8d0895803bcbd587595a54f4221a625 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:30 -0700 Subject: memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything mem_cgroup_should_soft_reclaim controls whether soft reclaim pass is done and it always says yes currently. Memcg iterators are clever to skip nodes that are not soft reclaimable quite efficiently but mem_cgroup_should_soft_reclaim can be more clever and do not start the soft reclaim pass at all if it knows that nothing would be scanned anyway. In order to do that, simply reuse mem_cgroup_soft_reclaim_eligible for the target group of the reclaim and allow the pass only if the whole subtree wouldn't be skipped. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a18e228f140b..848fc6c195d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1885,7 +1885,11 @@ enum mem_cgroup_filter_t mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { - struct mem_cgroup *parent = memcg; + struct mem_cgroup *parent; + + if (!memcg) + memcg = root_mem_cgroup; + parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) return VISIT; -- cgit v1.2.3 From 1be171d60bddcce2602c5d009029274d67736fd7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:32 -0700 Subject: memcg: track all children over limit in the root Children in soft limit excess are currently tracked up the hierarchy in memcg->children_in_excess. Nevertheless there still might exist tons of groups that are not in hierarchy relation to the root cgroup (e.g. all first level groups if root_mem_cgroup->use_hierarchy == false). As the whole tree walk has to be done when the iteration starts at root_mem_cgroup the iterator should be able to skip the walk if there is no child above the limit without iterating them. This can be done easily if the root tracks all children rather than only hierarchical children. This is done by this patch which updates root_mem_cgroup children_in_excess if root_mem_cgroup->use_hierarchy == false so the root knows about all children in excess. Please note that this is not an issue for inner memcgs which have use_hierarchy == false because then only the single group is visited so no special optimization is necessary. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Hugh Dickins Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 848fc6c195d2..46717d6c62b7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -864,9 +864,15 @@ static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) /* * Necessary to update all ancestors when hierarchy is used * because their event counter is not touched. + * We track children even outside the hierarchy for the root + * cgroup because tree walk starting at root should visit + * all cgroups and we want to prevent from pointless tree + * walk if no children is below the limit. */ while (delta && (parent = parent_mem_cgroup(parent))) atomic_add(delta, &parent->children_in_excess); + if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) + atomic_add(delta, &root_mem_cgroup->children_in_excess); spin_unlock(&memcg->soft_lock); } @@ -6043,6 +6049,9 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) if (memcg->soft_contributed) { while ((memcg = parent_mem_cgroup(memcg))) atomic_dec(&memcg->children_in_excess); + + if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) + atomic_dec(&root_mem_cgroup->children_in_excess); } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); -- cgit v1.2.3 From f894ffa865301d4010d68b15be29912fa4039e77 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 12 Sep 2013 15:13:35 -0700 Subject: memcg: trivial cleanups Clean up some mess made by the "Soft limit rework" series, and a few other things. Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46717d6c62b7..c4524458b7d0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -279,7 +279,7 @@ struct mem_cgroup { * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ - unsigned long move_charge_at_immigrate; + unsigned long move_charge_at_immigrate; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -324,7 +324,7 @@ struct mem_cgroup { /* * If true then this group has increased parents' children_in_excess - * when it got over the soft limit. + * when it got over the soft limit. * When a group falls bellow the soft limit, parents' children_in_excess * is decreased and soft_contributed changed to false. */ @@ -835,12 +835,12 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, } /* - * Called from rate-limitted memcg_check_events when enough + * Called from rate-limited memcg_check_events when enough * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure - * that all the parents up the hierarchy will be noticed that this group + * that all the parents up the hierarchy will be notified that this group * is in excess or that it is not in excess anymore. mmecg->soft_contributed * makes the transition a single action whenever the state flips from one to - * other. + * the other. */ static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) { @@ -1881,8 +1881,8 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) /* * A group is eligible for the soft limit reclaim under the given root * hierarchy if - * a) it is over its soft limit - * b) any parent up the hierarchy is over its soft limit + * a) it is over its soft limit + * b) any parent up the hierarchy is over its soft limit * * If the given group doesn't have any children over the limit then it * doesn't make any sense to iterate its subtree. @@ -1904,7 +1904,7 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, * If any parent up to the root in the hierarchy is over its soft limit * then we have to obey and reclaim from this group as well. */ - while((parent = parent_mem_cgroup(parent))) { + while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) return VISIT; if (parent == root) @@ -2309,7 +2309,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) flush_work(&stock->work); } out: - put_online_cpus(); + put_online_cpus(); } /* @@ -2741,7 +2741,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * is accessed after testing USED bit. To make pc->mem_cgroup visible * before USED bit, we need memory barrier here. * See mem_cgroup_add_lru_list(), etc. - */ + */ smp_wmb(); SetPageCgroupUsed(pc); @@ -3483,9 +3483,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) * the page allocator. Therefore, the following sequence when backed by * the SLUB allocator: * - * memcg_stop_kmem_account(); - * kmalloc() - * memcg_resume_kmem_account(); + * memcg_stop_kmem_account(); + * kmalloc() + * memcg_resume_kmem_account(); * * would effectively ignore the fact that we should skip accounting, * since it will drive us directly to this function without passing @@ -4514,7 +4514,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ - if (curusage >= oldusage) + if (curusage >= oldusage) retry_count--; else oldusage = curusage; @@ -4535,7 +4535,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, int enlarge = 0; /* see mem_cgroup_resize_res_limit */ - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); while (retry_count) { if (signal_pending(current)) { -- cgit v1.2.3 From 519e52473ebe9db5cdef44670d5a97f1fd53d721 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:42 -0700 Subject: mm: memcg: enable memcg OOM killer only for user faults System calls and kernel faults (uaccess, gup) can handle an out of memory situation gracefully and just return -ENOMEM. Enable the memcg OOM killer only for user faults, where it's really the only option available. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: azurIt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c4524458b7d0..0980bbf6438d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2454,7 +2454,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return CHARGE_RETRY; /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check) + if (!oom_check || !current->memcg_oom.may_oom) return CHARGE_NOMEM; /* check OOM */ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) -- cgit v1.2.3 From fb2a6fc56be66c169f8b80e07ed999ba453a2db2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:43 -0700 Subject: mm: memcg: rework and document OOM waiting and wakeup The memcg OOM handler open-codes a sleeping lock for OOM serialization (trylock, wait, repeat) because the required locking is so specific to memcg hierarchies. However, it would be nice if this construct would be clearly recognizable and not be as obfuscated as it is right now. Clean up as follows: 1. Remove the return value of mem_cgroup_oom_unlock() 2. Rename mem_cgroup_oom_lock() to mem_cgroup_oom_trylock(). 3. Pull the prepare_to_wait() out of the memcg_oom_lock scope. This makes it more obvious that the task has to be on the waitqueue before attempting to OOM-trylock the hierarchy, to not miss any wakeups before going to sleep. It just didn't matter until now because it was all lumped together into the global memcg_oom_lock spinlock section. 4. Pull the mem_cgroup_oom_notify() out of the memcg_oom_lock scope. It is proctected by the hierarchical OOM-lock. 5. The memcg_oom_lock spinlock is only required to propagate the OOM lock in any given hierarchy atomically. Restrict its scope to mem_cgroup_oom_(trylock|unlock). 6. Do not wake up the waitqueue unconditionally at the end of the function. Only the lockholder has to wake up the next in line after releasing the lock. Note that the lockholder kicks off the OOM-killer, which in turn leads to wakeups from the uncharges of the exiting task. But a contender is not guaranteed to see them if it enters the OOM path after the OOM kills but before the lockholder releases the lock. Thus there has to be an explicit wakeup after releasing the lock. 7. Put the OOM task on the waitqueue before marking the hierarchy as under OOM as that is the point where we start to receive wakeups. No point in listening before being on the waitqueue. 8. Likewise, unmark the hierarchy before finishing the sleep, for symmetry. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: azurIt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 83 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 37 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0980bbf6438d..04250cbf46c6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1916,15 +1916,18 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, return SKIP; } +static DEFINE_SPINLOCK(memcg_oom_lock); + /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. - * Has to be called with memcg_oom_lock */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; + spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* @@ -1938,33 +1941,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) iter->oom_lock = true; } - if (!failed) - return true; - - /* - * OK, we failed to lock the whole subtree so we have to clean up - * what we set up to the failing subtree - */ - for_each_mem_cgroup_tree(iter, memcg) { - if (iter == failed) { - mem_cgroup_iter_break(memcg, iter); - break; + if (failed) { + /* + * OK, we failed to lock the whole subtree so we have + * to clean up what we set up to the failing subtree + */ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter == failed) { + mem_cgroup_iter_break(memcg, iter); + break; + } + iter->oom_lock = false; } - iter->oom_lock = false; } - return false; + + spin_unlock(&memcg_oom_lock); + + return !failed; } -/* - * Has to be called with memcg_oom_lock - */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; - return 0; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) @@ -1988,7 +1991,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) atomic_add_unless(&iter->under_oom, -1, 0); } -static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -2035,45 +2037,52 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { struct oom_wait_info owait; - bool locked, need_to_kill; + bool locked; owait.memcg = memcg; owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); - need_to_kill = true; - mem_cgroup_mark_under_oom(memcg); - /* At first, try to OOM lock hierarchy under memcg.*/ - spin_lock(&memcg_oom_lock); - locked = mem_cgroup_oom_lock(memcg); /* + * As with any blocking lock, a contender needs to start + * listening for wakeups before attempting the trylock, + * otherwise it can miss the wakeup from the unlock and sleep + * indefinitely. This is just open-coded because our locking + * is so particular to memcg hierarchies. + * * Even if signal_pending(), we can't quit charge() loop without * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL * under OOM is always welcomed, use TASK_KILLABLE here. */ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - if (!locked || memcg->oom_kill_disable) - need_to_kill = false; + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + if (locked) mem_cgroup_oom_notify(memcg); - spin_unlock(&memcg_oom_lock); - if (need_to_kill) { + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); mem_cgroup_out_of_memory(memcg, mask, order); } else { schedule(); + mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); } - spin_lock(&memcg_oom_lock); - if (locked) - mem_cgroup_oom_unlock(memcg); - memcg_wakeup_oom(memcg); - spin_unlock(&memcg_oom_lock); - mem_cgroup_unmark_under_oom(memcg); + if (locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); + } if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; -- cgit v1.2.3 From 3812c8c8f3953921ef18544110dafc3505c1ac62 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Sep 2013 15:13:44 -0700 Subject: mm: memcg: do not trap chargers with full callstack on OOM The memcg OOM handling is incredibly fragile and can deadlock. When a task fails to charge memory, it invokes the OOM killer and loops right there in the charge code until it succeeds. Comparably, any other task that enters the charge path at this point will go to a waitqueue right then and there and sleep until the OOM situation is resolved. The problem is that these tasks may hold filesystem locks and the mmap_sem; locks that the selected OOM victim may need to exit. For example, in one reported case, the task invoking the OOM killer was about to charge a page cache page during a write(), which holds the i_mutex. The OOM killer selected a task that was just entering truncate() and trying to acquire the i_mutex: OOM invoking task: mem_cgroup_handle_oom+0x241/0x3b0 mem_cgroup_cache_charge+0xbe/0xe0 add_to_page_cache_locked+0x4c/0x140 add_to_page_cache_lru+0x22/0x50 grab_cache_page_write_begin+0x8b/0xe0 ext3_write_begin+0x88/0x270 generic_file_buffered_write+0x116/0x290 __generic_file_aio_write+0x27c/0x480 generic_file_aio_write+0x76/0xf0 # takes ->i_mutex do_sync_write+0xea/0x130 vfs_write+0xf3/0x1f0 sys_write+0x51/0x90 system_call_fastpath+0x18/0x1d OOM kill victim: do_truncate+0x58/0xa0 # takes i_mutex do_last+0x250/0xa30 path_openat+0xd7/0x440 do_filp_open+0x49/0xa0 do_sys_open+0x106/0x240 sys_open+0x20/0x30 system_call_fastpath+0x18/0x1d The OOM handling task will retry the charge indefinitely while the OOM killed task is not releasing any resources. A similar scenario can happen when the kernel OOM killer for a memcg is disabled and a userspace task is in charge of resolving OOM situations. In this case, ALL tasks that enter the OOM path will be made to sleep on the OOM waitqueue and wait for userspace to free resources or increase the group's limit. But a userspace OOM handler is prone to deadlock itself on the locks held by the waiting tasks. For example one of the sleeping tasks may be stuck in a brk() call with the mmap_sem held for writing but the userspace handler, in order to pick an optimal victim, may need to read files from /proc/, which tries to acquire the same mmap_sem for reading and deadlocks. This patch changes the way tasks behave after detecting a memcg OOM and makes sure nobody loops or sleeps with locks held: 1. When OOMing in a user fault, invoke the OOM killer and restart the fault instead of looping on the charge attempt. This way, the OOM victim can not get stuck on locks the looping task may hold. 2. When OOMing in a user fault but somebody else is handling it (either the kernel OOM killer or a userspace handler), don't go to sleep in the charge context. Instead, remember the OOMing memcg in the task struct and then fully unwind the page fault stack with -ENOMEM. pagefault_out_of_memory() will then call back into the memcg code to check if the -ENOMEM came from the memcg, and then either put the task to sleep on the memcg's OOM waitqueue or just restart the fault. The OOM victim can no longer get stuck on any lock a sleeping task may hold. Debugged by Michal Hocko. Signed-off-by: Johannes Weiner Reported-by: azurIt Acked-by: Michal Hocko Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 154 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 107 insertions(+), 47 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04250cbf46c6..4b5cfb509270 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -255,6 +255,7 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; + atomic_t oom_wakeups; int swappiness; /* OOM-Killer disable */ @@ -2020,6 +2021,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, static void memcg_wakeup_oom(struct mem_cgroup *memcg) { + atomic_inc(&memcg->oom_wakeups); /* for filtering, pass "memcg" as argument. */ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } @@ -2031,19 +2033,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) } /* - * try to call OOM killer. returns false if we should exit memory-reclaim loop. + * try to call OOM killer */ -static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, - int order) +static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - struct oom_wait_info owait; bool locked; + int wakeups; - owait.memcg = memcg; - owait.wait.flags = 0; - owait.wait.func = memcg_oom_wake_function; - owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); + if (!current->memcg_oom.may_oom) + return; + + current->memcg_oom.in_memcg_oom = 1; /* * As with any blocking lock, a contender needs to start @@ -2051,12 +2051,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, * otherwise it can miss the wakeup from the unlock and sleep * indefinitely. This is just open-coded because our locking * is so particular to memcg hierarchies. - * - * Even if signal_pending(), we can't quit charge() loop without - * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL - * under OOM is always welcomed, use TASK_KILLABLE here. */ - prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + wakeups = atomic_read(&memcg->oom_wakeups); mem_cgroup_mark_under_oom(memcg); locked = mem_cgroup_oom_trylock(memcg); @@ -2066,15 +2062,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, if (locked && !memcg->oom_kill_disable) { mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); mem_cgroup_out_of_memory(memcg, mask, order); + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); } else { - schedule(); - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); + /* + * A system call can just return -ENOMEM, but if this + * is a page fault and somebody else is handling the + * OOM already, we need to sleep on the OOM waitqueue + * for this memcg until the situation is resolved. + * Which can take some time because it might be + * handled by a userspace task. + * + * However, this is the charge context, which means + * that we may sit on a large call stack and hold + * various filesystem locks, the mmap_sem etc. and we + * don't want the OOM handler to deadlock on them + * while we sit here and wait. Store the current OOM + * context in the task_struct, then return -ENOMEM. + * At the end of the page fault handler, with the + * stack unwound, pagefault_out_of_memory() will check + * back with us by calling + * mem_cgroup_oom_synchronize(), possibly putting the + * task to sleep. + */ + current->memcg_oom.oom_locked = locked; + current->memcg_oom.wakeups = wakeups; + css_get(&memcg->css); + current->memcg_oom.wait_on_memcg = memcg; } +} + +/** + * mem_cgroup_oom_synchronize - complete memcg OOM handling + * + * This has to be called at the end of a page fault if the the memcg + * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * + * Memcg supports userspace OOM handling, so failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at + * the end of the page fault to put the task to sleep and clean up the + * OOM state. + * + * Returns %true if an ongoing memcg OOM situation was detected and + * finalized, %false otherwise. + */ +bool mem_cgroup_oom_synchronize(void) +{ + struct oom_wait_info owait; + struct mem_cgroup *memcg; + + /* OOM is global, do not handle */ + if (!current->memcg_oom.in_memcg_oom) + return false; + + /* + * We invoked the OOM killer but there is a chance that a kill + * did not free up any charges. Everybody else might already + * be sleeping, so restart the fault and keep the rampage + * going until some charges are released. + */ + memcg = current->memcg_oom.wait_on_memcg; + if (!memcg) + goto out; + + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + goto out_memcg; + + owait.memcg = memcg; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); - if (locked) { + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + /* Only sleep if we didn't miss any wakeups since OOM */ + if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + schedule(); + finish_wait(&memcg_oom_waitq, &owait.wait); +out_memcg: + mem_cgroup_unmark_under_oom(memcg); + if (current->memcg_oom.oom_locked) { mem_cgroup_oom_unlock(memcg); /* * There is no guarantee that an OOM-lock contender @@ -2083,11 +2159,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, */ memcg_oom_recover(memcg); } - - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - return false; - /* Give chance to dying process */ - schedule_timeout_uninterruptible(1); + css_put(&memcg->css); + current->memcg_oom.wait_on_memcg = NULL; +out: + current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2400,12 +2475,11 @@ enum { CHARGE_RETRY, /* need to retry but retry is not bad */ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages, unsigned int min_pages, - bool oom_check) + bool invoke_oom) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2462,14 +2536,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (mem_cgroup_wait_acct_move(mem_over_limit)) return CHARGE_RETRY; - /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check || !current->memcg_oom.may_oom) - return CHARGE_NOMEM; - /* check OOM */ - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) - return CHARGE_OOM_DIE; + if (invoke_oom) + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - return CHARGE_RETRY; + return CHARGE_NOMEM; } /* @@ -2572,7 +2642,7 @@ again: } do { - bool oom_check; + bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ if (fatal_signal_pending(current)) { @@ -2580,14 +2650,8 @@ again: goto bypass; } - oom_check = false; - if (oom && !nr_oom_retries) { - oom_check = true; - nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - } - - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, - oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, + nr_pages, invoke_oom); switch (ret) { case CHARGE_OK: break; @@ -2600,16 +2664,12 @@ again: css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom) { + if (!oom || invoke_oom) { css_put(&memcg->css); goto nomem; } - /* If oom, we never return -ENOMEM */ nr_oom_retries--; break; - case CHARGE_OOM_DIE: /* Killed by OOM Killer */ - css_put(&memcg->css); - goto bypass; } } while (ret != CHARGE_OK); -- cgit v1.2.3 From 6de5a8bfcae6e3b427d642eff078d8305b324b52 Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 12 Sep 2013 15:13:47 -0700 Subject: memcg: rename RESOURCE_MAX to RES_COUNTER_MAX RESOURCE_MAX is far too general name, change it to RES_COUNTER_MAX. Signed-off-by: Sha Zhengju Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Daisuke Nishimura Cc: Jeff Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b5cfb509270..2c71f243186e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4967,7 +4967,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) */ mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); - if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; @@ -4977,7 +4977,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) ret = memcg_update_cache_sizes(memcg); if (ret) { - res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); goto out; } static_key_slow_inc(&memcg_kmem_enabled_key); -- cgit v1.2.3 From 68b4876d996e8749142b2895bc2e251448996363 Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 12 Sep 2013 15:13:50 -0700 Subject: memcg: remove MEMCG_NR_FILE_MAPPED While accounting memcg page stat, it's not worth to use MEMCG_NR_FILE_MAPPED as an extra layer of indirection because of the complexity and presumed performance overhead. We can use MEM_CGROUP_STAT_FILE_MAPPED directly. Signed-off-by: Sha Zhengju Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Acked-by: Fengguang Wu Reviewed-by: Greg Thelen Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2c71f243186e..f3803462a7e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -84,21 +84,6 @@ static int really_do_swap_account __initdata = 0; #endif -/* - * Statistics for memory cgroup. - */ -enum mem_cgroup_stat_index { - /* - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. - */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ - MEM_CGROUP_STAT_NSTATS, -}; - static const char * const mem_cgroup_stat_names[] = { "cache", "rss", @@ -2231,7 +2216,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) } void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx, int val) + enum mem_cgroup_stat_index idx, int val) { struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); @@ -2244,14 +2229,6 @@ void mem_cgroup_update_page_stat(struct page *page, if (unlikely(!memcg || !PageCgroupUsed(pc))) return; - switch (idx) { - case MEMCG_NR_FILE_MAPPED: - idx = MEM_CGROUP_STAT_FILE_MAPPED; - break; - default: - BUG(); - } - this_cpu_add(memcg->stat->count[idx], val); } -- cgit v1.2.3 From 658b72c5a7a033f0dde61b15dff86bf423ce425e Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 12 Sep 2013 15:13:52 -0700 Subject: memcg: check for proper lock held in mem_cgroup_update_page_stat We should call mem_cgroup_begin_update_page_stat() before mem_cgroup_update_page_stat() to get proper locks, however the latter doesn't do any checking that we use proper locking, which would be hard. Suggested by Michal Hock we could at least test for rcu_read_lock_held() because RCU is held if !mem_cgroup_disabled(). Signed-off-by: Sha Zhengju Acked-by: Michal Hocko Reviewed-by: Greg Thelen Cc: Fengguang Wu Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3803462a7e6..0093bc36c5fc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2225,6 +2225,7 @@ void mem_cgroup_update_page_stat(struct page *page, if (mem_cgroup_disabled()) return; + VM_BUG_ON(!rcu_read_lock_held()); memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) return; -- cgit v1.2.3 From 3ea67d06e4679a16f69f66f43a8d6ee4778985fc Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 12 Sep 2013 15:13:53 -0700 Subject: memcg: add per cgroup writeback pages accounting Add memcg routines to count writeback pages, later dirty pages will also be accounted. After Kame's commit 89c06bd52fb9 ("memcg: use new logic for page stat accounting"), we can use 'struct page' flag to test page state instead of per page_cgroup flag. But memcg has a feature to move a page from a cgroup to another one and may have race between "move" and "page stat accounting". So in order to avoid the race we have designed a new lock: mem_cgroup_begin_update_page_stat() modify page information -->(a) mem_cgroup_update_page_stat() -->(b) mem_cgroup_end_update_page_stat() It requires both (a) and (b)(writeback pages accounting) to be pretected in mem_cgroup_{begin/end}_update_page_stat(). It's full no-op for !CONFIG_MEMCG, almost no-op if memcg is disabled (but compiled in), rcu read lock in the most cases (no task is moving), and spin_lock_irqsave on top in the slow path. There're two writeback interfaces to modify: test_{clear/set}_page_writeback(). And the lock order is: --> memcg->move_lock --> mapping->tree_lock Signed-off-by: Sha Zhengju Acked-by: Michal Hocko Reviewed-by: Greg Thelen Cc: Fengguang Wu Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0093bc36c5fc..d5ff3ce13029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -89,6 +89,7 @@ static const char * const mem_cgroup_stat_names[] = { "rss", "rss_huge", "mapped_file", + "writeback", "swap", }; @@ -3654,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline +void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, + struct mem_cgroup *to, + unsigned int nr_pages, + enum mem_cgroup_stat_index idx) +{ + /* Update stat data for mem_cgroup */ + preempt_disable(); + WARN_ON_ONCE(from->stat->count[idx] < nr_pages); + __this_cpu_add(from->stat->count[idx], -nr_pages); + __this_cpu_add(to->stat->count[idx], nr_pages); + preempt_enable(); +} + /** * mem_cgroup_move_account - move account of the page * @page: the page @@ -3699,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page, move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) { - /* Update mapped_file data for mem_cgroup */ - preempt_disable(); - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - preempt_enable(); - } + if (!anon && page_mapped(page)) + mem_cgroup_move_account_page_stat(from, to, nr_pages, + MEM_CGROUP_STAT_FILE_MAPPED); + + if (PageWriteback(page)) + mem_cgroup_move_account_page_stat(from, to, nr_pages, + MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_charge_statistics(from, page, anon, -nr_pages); /* caller should have done css_get */ -- cgit v1.2.3 From 8f939a9f4c769500c58c31ec81ce17297388f4bf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:34 -0700 Subject: revert "memcg: track all children over limit in the root" Revert commit 1be171d60bdd ("memcg: track all children over limit in the root") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..5ee0af463f2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -851,15 +851,9 @@ static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) /* * Necessary to update all ancestors when hierarchy is used * because their event counter is not touched. - * We track children even outside the hierarchy for the root - * cgroup because tree walk starting at root should visit - * all cgroups and we want to prevent from pointless tree - * walk if no children is below the limit. */ while (delta && (parent = parent_mem_cgroup(parent))) atomic_add(delta, &parent->children_in_excess); - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) - atomic_add(delta, &root_mem_cgroup->children_in_excess); spin_unlock(&memcg->soft_lock); } @@ -6112,9 +6106,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) if (memcg->soft_contributed) { while ((memcg = parent_mem_cgroup(memcg))) atomic_dec(&memcg->children_in_excess); - - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) - atomic_dec(&root_mem_cgroup->children_in_excess); } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); -- cgit v1.2.3 From 3120055e869f2a208480f238680d097eec8f0e02 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:35 -0700 Subject: revert "memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything" Revert commit e839b6a1c8d0 ("memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5ee0af463f2e..5cf7726764cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1872,11 +1872,7 @@ enum mem_cgroup_filter_t mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { - struct mem_cgroup *parent; - - if (!memcg) - memcg = root_mem_cgroup; - parent = memcg; + struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) return VISIT; -- cgit v1.2.3 From 30361e51cae7a4df3fec89f935a450a6fe6f16fa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:36 -0700 Subject: revert "memcg: track children in soft limit excess to improve soft limit" Revert commit 7d910c054be4 ("memcg: track children in soft limit excess to improve soft limit") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 71 --------------------------------------------------------- 1 file changed, 71 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5cf7726764cc..916892c2b8e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -124,7 +124,6 @@ static const char * const mem_cgroup_lru_names[] = { */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -303,22 +302,6 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif - /* - * Protects soft_contributed transitions. - * See mem_cgroup_update_soft_limit - */ - spinlock_t soft_lock; - - /* - * If true then this group has increased parents' children_in_excess - * when it got over the soft limit. - * When a group falls bellow the soft limit, parents' children_in_excess - * is decreased and soft_contributed changed to false. - */ - bool soft_contributed; - - /* Number of children that are in soft limit excess */ - atomic_t children_in_excess; struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ @@ -806,9 +789,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; case MEM_CGROUP_TARGET_NUMAINFO: next = val + NUMAINFO_EVENTS_TARGET; break; @@ -821,42 +801,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, return false; } -/* - * Called from rate-limited memcg_check_events when enough - * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure - * that all the parents up the hierarchy will be notified that this group - * is in excess or that it is not in excess anymore. mmecg->soft_contributed - * makes the transition a single action whenever the state flips from one to - * the other. - */ -static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) -{ - unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); - struct mem_cgroup *parent = memcg; - int delta = 0; - - spin_lock(&memcg->soft_lock); - if (excess) { - if (!memcg->soft_contributed) { - delta = 1; - memcg->soft_contributed = true; - } - } else { - if (memcg->soft_contributed) { - delta = -1; - memcg->soft_contributed = false; - } - } - - /* - * Necessary to update all ancestors when hierarchy is used - * because their event counter is not touched. - */ - while (delta && (parent = parent_mem_cgroup(parent))) - atomic_add(delta, &parent->children_in_excess); - spin_unlock(&memcg->soft_lock); -} - /* * Check events in order. * @@ -867,11 +811,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit; bool do_numainfo __maybe_unused; - do_softlimit = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); @@ -879,8 +820,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); mem_cgroup_threshold(memcg); - if (unlikely(do_softlimit)) - mem_cgroup_update_soft_limit(memcg); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1864,9 +1803,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * hierarchy if * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit - * - * If the given group doesn't have any children over the limit then it - * doesn't make any sense to iterate its subtree. */ enum mem_cgroup_filter_t mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, @@ -1888,8 +1824,6 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, break; } - if (!atomic_read(&memcg->children_in_excess)) - return SKIP_TREE; return SKIP; } @@ -6021,7 +5955,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); - spin_lock_init(&memcg->soft_lock); return &memcg->css; @@ -6099,10 +6032,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); - if (memcg->soft_contributed) { - while ((memcg = parent_mem_cgroup(memcg))) - atomic_dec(&memcg->children_in_excess); - } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } -- cgit v1.2.3 From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:37 -0700 Subject: revert "memcg: enhance memcg iterator to support predicates" Revert commit de57780dc659 ("memcg: enhance memcg iterator to support predicates") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 70 +++++++++++++-------------------------------------------- 1 file changed, 15 insertions(+), 55 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 916892c2b8e0..65e7bec4b0f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -862,15 +862,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -static enum mem_cgroup_filter_t -mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, - mem_cgroup_iter_filter cond) -{ - if (!cond) - return VISIT; - return cond(memcg, root); -} - /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -878,7 +869,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) + struct mem_cgroup *last_visited) { struct cgroup_subsys_state *prev_css, *next_css; @@ -896,31 +887,11 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - switch (mem_cgroup_filter(mem, root, cond)) { - case SKIP: + if (css_tryget(&mem->css)) + return mem; + else { prev_css = next_css; goto skip_node; - case SKIP_TREE: - if (mem == root) - return NULL; - /* - * css_rightmost_descendant is not an optimal way to - * skip through a subtree (especially for imbalanced - * trees leaning to right) but that's what we have right - * now. More effective solution would be traversing - * right-up for first non-NULL without calling - * css_next_descendant_pre afterwards. - */ - prev_css = css_rightmost_descendant(next_css); - goto skip_node; - case VISIT: - if (css_tryget(&mem->css)) - return mem; - else { - prev_css = next_css; - goto skip_node; - } - break; } } @@ -984,7 +955,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks - * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -997,18 +967,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond) + struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) { - /* first call must return non-NULL, second return NULL */ - return (struct mem_cgroup *)(unsigned long)!prev; - } + if (mem_cgroup_disabled()) + return NULL; if (!root) root = root_mem_cgroup; @@ -1019,9 +986,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - if (mem_cgroup_filter(root, root, cond) == VISIT) - return root; - return NULL; + return root; } rcu_read_lock(); @@ -1044,7 +1009,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited, cond); + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1055,11 +1020,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, reclaim->generation = iter->generation; } - /* - * We have finished the whole tree walk or no group has been - * visited because filter told us to skip the root node. - */ - if (!memcg && (prev || (cond && !last_visited))) + if (prev && !memcg) goto out_unlock; } out_unlock: @@ -1804,14 +1765,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) - return VISIT; + return true; /* * If any parent up to the root in the hierarchy is over its soft limit @@ -1819,12 +1779,12 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) - return VISIT; + return true; if (parent == root) break; } - return SKIP; + return false; } static DEFINE_SPINLOCK(memcg_oom_lock); -- cgit v1.2.3 From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:38 -0700 Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim" Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also for targeted reclaim") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65e7bec4b0f0..47cdc7eb1a6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1760,13 +1760,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif /* - * A group is eligible for the soft limit reclaim under the given root - * hierarchy if - * a) it is over its soft limit + * A group is eligible for the soft limit reclaim if + * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root) +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) { struct mem_cgroup *parent = memcg; @@ -1774,14 +1772,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, return true; /* - * If any parent up to the root in the hierarchy is over its soft limit - * then we have to obey and reclaim from this group as well. + * If any parent up the hierarchy is over its soft limit then we + * have to obey and reclaim from this group as well. */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) return true; - if (parent == root) - break; } return false; -- cgit v1.2.3 From bb4cc1a8b5eaf3b9e5707d7c270400b05d11a2b7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:40 -0700 Subject: revert "memcg: get rid of soft-limit tree infrastructure" Revert commit e883110aad71 ("memcg: get rid of soft-limit tree infrastructure") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 265 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 263 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47cdc7eb1a6b..852dbec07ce6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -124,6 +125,7 @@ static const char * const mem_cgroup_lru_names[] = { */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -159,6 +161,10 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -167,6 +173,26 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; +/* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; u64 threshold; @@ -405,6 +431,7 @@ static bool move_file(void) * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -631,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) return mem_cgroup_zoneinfo(memcg, nid, zid); } +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + spin_lock(&mctz->lock); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); + spin_unlock(&mctz->lock); +} + + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +{ + unsigned long long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + int nid = page_to_nid(page); + int zid = page_zonenum(page); + mctz = soft_limit_tree_from_page(page); + + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = mem_cgroup_zoneinfo(memcg, nid, zid); + excess = res_counter_soft_limit_excess(&memcg->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(memcg, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) +{ + int node, zone; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + for_each_node(node) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = mem_cgroup_zoneinfo(memcg, node, zone); + mctz = soft_limit_tree_node_zone(node, zone); + mem_cgroup_remove_exceeded(memcg, mz, mctz); + } + } +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->memcg->res) || + !css_tryget(&mz->memcg->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock(&mctz->lock); + return mz; +} + /* * Implementation Note: reading percpu statistics for memcg. * @@ -789,6 +974,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; case MEM_CGROUP_TARGET_NUMAINFO: next = val + NUMAINFO_EVENTS_TARGET; break; @@ -811,8 +999,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; bool do_numainfo __maybe_unused; + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); @@ -820,6 +1011,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_tree(memcg, page); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1661,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, return total; } -#if MAX_NUMNODES > 1 /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1684,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } +#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -1751,12 +1944,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return node; } +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +{ + int nid; + + /* + * quick check...making use of scan_node. + * We can skip unused nodes. + */ + if (!nodes_empty(memcg->scan_nodes)) { + for (nid = first_node(memcg->scan_nodes); + nid < MAX_NUMNODES; + nid = next_node(nid, memcg->scan_nodes)) { + + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) + return true; + } + } + /* + * Check rest of nodes. + */ + for_each_node_state(nid, N_MEMORY) { + if (node_isset(nid, memcg->scan_nodes)) + continue; + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) + return true; + } + return false; +} + #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +{ + return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); +} #endif /* @@ -2692,7 +2924,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, unlock_page_cgroup(pc); /* - * "charge_statistics" updated event counter. + * "charge_statistics" updated event counter. Then, check it. + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. */ memcg_check_events(memcg, page); } @@ -5791,6 +6025,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); + mz->usage_in_excess = 0; + mz->on_tree = false; mz->memcg = memcg; } memcg->nodeinfo[node] = pn; @@ -5846,6 +6082,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; size_t size = memcg_size(); + mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) @@ -5882,6 +6119,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); +static void __init mem_cgroup_soft_limit_tree_init(void) +{ + struct mem_cgroup_tree_per_node *rtpn; + struct mem_cgroup_tree_per_zone *rtpz; + int tmp, node, zone; + + for_each_node(node) { + tmp = node; + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); + BUG_ON(!rtpn); + + soft_limit_tree.rb_tree_per_node[node] = rtpn; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + } +} + static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -6662,6 +6922,7 @@ static int __init mem_cgroup_init(void) { hotcpu_notifier(memcg_cpu_hotplug_callback, 0); enable_swap_cgroup(); + mem_cgroup_soft_limit_tree_init(); memcg_stock_init(); return 0; } -- cgit v1.2.3 From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:41 -0700 Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code" Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim tighter with zone shrinking code") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 142 insertions(+), 21 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 852dbec07ce6..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1991,28 +1991,57 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * A group is eligible for the soft limit reclaim if - * a) it is over its soft limit - * b) any parent up the hierarchy is over its soft limit - */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = memcg; - - if (res_counter_soft_limit_excess(&memcg->res)) - return true; - - /* - * If any parent up the hierarchy is over its soft limit then we - * have to obey and reclaim from this group as well. - */ - while ((parent = parent_mem_cgroup(parent))) { - if (res_counter_soft_limit_excess(&parent->res)) - return true; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; + int loop = 0; + unsigned long excess; + unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; + + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + + while (1) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { + loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!total) + break; + /* + * We want to do more targeted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) + break; + } + continue; + } + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } - - return false; + mem_cgroup_iter_break(root_memcg, victim); + return total; } static DEFINE_SPINLOCK(memcg_oom_lock); @@ -4761,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + unsigned long nr_scanned; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + nr_scanned = 0; + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + gfp_mask, &nr_scanned); + nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) + css_put(&next_mz->memcg->css); + else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->memcg->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->memcg->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->memcg->css); + return nr_reclaimed; +} + /** * mem_cgroup_force_empty_list - clears LRU of a group * @memcg: group to clear -- cgit v1.2.3