summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c665
1 files changed, 499 insertions, 166 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13b9d0f221b8..67dd2a881433 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
+#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
@@ -55,10 +56,12 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
+#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
+#include "slab.h"
#include <asm/uaccess.h>
@@ -226,6 +229,46 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct work_struct remove;
+};
+
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
@@ -312,7 +355,7 @@ struct mem_cgroup {
atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct tcp_memcontrol tcp_mem;
+ struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
@@ -330,6 +373,10 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
@@ -337,7 +384,7 @@ struct mem_cgroup {
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
+ nr_node_ids * sizeof(struct mem_cgroup_per_node *);
}
/* internal only representation about the status of kmem accounting. */
@@ -489,14 +536,32 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
- return &mem_cgroup_from_css(css)->vmpressure;
+ return (memcg == root_mem_cgroup);
}
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+/*
+ * We restrict the id in the range of [1, 65535], so it can fit into
+ * an unsigned short.
+ */
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
- return (memcg == root_mem_cgroup);
+ /*
+ * The ID of the root cgroup is 0, but memcg treat 0 as an
+ * invalid ID, so we return (cgroup_id + 1).
+ */
+ return memcg->css.cgroup->id + 1;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id - 1, &mem_cgroup_subsys);
+ return mem_cgroup_from_css(css);
}
/* Writing them here to avoid exposing memcg's inner layout */
@@ -551,13 +616,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
- return &memcg->tcp_mem.cg_proto;
+ return &memcg->tcp_mem;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
- if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ if (!memcg_proto_activated(&memcg->tcp_mem))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
@@ -570,16 +635,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- * There are two main reasons for not using the css_id for this:
- * 1) this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * 2) In order not to violate the cgroup API, we would like to do all memory
- * allocation in ->create(). At that point, we haven't yet allocated the
- * css_id. Having a separate index prevents us from messing with the cgroup
- * core for this
+ * The main reason for not using cgroup id for this:
+ * this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
@@ -594,14 +654,14 @@ int memcg_limited_groups_array_size;
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
- * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
- * css_id space is not getting any smaller, and we don't have to necessarily
+ * cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE 65535
+#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
/*
* A lot of the calls to the cache allocation functions are expected to be
@@ -1408,7 +1468,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;
- return css_is_ancestor(&memcg->css, &root_memcg->css);
+ return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -1628,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct cgroup *task_cgrp;
- struct cgroup *mem_cgrp;
/*
- * Need a buffer in BSS, can't rely on allocations. The code relies
- * on the assumption that OOM is serialized for memory controller.
- * If this assumption is broken, revisit this code.
+ * protects memcg_name and makes sure that parallel ooms do not
+ * interleave
*/
+ static DEFINE_SPINLOCK(oom_info_lock);
+ struct cgroup *task_cgrp;
+ struct cgroup *mem_cgrp;
static char memcg_name[PATH_MAX];
int ret;
struct mem_cgroup *iter;
@@ -1643,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
if (!p)
return;
+ spin_lock(&oom_info_lock);
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
@@ -1711,6 +1772,7 @@ done:
pr_cont("\n");
}
+ spin_unlock(&oom_info_lock);
}
/*
@@ -2675,7 +2737,10 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
- goto bypass;
+ goto nomem;
+
+ if (gfp_mask & __GFP_NOFAIL)
+ oom = false;
/*
* We always charge the cgroup the mm_struct belongs to.
@@ -2826,15 +2891,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
- struct cgroup_subsys_state *css;
-
/* ID 0 is unused ID */
if (!id)
return NULL;
- css = css_lookup(&mem_cgroup_subsys, id);
- if (!css)
- return NULL;
- return mem_cgroup_from_css(css);
+ return mem_cgroup_from_id(id);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2942,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex);
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+ (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
+ KMEM_ACCOUNTED_MASK;
}
/*
@@ -2955,14 +3016,13 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
- return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+ return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
}
#ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
@@ -2984,21 +3044,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
- bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- /*
- * Conditions under which we can wait for the oom_killer. Those are
- * the same conditions tested by the core page allocator
- */
- may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, may_oom);
+ &_memcg, oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
@@ -3076,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg)
* But when we create a new cache, we can call this as well if its parent
* is kmem-limited. That will have to hold set_limit_mutex as well.
*/
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
{
int num, ret;
@@ -3138,7 +3191,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
- VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(s));
if (num_groups > memcg_limited_groups_array_size) {
int i;
@@ -3399,7 +3452,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
- new_cachep = cachep->memcg_params->memcg_caches[idx];
+ new_cachep = cache_from_memcg_idx(cachep, idx);
if (new_cachep) {
css_put(&memcg->css);
goto out;
@@ -3445,8 +3498,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
- for (i = 0; i < memcg_limited_groups_array_size; i++) {
- c = s->memcg_params->memcg_caches[i];
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
if (!c)
continue;
@@ -3579,8 +3632,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
- if (likely(cachep->memcg_params->memcg_caches[idx])) {
- cachep = cachep->memcg_params->memcg_caches[idx];
+ if (likely(cache_from_memcg_idx(cachep, idx))) {
+ cachep = cache_from_memcg_idx(cachep, idx);
goto out;
}
@@ -4350,7 +4403,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
* css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, css_id(&memcg->css));
+ swap_cgroup_record(ent, mem_cgroup_id(memcg));
}
#endif
@@ -4402,8 +4455,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
{
unsigned short old_id, new_id;
- old_id = css_id(&from->css);
- new_id = css_id(&to->css);
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
@@ -5102,14 +5155,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
-static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- char str[64];
u64 val;
- int name, len;
+ int name;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
@@ -5135,8 +5186,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
BUG();
}
- len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
- return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+ return val;
}
static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
@@ -5373,48 +5423,52 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
- int nid;
- unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
- unsigned long node_nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
- total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
- seq_printf(m, "total=%lu", total_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
-
- file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
- seq_printf(m, "file=%lu", file_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_FILE);
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
+ struct numa_stat {
+ const char *name;
+ unsigned int lru_mask;
+ };
- anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
- seq_printf(m, "anon=%lu", anon_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_ANON);
- seq_printf(m, " N%d=%lu", nid, node_nr);
+ static const struct numa_stat stats[] = {
+ { "total", LRU_ALL },
+ { "file", LRU_ALL_FILE },
+ { "anon", LRU_ALL_ANON },
+ { "unevictable", BIT(LRU_UNEVICTABLE) },
+ };
+ const struct numa_stat *stat;
+ int nid;
+ unsigned long nr;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
+ seq_printf(m, "%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
+ }
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ struct mem_cgroup *iter;
+
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
+ seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_node_nr_lru_pages(
+ iter, nid, stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
}
- seq_putc(m, '\n');
- unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
- seq_printf(m, "unevictable=%lu", unevictable_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- BIT(LRU_UNEVICTABLE));
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
@@ -5424,10 +5478,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
-static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
- struct seq_file *m)
+static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct mem_cgroup *mi;
unsigned int i;
@@ -5636,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
@@ -5719,13 +5770,23 @@ unlock:
return ret;
}
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
@@ -5798,14 +5859,23 @@ unlock:
mutex_unlock(&memcg->thresholds_lock);
}
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *event;
- enum res_type type = MEMFILE_TYPE(cft->private);
- BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
@@ -5823,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
return 0;
}
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *ev, *tmp;
- enum res_type type = MEMFILE_TYPE(cft->private);
-
- BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
@@ -5844,17 +5910,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
spin_unlock(&memcg_oom_lock);
}
-static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct cgroup_map_cb *cb)
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
- cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
- if (atomic_read(&memcg->under_oom))
- cb->fill(cb, "under_oom", 1);
- else
- cb->fill(cb, "under_oom", 0);
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
return 0;
}
@@ -5947,41 +6008,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
}
#endif
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ unsigned long flags = (unsigned long)key;
+
+ if (flags & POLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+ struct cftype *cft, const char *buffer)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ const char *name;
+ char *endp;
+ int ret;
+
+ efd = simple_strtoul(buffer, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buffer = endp + 1;
+
+ cfd = simple_strtoul(buffer, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buffer = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cfile.file->f_dentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ rcu_read_lock();
+
+ ret = -EINVAL;
+ cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+ &mem_cgroup_subsys);
+ if (cfile_css == css && css_tryget(css))
+ ret = 0;
+
+ rcu_read_unlock();
+ if (ret)
+ goto out_put_cfile;
+
+ ret = event->register_event(memcg, event->eventfd, buffer);
+ if (ret)
+ goto out_put_css;
+
+ efile.file->f_op->poll(efile.file, &event->pt);
+
+ spin_lock(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return 0;
+
+out_put_css:
+ css_put(css);
+out_put_cfile:
+ fdput(cfile);
+out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+out_put_efile:
+ fdput(efile);
+out_kfree:
+ kfree(event);
+
+ return ret;
+}
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
- .read_seq_string = memcg_stat_show,
+ .seq_show = memcg_stat_show,
},
{
.name = "force_empty",
@@ -5994,6 +6275,12 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_hierarchy_read,
},
{
+ .name = "cgroup.event_control", /* XXX: for compat */
+ .write_string = memcg_write_event_control,
+ .flags = CFTYPE_NO_PREFIX,
+ .mode = S_IWUGO,
+ },
+ {
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
@@ -6005,21 +6292,17 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
- .read_map = mem_cgroup_oom_control_read,
+ .seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .register_event = mem_cgroup_oom_register_event,
- .unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
- .register_event = vmpressure_register_event,
- .unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .read_seq_string = memcg_numa_stat_show,
+ .seq_show = memcg_numa_stat_show,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
@@ -6027,29 +6310,29 @@ static struct cftype mem_cgroup_files[] = {
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .read_seq_string = mem_cgroup_slabinfo_read,
+ .seq_show = mem_cgroup_slabinfo_read,
},
#endif
#endif
@@ -6061,27 +6344,25 @@ static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
@@ -6166,7 +6447,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
- free_css_id(&mem_cgroup_subsys, &memcg->css);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
@@ -6254,6 +6534,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->event_list);
+ spin_lock_init(&memcg->event_list_lock);
return &memcg->css;
@@ -6269,6 +6551,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
int error = 0;
+ if (css->cgroup->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
+
if (!parent)
return 0;
@@ -6326,6 +6611,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event, *tmp;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
kmem_cgroup_css_offline(memcg);
@@ -6338,6 +6636,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * XXX: css_offline() would be where we should reparent all
+ * memory to prepare the cgroup for destruction. However,
+ * memcg does not do css_tryget() and res_counter charging
+ * under the same RCU lock region, which means that charging
+ * could race with offlining. Offlining only happens to
+ * cgroups with no tasks in them but charges can show up
+ * without any tasks from the swapin path when the target
+ * memcg is looked up from the swapout record and not from the
+ * current task as it usually is. A race like this can leak
+ * charges and put pages with stale cgroup pointers into
+ * circulation:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget()
+ * rcu_read_unlock()
+ * disable css_tryget()
+ * call_rcu()
+ * offline_css()
+ * reparent_charges()
+ * res_counter_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * The bulk of the charges are still moved in offline_css() to
+ * avoid pinning a lot of pages in case a long-term reference
+ * like a swapout record is deferring the css_free() to long
+ * after offlining. But this makes sure we catch any charges
+ * made after offlining:
+ */
+ mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);
@@ -6540,7 +6874,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
- css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
+ mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
@@ -6591,10 +6925,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6783,9 +7117,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6802,7 +7136,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6960,7 +7294,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
.bind = mem_cgroup_bind,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
- .use_id = 1,
};
#ifdef CONFIG_MEMCG_SWAP