From d1a4c0b37c296e600ffe08edb0db2dc1b8f550d7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:04 +0000 Subject: tcp memory pressure controls This patch introduces memory pressure controls for the tcp protocol. It uses the generic socket memory pressure code introduced in earlier patches, and fills in the necessary data in cg_proto struct. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 net/ipv4/tcp_memcontrol.c (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c new file mode 100644 index 000000000000..4a68d2c24556 --- /dev/null +++ b/net/ipv4/tcp_memcontrol.c @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include + +static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) +{ + return container_of(cg_proto, struct tcp_memcontrol, cg_proto); +} + +static void memcg_tcp_enter_memory_pressure(struct sock *sk) +{ + if (!sk->sk_cgrp->memory_pressure) + *sk->sk_cgrp->memory_pressure = 1; +} +EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); + +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + /* + * The root cgroup does not use res_counters, but rather, + * rely on the data already collected by the network + * subsystem + */ + struct res_counter *res_parent = NULL; + struct cg_proto *cg_proto, *parent_cg; + struct tcp_memcontrol *tcp; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return 0; + + tcp = tcp_from_cgproto(cg_proto); + + tcp->tcp_prot_mem[0] = sysctl_tcp_mem[0]; + tcp->tcp_prot_mem[1] = sysctl_tcp_mem[1]; + tcp->tcp_prot_mem[2] = sysctl_tcp_mem[2]; + tcp->tcp_memory_pressure = 0; + + parent_cg = tcp_prot.proto_cgroup(parent); + if (parent_cg) + res_parent = parent_cg->memory_allocated; + + res_counter_init(&tcp->tcp_memory_allocated, res_parent); + percpu_counter_init(&tcp->tcp_sockets_allocated, 0); + + cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure; + cg_proto->memory_pressure = &tcp->tcp_memory_pressure; + cg_proto->sysctl_mem = tcp->tcp_prot_mem; + cg_proto->memory_allocated = &tcp->tcp_memory_allocated; + cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; + cg_proto->memcg = memcg; + + return 0; +} +EXPORT_SYMBOL(tcp_init_cgroup); + +void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct cg_proto *cg_proto; + struct tcp_memcontrol *tcp; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return; + + tcp = tcp_from_cgproto(cg_proto); + percpu_counter_destroy(&tcp->tcp_sockets_allocated); +} +EXPORT_SYMBOL(tcp_destroy_cgroup); -- cgit v1.2.3 From 3dc43e3e4d0b52197d3205214fe8f162f9e0c334 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:05 +0000 Subject: per-netns ipv4 sysctl_tcp_mem This patch allows each namespace to independently set up its levels for tcp memory pressure thresholds. This patch alone does not buy much: we need to make this values per group of process somehow. This is achieved in the patches that follows in this patchset. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki CC: David S. Miller CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 4a68d2c24556..bfb0c2b8df46 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include @@ -28,6 +30,7 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) struct tcp_memcontrol *tcp; struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct net *net = current->nsproxy->net_ns; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) @@ -35,9 +38,9 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) tcp = tcp_from_cgproto(cg_proto); - tcp->tcp_prot_mem[0] = sysctl_tcp_mem[0]; - tcp->tcp_prot_mem[1] = sysctl_tcp_mem[1]; - tcp->tcp_prot_mem[2] = sysctl_tcp_mem[2]; + tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; + tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; + tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; tcp->tcp_memory_pressure = 0; parent_cg = tcp_prot.proto_cgroup(parent); -- cgit v1.2.3 From 3aaabe2342c36bf48567b88fa78b819eee14bb5e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:06 +0000 Subject: tcp buffer limitation: per-cgroup limit This patch uses the "tcp.limit_in_bytes" field of the kmem_cgroup to effectively control the amount of kernel memory pinned by a cgroup. This value is ignored in the root cgroup, and in all others, caps the value specified by the admin in the net namespaces' view of tcp_sysctl_mem. If namespaces are being used, the admin is allowed to set a value bigger than cgroup's maximum, the same way it is allowed to set pretty much unlimited values in a real box. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 137 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index bfb0c2b8df46..e3533903409d 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,6 +6,19 @@ #include #include +static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft); +static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer); + +static struct cftype tcp_files[] = { + { + .name = "kmem.tcp.limit_in_bytes", + .write_string = tcp_cgroup_write, + .read_u64 = tcp_cgroup_read, + .private = RES_LIMIT, + }, +}; + static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) { return container_of(cg_proto, struct tcp_memcontrol, cg_proto); @@ -34,7 +47,7 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - return 0; + goto create_files; tcp = tcp_from_cgproto(cg_proto); @@ -57,7 +70,9 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; cg_proto->memcg = memcg; - return 0; +create_files: + return cgroup_add_files(cgrp, ss, tcp_files, + ARRAY_SIZE(tcp_files)); } EXPORT_SYMBOL(tcp_init_cgroup); @@ -66,6 +81,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct cg_proto *cg_proto; struct tcp_memcontrol *tcp; + u64 val; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) @@ -73,5 +89,122 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) tcp = tcp_from_cgproto(cg_proto); percpu_counter_destroy(&tcp->tcp_sockets_allocated); + + val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + + if (val != RESOURCE_MAX) + jump_label_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); + +static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) +{ + struct net *net = current->nsproxy->net_ns; + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + u64 old_lim; + int i; + int ret; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return -EINVAL; + + if (val > RESOURCE_MAX) + val = RESOURCE_MAX; + + tcp = tcp_from_cgproto(cg_proto); + + old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); + ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); + if (ret) + return ret; + + for (i = 0; i < 3; i++) + tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, + net->ipv4.sysctl_tcp_mem[i]); + + if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) + jump_label_dec(&memcg_socket_limit_enabled); + else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) + jump_label_inc(&memcg_socket_limit_enabled); + + return 0; +} + +static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + unsigned long long val; + int ret = 0; + + switch (cft->private) { + case RES_LIMIT: + /* see memcontrol.c */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + ret = tcp_update_limit(memcg, val); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return default_val; + + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, type); +} + +static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + u64 val; + + switch (cft->private) { + case RES_LIMIT: + val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); + break; + default: + BUG(); + } + return val; +} + +unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); + if (!cg_proto) + return 0; + + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); +} + +void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return; + + tcp = tcp_from_cgproto(cg_proto); + + tcp->tcp_prot_mem[idx] = val; +} -- cgit v1.2.3 From 5a6dd343770d2ae2c25f7a4b1998c091e6252f42 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:07 +0000 Subject: Display current tcp memory allocation in kmem cgroup This patch introduces kmem.tcp.usage_in_bytes file, living in the kmem_cgroup filesystem. It is a simple read-only file that displays the amount of kernel memory currently consumed by the cgroup. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index e3533903409d..9481f235803e 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -17,6 +17,11 @@ static struct cftype tcp_files[] = { .read_u64 = tcp_cgroup_read, .private = RES_LIMIT, }, + { + .name = "kmem.tcp.usage_in_bytes", + .read_u64 = tcp_cgroup_read, + .private = RES_USAGE, + }, }; static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) @@ -167,6 +172,19 @@ static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) return res_counter_read_u64(&tcp->tcp_memory_allocated, type); } +static u64 tcp_read_usage(struct mem_cgroup *memcg) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; + + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); +} + static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); @@ -176,6 +194,9 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) case RES_LIMIT: val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); break; + case RES_USAGE: + val = tcp_read_usage(memcg); + break; default: BUG(); } -- cgit v1.2.3 From ffea59e50494198a0db4d6ad8f6721b8fd994f65 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:08 +0000 Subject: Display current tcp failcnt in kmem cgroup This patch introduces kmem.tcp.failcnt file, living in the kmem_cgroup filesystem. Following the pattern in the other memcg resources, this files keeps a counter of how many times allocation failed due to limits being hit in this cgroup. The root cgroup will always show a failcnt of 0. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 9481f235803e..d438fba31ba8 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -9,6 +9,7 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft); static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer); +static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event); static struct cftype tcp_files[] = { { @@ -22,6 +23,12 @@ static struct cftype tcp_files[] = { .read_u64 = tcp_cgroup_read, .private = RES_USAGE, }, + { + .name = "kmem.tcp.failcnt", + .private = RES_FAILCNT, + .trigger = tcp_cgroup_reset, + .read_u64 = tcp_cgroup_read, + }, }; static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) @@ -197,12 +204,36 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) case RES_USAGE: val = tcp_read_usage(memcg); break; + case RES_FAILCNT: + val = tcp_read_stat(memcg, RES_FAILCNT, 0); + break; default: BUG(); } return val; } +static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) +{ + struct mem_cgroup *memcg; + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + memcg = mem_cgroup_from_cont(cont); + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return 0; + tcp = tcp_from_cgproto(cg_proto); + + switch (event) { + case RES_FAILCNT: + res_counter_reset_failcnt(&tcp->tcp_memory_allocated); + break; + } + + return 0; +} + unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) { struct tcp_memcontrol *tcp; -- cgit v1.2.3 From 0850f0f5c54261a6236f013e8bac154bcce424d6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:09 +0000 Subject: Display maximum tcp memory allocation in kmem cgroup This patch introduces kmem.tcp.max_usage_in_bytes file, living in the kmem_cgroup filesystem. The root cgroup will display a value equal to RESOURCE_MAX. This is to avoid introducing any locking schemes in the network paths when cgroups are not being actively used. All others, will see the maximum memory ever used by this cgroup. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index d438fba31ba8..171d7b64f803 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -29,6 +29,12 @@ static struct cftype tcp_files[] = { .trigger = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, + { + .name = "kmem.tcp.max_usage_in_bytes", + .private = RES_MAX_USAGE, + .trigger = tcp_cgroup_reset, + .read_u64 = tcp_cgroup_read, + }, }; static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) @@ -205,7 +211,8 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) val = tcp_read_usage(memcg); break; case RES_FAILCNT: - val = tcp_read_stat(memcg, RES_FAILCNT, 0); + case RES_MAX_USAGE: + val = tcp_read_stat(memcg, cft->private, 0); break; default: BUG(); @@ -226,6 +233,9 @@ static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) tcp = tcp_from_cgproto(cg_proto); switch (event) { + case RES_MAX_USAGE: + res_counter_reset_max(&tcp->tcp_memory_allocated); + break; case RES_FAILCNT: res_counter_reset_failcnt(&tcp->tcp_memory_allocated); break; -- cgit v1.2.3 From c48e074c7c75c0de4652ea5f5bf4e74c8cf4e3dd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 15 Dec 2011 01:05:10 +0000 Subject: tcp_memcontrol: fix reversed if condition We should only dereference the pointer if it's valid, not the other way round. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 171d7b64f803..7fed04f875c1 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -44,7 +44,7 @@ static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) static void memcg_tcp_enter_memory_pressure(struct sock *sk) { - if (!sk->sk_cgrp->memory_pressure) + if (sk->sk_cgrp->memory_pressure) *sk->sk_cgrp->memory_pressure = 1; } EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); -- cgit v1.2.3 From 1398eee08222a038fa5f017900f387e81f6e3ff4 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 12 Jan 2012 02:16:06 +0000 Subject: net: decrement memcg jump label when limit, not usage, is changed The logic of the current code is that whenever we destroy a cgroup that had its limit set (set meaning different than maximum), we should decrement the jump_label counter. Otherwise we assume it was never incremented. But what the code actually does is test for RES_USAGE instead of RES_LIMIT. Usage being different than maximum is likely to be true most of the time. The effect of this is that the key must become negative, and since the jump_label test says: !!atomic_read(&key->enabled); we'll have jump_labels still on when no one else is using this functionality. Signed-off-by: Glauber Costa CC: David S. Miller Signed-off-by: David S. Miller --- net/ipv4/tcp_memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_memcontrol.c') diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 7fed04f875c1..49978788a9dc 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -108,7 +108,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) tcp = tcp_from_cgproto(cg_proto); percpu_counter_destroy(&tcp->tcp_sockets_allocated); - val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); if (val != RESOURCE_MAX) jump_label_dec(&memcg_socket_limit_enabled); -- cgit v1.2.3