From e56c57d0d3fdbbdf583d3af96bfb803b8dfa713e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Nov 2011 17:07:07 -0500 Subject: net: rename sk_clone to sk_clone_lock Make clear that sk_clone() and inet_csk_clone() return a locked socket. Add _lock() prefix and kerneldoc. Suggested-by: Linus Torvalds Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 4ed7b1d12f5e..2de9dc295956 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1204,7 +1204,14 @@ void sk_release_kernel(struct sock *sk) } EXPORT_SYMBOL(sk_release_kernel); -struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +/** + * sk_clone_lock - clone a socket, and lock its clone + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk; @@ -1297,7 +1304,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) out: return newsk; } -EXPORT_SYMBOL_GPL(sk_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { -- cgit v1.2.3 From 6e3e939f3b1bf8534b32ad09ff199d88800835a0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 9 Nov 2011 10:15:42 +0100 Subject: net: add wireless TX status socket option The 802.1X EAPOL handshake hostapd does requires knowing whether the frame was ack'ed by the peer. Currently, we fudge this pretty badly by not even transmitting the frame as a normal data frame but injecting it with radiotap and getting the status out of radiotap monitor as well. This is rather complex, confuses users (mon.wlan0 presence) and doesn't work with all hardware. To get rid of that hack, introduce a real wifi TX status option for data frame transmissions. This works similar to the existing TX timestamping in that it reflects the SKB back to the socket's error queue with a SCM_WIFI_STATUS cmsg that has an int indicating ACK status (0/1). Since it is possible that at some point we will want to have TX timestamping and wifi status in a single errqueue SKB (there's little point in not doing that), redefine SO_EE_ORIGIN_TIMESTAMPING to SO_EE_ORIGIN_TXSTATUS which can collect more than just the timestamp; keep the old constant as an alias of course. Currently the internal APIs don't make that possible, but it wouldn't be hard to split them up in a way that makes it possible. Thanks to Neil Horman for helping me figure out the functions that add the control messages. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/core/sock.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 4ed7b1d12f5e..cbdf51c0d5ac 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -740,6 +740,11 @@ set_rcvbuf: case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; + + case SO_WIFI_STATUS: + sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); + break; + default: ret = -ENOPROTOOPT; break; @@ -961,6 +966,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); break; + case SO_WIFI_STATUS: + v.val = !!sock_flag(sk, SOCK_WIFI_STATUS); + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3 From 5bc1421e34ecfe0bd4b26dc3232b7d5e25179144 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 22 Nov 2011 05:10:51 +0000 Subject: net: add network priority cgroup infrastructure (v4) This patch adds in the infrastructure code to create the network priority cgroup. The cgroup, in addition to the standard processes file creates two control files: 1) prioidx - This is a read-only file that exports the index of this cgroup. This is a value that is both arbitrary and unique to a cgroup in this subsystem, and is used to index the per-device priority map 2) priomap - This is a writeable file. On read it reports a table of 2-tuples where name is the name of a network interface and priority is indicates the priority assigned to frames egresessing on the named interface and originating from a pid in this cgroup This cgroup allows for skb priority to be set prior to a root qdisc getting selected. This is benenficial for DCB enabled systems, in that it allows for any application to use dcb configured priorities so without application modification Signed-off-by: Neil Horman Signed-off-by: John Fastabend CC: Robert Love CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/sock.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 9a8b3fac1401..16069139797c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -125,6 +125,7 @@ #include #include #include +#include #include @@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +#if defined(CONFIG_CGROUPS) +#if !defined(CONFIG_NET_CLS_CGROUP) int net_cls_subsys_id = -1; EXPORT_SYMBOL_GPL(net_cls_subsys_id); #endif +#if !defined(CONFIG_NETPRIO_CGROUP) +int net_prio_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_prio_subsys_id); +#endif +#endif static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { @@ -1120,6 +1127,18 @@ void sock_update_classid(struct sock *sk) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); + +void sock_update_netprioidx(struct sock *sk) +{ + struct cgroup_netprio_state *state; + if (in_interrupt()) + return; + rcu_read_lock(); + state = task_netprio_state(current); + sk->sk_cgrp_prioidx = state ? state->prioidx : 0; + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif /** @@ -1147,6 +1166,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); + sock_update_netprioidx(sk); } return sk; -- cgit v1.2.3 From 08e29af3a9096ffdff477e537daea67faefd3952 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 28 Nov 2011 12:04:18 +0000 Subject: net: optimize socket timestamping We can test/set multiple bits from sk_flags at once, to shorten a bit socket setup/dismantle phase. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 16069139797c..9777da86aeac 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -276,14 +276,14 @@ static void sock_warn_obsolete_bsdism(const char *name) } } -static void sock_disable_timestamp(struct sock *sk, int flag) +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { - if (sock_flag(sk, flag)) { - sock_reset_flag(sk, flag); - if (!sock_flag(sk, SOCK_TIMESTAMP) && - !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { + if (sk->sk_flags & flags) { + sk->sk_flags &= ~flags; + if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); - } } } @@ -689,7 +689,7 @@ set_rcvbuf: SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, - SOCK_TIMESTAMPING_RX_SOFTWARE); + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, val & SOF_TIMESTAMPING_SOFTWARE); sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, @@ -1187,8 +1187,7 @@ static void __sk_free(struct sock *sk) RCU_INIT_POINTER(sk->sk_filter, NULL); } - sock_disable_timestamp(sk, SOCK_TIMESTAMP); - sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); + sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); if (atomic_read(&sk->sk_omem_alloc)) printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", @@ -1326,8 +1325,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) if (newsk->sk_prot->sockets_allocated) percpu_counter_inc(newsk->sk_prot->sockets_allocated); - if (sock_flag(newsk, SOCK_TIMESTAMP) || - sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: @@ -2165,16 +2163,15 @@ EXPORT_SYMBOL(sock_get_timestampns); void sock_enable_timestamp(struct sock *sk, int flag) { if (!sock_flag(sk, flag)) { + unsigned long previous_flags = sk->sk_flags; + sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ - if (!sock_flag(sk, - flag == SOCK_TIMESTAMP ? - SOCK_TIMESTAMPING_RX_SOFTWARE : - SOCK_TIMESTAMP)) + if (!(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } -- cgit v1.2.3 From 180d8cd942ce336b2c869d324855c40c5db478ad Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:02 +0000 Subject: foundations of per-cgroup memory pressure controlling. This patch replaces all uses of struct sock fields' memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem to acessor macros. Those macros can either receive a socket argument, or a mem_cgroup argument, depending on the context they live in. Since we're only doing a macro wrapping here, no performance impact at all is expected in the case where we don't have cgroups disabled. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman CC: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 57 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 9777da86aeac..a3d4205e7238 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1323,7 +1323,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wq = NULL; if (newsk->sk_prot->sockets_allocated) - percpu_counter_inc(newsk->sk_prot->sockets_allocated); + sk_sockets_allocated_inc(newsk); if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); @@ -1713,28 +1713,28 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_long_add_return(amt, prot->memory_allocated); + + allocated = sk_memory_allocated_add(sk, amt); /* Under limit. */ - if (allocated <= prot->sysctl_mem[0]) { - if (prot->memory_pressure && *prot->memory_pressure) - *prot->memory_pressure = 0; + if (allocated <= sk_prot_mem_limits(sk, 0)) { + sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ - if (allocated > prot->sysctl_mem[1]) - if (prot->enter_memory_pressure) - prot->enter_memory_pressure(sk); + if (allocated > sk_prot_mem_limits(sk, 1)) + sk_enter_memory_pressure(sk); /* Over hard limit. */ - if (allocated > prot->sysctl_mem[2]) + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) return 1; + } else { /* SK_MEM_SEND */ if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) @@ -1744,13 +1744,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) return 1; } - if (prot->memory_pressure) { + if (sk_has_memory_pressure(sk)) { int alloc; - if (!*prot->memory_pressure) + if (!sk_under_memory_pressure(sk)) return 1; - alloc = percpu_counter_read_positive(prot->sockets_allocated); - if (prot->sysctl_mem[2] > alloc * + alloc = sk_sockets_allocated_read_positive(sk); + if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) @@ -1773,7 +1773,9 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_long_sub(amt, prot->memory_allocated); + + sk_memory_allocated_sub(sk, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1784,15 +1786,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reclaim(struct sock *sk) { - struct proto *prot = sk->sk_prot; - - atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, - prot->memory_allocated); + sk_memory_allocated_sub(sk, + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; - if (prot->memory_pressure && *prot->memory_pressure && - (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) - *prot->memory_pressure = 0; + if (sk_under_memory_pressure(sk) && + (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) + sk_leave_memory_pressure(sk); } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2507,16 +2507,27 @@ static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } +static long sock_prot_memory_allocated(struct proto *proto) +{ + return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ + return proto->memory_pressure != NULL ? + proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +} static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, - proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + sock_prot_memory_allocated(proto), + sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), -- cgit v1.2.3 From e1aab161e0135aafcd439be20b4f35e4b0922d95 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:03 +0000 Subject: socket: initial cgroup code. The goal of this work is to move the memory pressure tcp controls to a cgroup, instead of just relying on global conditions. To avoid excessive overhead in the network fast paths, the code that accounts allocated memory to a cgroup is hidden inside a static_branch(). This branch is patched out until the first non-root cgroup is created. So when nobody is using cgroups, even if it is mounted, no significant performance penalty should be seen. This patch handles the generic part of the code, and has nothing tcp-specific. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki CC: Kirill A. Shutemov CC: David S. Miller CC: Eric W. Biederman CC: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index a3d4205e7238..6a871b8fdd20 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,6 +111,7 @@ #include #include #include +#include #include #include @@ -142,6 +143,9 @@ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; +struct jump_label_key memcg_socket_limit_enabled; +EXPORT_SYMBOL(memcg_socket_limit_enabled); + /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -1711,23 +1715,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; + int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated_add(sk, amt, &parent_status); /* Under limit. */ - if (allocated <= sk_prot_mem_limits(sk, 0)) { + if (parent_status == UNDER_LIMIT && + allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. */ - if (allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. (we or our parents) */ + if ((parent_status > SOFT_LIMIT) || + allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit. */ - if (allocated > sk_prot_mem_limits(sk, 2)) + /* Over hard limit (we or our parents) */ + if ((parent_status == OVER_LIMIT) || + (allocated > sk_prot_mem_limits(sk, 2))) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ @@ -1774,7 +1782,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - sk_memory_allocated_sub(sk, amt); + sk_memory_allocated_sub(sk, amt, parent_status); return 0; } @@ -1787,7 +1795,7 @@ EXPORT_SYMBOL(__sk_mem_schedule); void __sk_mem_reclaim(struct sock *sk) { sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (sk_under_memory_pressure(sk) && -- cgit v1.2.3 From d1a4c0b37c296e600ffe08edb0db2dc1b8f550d7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:04 +0000 Subject: tcp memory pressure controls This patch introduces memory pressure controls for the tcp protocol. It uses the generic socket memory pressure code introduced in earlier patches, and fills in the necessary data in cg_proto struct. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/sock.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 6a871b8fdd20..5a6a90620656 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -136,6 +136,46 @@ #include #endif +static DEFINE_RWLOCK(proto_list_lock); +static LIST_HEAD(proto_list); + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct proto *proto; + int ret = 0; + + read_lock(&proto_list_lock); + list_for_each_entry(proto, &proto_list, node) { + if (proto->init_cgroup) { + ret = proto->init_cgroup(cgrp, ss); + if (ret) + goto out; + } + } + + read_unlock(&proto_list_lock); + return ret; +out: + list_for_each_entry_continue_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(cgrp, ss); + read_unlock(&proto_list_lock); + return ret; +} + +void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct proto *proto; + + read_lock(&proto_list_lock); + list_for_each_entry_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(cgrp, ss); + read_unlock(&proto_list_lock); +} +#endif + /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -2291,9 +2331,6 @@ void sk_common_release(struct sock *sk) } EXPORT_SYMBOL(sk_common_release); -static DEFINE_RWLOCK(proto_list_lock); -static LIST_HEAD(proto_list); - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { -- cgit v1.2.3 From 36b77a52087a9fca4228c06e0730750f9b6468f0 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 16 Dec 2011 00:51:59 +0000 Subject: net: fix sleeping while atomic problem in sock mem_cgroup. We can't scan the proto_list to initialize sock cgroups, as it holds a rwlock, and we also want to keep the code generic enough to avoid calling the initialization functions of protocols directly, Convert proto_list_lock into a mutex, so we can sleep and do the necessary allocations. This lock is seldom taken, so there shouldn't be any performance penalties associated with that Signed-off-by: Glauber Costa CC: Hiroyouki Kamezawa CC: David S. Miller CC: Eric Dumazet CC: Stephen Rothwell CC: Randy Dunlap Signed-off-by: David S. Miller --- net/core/sock.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 5a6a90620656..0c67facf42f3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -136,7 +136,7 @@ #include #endif -static DEFINE_RWLOCK(proto_list_lock); +static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM @@ -145,7 +145,7 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) struct proto *proto; int ret = 0; - read_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); list_for_each_entry(proto, &proto_list, node) { if (proto->init_cgroup) { ret = proto->init_cgroup(cgrp, ss); @@ -154,13 +154,13 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) } } - read_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); return ret; out: list_for_each_entry_continue_reverse(proto, &proto_list, node) if (proto->destroy_cgroup) proto->destroy_cgroup(cgrp, ss); - read_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); return ret; } @@ -168,11 +168,11 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct proto *proto; - read_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); list_for_each_entry_reverse(proto, &proto_list, node) if (proto->destroy_cgroup) proto->destroy_cgroup(cgrp, ss); - read_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); } #endif @@ -2479,10 +2479,10 @@ int proto_register(struct proto *prot, int alloc_slab) } } - write_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); list_add(&prot->node, &proto_list); assign_proto_idx(prot); - write_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); return 0; out_free_timewait_sock_slab_name: @@ -2505,10 +2505,10 @@ EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { - write_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); - write_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); if (prot->slab != NULL) { kmem_cache_destroy(prot->slab); @@ -2531,9 +2531,9 @@ EXPORT_SYMBOL(proto_unregister); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(proto_list_lock) + __acquires(proto_list_mutex) { - read_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } @@ -2543,9 +2543,9 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void proto_seq_stop(struct seq_file *seq, void *v) - __releases(proto_list_lock) + __releases(proto_list_mutex) { - read_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) -- cgit v1.2.3 From 0fd7bac6b6157eed6cf0cb86a1e88ba29e57c033 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Dec 2011 07:11:44 +0000 Subject: net: relax rcvbuf limits skb->truesize might be big even for a small packet. Its even bigger after commit 87fb4b7b533 (net: more accurate skb truesize) and big MTU. We should allow queueing at least one packet per receiver, even with a low RCVBUF setting. Reported-by: Michal Simek Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 4ed7b1d12f5e..b23f174ab84c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -288,11 +288,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces - number of warnings when compiling with -W --ANK - */ - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) { + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; -- cgit v1.2.3 From f3f511e1ce6f1a6f0a5bb8320e9f802e76f6b999 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 5 Jan 2012 20:16:39 +0000 Subject: net: fix sock_clone reference mismatch with tcp memcontrol Sockets can also be created through sock_clone. Because it copies all data in the sock structure, it also copies the memcg-related pointer, and all should be fine. However, since we now use reference counts in socket creation, we are left with some sockets that have no reference counts. It matters when we destroy them, since it leads to a mismatch. Signed-off-by: Glauber Costa CC: David S. Miller CC: Greg Thelen CC: Hiroyouki Kamezawa CC: Laurent Chavey Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 002939cfc069..e80b64fbd663 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,6 +1362,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; + sk_update_clone(sk, newsk); + if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); -- cgit v1.2.3 From 475f1b52645a29936b9df1d8fcd45f7e56bd4a9f Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 9 Jan 2012 16:33:16 +1100 Subject: net: sk_update_clone is only used in net/core/sock.c so move it there. Fixes build errors when CONFIG_INET is not defined: In file included from include/linux/tcp.h:211:0, from include/linux/ipv6.h:221, from include/net/ipv6.h:16, from include/linux/sunrpc/clnt.h:26, from include/linux/nfs_fs.h:50, from init/do_mounts.c:20: include/net/sock.h: In function 'sk_update_clone': include/net/sock.h:1109:3: error: implicit declaration of function 'sock_update_memcg' [-Werror=implicit-function-declaration] Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index e80b64fbd663..c3ae73de0239 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1272,6 +1272,12 @@ void sk_release_kernel(struct sock *sk) } EXPORT_SYMBOL(sk_release_kernel); +static void sk_update_clone(const struct sock *sk, struct sock *newsk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_update_memcg(newsk); +} + /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone -- cgit v1.2.3 From 3969eb3859e4fad4b32ca8f96d4ec8551c20704a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 9 Jan 2012 13:44:23 -0800 Subject: net: Fix build with INET disabled. > net/core/sock.c: In function 'sk_update_clone': > net/core/sock.c:1278:3: error: implicit declaration of function 'sock_update_memcg' Reported-by: Randy Dunlap Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index c3ae73de0239..5c5af9988f94 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -112,6 +112,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 0e90b31f4ba77027a7c21cbfc66404df0851ca21 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 20 Jan 2012 04:57:16 +0000 Subject: net: introduce res_counter_charge_nofail() for socket allocations There is a case in __sk_mem_schedule(), where an allocation is beyond the maximum, but yet we are allowed to proceed. It happens under the following condition: sk->sk_wmem_queued + size >= sk->sk_sndbuf The network code won't revert the allocation in this case, meaning that at some point later it'll try to do it. Since this is never communicated to the underlying res_counter code, there is an inbalance in res_counter uncharge operation. I see two ways of fixing this: 1) storing the information about those allocations somewhere in memcg, and then deducting from that first, before we start draining the res_counter, 2) providing a slightly different allocation function for the res_counter, that matches the original behavior of the network code more closely. I decided to go for #2 here, believing it to be more elegant, since #1 would require us to do basically that, but in a more obscure way. Signed-off-by: Glauber Costa Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Michal Hocko CC: Tejun Heo CC: Li Zefan CC: Laurent Chavey Acked-by: Tejun Heo Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 5c5af9988f94..3e81fd2e3c75 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1827,7 +1827,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - sk_memory_allocated_sub(sk, amt, parent_status); + sk_memory_allocated_sub(sk, amt); return 0; } @@ -1840,7 +1840,7 @@ EXPORT_SYMBOL(__sk_mem_schedule); void __sk_mem_reclaim(struct sock *sk) { sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0); + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (sk_under_memory_pressure(sk) && -- cgit v1.2.3 From 2b73bc65e2771372c818db7955709c8caedbf8b9 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:38 +0000 Subject: netprio_cgroup: fix wrong memory access when NETPRIO_CGROUP=m When the netprio_cgroup module is not loaded, net_prio_subsys_id is -1, and so sock_update_prioidx() accesses cgroup_subsys array with negative index subsys[-1]. Make the code resembles cls_cgroup code, which is bug free. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/sock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..02f8dfe320b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1171,13 +1171,10 @@ EXPORT_SYMBOL(sock_update_classid); void sock_update_netprioidx(struct sock *sk) { - struct cgroup_netprio_state *state; if (in_interrupt()) return; - rcu_read_lock(); - state = task_netprio_state(current); - sk->sk_cgrp_prioidx = state ? state->prioidx : 0; - rcu_read_unlock(); + + sk->sk_cgrp_prioidx = task_netprioidx(current); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif -- cgit v1.2.3