From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 56df440a950b..8ab05146a447 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1160,6 +1160,10 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) sk->sk_socket = sock; } +static inline wait_queue_head_t *sk_sleep(struct sock *sk) +{ + return sk->sk_sleep; +} /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, @@ -1346,8 +1350,8 @@ static inline int sk_has_allocations(const struct sock *sk) * tp->rcv_nxt check sock_def_readable * ... { * schedule ... - * if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - * wake_up_interruptible(sk->sk_sleep) + * if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) + * wake_up_interruptible(sk_sleep(sk)) * ... * } * @@ -1368,7 +1372,7 @@ static inline int sk_has_sleeper(struct sock *sk) * This memory barrier is paired in the sock_poll_wait. */ smp_mb__after_lock(); - return sk->sk_sleep && waitqueue_active(sk->sk_sleep); + return sk_sleep(sk) && waitqueue_active(sk_sleep(sk)); } /** -- cgit v1.2.3 From f68c224fedff2157f3fad7f7da674cbc96567c84 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Apr 2010 16:06:59 -0700 Subject: dst: rcu check refinement __sk_dst_get() might be called from softirq, with socket lock held. [ 159.026180] include/net/sock.h:1200 invoked rcu_dereference_check() without protection! [ 159.026261] [ 159.026261] other info that might help us debug this: [ 159.026263] [ 159.026425] [ 159.026426] rcu_scheduler_active = 1, debug_locks = 0 [ 159.026552] 2 locks held by swapper/0: [ 159.026609] #0: (&icsk->icsk_retransmit_timer){+.-...}, at: [] run_timer_softirq+0x105/0x350 [ 159.026839] #1: (slock-AF_INET){+.-...}, at: [] tcp_write_timer+0x2f/0x1e0 [ 159.027063] [ 159.027064] stack backtrace: [ 159.027172] Pid: 0, comm: swapper Not tainted 2.6.34-rc5-03707-gde498c8-dirty #36 [ 159.027252] Call Trace: [ 159.027306] [] lockdep_rcu_dereference +0xaf/0xc0 [ 159.027411] [] tcp_current_mss+0xa7/0xb0 [ 159.027537] [] tcp_write_wakeup+0x89/0x190 [ 159.027600] [] tcp_send_probe0+0x16/0x100 [ 159.027726] [] tcp_write_timer+0x179/0x1e0 [ 159.027790] [] run_timer_softirq+0x191/0x350 [ 159.027980] [] __do_softirq+0xcd/0x200 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 8ab05146a447..86a8ca177a29 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1197,7 +1197,8 @@ static inline struct dst_entry * __sk_dst_get(struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() || - sock_owned_by_user(sk)); + sock_owned_by_user(sk) || + lockdep_is_held(&sk->sk_lock.slock)); } static inline struct dst_entry * -- cgit v1.2.3 From 0b53ff2eadb1db6818894435f85989fb05d7e718 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 26 Apr 2010 20:40:43 +0000 Subject: net: fix a lockdep rcu warning in __sk_dst_set() __sk_dst_set() might be called while no state can be integrated in a rcu_dereference_check() condition. So use rcu_dereference_raw() to shutup lockdep warnings (if CONFIG_PROVE_RCU is set) Signed-off-by: Eric Dumazet Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/net/sock.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 86a8ca177a29..4081db86a352 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1236,8 +1236,11 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); - old_dst = rcu_dereference_check(sk->sk_dst_cache, - lockdep_is_held(&sk->sk_dst_lock)); + /* + * This can be called while sk is owned by the caller only, + * with no state that can be checked in a rcu_dereference_check() cond + */ + old_dst = rcu_dereference_raw(sk->sk_dst_cache); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } -- cgit v1.2.3 From c58dc01babfd58ec9e71a6ce080150dc27755d88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Apr 2010 15:05:31 -0700 Subject: net: Make RFS socket operations not be inet specific. Idea from Eric Dumazet. As for placement inside of struct sock, I tried to choose a place that otherwise has a 32-bit hole on 64-bit systems. Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- include/net/sock.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 4081db86a352..07822280d953 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -198,6 +198,7 @@ struct sock_common { * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting + * @sk_rxhash: flow hash received from netif layer * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab * @sk_timer: sock cleanup timer @@ -279,6 +280,9 @@ struct sock { int sk_gso_type; unsigned int sk_gso_max_size; int sk_rcvlowat; +#ifdef CONFIG_RPS + __u32 sk_rxhash; +#endif unsigned long sk_flags; unsigned long sk_lingertime; struct sk_buff_head sk_error_queue; @@ -620,6 +624,40 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) return sk->sk_backlog_rcv(sk, skb); } +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_reset_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash) +{ +#ifdef CONFIG_RPS + if (unlikely(sk->sk_rxhash != rxhash)) { + sock_rps_reset_flow(sk); + sk->sk_rxhash = rxhash; + } +#endif +} + #define sk_wait_event(__sk, __timeo, __condition) \ ({ int __rc; \ release_sock(__sk); \ -- cgit v1.2.3 From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Apr 2010 15:13:20 -0700 Subject: net: sk_add_backlog() take rmem_alloc into account Current socket backlog limit is not enough to really stop DDOS attacks, because user thread spend many time to process a full backlog each round, and user might crazy spin on socket lock. We should add backlog size and receive_queue size (aka rmem_alloc) to pace writers, and let user run without being slow down too much. Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in stress situations. Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp receiver can now process ~200.000 pps (instead of ~100 pps before the patch) on a 8 core machine. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 07822280d953..cf12b1e61fa6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -256,7 +256,6 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; int len; - int limit; } sk_backlog; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; @@ -608,10 +607,20 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb->next = NULL; } +/* + * Take into account size of receive queue and backlog queue + */ +static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) +{ + unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); + + return qsize + skb->truesize > sk->sk_rcvbuf; +} + /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) { - if (sk->sk_backlog.len >= max(sk->sk_backlog.limit, sk->sk_rcvbuf << 1)) + if (sk_rcvqueues_full(sk, skb)) return -ENOBUFS; __sk_add_backlog(sk, skb); -- cgit v1.2.3 From 4b0b72f7dd617b13abd1b04c947e15873e011a24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 14:35:48 -0700 Subject: net: speedup udp receive path Since commit 95766fff ([UDP]: Add memory accounting.), each received packet needs one extra sock_lock()/sock_release() pair. This added latency because of possible backlog handling. Then later, ticket spinlocks added yet another latency source in case of DDOS. This patch introduces lock_sock_bh() and unlock_sock_bh() synchronization primitives, avoiding one atomic operation and backlog processing. skb_free_datagram_locked() uses them instead of full blown lock_sock()/release_sock(). skb is orphaned inside locked section for proper socket memory reclaim, and finally freed outside of it. UDP receive path now take the socket spinlock only once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index cf12b1e61fa6..d361c7769fe0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1021,6 +1021,16 @@ extern void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) +static inline void lock_sock_bh(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); +} + +static inline void unlock_sock_bh(struct sock *sk) +{ + spin_unlock_bh(&sk->sk_lock.slock); +} + extern struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot); -- cgit v1.2.3 From 767dd03369ac18af58efdef0383d6eb986eab426 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 19:14:43 +0000 Subject: net: speedup sock_recv_ts_and_drops() sock_recv_ts_and_drops() is fat and slow (~ 4% of cpu time on some profiles) We can test all socket flags at once to make fast path fast again. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index d361c7769fe0..e1777db5b9ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1635,7 +1635,24 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) sk->sk_stamp = kt; } -extern void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb); + +static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ +#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \ + (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \ + (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE)) + + if (sk->sk_flags & FLAGS_TS_OR_DROPS) + __sock_recv_ts_and_drops(msg, sk, skb); + else + sk->sk_stamp = skb->tstamp; +} /** * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped -- cgit v1.2.3 From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Apr 2010 11:01:49 +0000 Subject: net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index e1777db5b9ab..cc7f91ec972c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -159,7 +159,7 @@ struct sock_common { * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_rcvbuf: size of receive buffer in bytes - * @sk_sleep: sock wait queue + * @sk_wq: sock wait queue and async head * @sk_dst_cache: destination cache * @sk_dst_lock: destination cache lock * @sk_policy: flow policy @@ -257,7 +257,7 @@ struct sock { struct sk_buff *tail; int len; } sk_backlog; - wait_queue_head_t *sk_sleep; + struct socket_wq *sk_wq; struct dst_entry *sk_dst_cache; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; @@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) static inline wait_queue_head_t *sk_sleep(struct sock *sk) { - return sk->sk_sleep; + return &sk->sk_wq->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. @@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk) write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); - sk->sk_sleep = NULL; + sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); - sk->sk_sleep = &parent->wait; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); security_sock_graft(sk, parent); @@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk) } /** - * sk_has_sleeper - check if there are any waiting processes - * @sk: socket + * wq_has_sleeper - check if there are any waiting processes + * @sk: struct socket_wq * - * Returns true if socket has waiting processes + * Returns true if socket_wq has waiting processes * - * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory + * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths: @@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk) * ... ... * tp->rcv_nxt check sock_def_readable * ... { - * schedule ... - * if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - * wake_up_interruptible(sk_sleep(sk)) + * schedule rcu_read_lock(); + * wq = rcu_dereference(sk->sk_wq); + * if (wq && waitqueue_active(&wq->wait)) + * wake_up_interruptible(&wq->wait) * ... * } * @@ -1421,19 +1422,18 @@ static inline int sk_has_allocations(const struct sock *sk) * could then endup calling schedule and sleep forever if there are no more * data on the socket. * - * The sk_has_sleeper is always called right after a call to read_lock, so we - * can use smp_mb__after_lock barrier. */ -static inline int sk_has_sleeper(struct sock *sk) +static inline bool wq_has_sleeper(struct socket_wq *wq) { + /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier is paired in the sock_poll_wait. */ - smp_mb__after_lock(); - return sk_sleep(sk) && waitqueue_active(sk_sleep(sk)); + smp_mb(); + return wq && waitqueue_active(&wq->wait); } /** @@ -1442,7 +1442,7 @@ static inline int sk_has_sleeper(struct sock *sk) * @wait_address: socket wait queue * @p: poll_table * - * See the comments in the sk_has_sleeper function. + * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) @@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp, * We need to be sure we are in sync with the * socket flags modification. * - * This memory barrier is paired in the sk_has_sleeper. + * This memory barrier is paired in the wq_has_sleeper. */ smp_mb(); } -- cgit v1.2.3 From 1183f3838c588545592c042c0ce15015661ce7f2 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Sun, 2 May 2010 13:42:39 -0700 Subject: net: fix compile error due to double return type in SOCK_DEBUG Fix this one: include/net/sock.h: error: two or more data types in declaration specifiers Signed-off-by: Jan Engelhardt Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index b4603cd54fcd..1ad6435f252e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -74,7 +74,7 @@ printk(KERN_DEBUG msg); } while (0) #else /* Validate arguments and do nothing */ -static void inline int __attribute__ ((format (printf, 2, 3))) +static inline void __attribute__ ((format (printf, 2, 3))) SOCK_DEBUG(struct sock *sk, const char *msg, ...) { } -- cgit v1.2.3