From 9cd3e072b0be17446e37d7414eac8a3499e0601e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:10 -0800 Subject: net: rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA This patch is a cleanup to make following patch easier to review. Goal is to move SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA from (struct socket)->flags to a (struct socket_wq)->flags to benefit from RCU protection in sock_wake_async() To ease backports, we rename both constants. Two new helpers, sk_set_bit(int nr, struct sock *sk) and sk_clear_bit(int net, struct sock *sk) are added so that following patch can change their implementation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 7f89e4ba18d1..c155d09d8af4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2005,6 +2005,16 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +static inline void sk_set_bit(int nr, struct sock *sk) +{ + set_bit(nr, &sk->sk_socket->flags); +} + +static inline void sk_clear_bit(int nr, struct sock *sk) +{ + clear_bit(nr, &sk->sk_socket->flags); +} + static inline void sk_wake_async(struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) -- cgit v1.2.3 From ceb5d58b217098a657f3850b7a2640f995032e62 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:11 -0800 Subject: net: fix sock_wake_async() rcu protection Dmitry provided a syzkaller (http://github.com/google/syzkaller) triggering a fault in sock_wake_async() when async IO is requested. Said program stressed af_unix sockets, but the issue is generic and should be addressed in core networking stack. The problem is that by the time sock_wake_async() is called, we should not access the @flags field of 'struct socket', as the inode containing this socket might be freed without further notice, and without RCU grace period. We already maintain an RCU protected structure, "struct socket_wq" so moving SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA into it is the safe route. It also reduces number of cache lines needing dirtying, so might provide a performance improvement anyway. In followup patches, we might move remaining flags (SOCK_NOSPACE, SOCK_PASSCRED, SOCK_PASSSEC) to save 8 bytes and let 'struct socket' being mostly read and let it being shared between cpus. Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index c155d09d8af4..0434138c5f95 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -384,8 +384,10 @@ struct sock { int sk_rcvbuf; struct sk_filter __rcu *sk_filter; - struct socket_wq __rcu *sk_wq; - + union { + struct socket_wq __rcu *sk_wq; + struct socket_wq *sk_wq_raw; + }; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif @@ -2005,20 +2007,27 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +/* Note: + * We use sk->sk_wq_raw, from contexts knowing this + * pointer is not NULL and cannot disappear/change. + */ static inline void sk_set_bit(int nr, struct sock *sk) { - set_bit(nr, &sk->sk_socket->flags); + set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { - clear_bit(nr, &sk->sk_socket->flags); + clear_bit(nr, &sk->sk_wq_raw->flags); } -static inline void sk_wake_async(struct sock *sk, int how, int band) +static inline void sk_wake_async(const struct sock *sk, int how, int band) { - if (sock_flag(sk, SOCK_FASYNC)) - sock_wake_async(sk->sk_socket, how, band); + if (sock_flag(sk, SOCK_FASYNC)) { + rcu_read_lock(); + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); + rcu_read_unlock(); + } } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might -- cgit v1.2.3 From 6bd4f355df2eae80b8a5c7b097371cd1e05f20d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Dec 2015 21:53:57 -0800 Subject: ipv6: kill sk_dst_lock While testing the np->opt RCU conversion, I found that UDP/IPv6 was using a mixture of xchg() and sk_dst_lock to protect concurrent changes to sk->sk_dst_cache, leading to possible corruptions and crashes. ip6_sk_dst_lookup_flow() uses sk_dst_check() anyway, so the simplest way to fix the mess is to remove sk_dst_lock completely, as we did for IPv4. __ip6_dst_store() and ip6_dst_store() share same implementation. sk_setup_caps() being called with socket lock being held or not, we have to use sk_dst_set() instead of __sk_dst_set() Note that I had to move the "np->dst_cookie = rt6_get_cookie(rt);" in ip6_dst_store() before the sk_setup_caps(sk, dst) call. This is because ip6_dst_store() can be called from process context, without any lock held. As soon as the dst is installed in sk->sk_dst_cache, dst can be freed from another cpu doing a concurrent ip6_dst_store() Doing the dst dereference before doing the install is needed to make sure no use after free would trigger. Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- include/net/sock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 0434138c5f95..52d27ee924f4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -254,7 +254,6 @@ struct cg_proto; * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_dst_cache: destination cache - * @sk_dst_lock: destination cache lock * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed @@ -393,7 +392,7 @@ struct sock { #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; - spinlock_t sk_dst_lock; + /* Note: 32bit hole on 64bit arches */ atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; -- cgit v1.2.3 From 01ce63c90170283a9855d1db4fe81934dddce648 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 4 Dec 2015 15:14:04 -0200 Subject: sctp: update the netstamp_needed counter when copying sockets Dmitry Vyukov reported that SCTP was triggering a WARN on socket destroy related to disabling sock timestamp. When SCTP accepts an association or peel one off, it copies sock flags but forgot to call net_enable_timestamp() if a packet timestamping flag was copied, leading to extra calls to net_disable_timestamp() whenever such clones were closed. The fix is to call net_enable_timestamp() whenever we copy a sock with that flag on, like tcp does. Reported-by: Dmitry Vyukov Signed-off-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 52d27ee924f4..b1d475b5db68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -740,6 +740,8 @@ enum sock_flags { SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ }; +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) { nsk->sk_flags = osk->sk_flags; -- cgit v1.2.3 From d188ba86dd07a72ebebfa22fe9cb0b0572e57740 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Dec 2015 07:22:02 -0800 Subject: xfrm: add rcu protection to sk->sk_policy[] XFRM can deal with SYNACK messages, sent while listener socket is not locked. We add proper rcu protection to __xfrm_sk_clone_policy() and xfrm_sk_policy_lookup() This might serve as the first step to remove xfrm.xfrm_policy_lock use in fast path. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index b1d475b5db68..eaef41433d7a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -388,7 +388,7 @@ struct sock { struct socket_wq *sk_wq_raw; }; #ifdef CONFIG_XFRM - struct xfrm_policy *sk_policy[2]; + struct xfrm_policy __rcu *sk_policy[2]; #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; -- cgit v1.2.3 From 79462ad02e861803b3840cc782248c7359451cd9 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 14 Dec 2015 22:03:39 +0100 Subject: net: add validation for the socket syscall protocol argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 郭永刚 reported that one could simply crash the kernel as root by using a simple program: int socket_fd; struct sockaddr_in addr; addr.sin_port = 0; addr.sin_addr.s_addr = INADDR_ANY; addr.sin_family = 10; socket_fd = socket(10,3,0x40000000); connect(socket_fd , &addr,16); AF_INET, AF_INET6 sockets actually only support 8-bit protocol identifiers. inet_sock's skc_protocol field thus is sized accordingly, thus larger protocol identifiers simply cut off the higher bits and store a zero in the protocol fields. This could lead to e.g. NULL function pointer because as a result of the cut off inet_num is zero and we call down to inet_autobind, which is NULL for raw sockets. kernel: Call Trace: kernel: [] ? inet_autobind+0x2e/0x70 kernel: [] inet_dgram_connect+0x54/0x80 kernel: [] SYSC_connect+0xd9/0x110 kernel: [] ? ptrace_notify+0x5b/0x80 kernel: [] ? syscall_trace_enter_phase2+0x108/0x200 kernel: [] SyS_connect+0xe/0x10 kernel: [] tracesys_phase2+0x84/0x89 I found no particular commit which introduced this problem. CVE: CVE-2015-8543 Cc: Cong Wang Reported-by: 郭永刚 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index eaef41433d7a..c4205e0a3a2d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -403,6 +403,7 @@ struct sock { sk_no_check_rx : 1, sk_userlocks : 4, sk_protocol : 8, +#define SK_PROTOCOL_MAX U8_MAX sk_type : 16; kmemcheck_bitfield_end(flags); int sk_wmem_queued; -- cgit v1.2.3 From 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Dec 2015 14:08:53 -0800 Subject: net: fix IP early demux races David Wilder reported crashes caused by dst reuse. I am seeing a crash on a distro V4.2.3 kernel caused by a double release of a dst_entry. In ipv4_dst_destroy() the call to list_empty() finds a poisoned next pointer, indicating the dst_entry has already been removed from the list and freed. The crash occurs 18 to 24 hours into a run of a network stress exerciser. Thanks to his detailed report and analysis, we were able to understand the core issue. IP early demux can associate a dst to skb, after a lookup in TCP/UDP sockets. When socket cache is not properly set, we want to store into sk->sk_dst_cache the dst for future IP early demux lookups, by acquiring a stable refcount on the dst. Problem is this acquisition is simply using an atomic_inc(), which works well, unless the dst was queued for destruction from dst_release() noticing dst refcount went to zero, if DST_NOCACHE was set on dst. We need to make sure current refcount is not zero before incrementing it, or risk double free as David reported. This patch, being a stable candidate, adds two new helpers, and use them only from IP early demux problematic paths. It might be possible to merge in net-next skb_dst_force() and skb_dst_force_safe(), but I prefer having the smallest patch for stable kernels : Maybe some skb_dst_force() callers do not expect skb->dst can suddenly be cleared. Can probably be backported back to linux-3.6 kernels Reported-by: David J. Wilder Tested-by: David J. Wilder Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index c4205e0a3a2d..28790fe18206 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -817,7 +817,7 @@ void sk_stream_write_space(struct sock *sk); static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ - skb_dst_force(skb); + skb_dst_force_safe(skb); if (!sk->sk_backlog.tail) sk->sk_backlog.head = skb; -- cgit v1.2.3 From 7bbadd2d1009575dad675afc16650ebb5aa10612 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 14 Dec 2015 23:30:43 +0100 Subject: net: fix warnings in 'make htmldocs' by moving macro definition out of field declaration Docbook does not like the definition of macros inside a field declaration and adds a warning. Move the definition out. Fixes: 79462ad02e86180 ("net: add validation for the socket syscall protocol argument") Reported-by: kbuild test robot Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 28790fe18206..14d3c0734007 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -403,8 +403,8 @@ struct sock { sk_no_check_rx : 1, sk_userlocks : 4, sk_protocol : 8, -#define SK_PROTOCOL_MAX U8_MAX sk_type : 16; +#define SK_PROTOCOL_MAX U8_MAX kmemcheck_bitfield_end(flags); int sk_wmem_queued; gfp_t sk_allocation; -- cgit v1.2.3