From cb820f8e4b7f73d1a32175e6591735b25bb5398d Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 19 Jul 2013 19:40:09 +0200 Subject: net: Provide a generic socket error queue delivery method for Tx time stamps. This patch moves the private error queue delivery function from the af_packet code to the core socket method. In this way, network layers only needing the error queue for transmit time stamping can share common code. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 95a5a2c6925a..e0473f61693f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2249,6 +2249,8 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb) extern void sock_enable_timestamp(struct sock *sk, int flag); extern int sock_get_timestamp(struct sock *, struct timeval __user *); extern int sock_get_timestampns(struct sock *, struct timespec __user *); +extern int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type); /* * Enable debug/info messages -- cgit v1.2.3 From 64dc61306ce7da370833289739e2f52dfc6b37ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jul 2013 20:26:31 -0700 Subject: net: add sk_stream_is_writeable() helper Several call sites use the hardcoded following condition : sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) Lets use a helper because TCP_NOTSENT_LOWAT support will change this condition for TCP sockets. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index e0473f61693f..d0b5fdee50a2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1088,6 +1088,10 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto, } #endif +static inline bool sk_stream_is_writeable(const struct sock *sk) +{ + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk); +} static inline bool sk_has_memory_pressure(const struct sock *sk) { -- cgit v1.2.3 From c9bee3b7fdecb0c1d070c7b54113b3bdfb9a3d36 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jul 2013 20:27:07 -0700 Subject: tcp: TCP_NOTSENT_LOWAT socket option Idea of this patch is to add optional limitation of number of unsent bytes in TCP sockets, to reduce usage of kernel memory. TCP receiver might announce a big window, and TCP sender autotuning might allow a large amount of bytes in write queue, but this has little performance impact if a large part of this buffering is wasted : Write queue needs to be large only to deal with large BDP, not necessarily to cope with scheduling delays (incoming ACKS make room for the application to queue more bytes) For most workloads, using a value of 128 KB or less is OK to give applications enough time to react to POLLOUT events in time (or being awaken in a blocking sendmsg()) This patch adds two ways to set the limit : 1) Per socket option TCP_NOTSENT_LOWAT 2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets not using TCP_NOTSENT_LOWAT socket option (or setting a zero value) Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect. This changes poll()/select()/epoll() to report POLLOUT only if number of unsent bytes is below tp->nosent_lowat Note this might increase number of sendmsg()/sendfile() calls when using non blocking sockets, and increase number of context switches for blocking sockets. Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is defined as : Specify the minimum number of bytes in the buffer until the socket layer will pass the data to the protocol) Tested: netperf sessions, and watching /proc/net/protocols "memory" column for TCP With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458) lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 45458 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 45458 no 208 yes kernel y y y y y y y y y y y y y n y y y y y lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 20567 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 20567 no 208 yes kernel y y y y y y y y y y y y y n y y y y y Using 128KB has no bad effect on the throughput or cpu usage of a single flow, although there is an increase of context switches. A bonus is that we hold socket lock for a shorter amount of time and should improve latencies of ACK processing. lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1651584 6291456 16384 20.00 17447.90 10^6bits/s 3.13 S -1.00 U 0.353 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 412,514 context-switches 200.034645535 seconds time elapsed lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1593240 6291456 16384 20.00 17321.16 10^6bits/s 3.35 S -1.00 U 0.381 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 2,675,818 context-switches 200.029651391 seconds time elapsed Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-By: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/sock.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index d0b5fdee50a2..b9f2b095b1ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk) extern void sk_stream_write_space(struct sock *sk); -static inline bool sk_stream_memory_free(const struct sock *sk) -{ - return sk->sk_wmem_queued < sk->sk_sndbuf; -} - /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { @@ -950,6 +945,7 @@ struct proto { unsigned int inuse_idx; #endif + bool (*stream_memory_free)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ @@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto, } #endif +static inline bool sk_stream_memory_free(const struct sock *sk) +{ + if (sk->sk_wmem_queued >= sk->sk_sndbuf) + return false; + + return sk->sk_prot->stream_memory_free ? + sk->sk_prot->stream_memory_free(sk) : true; +} + static inline bool sk_stream_is_writeable(const struct sock *sk) { - return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk); + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && + sk_stream_memory_free(sk); } + static inline bool sk_has_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure != NULL; -- cgit v1.2.3 From f2f872f9272a79a1048877ea14c15576f46c225e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jul 2013 17:55:08 -0700 Subject: netem: Introduce skb_orphan_partial() helper Commit 547669d483e578 ("tcp: xps: fix reordering issues") added unexpected reorders in case netem is used in a MQ setup for high performance test bed. ETH=eth0 tc qd del dev $ETH root 2>/dev/null tc qd add dev $ETH root handle 1: mq for i in `seq 1 32` do tc qd add dev $ETH parent 1:$i netem delay 100ms done As all tcp packets are orphaned by netem, TCP stack believes it can set skb->ooo_okay on all packets. In order to allow producers to send more packets, we want to keep sk_wmem_alloc from reaching sk_sndbuf limit. We can do that by accounting one byte per skb in netem queues, so that TCP stack is not fooled too much. Tested: With above MQ/netem setup, scaling number of concurrent flows gives linear results and no reorders/retransmits lpq83:~# for n in 1 10 20 30 40 50 60 70 80 90 100 do echo -n "n:$n " ; ./super_netperf $n -H 10.7.7.84; done n:1 198.46 n:10 2002.69 n:20 4000.98 n:30 6006.35 n:40 8020.93 n:50 10032.3 n:60 12081.9 n:70 13971.3 n:80 16009.7 n:90 17117.3 n:100 17425.5 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index b9f2b095b1ab..53d4714709a1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1520,6 +1520,7 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); extern void sock_wfree(struct sk_buff *skb); +extern void skb_orphan_partial(struct sk_buff *skb); extern void sock_rfree(struct sk_buff *skb); extern void sock_edemux(struct sk_buff *skb); -- cgit v1.2.3 From 28d6427109d13b0f447cba5761f88d3548e83605 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2013 14:38:47 -0700 Subject: net: attempt high order allocations in sock_alloc_send_pskb() Adding paged frags skbs to af_unix sockets introduced a performance regression on large sends because of additional page allocations, even if each skb could carry at least 100% more payload than before. We can instruct sock_alloc_send_pskb() to attempt high order allocations. Most of the time, it does a single page allocation instead of 8. I added an additional parameter to sock_alloc_send_pskb() to let other users to opt-in for this new feature on followup patches. Tested: Before patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 46861.15 After patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 57981.11 Signed-off-by: Eric Dumazet Cc: David Rientjes Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index ab6a8b75207e..e4bbcbfd07ea 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1539,7 +1539,8 @@ extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode); + int *errcode, + int max_page_order); extern void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); -- cgit v1.2.3 From 95bd09eb27507691520d39ee1044d6ad831c1168 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Aug 2013 05:46:32 -0700 Subject: tcp: TSO packets automatic sizing After hearing many people over past years complaining against TSO being bursty or even buggy, we are proud to present automatic sizing of TSO packets. One part of the problem is that tcp_tso_should_defer() uses an heuristic relying on upcoming ACKS instead of a timer, but more generally, having big TSO packets makes little sense for low rates, as it tends to create micro bursts on the network, and general consensus is to reduce the buffering amount. This patch introduces a per socket sk_pacing_rate, that approximates the current sending rate, and allows us to size the TSO packets so that we try to send one packet every ms. This field could be set by other transports. Patch has no impact for high speed flows, where having large TSO packets makes sense to reach line rate. For other flows, this helps better packet scheduling and ACK clocking. This patch increases performance of TCP flows in lossy environments. A new sysctl (tcp_min_tso_segs) is added, to specify the minimal size of a TSO packet (default being 2). A follow-up patch will provide a new packet scheduler (FQ), using sk_pacing_rate as an input to perform optional per flow pacing. This explains why we chose to set sk_pacing_rate to twice the current rate, allowing 'slow start' ramp up. sk_pacing_rate = 2 * cwnd * mss / srtt v2: Neal Cardwell reported a suspect deferring of last two segments on initial write of 10 MSS, I had to change tcp_tso_should_defer() to take into account tp->xmit_size_goal_segs Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Van Jacobson Cc: Tom Herbert Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index e4bbcbfd07ea..6ba2e7b0e2b1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -232,6 +232,7 @@ struct cg_proto; * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode + * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings @@ -361,6 +362,7 @@ struct sock { kmemcheck_bitfield_end(flags); int sk_wmem_queued; gfp_t sk_allocation; + u32 sk_pacing_rate; /* bytes per second */ netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; int sk_gso_type; -- cgit v1.2.3 From 559835ea7292e2f09304d81eda16f4209433245e Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 24 Sep 2013 10:25:40 -0700 Subject: vxlan: Use RCU apis to access sk_user_data. Use of RCU api makes vxlan code easier to understand. It also fixes bug due to missing ACCESS_ONCE() on sk_user_data dereference. In rare case without ACCESS_ONCE() compiler might omit vs on sk_user_data dereference. Compiler can use vs as alias for sk->sk_user_data, resulting in multiple sk_user_data dereference in rcu read context which could change. CC: Jesse Gross Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/sock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 6ba2e7b0e2b1..1d37a8086bed 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -409,6 +409,11 @@ struct sock { void (*sk_destruct)(struct sock *sk); }; +#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) + +#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk))) +#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr) + /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE -- cgit v1.2.3