From 071d5080e33d6f24139e4213c2d9f97a2c21b602 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Jul 2015 13:16:29 -0700 Subject: tcp: add tcp_in_slow_start helper Add a helper to test the slow start condition in various congestion control modules and other places. This is to prepare a slight improvement in policy as to exactly when to slow start. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/net/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 950cfecaad3c..dba22fc1b065 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -989,6 +989,11 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd <= tp->snd_ssthresh; +} + static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; @@ -1065,7 +1070,7 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) return tp->snd_cwnd < 2 * tp->max_packets_out; return tp->is_cwnd_limited; -- cgit v1.2.3 From 76174004a0f19785a328f40388e87e982bbf69b9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Jul 2015 13:16:30 -0700 Subject: tcp: do not slow start when cwnd equals ssthresh In the original design slow start is only used to raise cwnd when cwnd is stricly below ssthresh. It makes little sense to slow start when cwnd == ssthresh: especially when hystart has set ssthresh in the initial ramp, or after recovery when cwnd resets to ssthresh. Not doing so will also help reduce the buffer bloat slightly. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index dba22fc1b065..364426a2be5a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -991,7 +991,7 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { - return tp->snd_cwnd <= tp->snd_ssthresh; + return tp->snd_cwnd < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) -- cgit v1.2.3 From 6f021c62d64f38092bc2a0c5fe7b81d5e5b21a00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Aug 2015 12:30:00 -0700 Subject: tcp: fix slow start after idle vs TSO/GSO slow start after idle might reduce cwnd, but we perform this after first packet was cooked and sent. With TSO/GSO, it means that we might send a full TSO packet even if cwnd should have been reduced to IW10. Moving the SSAI check in skb_entail() makes sense, because we slightly reduce number of times this check is done, especially for large send() and TCP Small queue callbacks from softirq context. As Neal pointed out, we also need to perform the check if/when receive window opens. Tested: Following packetdrill test demonstrates the problem // Test of slow start after idle `sysctl -q net.ipv4.tcp_slow_start_after_idle=1` 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 65535 +0 > S. 0:0(0) ack 1 +.100 < . 1:1(0) ack 1 win 511 +0 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 +0 write(4, ..., 26000) = 26000 +0 > . 1:5001(5000) ack 1 +0 > . 5001:10001(5000) ack 1 +0 %{ assert tcpi_snd_cwnd == 10 }% +.100 < . 1:1(0) ack 10001 win 511 +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% +0 > . 10001:20001(10000) ack 1 +0 > P. 20001:26001(6000) ack 1 +.100 < . 1:1(0) ack 26001 win 511 +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% +4 write(4, ..., 20000) = 20000 // If slow start after idle works properly, we should send 5 MSS here (cwnd/2) +0 > . 26001:31001(5000) ack 1 +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% +0 > . 31001:36001(5000) ack 1 Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 364426a2be5a..309801f7eb82 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1165,6 +1165,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) } u32 tcp_default_init_rwnd(u32 mss); +void tcp_cwnd_restart(struct sock *sk, s32 delta); + +static inline void tcp_slow_start_after_idle_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 delta; + + if (!sysctl_tcp_slow_start_after_idle || tp->packets_out) + return; + delta = tcp_time_stamp - tp->lsndtime; + if (delta > inet_csk(sk)->icsk_rto) + tcp_cwnd_restart(sk, delta); +} /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, -- cgit v1.2.3 From 43e122b014c955a33220fabbd09c4b5e4f422c3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Aug 2015 17:38:02 -0700 Subject: tcp: refine pacing rate determination When TCP pacing was added back in linux-3.12, we chose to apply a fixed ratio of 200 % against current rate, to allow probing for optimal throughput even during slow start phase, where cwnd can be doubled every other gRTT. At Google, we found it was better applying a different ratio while in Congestion Avoidance phase. This ratio was set to 120 %. We've used the normal tcp_in_slow_start() helper for a while, then tuned the condition to select the conservative ratio as soon as cwnd >= ssthresh/2 : - After cwnd reduction, it is safer to ramp up more slowly, as we approach optimal cwnd. - Initial ramp up (ssthresh == INFINITY) still allows doubling cwnd every other RTT. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 309801f7eb82..4a7b03947a38 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; +extern int sysctl_tcp_pacing_ss_ratio; +extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; -- cgit v1.2.3 From c3a8d9474684d391b0afc3970d9b249add15ec07 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:47 +0200 Subject: tcp: use dctcp if enabled on the route to the initiator Currently, the following case doesn't use DCTCP, even if it should: A responder has f.e. Cubic as system wide default, but for a specific route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The initiating host then uses DCTCP as congestion control, but since the initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok, and we have to fall back to Reno after 3WHS completes. We were thinking on how to solve this in a minimal, non-intrusive way without bloating tcp_ecn_create_request() needlessly: lets cache the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0) is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows to only do a single metric feature lookup inside tcp_ecn_create_request(). Joint work with Florian Westphal. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4a7b03947a38..0cab28cd43a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else -- cgit v1.2.3