From 291a00d1a70f96b393da9ac90c58a82bc7949fc8 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 1 Jul 2015 14:11:14 -0700
Subject: tcp: reduce cwnd if retransmit is lost in CA_Loss

If the retransmission in CA_Loss is lost again, we should not
continue to slow start or raise cwnd in congestion avoidance mode.
Instead we should enter fast recovery and use PRR to reduce cwnd,
following the principle in RFC5681:

"... or the loss of a retransmission, should be taken as two
 indications of congestion and, therefore, cwnd (and ssthresh) MUST
 be lowered twice in this case."

This is especially important to reduce loss when the CA_Loss
state was caused by a traffic policer dropping the entire inflight.
The CA_Loss state has a problem where a loss of L packets causes the
sender to send a burst of L packets. So a policer that's dropping
most packets in a given RTT can cause a huge retransmit storm. By
contrast, PRR includes logic to bound the number of outbound packets
that result from a given ACK. So switching to CA_Recovery on lost
retransmits in CA_Loss avoids this retransmit storm problem when
in CA_Loss.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 684f095d196e..923e0e568bfa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -109,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
 #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
+#define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
 #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
 #define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -1037,7 +1038,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
  * highest SACK block). Also calculate the lowest snd_nxt among the remaining
  * retransmitted skbs to avoid some costly processing per ACKs.
  */
-static void tcp_mark_lost_retrans(struct sock *sk)
+static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1078,7 +1079,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
 		if (after(received_upto, ack_seq)) {
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 			tp->retrans_out -= tcp_skb_pcount(skb);
-
+			*flag |= FLAG_LOST_RETRANS;
 			tcp_skb_mark_lost_uncond_verify(tp, skb);
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
 		} else {
@@ -1818,7 +1819,7 @@ advance_sp:
 	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
 		tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
 
-	tcp_mark_lost_retrans(sk);
+	tcp_mark_lost_retrans(sk, &state->flag);
 	tcp_verify_left_out(tp);
 out:
 
@@ -2676,7 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 	tp->prior_ssthresh = 0;
 	tcp_init_undo(tp);
 
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+	if (!tcp_in_cwnd_reduction(sk)) {
 		if (!ece_ack)
 			tp->prior_ssthresh = tcp_current_ssthresh(sk);
 		tcp_init_cwnd_reduction(sk);
@@ -2852,9 +2853,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, is_dupack);
-		if (icsk->icsk_ca_state != TCP_CA_Open)
+		if (icsk->icsk_ca_state != TCP_CA_Open &&
+		    !(flag & FLAG_LOST_RETRANS))
 			return;
-		/* Fall through to processing in Open state. */
+		/* Change state if cwnd is undone or retransmits are lost */
 	default:
 		if (tcp_is_reno(tp)) {
 			if (flag & FLAG_SND_UNA_ADVANCED)
-- 
cgit v1.2.3


From 3759824da87b30ce7a35b4873b62b0ba38905ef5 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 1 Jul 2015 14:11:15 -0700
Subject: tcp: PRR uses CRB mode by default and SS mode conditionally

PRR slow start is often too aggressive especially when drops are
caused by traffic policers. The policers mainly use token bucket
to enforce the rate so sending (twice) faster than the delivery
rate causes excessive drops.

This patch changes PRR to the conservative reduction bound
(CRB) mode in RFC 6937 by default. CRB follows the packet
conservation rule to send at most the delivery rate by default.

But if many packets are lost and the pipe is empty, CRB may take N
round trips to repair N losses. We conditionally turn on slow start
mode if all these conditions are made to speed up the recovery:

  1) on the second round or later in recovery
  2) retransmission sent in the previous round is delivered on this ACK
  3) no retransmission is marked lost on this ACK

By using packet conservation by default, this change reduces the loss
retransmits signicantly on networks that deploy traffic policers,
up to 20% reduction of overall loss rate.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 923e0e568bfa..ad1482dd215e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2476,15 +2476,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
 	return false;
 }
 
-/* The cwnd reduction in CWR and Recovery use the PRR algorithm
- * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
  * It computes the number of packets to send (sndcnt) based on packets newly
  * delivered:
  *   1) If the packets in flight is larger than ssthresh, PRR spreads the
  *	cwnd reductions across a full RTT.
- *   2) If packets in flight is lower than ssthresh (such as due to excess
- *	losses and/or application stalls), do not perform any further cwnd
- *	reductions, but instead slow start up to ssthresh.
+ *   2) Otherwise PRR uses packet conservation to send as much as delivered.
+ *      But when the retransmits are acked without further losses, PRR
+ *      slow starts cwnd up to ssthresh to speed up the recovery.
  */
 static void tcp_init_cwnd_reduction(struct sock *sk)
 {
@@ -2501,7 +2500,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
 }
 
 static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
-			       int fast_rexmit)
+			       int fast_rexmit, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int sndcnt = 0;
@@ -2510,16 +2509,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
 				 (tp->packets_out - tp->sacked_out);
 
 	tp->prr_delivered += newly_acked_sacked;
-	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+	if (delta < 0) {
 		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
 			       tp->prior_cwnd - 1;
 		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
-	} else {
+	} else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
+		   !(flag & FLAG_LOST_RETRANS)) {
 		sndcnt = min_t(int, delta,
 			       max_t(int, tp->prr_delivered - tp->prr_out,
 				     newly_acked_sacked) + 1);
+	} else {
+		sndcnt = min(delta, newly_acked_sacked);
 	}
-
 	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
 	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
 }
@@ -2580,7 +2581,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
 	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
 		tcp_try_keep_open(sk);
 	} else {
-		tcp_cwnd_reduction(sk, prior_unsacked, 0);
+		tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
 	}
 }
 
@@ -2737,7 +2738,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 
 /* Undo during fast recovery after partial ACK. */
 static bool tcp_try_undo_partial(struct sock *sk, const int acked,
-				 const int prior_unsacked)
+				 const int prior_unsacked, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2753,7 +2754,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
 		 * mark more packets lost or retransmit more.
 		 */
 		if (tp->retrans_out) {
-			tcp_cwnd_reduction(sk, prior_unsacked, 0);
+			tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
 			return true;
 		}
 
@@ -2840,7 +2841,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			if (tcp_is_reno(tp) && is_dupack)
 				tcp_add_reno_sack(sk);
 		} else {
-			if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+			if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
 				return;
 			/* Partial ACK arrived. Force fast retransmit. */
 			do_lost = tcp_is_reno(tp) ||
@@ -2891,7 +2892,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 
 	if (do_lost)
 		tcp_update_scoreboard(sk, fast_rexmit);
-	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
+	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
 	tcp_xmit_retransmit_queue(sk);
 }
 
-- 
cgit v1.2.3


From 2251ae46af72d013a6a537e7681e94b18b63e332 Mon Sep 17 00:00:00 2001
From: Jon Maxwell <jmaxwell37@gmail.com>
Date: Wed, 8 Jul 2015 10:12:28 +1000
Subject: tcp: v1 always send a quick ack when quickacks are enabled

V1 of this patch contains Eric Dumazet's suggestion to move the per
dst RTAX_QUICKACK check into tcp_in_quickack_mode(). Thanks Eric.

I ran some tests and after setting the "ip route change quickack 1"
knob there were still many delayed ACKs sent. This occured
because when icsk_ack.quick=0 the !icsk_ack.pingpong value is
subsequently ignored as tcp_in_quickack_mode() checks both these
values. The condition for a quick ack to trigger requires
that both icsk_ack.quick != 0 and icsk_ack.pingpong=0. Currently
only icsk_ack.pingpong is controlled by the knob. But the
icsk_ack.quick value changes dynamically depending on heuristics.
The crux of the matter is that delayed acks still cannot be entirely
disabled even with the RTAX_QUICKACK per dst knob enabled. This
patch ensures that a quick ack is always sent when the RTAX_QUICKACK
per dst knob is turned on.

The "ip route change quickack 1" knob was recently added to enable
quickacks. It was modeled around the TCP_QUICKACK setsockopt() option.
This issue is that even with "ip route change quickack 1" enabled
we still see delayed ACKs under some conditions. It would be nice
to be able to completely disable delayed ACKs.

Here is an example:

# netstat -s|grep dela
    3 delayed acks sent

For all routes enable the knob

# ip route change quickack 1

Generate some traffic across a slow link and we still see the delayed
acks.

# netstat -s|grep dela
    106 delayed acks sent
    1 delayed acks further delayed because of locked socket

The issue is that both the "ip route change quickack 1" knob and
the TCP_QUICKACK option set the icsk_ack.pingpong variable to 0.
However at the business end in the __tcp_ack_snd_check() routine,
tcp_in_quickack_mode() checks that both icsk_ack.quick != 0
and icsk_ack.pingpong=0 in order to trigger a quickack. As
icsk_ack.quick is determined by heuristics it can be 0. When
that occurs the icsk_ack.pingpong value is ignored and a delayed
ACK is sent regardless.

This patch moves the RTAX_QUICKACK per dst check into the
tcp_in_quickack_mode() routine which ensures that a quickack is
always sent when the quickack knob is enabled for that dst.

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ad1482dd215e..7f4a8d5f6eb0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -197,11 +197,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)
  * and the session is not interactive.
  */
 
-static inline bool tcp_in_quickack_mode(const struct sock *sk)
+static bool tcp_in_quickack_mode(struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct dst_entry *dst = __sk_dst_get(sk);
 
-	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+	return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
+		(icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
 }
 
 static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
@@ -3951,7 +3953,6 @@ void tcp_reset(struct sock *sk)
 static void tcp_fin(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	const struct dst_entry *dst;
 
 	inet_csk_schedule_ack(sk);
 
@@ -3963,9 +3964,7 @@ static void tcp_fin(struct sock *sk)
 	case TCP_ESTABLISHED:
 		/* Move to CLOSE_WAIT */
 		tcp_set_state(sk, TCP_CLOSE_WAIT);
-		dst = __sk_dst_get(sk);
-		if (!dst || !dst_metric(dst, RTAX_QUICKACK))
-			inet_csk(sk)->icsk_ack.pingpong = 1;
+		inet_csk(sk)->icsk_ack.pingpong = 1;
 		break;
 
 	case TCP_CLOSE_WAIT:
-- 
cgit v1.2.3


From b20a3fa30a281b52b2576b509efbe5cd47a5a79b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 9 Jul 2015 13:16:31 -0700
Subject: tcp: update congestion state first before raising cwnd

The congestion state and cwnd can be updated in the wrong order.
For example, upon receiving a dubious ACK, we incorrectly raise
the cwnd first (tcp_may_raise_cwnd()/tcp_cong_avoid()) because
the state is still Open, then enter recovery state to reduce cwnd.

For another example, if the ACK indicates spurious timeout or
retransmits, we first revert the cwnd reduction and congestion
state back to Open state.  But we don't raise the cwnd even though
the ACK does not indicate any congestion.

To fix this problem we should first call tcp_fastretrans_alert() to
process the dubious ACK and update the congestion state, then call
tcp_may_raise_cwnd() that raises cwnd based on the current state.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7f4a8d5f6eb0..1578fc2a6f39 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3568,10 +3568,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 				    &sack_state);
 	acked -= tp->packets_out;
 
-	/* Advance cwnd if state allows */
-	if (tcp_may_raise_cwnd(sk, flag))
-		tcp_cong_avoid(sk, ack, acked);
-
 	if (tcp_ack_is_dubious(sk, flag)) {
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
 		tcp_fastretrans_alert(sk, acked, prior_unsacked,
@@ -3580,6 +3576,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
+	/* Advance cwnd if state allows */
+	if (tcp_may_raise_cwnd(sk, flag))
+		tcp_cong_avoid(sk, ack, acked);
+
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
 		struct dst_entry *dst = __sk_dst_get(sk);
 		if (dst)
-- 
cgit v1.2.3


From f82b681a511f4d61069e9586a9cf97bdef371ef3 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 13 Jul 2015 12:10:20 -0700
Subject: tcp: don't use F-RTO on non-recurring timeouts

Currently F-RTO may repeatedly send new data packets on non-recurring
timeouts in CA_Loss mode. This is a bug because F-RTO (RFC5682)
should only be used on either new recovery or recurring timeouts.

This exacerbates the recovery progress during frequent timeout &
repair, because we prioritize sending new data packets instead of
repairing the holes when the bandwidth is already scarce.

Fix it by correcting the test of a new recovery episode.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 684f095d196e..728f5b3d3c64 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1917,14 +1917,13 @@ void tcp_enter_loss(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	bool new_recovery = false;
+	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
 	    !after(tp->high_seq, tp->snd_una) ||
 	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
-		new_recovery = true;
 		tp->prior_ssthresh = tcp_current_ssthresh(sk);
 		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 		tcp_ca_event(sk, CA_EVENT_LOSS);
-- 
cgit v1.2.3


From b56ea2985d389a3676638203323ebe22c261b7fe Mon Sep 17 00:00:00 2001
From: Rick Jones <rick.jones2@hp.com>
Date: Tue, 21 Jul 2015 16:14:13 -0700
Subject: net: track success and failure of TCP PMTU probing

Track success and failure of TCP PMTU probing.

Signed-off-by: Rick Jones <rick.jones2@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1578fc2a6f39..cda3ffedadb6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2593,6 +2593,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
 
 	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
 	icsk->icsk_mtup.probe_size = 0;
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
 }
 
 static void tcp_mtup_probe_success(struct sock *sk)
@@ -2612,6 +2613,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
 	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
 	icsk->icsk_mtup.probe_size = 0;
 	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
 }
 
 /* Do a simple retransmit without using the backoff mechanisms in
-- 
cgit v1.2.3


From 6f021c62d64f38092bc2a0c5fe7b81d5e5b21a00 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Aug 2015 12:30:00 -0700
Subject: tcp: fix slow start after idle vs TSO/GSO

slow start after idle might reduce cwnd, but we perform this
after first packet was cooked and sent.

With TSO/GSO, it means that we might send a full TSO packet
even if cwnd should have been reduced to IW10.

Moving the SSAI check in skb_entail() makes sense, because
we slightly reduce number of times this check is done,
especially for large send() and TCP Small queue callbacks from
softirq context.

As Neal pointed out, we also need to perform the check
if/when receive window opens.

Tested:

Following packetdrill test demonstrates the problem
// Test of slow start after idle

`sysctl -q net.ipv4.tcp_slow_start_after_idle=1`

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0    setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0    bind(3, ..., ...) = 0
+0    listen(3, 1) = 0

+0    < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0    > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+.100 < . 1:1(0) ack 1 win 511
+0    accept(3, ..., ...) = 4
+0    setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0

+0    write(4, ..., 26000) = 26000
+0    > . 1:5001(5000) ack 1
+0    > . 5001:10001(5000) ack 1
+0    %{ assert tcpi_snd_cwnd == 10 }%

+.100 < . 1:1(0) ack 10001 win 511
+0    %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }%
+0    > . 10001:20001(10000) ack 1
+0    > P. 20001:26001(6000) ack 1

+.100 < . 1:1(0) ack 26001 win 511
+0    %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }%

+4 write(4, ..., 20000) = 20000
// If slow start after idle works properly, we should send 5 MSS here (cwnd/2)
+0    > . 26001:31001(5000) ack 1
+0    %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }%
+0    > . 31001:36001(5000) ack 1

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4d6bcd0ca9..0abca2841de2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3332,6 +3332,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 			tp->pred_flags = 0;
 			tcp_fast_path_check(sk);
 
+			if (tcp_send_head(sk))
+				tcp_slow_start_after_idle_check(sk);
+
 			if (nwin > tp->max_window) {
 				tp->max_window = nwin;
 				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
-- 
cgit v1.2.3


From 43e122b014c955a33220fabbd09c4b5e4f422c3c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Aug 2015 17:38:02 -0700
Subject: tcp: refine pacing rate determination

When TCP pacing was added back in linux-3.12, we chose
to apply a fixed ratio of 200 % against current rate,
to allow probing for optimal throughput even during
slow start phase, where cwnd can be doubled every other gRTT.

At Google, we found it was better applying a different ratio
while in Congestion Avoidance phase.
This ratio was set to 120 %.

We've used the normal tcp_in_slow_start() helper for a while,
then tuned the condition to select the conservative ratio
as soon as cwnd >= ssthresh/2 :

- After cwnd reduction, it is safer to ramp up more slowly,
  as we approach optimal cwnd.
- Initial ramp up (ssthresh == INFINITY) still allows doubling
  cwnd every other RTT.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0abca2841de2..dc08e2352665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
  * TCP pacing, to smooth the burst on large writes when packets
  * in flight is significantly lower than cwnd (or rwin)
  */
+int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
+int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
+
 static void tcp_update_pacing_rate(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	u64 rate;
 
 	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
-	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+	/* current rate is (cwnd * mss) / srtt
+	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+	 * In Congestion Avoidance phase, set it to 120 % the current rate.
+	 *
+	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+	 *	 end of slow start and should slow down.
+	 */
+	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+		rate *= sysctl_tcp_pacing_ss_ratio;
+	else
+		rate *= sysctl_tcp_pacing_ca_ratio;
 
 	rate *= max(tp->snd_cwnd, tp->packets_out);
 
-- 
cgit v1.2.3


From c3a8d9474684d391b0afc3970d9b249add15ec07 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 31 Aug 2015 15:58:47 +0200
Subject: tcp: use dctcp if enabled on the route to the initiator

Currently, the following case doesn't use DCTCP, even if it should:
A responder has f.e. Cubic as system wide default, but for a specific
route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The
initiating host then uses DCTCP as congestion control, but since the
initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok,
and we have to fall back to Reno after 3WHS completes.

We were thinking on how to solve this in a minimal, non-intrusive
way without bloating tcp_ecn_create_request() needlessly: lets cache
the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0)
is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES
contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows
to only do a single metric feature lookup inside tcp_ecn_create_request().

Joint work with Florian Westphal.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dc08e2352665..a8f515bb19c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req,
 	const struct net *net = sock_net(listen_sk);
 	bool th_ecn = th->ece && th->cwr;
 	bool ect, ecn_ok;
+	u32 ecn_ok_dst;
 
 	if (!th_ecn)
 		return;
 
 	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
-	ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
+	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
+	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
 
-	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
+	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+	    (ecn_ok_dst & DST_FEATURE_ECN_CA))
 		inet_rsk(req)->ecn_ok = 1;
 }
 
-- 
cgit v1.2.3