From 4d846f02392a710f9604892ac3329e628e60a230 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2012 23:28:07 +0000
Subject: tcp: fix tcp_grow_window() for large incoming frames

tcp_grow_window() has to grow rcv_ssthresh up to window_clamp, allowing
sender to increase its window.

tcp_grow_window() still assumes a tcp frame is under MSS, but its no
longer true with LRO/GRO.

This patch fixes one of the performance issue we noticed with GRO on.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9944c1d9a218..3ff364065376 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -335,6 +335,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 			incr = __tcp_grow_window(sk, skb);
 
 		if (incr) {
+			incr = max_t(int, incr, 2 * skb->len);
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
 					       tp->window_clamp);
 			inet_csk(sk)->icsk_ack.quick |= 1;
-- 
cgit v1.2.3


From 22b4a4f22da4b39c6f7f679fd35f3d35c91bf851 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Apr 2012 10:14:23 +0000
Subject: tcp: fix retransmit of partially acked frames

Alexander Beregalov reported skb_over_panic errors and provided stack
trace.

I occurs commit a21d45726aca (tcp: avoid order-1 allocations on wifi and
tx path) added a regression, when a retransmit is done after a partial
ACK.

tcp_retransmit_skb() tries to aggregate several frames if the first one
has enough available room to hold the following ones payload. This is
controlled by /proc/sys/net/ipv4/tcp_retrans_collapse tunable (default :
enabled)

Problem is we must make sure _pskb_trim_head() doesnt fool
skb_availroom() when pulling some bytes from skb (this pull is done when
receiver ACK part of the frame).

Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Cc: Marc MERLIN <marc@merlins.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 376b2cfbb685..7ac6423117ad 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1096,6 +1096,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
 	eat = min_t(int, len, skb_headlen(skb));
 	if (eat) {
 		__skb_pull(skb, eat);
+		skb->avail_size -= eat;
 		len -= eat;
 		if (!len)
 			return;
-- 
cgit v1.2.3


From 62ad6fcd743792bf294f2a7ba26ab8f462065150 Mon Sep 17 00:00:00 2001
From: Shan Wei <davidshan@tencent.com>
Date: Tue, 24 Apr 2012 18:15:41 +0000
Subject: udp_diag: implement idiag_get_info for udp/udplite to get queue
 information

When we use netlink to monitor queue information for udp socket,
idiag_rqueue and idiag_wqueue of inet_diag_msg are returned with 0.

Keep consistent with netstat, just return back allocated rmem/wmem size.

Signed-off-by: Shan Wei <davidshan@tencent.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 2 +-
 net/ipv4/udp_diag.c  | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8d25a1c557eb..8f8db724bfaf 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -141,7 +141,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			goto rtattr_failure;
 
 	if (icsk == NULL) {
-		r->idiag_rqueue = r->idiag_wqueue = 0;
+		handler->idiag_get_info(sk, r, NULL);
 		goto out;
 	}
 
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8a949f19deb6..a7f86a3cd502 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -146,9 +146,17 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
 	return udp_dump_one(&udp_table, in_skb, nlh, req);
 }
 
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+		void *info)
+{
+	r->idiag_rqueue = sk_rmem_alloc_get(sk);
+	r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
 static const struct inet_diag_handler udp_diag_handler = {
 	.dump		 = udp_diag_dump,
 	.dump_one	 = udp_diag_dump_one,
+	.idiag_get_info  = udp_diag_get_info,
 	.idiag_type	 = IPPROTO_UDP,
 };
 
@@ -167,6 +175,7 @@ static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *
 static const struct inet_diag_handler udplite_diag_handler = {
 	.dump		 = udplite_diag_dump,
 	.dump_one	 = udplite_diag_dump_one,
+	.idiag_get_info  = udp_diag_get_info,
 	.idiag_type	 = IPPROTO_UDPLITE,
 };
 
-- 
cgit v1.2.3


From 651913ce9de2bbcedef608c5d6cf39c244248509 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 27 Apr 2012 11:29:37 -0400
Subject: tcp: clean up use of jiffies in tcp_rcv_rtt_measure()

Clean up a reference to jiffies in tcp_rcv_rtt_measure() that should
instead reference tcp_time_stamp. Since the result of the subtraction
is passed into a function taking u32, this should not change any
behavior (and indeed the generated assembly does not change on
x86_64). However, it seems worth cleaning this up for consistency and
clarity (and perhaps to avoid bugs if this is copied and pasted
somewhere else).

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ff364065376..2a702e3a4ae6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -495,7 +495,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
 		goto new_measure;
 	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
 		return;
-	tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
+	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
 
 new_measure:
 	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
-- 
cgit v1.2.3


From 1cebce36d660c83bd1353e41f3e66abd4686f215 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 30 Apr 2012 06:00:18 +0000
Subject: tcp: fix infinite cwnd in tcp_complete_cwr()

When the cwnd reduction is done, ssthresh may be infinite
if TCP enters CWR via ECN or F-RTO. If cwnd is not undone, i.e.,
undo_marker is set, tcp_complete_cwr() falsely set cwnd to the
infinite ssthresh value. The correct operation is to keep cwnd
intact because it has been updated in ECN or F-RTO.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2a702e3a4ae6..d99efd7dfb6c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2868,11 +2868,14 @@ static inline void tcp_complete_cwr(struct sock *sk)
 
 	/* Do not moderate cwnd if it's already undone in cwr or recovery. */
 	if (tp->undo_marker) {
-		if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
 			tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
-		else /* PRR */
+			tp->snd_cwnd_stamp = tcp_time_stamp;
+		} else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+			/* PRR algorithm. */
 			tp->snd_cwnd = tp->snd_ssthresh;
-		tp->snd_cwnd_stamp = tcp_time_stamp;
+			tp->snd_cwnd_stamp = tcp_time_stamp;
+		}
 	}
 	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
-- 
cgit v1.2.3


From b49960a05e32121d29316cfdf653894b88ac9190 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 May 2012 02:28:41 +0000
Subject: tcp: change tcp_adv_win_scale and tcp_rmem[2]

tcp_adv_win_scale default value is 2, meaning we expect a good citizen
skb to have skb->len / skb->truesize ratio of 75% (3/4)

In 2.6 kernels we (mis)accounted for typical MSS=1460 frame :
1536 + 64 + 256 = 1856 'estimated truesize', and 1856 * 3/4 = 1392.
So these skbs were considered as not bloated.

With recent truesize fixes, a typical MSS=1460 frame truesize is now the
more precise :
2048 + 256 = 2304. But 2304 * 3/4 = 1728.
So these skb are not good citizen anymore, because 1460 < 1728

(GRO can escape this problem because it build skbs with a too low
truesize.)

This also means tcp advertises a too optimistic window for a given
allocated rcvspace : When receiving frames, sk_rmem_alloc can hit
sk_rcvbuf limit and we call tcp_prune_queue()/tcp_collapse() too often,
especially when application is slow to drain its receive queue or in
case of losses (netperf is fast, scp is slow). This is a major latency
source.

We should adjust the len/truesize ratio to 50% instead of 75%

This patch :

1) changes tcp_adv_win_scale default to 1 instead of 2

2) increase tcp_rmem[2] limit from 4MB to 6MB to take into account
better truesize tracking and to allow autotuning tcp receive window to
reach same value than before. Note that same amount of kernel memory is
consumed compared to 2.6 kernels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c       | 9 +++++----
 net/ipv4/tcp_input.c | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8bb6adeb62c0..1272a88c2a63 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3243,7 +3243,7 @@ void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
 	unsigned long limit;
-	int max_share, cnt;
+	int max_rshare, max_wshare, cnt;
 	unsigned int i;
 	unsigned long jiffy = jiffies;
 
@@ -3303,15 +3303,16 @@ void __init tcp_init(void)
 	tcp_init_mem(&init_net);
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
-	max_share = min(4UL*1024*1024, limit);
+	max_wshare = min(4UL*1024*1024, limit);
+	max_rshare = min(6UL*1024*1024, limit);
 
 	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
 	sysctl_tcp_wmem[1] = 16*1024;
-	sysctl_tcp_wmem[2] = max(64*1024, max_share);
+	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
 
 	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
 	sysctl_tcp_rmem[1] = 87380;
-	sysctl_tcp_rmem[2] = max(87380, max_share);
+	sysctl_tcp_rmem[2] = max(87380, max_rshare);
 
 	pr_info("Hash tables configured (established %u bind %u)\n",
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d99efd7dfb6c..257b61789eeb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,7 +85,7 @@ int sysctl_tcp_ecn __read_mostly = 2;
 EXPORT_SYMBOL(sysctl_tcp_ecn);
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 2;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
 int sysctl_tcp_stdurg __read_mostly;
-- 
cgit v1.2.3