From 6b5a5c0dbb11dcff4e1b0f1ef87a723197948ed4 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 21 Nov 2011 17:15:14 +0000 Subject: tcp: do not scale TSO segment size with reordering degree Since 2005 (c1b4a7e69576d65efc31a8cea0714173c2841244) tcp_tso_should_defer has been using tcp_max_burst() as a target limit for deciding how large to make outgoing TSO packets when not using sysctl_tcp_tso_win_divisor. But since 2008 (dd9e0dda66ba38a2ddd1405ac279894260dc5c36) tcp_max_burst() returns the reordering degree. We should not have tcp_tso_should_defer attempt to build larger segments just because there is more reordering. This commit splits the notion of deferral size used in TSO from the notion of burst size used in cwnd moderation, and returns the TSO deferral limit to its original value. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 63170e297540..58f69acd3d22 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1581,7 +1581,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) * frame, so if we have space for more than 3 frames * then send now. */ - if (limit > tcp_max_burst(tp) * tp->mss_cache) + if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } -- cgit v1.2.3 From 117632e64d2a5f464e491fe221d7169a3814a77b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 3 Dec 2011 21:39:53 +0000 Subject: tcp: take care of misalignments We discovered that TCP stack could retransmit misaligned skbs if a malicious peer acknowledged sub MSS frame. This currently can happen only if output interface is non SG enabled : If SG is enabled, tcp builds headless skbs (all payload is included in fragments), so the tcp trimming process only removes parts of skb fragments, header stay aligned. Some arches cant handle misalignments, so force a head reallocation and shrink headroom to MAX_TCP_HEADER. Dont care about misaligments on x86 and PPC (or other arches setting NET_IP_ALIGN to 0) This patch introduces __pskb_copy() which can specify the headroom of new head, and pskb_copy() becomes a wrapper on top of __pskb_copy() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58f69acd3d22..50788d67bdb7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2147,7 +2147,15 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + /* make sure skb->data is aligned on arches that require it */ + if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) { + struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, + GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } else { + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + } if (err == 0) { /* Update global TCP statistics. */ -- cgit v1.2.3 From 4fa48bf3c75069d636fc8830743c929a062e80dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Dec 2011 08:51:08 +0000 Subject: tcp: fix tcp_trim_head() commit f07d960df3 (tcp: avoid frag allocation for small frames) breaked assumption in tcp stack that skb is either linear (skb->data_len == 0), or fully fragged (skb->data_len == skb->len) tcp_trim_head() made this assumption, we must fix it. Thanks to Vijay for providing a very detailed explanation. Reported-by: Vijay Subramanian Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 50788d67bdb7..cf3068038942 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1093,6 +1093,13 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; + eat = min_t(int, len, skb_headlen(skb)); + if (eat) { + __skb_pull(skb, eat); + len -= eat; + if (!len) + return; + } eat = len; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { @@ -1124,11 +1131,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; - /* If len == headlen, we avoid __skb_pull to preserve alignment. */ - if (unlikely(len < skb_headlen(skb))) - __skb_pull(skb, len); - else - __pskb_trim_head(skb, len - skb_headlen(skb)); + __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_PARTIAL; -- cgit v1.2.3 From 180d8cd942ce336b2c869d324855c40c5db478ad Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:02 +0000 Subject: foundations of per-cgroup memory pressure controlling. This patch replaces all uses of struct sock fields' memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem to acessor macros. Those macros can either receive a socket argument, or a mem_cgroup argument, depending on the context they live in. Since we're only doing a macro wrapping here, no performance impact at all is expected in the case where we don't have cgroups disabled. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman CC: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cf3068038942..8c8de2780c7a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1922,7 +1922,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (sk_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); -- cgit v1.2.3 From 5b35e1e6e9ca651e6b291c96d1106043c9af314a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 28 Jan 2012 17:29:46 +0000 Subject: tcp: fix tcp_trim_head() to adjust segment count with skb MSS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes tcp_trim_head() to recalculate the number of segments in the skb with the skb's existing MSS, so trimming the head causes the skb segment count to be monotonically non-increasing - it should stay the same or go down, but not increase. Previously tcp_trim_head() used the current MSS of the connection. But if there was a decrease in MSS between original transmission and ACK (e.g. due to PMTUD), this could cause tcp_trim_head() to counter-intuitively increase the segment count when trimming bytes off the head of an skb. This violated assumptions in tcp_tso_acked() that tcp_trim_head() only decreases the packet count, so that packets_acked in tcp_tso_acked() could underflow, leading tcp_clean_rtx_queue() to pass u32 pkts_acked values as large as 0xffffffff to ca_ops->pkts_acked(). As an aside, if tcp_trim_head() had really wanted the skb to reflect the current MSS, it should have called tcp_set_skb_tso_segs() unconditionally, since a decrease in MSS would mean that a single-packet skb should now be sliced into multiple segments. Signed-off-by: Neal Cardwell Acked-by: Nandita Dukkipati Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c8de2780c7a..4ff3b6dc74fc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1141,11 +1141,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - /* Any change of skb->len requires recalculation of tso - * factor and mss. - */ + /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); return 0; } -- cgit v1.2.3