From 5d134f1c1f36166e8a738de92c4d2f4c262ff91b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 5 Jan 2013 16:10:48 +0000 Subject: tcp: make sysctl_tcp_ecn namespace aware As per suggestion from Eric Dumazet this patch makes tcp_ecn sysctl namespace aware. The reason behind this patch is to ease the testing of ecn problems on the internet and allows applications to tune their own use of ecn. Cc: Eric Dumazet Cc: David Miller Cc: Stephen Hemminger Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a28e4db8a952..38e11841be09 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -81,8 +81,6 @@ int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; EXPORT_SYMBOL(sysctl_tcp_reordering); -int sysctl_tcp_ecn __read_mostly = 2; -EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; -- cgit v1.2.3 From cef401de7be8c4e155c6746bfccf721a4fa5fab9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Jan 2013 20:34:37 +0000 Subject: net: fix possible wrong checksum generation Pravin Shelar mentioned that GSO could potentially generate wrong TX checksum if skb has fragments that are overwritten by the user between the checksum computation and transmit. He suggested to linearize skbs but this extra copy can be avoided for normal tcp skbs cooked by tcp_sendmsg(). This patch introduces a new SKB_GSO_SHARED_FRAG flag, set in skb_shinfo(skb)->gso_type if at least one frag can be modified by the user. Typical sources of such possible overwrites are {vm}splice(), sendfile(), and macvtap/tun/virtio_net drivers. Tested: $ netperf -H 7.7.8.84 MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.8.84 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 3959.52 $ netperf -H 7.7.8.84 -t TCP_SENDFILE TCP SENDFILE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.8.84 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 3216.80 Performance of the SENDFILE is impacted by the extra allocation and copy, and because we use order-0 pages, while the TCP_STREAM uses bigger pages. Reported-by: Pravin Shelar Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0905997e5873..492c7cfe1453 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1240,13 +1240,13 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ if (!skb_shinfo(prev)->gso_size) { skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; + skb_shinfo(prev)->gso_type |= sk->sk_gso_type; } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (skb_shinfo(skb)->gso_segs <= 1) { skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; } /* Difference in this won't matter, both ACKed by the same cumul. ACK */ -- cgit v1.2.3 From 66555e92fb7a619188c02cceae4bbc414f15f96d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 31 Jan 2013 11:16:46 -0800 Subject: tcp: detect SYN/data drop when F-RTO is disabled On receiving the SYN-ACK, Fast Open checks icsk_retransmit for SYN retransmission to detect SYN/data drops. But if F-RTO is disabled, icsk_retransmit is reset at step D of tcp_fastretrans_alert() ( under tcp_ack()) before tcp_rcv_fastopen_synack(). The fix is to use total_retrans instead which accounts for SYN retransmission regardless the use of F-RTO. Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 18f97ca76b00..8aca4ee95ff9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5649,8 +5649,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, * the remote receives only the retransmitted (regular) SYNs: either * the original SYN-data or the corresponding SYN-ACK is lost. */ - syn_drop = (cookie->len <= 0 && data && - inet_csk(sk)->icsk_retransmits); + syn_drop = (cookie->len <= 0 && data && tp->total_retrans); tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); -- cgit v1.2.3 From 2e5f421211ff76c17130b4597bc06df4eeead24f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Feb 2013 09:13:05 +0000 Subject: tcp: frto should not set snd_cwnd to 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9dc274151a548 (tcp: fix ABC in tcp_slow_start()) uncovered a bug in FRTO code : tcp_process_frto() is setting snd_cwnd to 0 if the number of in flight packets is 0. As Neal pointed out, if no packet is in flight we lost our chance to disambiguate whether a loss timeout was spurious. We should assume it was a proper loss. Reported-by: Pasi Kärkkäinen Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Ilpo Järvinen Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8aca4ee95ff9..680c4224ed96 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3484,7 +3484,8 @@ static bool tcp_process_frto(struct sock *sk, int flag) ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) tp->undo_marker = 0; - if (!before(tp->snd_una, tp->frto_highmark)) { + if (!before(tp->snd_una, tp->frto_highmark) || + !tcp_packets_in_flight(tp)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return true; } -- cgit v1.2.3 From ca2eb5679f8ddffff60156af42595df44a315ef0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 5 Feb 2013 07:25:17 +0000 Subject: tcp: remove Appropriate Byte Count support TCP Appropriate Byte Count was added by me, but later disabled. There is no point in maintaining it since it is a potential source of bugs and Linux already implements other better window protection heuristics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e376aa9591bc..f56bd1082f54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -98,7 +98,6 @@ int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_abc __read_mostly; int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ @@ -2007,7 +2006,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->frto_counter = 0; - tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); @@ -2056,7 +2054,6 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->bytes_acked = 0; tcp_clear_retrans_partial(tp); if (tcp_is_reno(tp)) @@ -2684,7 +2681,6 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) struct tcp_sock *tp = tcp_sk(sk); tp->high_seq = tp->snd_nxt; - tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tp->prior_cwnd = tp->snd_cwnd; tp->prr_delivered = 0; @@ -2735,7 +2731,6 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; - tp->bytes_acked = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; tcp_init_cwnd_reduction(sk, set_ssthresh); @@ -3417,7 +3412,6 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; - tp->bytes_acked = 0; TCP_ECN_queue_cwr(tp); tcp_moderate_cwnd(tp); } @@ -3609,15 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; - if (sysctl_tcp_abc) { - if (icsk->icsk_ca_state < TCP_CA_CWR) - tp->bytes_acked += ack - prior_snd_una; - else if (icsk->icsk_ca_state == TCP_CA_Loss) - /* we assume just one segment left network */ - tp->bytes_acked += min(ack - prior_snd_una, - tp->mss_cache); - } - prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); -- cgit v1.2.3 From 6731d2095bd4aef18027c72ef845ab1087c3ba63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 4 Feb 2013 02:14:25 +0000 Subject: tcp: fix for zero packets_in_flight was too broad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are transients during normal FRTO procedure during which the packets_in_flight can go to zero between write_queue state updates and firing the resulting segments out. As FRTO processing occurs during that window the check must be more precise to not match "spuriously" :-). More specificly, e.g., when packets_in_flight is zero but FLAG_DATA_ACKED is true the problematic branch that set cwnd into zero would not be taken and new segments might be sent out later. Signed-off-by: Ilpo Järvinen Tested-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 680c4224ed96..ad70a962c20e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3484,8 +3484,7 @@ static bool tcp_process_frto(struct sock *sk, int flag) ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) tp->undo_marker = 0; - if (!before(tp->snd_una, tp->frto_highmark) || - !tcp_packets_in_flight(tp)) { + if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return true; } @@ -3505,6 +3504,11 @@ static bool tcp_process_frto(struct sock *sk, int flag) } } else { if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + if (!tcp_packets_in_flight(tp)) { + tcp_enter_frto_loss(sk, 2, flag); + return true; + } + /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); -- cgit v1.2.3 From ee684b6f2830047d19877e5547989740f18b1a5d Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 11 Feb 2013 05:50:19 +0000 Subject: tcp: send packets with a socket timestamp A socket timestamp is a sum of the global tcp_time_stamp and a per-socket offset. A socket offset is added in places where externally visible tcp timestamp option is parsed/initialized. Connections in the SYN_RECV state are not supported, global tcp_time_stamp is used for them, because repair mode doesn't support this state. In a future it can be implemented by the similar way as for TIME_WAIT sockets. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: Eric Dumazet Cc: Pavel Emelyanov Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ea678b62e94f..d9bfaea34322 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3860,7 +3860,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr); + tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; return true; } return false; @@ -3884,7 +3884,11 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, if (tcp_parse_aligned_timestamp(tp, th)) return true; } + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); + if (tp->rx_opt.saw_tstamp) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; + return true; } @@ -5665,6 +5669,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); + if (tp->rx_opt.saw_tstamp) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { /* rfc793: -- cgit v1.2.3 From c9af6db4c11ccc6c3e7f19bbc15d54023956f97c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 11 Feb 2013 09:27:41 +0000 Subject: net: Fix possible wrong checksum generation. Patch cef401de7be8c4e (net: fix possible wrong checksum generation) fixed wrong checksum calculation but it broke TSO by defining new GSO type but not a netdev feature for that type. net_gso_ok() would not allow hardware checksum/segmentation offload of such packets without the feature. Following patch fixes TSO and wrong checksum. This patch uses same logic that Eric Dumazet used. Patch introduces new flag SKBTX_SHARED_FRAG if at least one frag can be modified by the user. but SKBTX_SHARED_FRAG flag is kept in skb shared info tx_flags rather than gso_type. tx_flags is better compared to gso_type since we can have skb with shared frag without gso packet. It does not link SHARED_FRAG to GSO, So there is no need to define netdev feature for this. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d9bfaea34322..a759e19496d2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1239,13 +1239,13 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ if (!skb_shinfo(prev)->gso_size) { skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type |= sk->sk_gso_type; + skb_shinfo(prev)->gso_type = sk->sk_gso_type; } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (skb_shinfo(skb)->gso_segs <= 1) { skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; + skb_shinfo(skb)->gso_type = 0; } /* Difference in this won't matter, both ACKed by the same cumul. ACK */ -- cgit v1.2.3 From aab2b4bf224ef8358d262f95b568b8ad0cecf0a0 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 4 Mar 2013 06:23:05 +0000 Subject: tcp: fix double-counted receiver RTT when leaving receiver fast path We should not update ts_recent and call tcp_rcv_rtt_measure_ts() both before and after going to step5. That wastes CPU and double-counts the receiver-side RTT sample. Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a759e19496d2..0d9bdacce99f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5485,6 +5485,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_checksum_complete_user(sk, skb)) goto csum_error; + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5496,9 +5499,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ -- cgit v1.2.3