From a1c1f281b84a751fdb5ff919da3b09df7297619f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 13 May 2008 02:53:26 -0700 Subject: tcp FRTO: Fix fallback to conventional recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems that commit 009a2e3e4ec ("[TCP] FRTO: Improve interoperability with other undo_marker users") run into another land-mine which caused fallback to conventional recovery to break: 1. Cumulative ACK arrives after FRTO retransmission 2. tcp_try_to_open sees zero retrans_out, clears retrans_stamp which should be kept like in CA_Loss state it would be 3. undo_marker change allowed tcp_packet_delayed to return true because of the cleared retrans_stamp once FRTO is terminated causing LossUndo to occur, which means all loss markings FRTO made are reverted. This means that the conventional recovery basically recovered one loss per RTT, which is not that efficient. It was quite unobvious that the undo_marker change broken something like this, I had a quite long session to track it down because of the non-intuitiviness of the bug (luckily I had a trivial reproducer at hand and I was also able to learn to use kprobes in the process as well :-)). This together with the NewReno+FRTO fix and FRTO in-order workaround this fixes Damon's problems, this and the first mentioned are enough to fix Bugzilla #10063. Signed-off-by: Ilpo Järvinen Reported-by: Damon L. Chesser Tested-by: Damon L. Chesser Tested-by: Sebastian Hyrwall Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 26c936930e92..d6edb98fd526 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2482,7 +2482,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_verify_left_out(tp); - if (tp->retrans_out == 0) + if (!tp->frto_counter && tp->retrans_out == 0) tp->retrans_stamp = 0; if (flag & FLAG_ECE) -- cgit v1.2.3 From 79d44516b4b178ffb6e2159c75584cfcfc097914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 13 May 2008 02:54:19 -0700 Subject: tcp FRTO: work-around inorder receivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If receiver consumes segments successfully only in-order, FRTO fallback to conventional recovery produces RTO loop because FRTO's forward transmissions will always get dropped and need to be resent, yet by default they're not marked as lost (which are the only segments we will retransmit in CA_Loss). Price to pay about this is occassionally unnecessarily retransmitting the forward transmission(s). SACK blocks help a bit to avoid this, so it's mainly a concern for NewReno case though SACK is not fully immune either. This change has a side-effect of fixing SACKFRTO problem where it didn't have snd_nxt of the RTO time available anymore when fallback become necessary (this problem would have only occured when RTO would occur for two or more segments and ECE arrives in step 3; no need to figure out how to fix that unless the TODO item of selective behavior is considered in future). Signed-off-by: Ilpo Järvinen Reported-by: Damon L. Chesser Tested-by: Damon L. Chesser Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6edb98fd526..b54d9d37b636 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1842,9 +1842,16 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; } - /* Don't lost mark skbs that were fwd transmitted after RTO */ - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && - !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { + /* Marking forward transmissions that were made after RTO lost + * can cause unnecessary retransmissions in some scenarios, + * SACK blocks will mitigate that in some but not in all cases. + * We used to not mark them but it was causing break-ups with + * receivers that do only in-order receival. + * + * TODO: we could detect presence of such receiver and select + * different behavior per flow. + */ + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); } @@ -1860,7 +1867,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); - tp->high_seq = tp->frto_highmark; + tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); tcp_clear_retrans_hints_partial(tp); -- cgit v1.2.3 From 8aca6cb1179ed9bef9351028c8d8af852903eae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 11:34:22 -0700 Subject: tcp: Fix inconsistency source (CA_Open only when !tcp_left_out(tp)) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible that this skip path causes TCP to end up into an invalid state where ca_state was left to CA_Open while some segments already came into sacked_out. If next valid ACK doesn't contain new SACK information TCP fails to enter into tcp_fastretrans_alert(). Thus at least high_seq is set incorrectly to a too high seqno because some new data segments could be sent in between (and also, limited transmit is not being correctly invoked there). Reordering in both directions can easily cause this situation to occur. I guess we would want to use tcp_moderate_cwnd(tp) there as well as it may be possible to use this to trigger oversized burst to network by sending an old ACK with huge amount of SACK info, but I'm a bit unsure about its effects (mainly to FlightSize), so to be on the safe side I just currently fixed it minimally to keep TCP's state consistent (obviously, such nasty ACKs have been possible this far). Though it seems that FlightSize is already underestimated by some amount, so probably on the long term we might want to trigger recovery there too, if appropriate, to make FlightSize calculation to resemble reality at the time when the losses where discovered (but such change scares me too much now and requires some more thinking anyway how to do that as it likely involves some code shuffling). This bug was found by Brian Vowell while running my TCP debug patch to find cause of another TCP issue (fackets_out miscount). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b54d9d37b636..54a0b7412782 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2483,6 +2483,20 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } +static void tcp_try_keep_open(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int state = TCP_CA_Open; + + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) + state = TCP_CA_Disorder; + + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); + tp->high_seq = tp->snd_nxt; + } +} + static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2496,15 +2510,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { - int state = TCP_CA_Open; - - if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) - state = TCP_CA_Disorder; - - if (inet_csk(sk)->icsk_ca_state != state) { - tcp_set_ca_state(sk, state); - tp->high_seq = tp->snd_nxt; - } + tcp_try_keep_open(sk); tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); @@ -3310,8 +3316,11 @@ no_queue: return 1; old_ack: - if (TCP_SKB_CB(skb)->sacked) + if (TCP_SKB_CB(skb)->sacked) { tcp_sacktag_write_queue(sk, skb, prior_snd_una); + if (icsk->icsk_ca_state == TCP_CA_Open) + tcp_try_keep_open(sk); + } uninteresting_ack: SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); -- cgit v1.2.3 From a6604471db5e7a33474a7f16c64d6b118fae3e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 12:07:44 -0700 Subject: tcp: fix skb vs fack_count out-of-sync condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This bug is able to corrupt fackets_out in very rare cases. In order for this to cause corruption: 1) DSACK in the middle of previous SACK block must be generated. 2) In order to take that particular branch, part or all of the DSACKed segment must already be SACKed so that we have that in cache in the first place. 3) The new info must be top enough so that fackets_out will be updated on this iteration. ...then fack_count is updated while skb wasn't, then we walk again that particular segment thus updating fack_count twice for a single skb and finally that value is assigned to fackets_out by tcp_sacktag_one. It is safe to call tcp_sacktag_one just once for a segment (at DSACK), no need to call again for plain SACK. Potential problem of the miscount are limited to premature entry to recovery and to inflated reordering metric (which could even cancel each other out in the most the luckiest scenarios :-)). Both are quite insignificant in worst case too and there exists also code to reset them (fackets_out once sacked_out becomes zero and reordering metric on RTO). This has been reported by a number of people, because it occurred quite rarely, it has been very evasive. Andy Furniss was able to get it to occur couple of times so that a bit more info was collected about the problem using a debug patch, though it still required lot of checking around. Thanks also to others who have tried to help here. This is listed as Bugzilla #10346. The bug was introduced by me in commit 68f8353b48 ([TCP]: Rewrite SACK block processing & sack_recv_cache use), I probably thought back then that there's need to scan that entry twice or didn't dare to make it go through it just once there. Going through twice would have required restoring fack_count after the walk but as noted above, I chose to drop the additional walk step altogether here. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 54a0b7412782..eba873e9b560 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1392,9 +1392,9 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); - tcp_sacktag_walk(skb, sk, NULL, - next_dup->start_seq, next_dup->end_seq, - 1, fack_count, reord, flag); + skb = tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); } return skb; -- cgit v1.2.3 From ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jun 2008 16:31:35 -0700 Subject: tcp: Revert 'process defer accept as established' changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87 ("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38 ("tcp: Fix slab corruption with ipv6 and tcp6fuzz"). This change causes several problems, first reported by Ingo Molnar as a distcc-over-loopback regression where connections were getting stuck. Ilpo Järvinen first spotted the locking problems. The new function added by this code, tcp_defer_accept_check(), only has the child socket locked, yet it is modifying state of the parent listening socket. Fixing that is non-trivial at best, because we can't simply just grab the parent listening socket lock at this point, because it would create an ABBA deadlock. The normal ordering is parent listening socket --> child socket, but this code path would require the reverse lock ordering. Next is a problem noticed by Vitaliy Gusev, he noted: ---------------------------------------- >--- a/net/ipv4/tcp_timer.c >+++ b/net/ipv4/tcp_timer.c >@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data) > goto death; > } > >+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ goto death; Here socket sk is not attached to listening socket's request queue. tcp_done() will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should release this sk) as socket is not DEAD. Therefore socket sk will be lost for freeing. ---------------------------------------- Finally, Alexey Kuznetsov argues that there might not even be any real value or advantage to these new semantics even if we fix all of the bugs: ---------------------------------------- Hiding from accept() sockets with only out-of-order data only is the only thing which is impossible with old approach. Is this really so valuable? My opinion: no, this is nothing but a new loophole to consume memory without control. ---------------------------------------- So revert this thing for now. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 45 --------------------------------------------- 1 file changed, 45 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eba873e9b560..cad73b7dfef0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4541,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) } } -static int tcp_defer_accept_check(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->defer_tcp_accept.request) { - int queued_data = tp->rcv_nxt - tp->copied_seq; - int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ? - tcp_hdr((struct sk_buff *) - sk->sk_receive_queue.prev)->fin : 0; - - if (queued_data && hasfin) - queued_data--; - - if (queued_data && - tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { - if (sock_flag(sk, SOCK_KEEPOPEN)) { - inet_csk_reset_keepalive_timer(sk, - keepalive_time_when(tp)); - } else { - inet_csk_delete_keepalive_timer(sk); - } - - inet_csk_reqsk_queue_add( - tp->defer_tcp_accept.listen_sk, - tp->defer_tcp_accept.request, - sk); - - tp->defer_tcp_accept.listen_sk->sk_data_ready( - tp->defer_tcp_accept.listen_sk, 0); - - sock_put(tp->defer_tcp_accept.listen_sk); - sock_put(sk); - tp->defer_tcp_accept.listen_sk = NULL; - tp->defer_tcp_accept.request = NULL; - } else if (hasfin || - tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { - tcp_reset(sk); - return -1; - } - } - return 0; -} - static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); @@ -4944,8 +4901,6 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - - tcp_defer_accept_check(sk); return 0; csum_error: -- cgit v1.2.3