From 7d2b55f80d62b6b4b72d9ef4318a2a49df3e2830 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Nov 2011 08:58:01 +0000 Subject: tcp: make is_dupack a parameter to tcp_fastretrans_alert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow callers to decide whether an ACK is a duplicate ACK. This is a prerequisite to allowing fastretrans_alert to be called from new contexts, such as the no_queue and old_ack code paths, from which we have extra info that tells us whether an ACK is a dupack. Signed-off-by: Neal Cardwell Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 52b5c2d0ecd0..f772aaa4319e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3009,11 +3009,11 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int newly_acked_sacked, int flag) + int newly_acked_sacked, bool is_dupack, + int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); int fast_rexmit = 0, mib_idx; @@ -3681,10 +3681,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; + bool is_dupack = false; u32 prior_in_flight; u32 prior_fackets; int prior_packets; int prior_sacked = tp->sacked_out; + int pkts_acked = 0; int newly_acked_sacked = 0; int frto_cwnd = 0; @@ -3757,6 +3759,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); + pkts_acked = prior_packets - tp->packets_out; newly_acked_sacked = (prior_packets - prior_sacked) - (tp->packets_out - tp->sacked_out); @@ -3771,8 +3774,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); - tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, - newly_acked_sacked, flag); + is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); + tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + is_dupack, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight); -- cgit v1.2.3 From 5628adf1a0ff39b9e76e1a8e1c94dadabfd46914 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Nov 2011 08:58:02 +0000 Subject: tcp: use DSACKs that arrive when packets_out is 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bug: Senders ignored DSACKs after recovery when there were no outstanding packets (a common scenario for HTTP servers). The change: when there are no outstanding packets (the "no_queue" goto label), call tcp_fastretrans_alert() in order to use DSACKs to undo congestion window reductions. Other patches in this series will provide other changes that are necessary to fully fix this problem. Signed-off-by: Neal Cardwell Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f772aaa4319e..b49e41864037 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3788,6 +3788,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + /* If data was DSACKed, see if we can undo a cwnd reduction. */ + if (flag & FLAG_DSACKING_ACK) + tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. -- cgit v1.2.3 From e95ae2f2cf10f7bf27b492aa6188f3cd745de162 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Nov 2011 08:58:03 +0000 Subject: tcp: use SACKs and DSACKs that arrive on ACKs below snd_una MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bug: When the ACK field is below snd_una (which can happen when ACKs are reordered), senders ignored DSACKs (preventing undo) and did not call tcp_fastretrans_alert, so they did not increment prr_delivered to reflect newly-SACKed sequence ranges, and did not call tcp_xmit_retransmit_queue, thus passing up chances to send out more retransmitted and new packets based on any newly-SACKed packets. The change: When the ACK field is below snd_una (the "old_ack" goto label), call tcp_fastretrans_alert to allow undo based on any newly-arrived DSACKs and try to send out more packets based on newly-SACKed packets. Other patches in this series will provide other changes that are necessary to fully fix this problem. Signed-off-by: Neal Cardwell Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b49e41864037..751d39060fb8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3805,10 +3805,14 @@ invalid_ack: return -1; old_ack: + /* If data was SACKed, tag it and see if we should send more data. + * If data was DSACKed, see if we can undo a cwnd reduction. + */ if (TCP_SKB_CB(skb)->sacked) { - tcp_sacktag_write_queue(sk, skb, prior_snd_una); - if (icsk->icsk_ca_state == TCP_CA_Open) - tcp_try_keep_open(sk); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + newly_acked_sacked = tp->sacked_out - prior_sacked; + tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + is_dupack, flag); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); -- cgit v1.2.3 From f698204bd0bfdc645642e271da117b56b795aee0 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Nov 2011 08:58:04 +0000 Subject: tcp: allow undo from reordered DSACKs Previously, SACK-enabled connections hung around in TCP_CA_Disorder state while snd_una==high_seq, just waiting to accumulate DSACKs and hopefully undo a cwnd reduction. This could and did lead to the following unfortunate scenario: if some incoming ACKs advance snd_una beyond high_seq then we were setting undo_marker to 0 and moving to TCP_CA_Open, so if (due to reordering in the ACK return path) we shortly thereafter received a DSACK then we were no longer able to undo the cwnd reduction. The change: Simplify the congestion avoidance state machine by removing the behavior where SACK-enabled connections hung around in the TCP_CA_Disorder state just waiting for DSACKs. Instead, when snd_una advances to high_seq or beyond we typically move to TCP_CA_Open immediately and allow an undo in either TCP_CA_Open or TCP_CA_Disorder if we later receive enough DSACKs. Other patches in this series will provide other changes that are necessary to fully fix this problem. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 751d39060fb8..a4efdd7cf5a1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2858,7 +2858,7 @@ static void tcp_try_keep_open(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int state = TCP_CA_Open; - if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) + if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { @@ -3066,17 +3066,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } break; - case TCP_CA_Disorder: - tcp_try_undo_dsack(sk); - if (!tp->undo_marker || - /* For SACK case do not Open to allow to undo - * catching for all duplicate ACKs. */ - tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { - tp->undo_marker = 0; - tcp_set_ca_state(sk, TCP_CA_Open); - } - break; - case TCP_CA_Recovery: if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); @@ -3117,7 +3106,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_add_reno_sack(sk); } - if (icsk->icsk_ca_state == TCP_CA_Disorder) + if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk)) { -- cgit v1.2.3 From 8cd6d6162d998da579d40a1ee061bf8ce1610ff8 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Nov 2011 08:58:05 +0000 Subject: tcp: skip cwnd moderation in TCP_CA_Open in tcp_try_to_open The problem: Senders were overriding cwnd values picked during an undo by calling tcp_moderate_cwnd() in tcp_try_to_open(). The fix: Don't moderate cwnd in tcp_try_to_open() if we're in TCP_CA_Open, since doing so is generally unnecessary and specifically would override a DSACK-based undo of a cwnd reduction made in fast recovery. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a4efdd7cf5a1..78dd38cd5496 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2881,7 +2881,8 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - tcp_moderate_cwnd(tp); + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); } -- cgit v1.2.3 From fdf5af0daf8019cec2396cdef8fb042d80fe71fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Dec 2011 23:41:42 +0000 Subject: tcp: drop SYN+FIN messages Denys Fedoryshchenko reported that SYN+FIN attacks were bringing his linux machines to their limits. Dont call conn_request() if the TCP flags includes SYN flag Reported-by: Denys Fedoryshchenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 78dd38cd5496..0cbb44076cfa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5811,6 +5811,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if (th->syn) { + if (th->fin) + goto discard; if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; -- cgit v1.2.3 From dfd56b8b38fff3586f36232db58e1e9f7885a605 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 10 Dec 2011 09:48:31 +0000 Subject: net: use IS_ENABLED(CONFIG_IPV6) Instead of testing defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0cbb44076cfa..b9cbc351c511 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2663,7 +2663,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", -- cgit v1.2.3 From 180d8cd942ce336b2c869d324855c40c5db478ad Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:02 +0000 Subject: foundations of per-cgroup memory pressure controlling. This patch replaces all uses of struct sock fields' memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem to acessor macros. Those macros can either receive a socket argument, or a mem_cgroup argument, depending on the context they live in. Since we're only doing a macro wrapping here, no performance impact at all is expected in the case where we don't have cgroups disabled. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman CC: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b9cbc351c511..f131d92d25ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -322,7 +322,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_memory_pressure) { + !sk_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -411,8 +411,8 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + !sk_under_memory_pressure(sk) && + sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4866,7 +4866,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (tcp_memory_pressure) + else if (sk_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4932,11 +4932,11 @@ static int tcp_should_expand_sndbuf(const struct sock *sk) return 0; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_memory_pressure) + if (sk_under_memory_pressure(sk)) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) return 0; /* If we filled the congestion window, do not expand. */ -- cgit v1.2.3 From ab56222a32b9dbaae19c1d37f07b0ac4fc3c27ec Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Tue, 20 Dec 2011 13:23:24 +0000 Subject: tcp: Replace constants with #define macros to record the state of SACK/FACK and DSACK for better readability and maintenance. Signed-off-by: Vijay Subramanian Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f131d92d25ee..2877c3e09587 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -865,13 +865,13 @@ static void tcp_disable_fack(struct tcp_sock *tp) /* RFC3517 uses different metric in lost marker => reset on change */ if (tcp_is_fack(tp)) tp->lost_skb_hint = NULL; - tp->rx_opt.sack_ok &= ~2; + tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; } /* Take a notice that peer is sending D-SACKs */ static void tcp_dsack_seen(struct tcp_sock *tp) { - tp->rx_opt.sack_ok |= 4; + tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; } /* Initialize metrics on socket. */ @@ -3878,7 +3878,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && !estab && sysctl_tcp_sack) { - opt_rx->sack_ok = 1; + opt_rx->sack_ok = TCP_SACK_SEEN; tcp_sack_reset(opt_rx); } break; -- cgit v1.2.3 From 974c12360dfe6ab01201fe9e708e7755c413f8b6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jan 2012 14:42:21 +0000 Subject: tcp: detect loss above high_seq in recovery Correctly implement a loss detection heuristic: New sequences (above high_seq) sent during the fast recovery are deemed lost when higher sequences are SACKed. Current code does not catch these losses, because tcp_mark_head_lost() does not check packets beyond high_seq. The fix is straight-forward by checking packets until the highest sacked packet. In addition, all the FLAG_DATA_LOST logic are in-effective and redundant and can be removed. Update the loss heuristic comments. The algorithm above is documented as heuristic B, but it is redundant too because heuristic A already covers B. Note that this change only marks some forward-retransmitted packets LOST. It does NOT forbid TCP performing further CWR on new losses. A potential follow-up patch under preparation is to perform another CWR on "new" losses such as 1) sequence above high_seq is lost (by resetting high_seq to snd_nxt) 2) retransmission is lost. Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2877c3e09587..976034f82320 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -105,7 +105,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ -#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -1040,13 +1039,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) - * 3. Loss detection event of one of three flavors: + * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modfication, head until snd.fack is lost. - * B. SACK arrives sacking data transmitted after never retransmitted - * hole was sent out. - * C. SACK arrives sacking SND.NXT at the moment, when the + * A''. Its FACK modification, head until snd.fack is lost. + * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * @@ -1153,7 +1150,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, } /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "C". Later note: FACK people cheated me again 8), we have to account + * Event "B". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. * * Search retransmitted skbs from write_queue that were sent when snd_nxt was @@ -1844,10 +1841,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - state.flag |= FLAG_DATA_LOST; - /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) @@ -2515,8 +2508,11 @@ static void tcp_timeout_skbs(struct sock *sk) tcp_verify_left_out(tp); } -/* Mark head of queue up as lost. With RFC3517 SACK, the packets is - * is against sacked "cnt", otherwise it's against facked "cnt" +/* Detect loss in event "A" above by marking head of queue up as lost. + * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * has at least tp->reordering SACKed seqments above it; "packets" refers to + * the maximum SACKed segments to pass before reaching this limit. */ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { @@ -2525,6 +2521,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) int cnt, oldcnt; int err; unsigned int mss; + /* Use SACK to deduce losses of new sequences sent during recovery */ + const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { @@ -2546,7 +2544,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; oldcnt = cnt; @@ -3033,19 +3031,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (tcp_check_sack_reneging(sk, flag)) return; - /* C. Process data loss notification, provided it is valid. */ - if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && - before(tp->snd_una, tp->high_seq) && - icsk->icsk_ca_state != TCP_CA_Open && - tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); - } - - /* D. Check consistency of the current state. */ + /* C. Check consistency of the current state. */ tcp_verify_left_out(tp); - /* E. Check state exit conditions. State can be terminated + /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { WARN_ON(tp->retrans_out != 0); @@ -3077,7 +3066,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } } - /* F. Process state. */ + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { -- cgit v1.2.3 From cc9a672ee522d4805495b98680f4a3db5d0a0af9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 12 Feb 2012 18:37:09 +0000 Subject: tcp: allow tcp_sacktag_one() to tag ranges not aligned with skbs This commit allows callers of tcp_sacktag_one() to pass in sequence ranges that do not align with skb boundaries, as tcp_shifted_skb() needs to do in an upcoming fix in this patch series. In fact, now tcp_sacktag_one() does not need to depend on an input skb at all, which makes its semantics and dependencies more clear. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 976034f82320..4e8a81fda65c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1307,25 +1307,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } -static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, - struct tcp_sacktag_state *state, +/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ +static u8 tcp_sacktag_one(struct sock *sk, + struct tcp_sacktag_state *state, u8 sacked, + u32 start_seq, u32 end_seq, int dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); - u8 sacked = TCP_SKB_CB(skb)->sacked; int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { @@ -1344,13 +1345,13 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ - if (before(TCP_SKB_CB(skb)->seq, + if (before(start_seq, tcp_highest_sack_seq(tp))) state->reord = min(fack_count, state->reord); /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + if (!after(end_seq, tp->frto_highmark)) state->flag |= FLAG_ONLY_ORIG_SACKED; } @@ -1368,8 +1369,7 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->lost_skb_hint)->seq)) + before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; if (fack_count > tp->fackets_out) @@ -1425,7 +1425,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* We discard results */ - tcp_sacktag_one(skb, sk, state, dup_sack, pcount); + tcp_sacktag_one(sk, state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1664,10 +1668,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, break; if (in_sack) { - TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, - state, - dup_sack, - tcp_skb_pcount(skb)); + TCP_SKB_CB(skb)->sacked = + tcp_sacktag_one(sk, + state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, + tcp_skb_pcount(skb)); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) -- cgit v1.2.3 From daef52bab1fd26e24e8e9578f8fb33ba1d0cb412 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 12 Feb 2012 18:37:10 +0000 Subject: tcp: fix range tcp_shifted_skb() passes to tcp_sacktag_one() Fix the newly-SACKed range to be the range of newly-shifted bytes. Previously - since 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 - tcp_shifted_skb() incorrectly called tcp_sacktag_one() with the start and end sequence numbers of the skb it passes in set to the range just beyond the range that is newly-SACKed. This commit also removes a special-case adjustment to lost_cnt_hint in tcp_shifted_skb() since the pre-existing adjustment of lost_cnt_hint in tcp_sacktag_one() now properly handles this things now that the correct start sequence number is passed in. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4e8a81fda65c..8116d06e042c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1388,6 +1388,9 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* Shift newly-SACKed bytes from this skb to the immediately previous + * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. + */ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, @@ -1395,12 +1398,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); + u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ + u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); - if (skb == tp->lost_skb_hint) - tp->lost_cnt_hint += pcount; - TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; @@ -1424,12 +1426,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_type = 0; } - /* We discard results */ - tcp_sacktag_one(sk, state, - TCP_SKB_CB(skb)->sacked, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq, - dup_sack, pcount); + /* Adjust counters and hints for the newly sacked sequence range but + * discard the return value since prev is already marked. + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); -- cgit v1.2.3 From 0af2a0d0576205dda778d25c6c344fc6508fc81d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 13 Feb 2012 20:22:08 +0000 Subject: tcp: fix tcp_shifted_skb() adjustment of lost_cnt_hint for FACK This commit ensures that lost_cnt_hint is correctly updated in tcp_shifted_skb() for FACK TCP senders. The lost_cnt_hint adjustment in tcp_sacktag_one() only applies to non-FACK senders, so FACK senders need their own adjustment. This applies the spirit of 1e5289e121372a3494402b1b131b41bfe1cf9b7f - except now that the sequence range passed into tcp_sacktag_one() is correct we need only have a special case adjustment for FACK. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8116d06e042c..53c8ce4046b2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1403,6 +1403,10 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); + /* Adjust hint for FACK. Non-FACK is handled in tcp_sacktag_one(). */ + if (tcp_is_fack(tp) && (skb == tp->lost_skb_hint)) + tp->lost_cnt_hint += pcount; + TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; -- cgit v1.2.3