From 58d607d3e52f2b15902f58a1161da9fb3b0f6d47 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Sep 2015 15:24:20 -0700 Subject: tcp: provide skb->hash to synack packets In commit b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit"), Tom provided a l4 hash to most outgoing TCP packets. We'd like to provide one as well for SYNACK packets, so that all packets of a given flow share same txhash, to later enable bonding driver to also use skb->hash to perform slave selection. Note that a SYNACK retransmit shuffles the tx hash, as Tom did in commit 265f94ff54d62 ("net: Recompute sk_txhash on negative routing advice") for established sockets. This has nice effect making TCP flows resilient to some kind of black holes, even at connection establish phase. Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9a8a12b62ee..d0ad3554c333 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2987,6 +2987,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, rcu_read_lock(); md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, foc) + sizeof(*th); @@ -3505,6 +3506,7 @@ int tcp_rtx_synack(struct sock *sk, struct request_sock *req) struct flowi fl; int res; + tcp_rsk(req)->txhash = net_tx_rndhash(); res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); -- cgit v1.2.3 From f9b9958229638245b5709f27c76c199a465f1496 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 18 Sep 2015 11:40:33 -0700 Subject: tcp: send loss probe after 1s if no RTT available This patch makes TLP to use 1 sec timer by default when RTT is not available due to SYN/ACK retransmission or SYN cookies. Prior to this change, the lack of RTT prevents TLP so the first data packets sent can only be recovered by fast recovery or RTO. If the fast recovery fails to trigger the RTO is 3 second when SYN/ACK is retransmitted. With this patch we can trigger fast recovery in 1sec instead. Note that we need to check Fast Open more properly. A Fast Open connection could be (accepted then) closed before it receives the final ACK of 3WHS so the state is FIN_WAIT_1. Without the new check, TLP will retransmit FIN instead of SYN/ACK. Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d0ad3554c333..4cd0b50d4e46 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2165,7 +2165,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (sk->sk_state == TCP_SYN_RECV) + if (tp->fastopen_rsk) return false; /* TLP is only scheduled when next timer event is RTO. */ @@ -2175,7 +2175,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -2184,9 +2184,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account - * for delayed ack when there's one outstanding packet. + * for delayed ack when there's one outstanding packet. If no RTT + * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1; + timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; if (tp->packets_out == 1) timeout = max_t(u32, timeout, (rtt + (rtt >> 1) + TCP_DELACK_MAX)); -- cgit v1.2.3 From 37bfbdda0b036a3720924e04c0171d9038159c2c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:17 -0700 Subject: tcp: remove tcp_synack_options() socket argument We do not use the socket in this function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4cd0b50d4e46..87392cb51b11 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -612,12 +612,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct sock *sk, - struct request_sock *req, - unsigned int mss, struct sk_buff *skb, - struct tcp_out_options *opts, - const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) +static unsigned int tcp_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -2989,8 +2988,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); -- cgit v1.2.3 From 6ac705b1805863b1899e85f641bb265f9e6e9d99 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:18 -0700 Subject: tcp: remove tcp_ecn_make_synack() socket argument SYNACK packets might be sent without holding socket lock. For DCTCP/ECN sake, we should call INET_ECN_xmit() while socket lock is owned, and only when we init/change congestion control. This also fixies a bug if congestion module is changed from dctcp to another one on a listener : we now clear ECN bits properly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 87392cb51b11..ba6194152d39 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -357,14 +357,10 @@ static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) } static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, - struct sock *sk) +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) { + if (inet_rsk(req)->ecn_ok) th->ece = 1; - if (tcp_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); - } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -2998,7 +2994,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is -- cgit v1.2.3 From 5d062de7f8ea1ca7c635957ff1144fba815ba34c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:19 -0700 Subject: tcp: constify tcp_make_synack() socket argument listener socket is not locked when tcp_make_synack() is called. We better make sure no field is written. There is one exception : Since SYNACK packets are attached to the listener at this moment (or SYN_RECV child in case of Fast Open), sock_wmalloc() needs to update sk->sk_wmem_alloc, but this is done using atomic operations so this is safe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ba6194152d39..9eb67a8933f1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2944,20 +2944,25 @@ int tcp_send_synack(struct sock *sk) * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc) { - struct tcp_out_options opts; struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th; - struct sk_buff *skb; + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *md5 = NULL; + struct tcp_out_options opts; + struct sk_buff *skb; int tcp_header_size; + struct tcphdr *th; + u16 user_mss; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + /* sk is a const pointer, because we want to express multiple cpus + * might call us concurrently. + * sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way. + */ + skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2968,8 +2973,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -3009,7 +3015,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts); + tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); -- cgit v1.2.3 From ea3bea3a1d38aab1542176b2ff11a99ce3db9656 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:23 -0700 Subject: tcp/dccp: constify rtx_synack() and friends This is done to make sure we do not change listener socket while sending SYNACK packets while socket lock is not held. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9eb67a8933f1..53ce6cf55598 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3502,7 +3502,7 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } -int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; -- cgit v1.2.3 From d2e1339f40db753286ca0a92c92a847e08c5d2de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bendik=20R=C3=B8nning=20Opstad?= Date: Wed, 23 Sep 2015 18:49:53 +0200 Subject: tcp: Fix CWV being too strict on thin streams MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Application limited streams such as thin streams, that transmit small amounts of payload in relatively few packets per RTT, can be prevented from growing the CWND when in congestion avoidance. This leads to increased sojourn times for data segments in streams that often transmit time-dependent data. Currently, a connection is considered CWND limited only after having successfully transmitted at least one packet with new data, while at the same time failing to transmit some unsent data from the output queue because the CWND is full. Applications that produce small amounts of data may be left in a state where it is never considered to be CWND limited, because all unsent data is successfully transmitted each time an incoming ACK opens up for more data to be transmitted in the send window. Fix by always testing whether the CWND is fully used after successful packet transmissions, such that a connection is considered CWND limited whenever the CWND has been filled. This is the correct behavior as specified in RFC2861 (section 3.1). Cc: Andreas Petlund Cc: Carsten Griwodz Cc: Jonas Markussen Cc: Kenneth Klette Jonassen Cc: Mads Johannessen Signed-off-by: Bendik Rønning Opstad Acked-by: Eric Dumazet Tested-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9e53dd9bfcad..09bb082ca1a7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1822,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Ok, it looks like it is advisable to defer. */ - if (cong_win < send_win && cong_win < skb->len) + if (cong_win < send_win && cong_win <= skb->len) *is_cwnd_limited = true; return true; @@ -2055,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { - is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -2137,6 +2136,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; } -- cgit v1.2.3 From ca6fb06518836ef9b65dc0aac02ff97704d52a05 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:35 -0700 Subject: tcp: attach SYNACK messages to request sockets instead of listener If a listen backlog is very big (to avoid syncookies), then the listener sk->sk_wmem_alloc is the main source of false sharing, as we need to touch it twice per SYNACK re-transmit and TX completion. (One SYN packet takes listener lock once, but up to 6 SYNACK are generated) By attaching the skb to the request socket, we remove this source of contention. Tested: listen(fd, 10485760); // single listener (no SO_REUSEPORT) 16 RX/TX queue NIC Sustain a SYNFLOOD attack of ~320,000 SYN per second, Sending ~1,400,000 SYNACK per second. Perf profiles now show listener spinlock being next bottleneck. 20.29% [kernel] [k] queued_spin_lock_slowpath 10.06% [kernel] [k] __inet_lookup_established 5.12% [kernel] [k] reqsk_timer_handler 3.22% [kernel] [k] get_next_timer_interrupt 3.00% [kernel] [k] tcp_make_synack 2.77% [kernel] [k] ipt_do_table 2.70% [kernel] [k] run_timer_softirq 2.50% [kernel] [k] ip_finish_output 2.04% [kernel] [k] cascade Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 09bb082ca1a7..55ed3266b05f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk) */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, u16 user_mss; int mss; - /* sk is a const pointer, because we want to express multiple cpus - * might call us concurrently. - * sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb->destructor = sock_edemux; + sock_hold(req_to_sk(req)); + skb->sk = req_to_sk(req); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); @@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); -- cgit v1.2.3 From ed53d0ab761f5c71d77c8dc05fd19c0a851200db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 19:33:23 -0700 Subject: net: shrink struct sock and request_sock by 8 bytes One 32bit hole is following skc_refcnt, use it. skc_incoming_cpu can also be an union for request_sock rcv_wnd. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 55ed3266b05f..6e79fcb0addb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3023,7 +3023,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); -- cgit v1.2.3 From dc6ef6be52154490c5c03f742e28bc781cc751b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Oct 2015 13:00:01 -0700 Subject: tcp: do not set queue_mapping on SYNACK At the time of commit fff326990789 ("tcp: reflect SYN queue_mapping into SYNACK packets") we had little ways to cope with SYN floods. We no longer need to reflect incoming skb queue mappings, and instead can pick a TX queue based on cpu cooking the SYNACK, with normal XPS affinities. Note that all SYNACK retransmits were picking TX queue 0, this no longer is a win given that SYNACK rtx are now distributed on all cpus. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6e79fcb0addb..19adedb8c5cc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3518,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); -- cgit v1.2.3 From af82f4e84866ecd360a53f770d6217637116e6c1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 16 Oct 2015 21:57:43 -0700 Subject: tcp: remove tcp_mark_lost_retrans() Remove the existing lost retransmit detection because RACK subsumes it completely. This also stops the overloading the ack_seq field of the skb control block. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 19adedb8c5cc..f6f7f9b4901b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("retrans_out leaked\n"); } #endif - if (!tp->retrans_out) - tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); @@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, - * see tcp_input.c tcp_sacktag_write_queue(). - */ - TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } -- cgit v1.2.3 From 9e17f8a475fca81950fdddc08df428ed66cf441f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 1 Nov 2015 15:36:55 -0800 Subject: net: make skb_set_owner_w() more robust skb_set_owner_w() is called from various places that assume skb->sk always point to a full blown socket (as it changes sk->sk_wmem_alloc) We'd like to attach skb to request sockets, and in the future to timewait sockets as well. For these kind of pseudo sockets, we need to take a traditional refcount and use sock_edemux() as the destructor. It is now time to un-inline skb_set_owner_w(), being too big. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Bisected-by: Haiyang Zhang Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f4f9793eb025..cb7ca569052c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2963,9 +2963,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, skb_reserve(skb, MAX_TCP_HEADER); if (attach_req) { - skb->destructor = sock_edemux; - sock_hold(req_to_sk(req)); - skb->sk = req_to_sk(req); + skb_set_owner_w(skb, req_to_sk(req)); } else { /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. -- cgit v1.2.3 From 07e100f984975cb0417a7d5e626d0409efbad478 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Dec 2015 13:53:10 -0800 Subject: tcp: restore fastopen with no data in SYN packet Yuchung tracked a regression caused by commit 57be5bdad759 ("ip: convert tcp_sendmsg() to iov_iter primitives") for TCP Fast Open. Some Fast Open users do not actually add any data in the SYN packet. Fixes: 57be5bdad759 ("ip: convert tcp_sendmsg() to iov_iter primitives") Reported-by: Yuchung Cheng Signed-off-by: Eric Dumazet Cc: Al Viro Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cb7ca569052c..9bfc39ff2285 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3150,7 +3150,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, err = 0, copied; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; struct sk_buff *syn_data; @@ -3188,17 +3188,18 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); - copied = copy_from_iter(skb_put(syn_data, space), space, - &fo->data->msg_iter); - if (unlikely(!copied)) { - kfree_skb(syn_data); - goto fallback; - } - if (copied != space) { - skb_trim(syn_data, copied); - space = copied; + if (space) { + int copied = copy_from_iter(skb_put(syn_data, space), space, + &fo->data->msg_iter); + if (unlikely(!copied)) { + kfree_skb(syn_data); + goto fallback; + } + if (copied != space) { + skb_trim(syn_data, copied); + space = copied; + } } - /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; -- cgit v1.2.3