From 5d134f1c1f36166e8a738de92c4d2f4c262ff91b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 5 Jan 2013 16:10:48 +0000 Subject: tcp: make sysctl_tcp_ecn namespace aware As per suggestion from Eric Dumazet this patch makes tcp_ecn sysctl namespace aware. The reason behind this patch is to ease the testing of ecn problems on the internet and allows applications to tune their own use of ecn. Cc: Eric Dumazet Cc: David Miller Cc: Stephen Hemminger Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5d451593ef16..667a6adfccf8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -314,7 +314,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } -- cgit v1.2.3 From cef401de7be8c4e155c6746bfccf721a4fa5fab9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Jan 2013 20:34:37 +0000 Subject: net: fix possible wrong checksum generation Pravin Shelar mentioned that GSO could potentially generate wrong TX checksum if skb has fragments that are overwritten by the user between the checksum computation and transmit. He suggested to linearize skbs but this extra copy can be avoided for normal tcp skbs cooked by tcp_sendmsg(). This patch introduces a new SKB_GSO_SHARED_FRAG flag, set in skb_shinfo(skb)->gso_type if at least one frag can be modified by the user. Typical sources of such possible overwrites are {vm}splice(), sendfile(), and macvtap/tun/virtio_net drivers. Tested: $ netperf -H 7.7.8.84 MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.8.84 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 3959.52 $ netperf -H 7.7.8.84 -t TCP_SENDFILE TCP SENDFILE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.8.84 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 3216.80 Performance of the SENDFILE is impacted by the extra allocation and copy, and because we use order-0 pages, while the TCP_STREAM uses bigger pages. Reported-by: Pravin Shelar Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 667a6adfccf8..367e2ec01da1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1133,6 +1133,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { + skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; if (skb->len <= mss_now || !sk_can_gso(sk) || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal @@ -1140,11 +1141,10 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, */ skb_shinfo(skb)->gso_segs = 1; skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; } else { skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); skb_shinfo(skb)->gso_size = mss_now; - skb_shinfo(skb)->gso_type = sk->sk_gso_type; + skb_shinfo(skb)->gso_type |= sk->sk_gso_type; } } -- cgit v1.2.3 From ee684b6f2830047d19877e5547989740f18b1a5d Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 11 Feb 2013 05:50:19 +0000 Subject: tcp: send packets with a socket timestamp A socket timestamp is a sum of the global tcp_time_stamp and a per-socket offset. A socket offset is added in places where externally visible tcp timestamp option is parsed/initialized. Connections in the SYN_RECV state are not supported, global tcp_time_stamp is used for them, because repair mode doesn't support this state. In a future it can be implemented by the similar way as for TIME_WAIT sockets. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: Eric Dumazet Cc: Pavel Emelyanov Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 367e2ec01da1..564bf89d9fd3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -622,7 +622,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -806,7 +806,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when : 0; + opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } -- cgit v1.2.3 From c9af6db4c11ccc6c3e7f19bbc15d54023956f97c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 11 Feb 2013 09:27:41 +0000 Subject: net: Fix possible wrong checksum generation. Patch cef401de7be8c4e (net: fix possible wrong checksum generation) fixed wrong checksum calculation but it broke TSO by defining new GSO type but not a netdev feature for that type. net_gso_ok() would not allow hardware checksum/segmentation offload of such packets without the feature. Following patch fixes TSO and wrong checksum. This patch uses same logic that Eric Dumazet used. Patch introduces new flag SKBTX_SHARED_FRAG if at least one frag can be modified by the user. but SKBTX_SHARED_FRAG flag is kept in skb shared info tx_flags rather than gso_type. tx_flags is better compared to gso_type since we can have skb with shared frag without gso packet. It does not link SHARED_FRAG to GSO, So there is no need to define netdev feature for this. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 564bf89d9fd3..6182d90e97b0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1133,7 +1133,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG; if (skb->len <= mss_now || !sk_can_gso(sk) || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal @@ -1141,10 +1140,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, */ skb_shinfo(skb)->gso_segs = 1; skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; } else { skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); skb_shinfo(skb)->gso_size = mss_now; - skb_shinfo(skb)->gso_type |= sk->sk_gso_type; + skb_shinfo(skb)->gso_type = sk->sk_gso_type; } } -- cgit v1.2.3 From 14bbd6a565e1bcdc240d44687edb93f721cfdf99 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 14 Feb 2013 09:44:49 +0000 Subject: net: Add skb_unclone() helper function. This function will be used in next GRE_GSO patch. This patch does not change any functionality. Signed-off-by: Pravin B Shelar Acked-by: Eric Dumazet --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6182d90e97b0..fd0cea114b5d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1331,7 +1331,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; __pskb_trim_head(skb, len); -- cgit v1.2.3 From 1b63edd6ecc55c3a61b40297b49e2323783bddfd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 22 Feb 2013 08:59:06 +0000 Subject: tcp: fix SYN-data space mis-accounting In fast open the sender unncessarily reduces the space available for data in SYN by 12 bytes. This is because in the sender incorrectly reserves space for TS option twice in tcp_send_syn_data(): tcp_mtu_to_mss() already accounts for TS option space. But it further reserves MAX_TCP_OPTION_SPACE when computing the payload space. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fd0cea114b5d..e2b4461074da 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1351,8 +1351,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options. */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1381,13 +1381,17 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - return mss_now; } +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + /* Subtract TCP options size, not including SACKs */ + return __tcp_mtu_to_mss(sk, pmtu) - + (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} + /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { @@ -2930,7 +2934,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) */ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; - space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; syn_data = skb_copy_expand(syn, skb_headroom(syn), space, -- cgit v1.2.3