From 419f9f896074ce8b21e88066e6f3515f18e5641c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 11 Apr 2010 02:15:53 +0000 Subject: tcp: Handle CHECKSUM_PARTIAL for SYNACK packets for IPv4 tcp: Handle CHECKSUM_PARTIAL for SYNACK packets for IPv4 This patch moves the common code between tcp_v4_send_check and tcp_v4_gso_send_check into a new function __tcp_v4_send_check. It then uses the new function in tcp_v4_send_synack so that it handles CHECKSUM_PARTIAL properly. Signed-off-by: Herbert Xu Tested-by: Yinghai Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3c23e70885f4..aebfd28c5089 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -519,26 +519,31 @@ out: sock_put(sk); } -/* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +static void __tcp_v4_send_check(struct sk_buff *skb, + __be32 saddr, __be32 daddr) { - struct inet_sock *inet = inet_sk(sk); struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v4_check(len, inet->inet_saddr, - inet->inet_daddr, 0); + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { - th->check = tcp_v4_check(len, inet->inet_saddr, - inet->inet_daddr, + th->check = tcp_v4_check(skb->len, saddr, daddr, csum_partial(th, th->doff << 2, skb->csum)); } } +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + + __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); +} + int tcp_v4_gso_send_check(struct sk_buff *skb) { const struct iphdr *iph; @@ -551,10 +556,8 @@ int tcp_v4_gso_send_check(struct sk_buff *skb) th = tcp_hdr(skb); th->check = 0; - th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v4_send_check(skb, iph->saddr, iph->daddr); return 0; } @@ -763,13 +766,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, rvp); if (skb) { - struct tcphdr *th = tcp_hdr(skb); - - th->check = tcp_v4_check(skb->len, - ireq->loc_addr, - ireq->rmt_addr, - csum_partial(th, skb->len, - skb->csum)); + __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, ireq->rmt_addr, -- cgit v1.2.3 From bb29624614c2afe2873ee8ee97cf09df42701694 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 11 Apr 2010 02:15:55 +0000 Subject: inet: Remove unused send_check length argument inet: Remove unused send_check length argument This patch removes the unused length argument from the send_check function in struct inet_connection_sock_af_ops. Signed-off-by: Herbert Xu Tested-by: Yinghai Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aebfd28c5089..a24995cdc4b6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -537,7 +537,7 @@ static void __tcp_v4_send_check(struct sk_buff *skb, } /* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); -- cgit v1.2.3 From fec5e652e58fa6017b2c9e06466cb2a6538de5b4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 16 Apr 2010 16:01:27 -0700 Subject: rfs: Receive Flow Steering This patch implements receive flow steering (RFS). RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running. RFS is an extension of Receive Packet Steering (RPS). The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure. The rxhash is passed in skb's received on the connection from netif_receive_skb. For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms. The convolution of the simple approach is that it would potentially allow OOO packets. If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter. To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table. rps_sock_table is a global hash table. Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above. This table contains the "desired" CPUs for flows. rps_dev_flow_table is specific to each device queue. Each entry contains a CPU and a tail queue counter. The CPU is the "current" CPU for a matching flow. The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry. Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length. When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table. And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted. When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true: - The current CPU is unset (equal to RPS_NO_CPU) - Current CPU is offline - The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table. This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry. This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery. Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality. 2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant. This patch implements RFS for TCP and connected UDP sockets. It should be usable for other flow oriented protocols. There are two configuration parameters for RFS. The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue. Both are rounded to power of two. The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency). The benefits of RFS are dependent on cache hierarchy, application load, and other factors. On simple benchmarks, we don't necessarily see improvement and sometimes see degradation. However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf. e1000e on 8 core Intel No RFS or RPS 104K tps at 30% CPU No RFS (best RPS config): 290K tps at 63% CPU RFS 303K tps at 61% CPU RPC test tps CPU% 50/90/99% usec latency Latency StdDev No RFS/RPS 103K 48% 757/900/3185 4472.35 RPS only: 174K 73% 415/993/2468 491.66 RFS 223K 73% 379/651/1382 315.61 Signed-off-by: Tom Herbert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a24995cdc4b6..ad08392a738c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1672,6 +1672,8 @@ process: skb->dev = NULL; + inet_rps_save_rxhash(sk, skb->rxhash); + bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -- cgit v1.2.3 From 0eae88f31ca2b88911ce843452054139e028771f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 19:06:52 -0700 Subject: net: Fix various endianness glitches Sparse can help us find endianness bugs, but we need to make some cleanups to be able to more easily spot real bugs. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad08392a738c..4d6717d1e61c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1286,8 +1286,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_release; /* Secret recipe starts with IP addresses */ - *mess++ ^= daddr; - *mess++ ^= saddr; + *mess++ ^= (__force u32)daddr; + *mess++ ^= (__force u32)saddr; /* plus variable length Initiator Cookie */ c = (u8 *)mess; -- cgit v1.2.3 From c58dc01babfd58ec9e71a6ce080150dc27755d88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Apr 2010 15:05:31 -0700 Subject: net: Make RFS socket operations not be inet specific. Idea from Eric Dumazet. As for placement inside of struct sock, I tried to choose a place that otherwise has a 32-bit hole on 64-bit systems. Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d6717d1e61c..771f8146a2e5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1672,7 +1672,7 @@ process: skb->dev = NULL; - inet_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb->rxhash); bh_lock_sock_nested(sk); ret = 0; -- cgit v1.2.3 From a465419b1febb603821f924805529cff89cafeed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 May 2010 00:36:33 -0700 Subject: net: Introduce sk_route_nocaps TCP-MD5 sessions have intermittent failures, when route cache is invalidated. ip_queue_xmit() has to find a new route, calls sk_setup_caps(sk, &rt->u.dst), destroying the sk->sk_route_caps &= ~NETIF_F_GSO_MASK that MD5 desperately try to make all over its way (from tcp_transmit_skb() for example) So we send few bad packets, and everything is fine when tcp_transmit_skb() is called again for this socket. Since ip_queue_xmit() is at a lower level than TCP-MD5, I chose to use a socket field, sk_route_nocaps, containing bits to mask on sk_route_caps. Reported-by: Bhaskar Dutta Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 771f8146a2e5..202cf09c4cd4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -891,7 +891,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, kfree(newkey); return -ENOMEM; } - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + sk_nocaps_add(sk, NETIF_F_GSO_MASK); } if (tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); @@ -1021,7 +1021,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, return -EINVAL; tp->md5sig_info = p; - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + sk_nocaps_add(sk, NETIF_F_GSO_MASK); } newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); @@ -1462,7 +1462,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newkey != NULL) tcp_v4_md5_do_add(newsk, newinet->inet_daddr, newkey, key->keylen); - newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; + sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif -- cgit v1.2.3