From 1b3c61dc1aebf5d3d6c3981ba3eedc1e66f3ecda Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Tue, 13 May 2014 10:17:34 -0700 Subject: net: Use fwmark reflection in PMTU discovery. Currently, routing lookups used for Path PMTU Discovery in absence of a socket or on unmarked sockets use a mark of 0. This causes PMTUD not to work when using routing based on netfilter fwmark mangling and fwmark ip rules, such as: iptables -j MARK --set-mark 17 ip rule add fwmark 17 lookup 100 This patch causes these route lookups to use the fwmark from the received ICMP error when the fwmark_reflect sysctl is enabled. This allows the administrator to make PMTUD work by configuring appropriate fwmark rules to mark the inbound ICMP packets. Black-box tested using user-mode linux by pointing different fwmarks at routing tables egressing on different interfaces, and using iptables mangling to mark packets inbound on each interface with the interface's fwmark. ICMPv4 and ICMPv6 PMTU discovery work as expected when mark reflection is enabled and fail when it is disabled. Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index db1e0da871f4..50e1e0feddfc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -993,6 +993,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, struct flowi4 fl4; struct rtable *rt; + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + __build_flow_key(&fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); @@ -1010,6 +1013,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); -- cgit v1.2.3 From fbdc0ad095c0a299e9abf5d8ac8f58374951149a Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 22 May 2014 16:36:55 +0800 Subject: ipv4: initialise the itag variable in __mkroute_input the value of itag is a random value from stack, and may not be initiated by fib_validate_source, which called fib_combine_itag if CONFIG_IP_ROUTE_CLASSID is not set This will make the cached dst uncertainty Signed-off-by: Li RongQing Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index db1e0da871f4..5e676be3daeb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1519,7 +1519,7 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *out_dev; unsigned int flags = 0; bool do_cache; - u32 itag; + u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); -- cgit v1.2.3 From 73f156a6e8c1074ac6327e0abd1169e95eb66463 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Jun 2014 05:26:03 -0700 Subject: inetpeer: get rid of ip_id_count Ideally, we would need to generate IP ID using a per destination IP generator. linux kernels used inet_peer cache for this purpose, but this had a huge cost on servers disabling MTU discovery. 1) each inet_peer struct consumes 192 bytes 2) inetpeer cache uses a binary tree of inet_peer structs, with a nominal size of ~66000 elements under load. 3) lookups in this tree are hitting a lot of cache lines, as tree depth is about 20. 4) If server deals with many tcp flows, we have a high probability of not finding the inet_peer, allocating a fresh one, inserting it in the tree with same initial ip_id_count, (cf secure_ip_id()) 5) We garbage collect inet_peer aggressively. IP ID generation do not have to be 'perfect' Goal is trying to avoid duplicates in a short period of time, so that reassembly units have a chance to complete reassembly of fragments belonging to one message before receiving other fragments with a recycled ID. We simply use an array of generators, and a Jenkin hash using the dst IP as a key. ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it belongs (it is only used from this file) secure_ip_id() and secure_ipv6_id() no longer are needed. Rename ip_select_ident_more() to ip_select_ident_segs() to avoid unnecessary decrement/increment of the number of segments. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4154eb76b0ad..082239ffe34a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -456,39 +457,19 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; +atomic_t *ip_idents __read_mostly; +EXPORT_SYMBOL(ip_idents); - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); -} - -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +void __ip_select_ident(struct iphdr *iph, int segs) { - struct net *net = dev_net(dst->dev); - struct inet_peer *peer; + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; - peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); - if (peer) { - iph->id = htons(inet_getid(peer, more)); - inet_putpeer(peer); - return; - } + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); - ip_select_fb_ident(iph); + hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); @@ -2711,6 +2692,12 @@ int __init ip_rt_init(void) { int rc = 0; + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) -- cgit v1.2.3 From 7f502361531e9eecb396cf99bdc9e9a59f7ebd7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2014 01:26:23 -0700 Subject: ipv4: irq safe sk_dst_[re]set() and ipv4_sk_update_pmtu() fix We have two different ways to handle changes to sk->sk_dst First way (used by TCP) assumes socket lock is owned by caller, and use no extra lock : __sk_dst_set() & __sk_dst_reset() Another way (used by UDP) uses sk_dst_lock because socket lock is not always taken. Note that sk_dst_lock is not softirq safe. These ways are not inter changeable for a given socket type. ipv4_sk_update_pmtu(), added in linux-3.8, added a race, as it used the socket lock as synchronization, but users might be UDP sockets. Instead of converting sk_dst_lock to a softirq safe version, use xchg() as we did for sk_rx_dst in commit e47eb5dfb296b ("udp: ipv4: do not use sk_dst_lock from softirq context") In a follow up patch, we probably can remove sk_dst_lock, as it is only used in IPv6. Signed-off-by: Eric Dumazet Cc: Steffen Klassert Fixes: 9cb3a50c5f63e ("ipv4: Invalidate the socket cached route on pmtu events if possible") Signed-off-by: David S. Miller --- net/ipv4/route.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 082239ffe34a..3162ea923ded 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1010,7 +1010,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - struct dst_entry *dst; + struct dst_entry *odst = NULL; bool new = false; bh_lock_sock(sk); @@ -1018,16 +1018,17 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) if (!ip_sk_accept_pmtu(sk)) goto out; - rt = (struct rtable *) __sk_dst_get(sk); + odst = sk_dst_get(sk); - if (sock_owned_by_user(sk) || !rt) { + if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - if (!__sk_dst_check(sk, 0)) { + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1037,8 +1038,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); - dst = dst_check(&rt->dst, 0); - if (!dst) { + if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); @@ -1050,10 +1050,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) } if (new) - __sk_dst_set(sk, &rt->dst); + sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); + dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -- cgit v1.2.3