From 351638e7deeed2ec8ce451b53d33921b3da68f83 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 28 May 2013 01:30:21 +0000
Subject: net: pass info struct via netdevice notifier

So far, only net_device * could be passed along with netdevice notifier
event. This patch provides a possibility to pass custom structure
able to provide info that event listener needs to know.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>

v2->v3: fix typo on simeth
	shortened dev_getter
	shortened notifier_info struct name
v1->v2: fix notifier_call parameter in call_netdevice_notifier()
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad0aa6b0b86a..194c3cde1536 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2681,9 +2681,9 @@ errout:
 }
 
 static int ip6_route_dev_notify(struct notifier_block *this,
-				unsigned long event, void *data)
+				unsigned long event, void *ptr)
 {
-	struct net_device *dev = (struct net_device *)data;
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct net *net = dev_net(dev);
 
 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
-- 
cgit v1.2.3


From 29a3cad5c6ae9e7fbf1509d01d39c3c3c38f11f9 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Tue, 28 May 2013 20:34:26 +0000
Subject: ipv6: Correct comparisons and calculations using skb->tail and
 skb-transport_header

This corrects an regression introduced by "net: Use 16bits for *_headers
fields of struct skbuff" when NET_SKBUFF_DATA_USES_OFFSET is not set. In
that case skb->tail will be a pointer whereas skb->transport_header
will be an offset from head. This is corrected by using wrappers that
ensure that comparisons and calculations are always made using pointers.

Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 194c3cde1536..2b874185ebb2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1649,7 +1649,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	int optlen, on_link;
 	u8 *lladdr;
 
-	optlen = skb->tail - skb->transport_header;
+	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
 	optlen -= sizeof(*msg);
 
 	if (optlen < 0) {
-- 
cgit v1.2.3


From fe2c6338fd2c6f383c4d4164262f35c8f3708e1f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 11 Jun 2013 23:04:25 -0700
Subject: net: Convert uses of typedef ctl_table to struct ctl_table

Reduce the uses of this unnecessary typedef.

Done via perl script:

$ git grep --name-only -w ctl_table net | \
  xargs perl -p -i -e '\
	sub trim { my ($local) = @_; $local =~ s/(^\s+|\s+$)//g; return $local; } \
        s/\b(?<!struct\s)ctl_table\b(\s*\*\s*|\s+\w+)/"struct ctl_table " . trim($1)/ge'

Reflow the modified lines that now exceed 80 columns.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2b874185ebb2..7ca87b37c0ef 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2790,7 +2790,7 @@ static const struct file_operations rt6_stats_seq_fops = {
 #ifdef CONFIG_SYSCTL
 
 static
-int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
+int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 			      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net;
@@ -2805,7 +2805,7 @@ int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
 	return 0;
 }
 
-ctl_table ipv6_route_table_template[] = {
+struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
 		.data		=	&init_net.ipv6.sysctl.flush_delay,
-- 
cgit v1.2.3


From 52bd4c0c1551daa2efa7bb9e01a2f4ea6d1311bb Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 28 Jun 2013 17:35:48 +0200
Subject: ipv6: fix ecmp lookup when oif is specified

There is no reason to skip ECMP lookup when oif is specified, but this implies
to check oif given by user when selecting another route.
When the new route does not match oif requirement, we simply keep the initial
one.

Spotted-by: dingzhi <zhi.ding@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7ca87b37c0ef..9ff0b78a9c15 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -83,6 +83,7 @@ static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
+static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -394,7 +395,8 @@ static int rt6_info_hash_nhsfn(unsigned int candidate_count,
 }
 
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
-					     struct flowi6 *fl6)
+					     struct flowi6 *fl6, int oif,
+					     int strict)
 {
 	struct rt6_info *sibling, *next_sibling;
 	int route_choosen;
@@ -408,6 +410,8 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 				&match->rt6i_siblings, rt6i_siblings) {
 			route_choosen--;
 			if (route_choosen == 0) {
+				if (rt6_score_route(sibling, oif, strict) < 0)
+					break;
 				match = sibling;
 				break;
 			}
@@ -743,7 +747,7 @@ restart:
 	rt = fn->leaf;
 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-		rt = rt6_multipath_select(rt, fl6);
+		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
 	BACKTRACK(net, &fl6->saddr);
 out:
 	dst_use(&rt->dst, jiffies);
@@ -875,8 +879,8 @@ restart_2:
 
 restart:
 	rt = rt6_select(fn, oif, strict | reachable);
-	if (rt->rt6i_nsiblings && oif == 0)
-		rt = rt6_multipath_select(rt, fl6);
+	if (rt->rt6i_nsiblings)
+		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
 	BACKTRACK(net, &fl6->saddr);
 	if (rt == net->ipv6.ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
-- 
cgit v1.2.3


From 3630d40067a21d4dfbadc6002bb469ce26ac5d52 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 3 Jul 2013 20:45:04 +0200
Subject: ipv6: rt6_check_neigh should successfully verify neigh if no NUD
 information are available

After the removal of rt->n we do not create a neighbour entry at route
insertion time (rt6_bind_neighbour is gone). As long as no neighbour is
created because of "useful traffic" we skip this routing entry because
rt6_check_neigh cannot pick up a valid neighbour (neigh == NULL) and
thus returns false.

This change was introduced by commit
887c95cc1da53f66a5890fdeab13414613010097 ("ipv6: Complete neighbour
entry removal from dst_entry.")

To quote RFC4191:
"If the host has no information about the router's reachability, then
the host assumes the router is reachable."

and also:
"A host MUST NOT probe a router's reachability in the absence of useful
traffic that the host would have sent to the router if it were reachable."

So, just assume the router is reachable and let's rt6_probe do the
rest. We don't need to create a neighbour on route insertion time.

If we don't compile with CONFIG_IPV6_ROUTER_PREF (RFC4191 support)
a neighbour is only valid if its nud_state is NUD_VALID. I did not find
any references that we should probe the router on route insertion time
via the other RFCs. So skip this route in that case.

v2:
a) use IS_ENABLED instead of #ifdefs (thanks to Sergei Shtylyov)

Reported-by: Pierre Emeriaud <petrus.lt@gmail.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9ff0b78a9c15..bd5fd7054031 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -551,6 +551,8 @@ static inline bool rt6_check_neigh(struct rt6_info *rt)
 			ret = true;
 #endif
 		read_unlock(&neigh->lock);
+	} else if (IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) {
+		ret = true;
 	}
 	rcu_read_unlock_bh();
 
-- 
cgit v1.2.3


From 1eb4f758286884e7566627164bca4c4a16952a83 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 10 Jul 2013 23:00:57 +0200
Subject: ipv6: in case of link failure remove route directly instead of
 letting it expire

We could end up expiring a route which is part of an ecmp route set. Doing
so would invalidate the rt->rt6i_nsiblings calculations and could provoke
the following panic:

[   80.144667] ------------[ cut here ]------------
[   80.145172] kernel BUG at net/ipv6/ip6_fib.c:733!
[   80.145172] invalid opcode: 0000 [#1] SMP
[   80.145172] Modules linked in: 8021q nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables
+snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_page_alloc snd_timer virtio_balloon snd soundcore i2c_piix4 i2c_core virtio_net virtio_blk
[   80.145172] CPU: 1 PID: 786 Comm: ping6 Not tainted 3.10.0+ #118
[   80.145172] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   80.145172] task: ffff880117fa0000 ti: ffff880118770000 task.ti: ffff880118770000
[   80.145172] RIP: 0010:[<ffffffff815f3b5d>]  [<ffffffff815f3b5d>] fib6_add+0x75d/0x830
[   80.145172] RSP: 0018:ffff880118771798  EFLAGS: 00010202
[   80.145172] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88011350e480
[   80.145172] RDX: ffff88011350e238 RSI: 0000000000000004 RDI: ffff88011350f738
[   80.145172] RBP: ffff880118771848 R08: ffff880117903280 R09: 0000000000000001
[   80.145172] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011350f680
[   80.145172] R13: ffff880117903280 R14: ffff880118771890 R15: ffff88011350ef90
[   80.145172] FS:  00007f02b5127740(0000) GS:ffff88011fd00000(0000) knlGS:0000000000000000
[   80.145172] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   80.145172] CR2: 00007f981322a000 CR3: 00000001181b1000 CR4: 00000000000006e0
[   80.145172] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   80.145172] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   80.145172] Stack:
[   80.145172]  0000000000000001 ffff880100000000 ffff880100000000 ffff880117903280
[   80.145172]  0000000000000000 ffff880119a4cf00 0000000000000400 00000000000007fa
[   80.145172]  0000000000000000 0000000000000000 0000000000000000 ffff88011350f680
[   80.145172] Call Trace:
[   80.145172]  [<ffffffff815eeceb>] ? rt6_bind_peer+0x4b/0x90
[   80.145172]  [<ffffffff815ed985>] __ip6_ins_rt+0x45/0x70
[   80.145172]  [<ffffffff815eee35>] ip6_ins_rt+0x35/0x40
[   80.145172]  [<ffffffff815ef1e4>] ip6_pol_route.isra.44+0x3a4/0x4b0
[   80.145172]  [<ffffffff815ef34a>] ip6_pol_route_output+0x2a/0x30
[   80.145172]  [<ffffffff81616077>] fib6_rule_action+0xd7/0x210
[   80.145172]  [<ffffffff815ef320>] ? ip6_pol_route_input+0x30/0x30
[   80.145172]  [<ffffffff81553026>] fib_rules_lookup+0xc6/0x140
[   80.145172]  [<ffffffff81616374>] fib6_rule_lookup+0x44/0x80
[   80.145172]  [<ffffffff815ef320>] ? ip6_pol_route_input+0x30/0x30
[   80.145172]  [<ffffffff815edea3>] ip6_route_output+0x73/0xb0
[   80.145172]  [<ffffffff815dfdf3>] ip6_dst_lookup_tail+0x2c3/0x2e0
[   80.145172]  [<ffffffff813007b1>] ? list_del+0x11/0x40
[   80.145172]  [<ffffffff81082a4c>] ? remove_wait_queue+0x3c/0x50
[   80.145172]  [<ffffffff815dfe4d>] ip6_dst_lookup_flow+0x3d/0xa0
[   80.145172]  [<ffffffff815fda77>] rawv6_sendmsg+0x267/0xc20
[   80.145172]  [<ffffffff815a8a83>] inet_sendmsg+0x63/0xb0
[   80.145172]  [<ffffffff8128eb93>] ? selinux_socket_sendmsg+0x23/0x30
[   80.145172]  [<ffffffff815218d6>] sock_sendmsg+0xa6/0xd0
[   80.145172]  [<ffffffff81524a68>] SYSC_sendto+0x128/0x180
[   80.145172]  [<ffffffff8109825c>] ? update_curr+0xec/0x170
[   80.145172]  [<ffffffff81041d09>] ? kvm_clock_get_cycles+0x9/0x10
[   80.145172]  [<ffffffff810afd1e>] ? __getnstimeofday+0x3e/0xd0
[   80.145172]  [<ffffffff8152509e>] SyS_sendto+0xe/0x10
[   80.145172]  [<ffffffff8164efd9>] system_call_fastpath+0x16/0x1b
[   80.145172] Code: fe ff ff 41 f6 45 2a 06 0f 85 ca fe ff ff 49 8b 7e 08 4c 89 ee e8 94 ef ff ff e9 b9 fe ff ff 48 8b 82 28 05 00 00 e9 01 ff ff ff <0f> 0b 49 8b 54 24 30 0d 00 00 40 00 89 83 14 01 00 00 48 89 53
[   80.145172] RIP  [<ffffffff815f3b5d>] fib6_add+0x75d/0x830
[   80.145172]  RSP <ffff880118771798>
[   80.387413] ---[ end trace 02f20b7a8b81ed95 ]---
[   80.390154] Kernel panic - not syncing: Fatal exception in interrupt

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bd5fd7054031..5b127e09c224 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1080,10 +1080,13 @@ static void ip6_link_failure(struct sk_buff *skb)
 
 	rt = (struct rt6_info *) skb_dst(skb);
 	if (rt) {
-		if (rt->rt6i_flags & RTF_CACHE)
-			rt6_update_expires(rt, 0);
-		else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
+		if (rt->rt6i_flags & RTF_CACHE) {
+			dst_hold(&rt->dst);
+			if (ip6_del_rt(rt))
+				dst_free(&rt->dst);
+		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
 			rt->rt6i_node->fn_sernum = -1;
+		}
 	}
 }
 
-- 
cgit v1.2.3


From afc154e978de1eb11c555bc8bcec1552f75ebc43 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 11 Jul 2013 12:43:42 +0200
Subject: ipv6: fix route selection if kernel is not compiled with
 CONFIG_IPV6_ROUTER_PREF

This is a follow-up patch to 3630d40067a21d4dfbadc6002bb469ce26ac5d52
("ipv6: rt6_check_neigh should successfully verify neigh if no NUD
information are available").

Since the removal of rt->n in rt6_info we can end up with a dst ==
NULL in rt6_check_neigh. In case the kernel is not compiled with
CONFIG_IPV6_ROUTER_PREF we should also select a route with unkown
NUD state but we must not avoid doing round robin selection on routes
with the same target. So introduce and pass down a boolean ``do_rr'' to
indicate when we should update rt->rr_ptr. As soon as no route is valid
we do backtracking and do a lookup on a higher level in the fib trie.

v2:
a) Improved rt6_check_neigh logic (no need to create neighbour there)
   and documented return values.

v3:
a) Introduce enum rt6_nud_state to get rid of the magic numbers
   (thanks to David Miller).
b) Update and shorten commit message a bit to actualy reflect
   the source.

Reported-by: Pierre Emeriaud <petrus.lt@gmail.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 63 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 23 deletions(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5b127e09c224..a8c891aa2464 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -65,6 +65,12 @@
 #include <linux/sysctl.h>
 #endif
 
+enum rt6_nud_state {
+	RT6_NUD_FAIL_HARD = -2,
+	RT6_NUD_FAIL_SOFT = -1,
+	RT6_NUD_SUCCEED = 1
+};
+
 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
 				    const struct in6_addr *dest);
 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
@@ -531,28 +537,29 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 	return 0;
 }
 
-static inline bool rt6_check_neigh(struct rt6_info *rt)
+static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
 {
 	struct neighbour *neigh;
-	bool ret = false;
+	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 
 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
 	    !(rt->rt6i_flags & RTF_GATEWAY))
-		return true;
+		return RT6_NUD_SUCCEED;
 
 	rcu_read_lock_bh();
 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
 	if (neigh) {
 		read_lock(&neigh->lock);
 		if (neigh->nud_state & NUD_VALID)
-			ret = true;
+			ret = RT6_NUD_SUCCEED;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 		else if (!(neigh->nud_state & NUD_FAILED))
-			ret = true;
+			ret = RT6_NUD_SUCCEED;
 #endif
 		read_unlock(&neigh->lock);
-	} else if (IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) {
-		ret = true;
+	} else {
+		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
+		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_SOFT;
 	}
 	rcu_read_unlock_bh();
 
@@ -566,43 +573,52 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 
 	m = rt6_check_dev(rt, oif);
 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
-		return -1;
+		return RT6_NUD_FAIL_HARD;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 #endif
-	if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE))
-		return -1;
+	if (strict & RT6_LOOKUP_F_REACHABLE) {
+		int n = rt6_check_neigh(rt);
+		if (n < 0)
+			return n;
+	}
 	return m;
 }
 
 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
-				   int *mpri, struct rt6_info *match)
+				   int *mpri, struct rt6_info *match,
+				   bool *do_rr)
 {
 	int m;
+	bool match_do_rr = false;
 
 	if (rt6_check_expired(rt))
 		goto out;
 
 	m = rt6_score_route(rt, oif, strict);
-	if (m < 0)
+	if (m == RT6_NUD_FAIL_SOFT && !IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) {
+		match_do_rr = true;
+		m = 0; /* lowest valid score */
+	} else if (m < 0) {
 		goto out;
+	}
+
+	if (strict & RT6_LOOKUP_F_REACHABLE)
+		rt6_probe(rt);
 
 	if (m > *mpri) {
-		if (strict & RT6_LOOKUP_F_REACHABLE)
-			rt6_probe(match);
+		*do_rr = match_do_rr;
 		*mpri = m;
 		match = rt;
-	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
-		rt6_probe(rt);
 	}
-
 out:
 	return match;
 }
 
 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 				     struct rt6_info *rr_head,
-				     u32 metric, int oif, int strict)
+				     u32 metric, int oif, int strict,
+				     bool *do_rr)
 {
 	struct rt6_info *rt, *match;
 	int mpri = -1;
@@ -610,10 +626,10 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	match = NULL;
 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
 	     rt = rt->dst.rt6_next)
-		match = find_match(rt, oif, strict, &mpri, match);
+		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 	     rt = rt->dst.rt6_next)
-		match = find_match(rt, oif, strict, &mpri, match);
+		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
 	return match;
 }
@@ -622,15 +638,16 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 {
 	struct rt6_info *match, *rt0;
 	struct net *net;
+	bool do_rr = false;
 
 	rt0 = fn->rr_ptr;
 	if (!rt0)
 		fn->rr_ptr = rt0 = fn->leaf;
 
-	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
+	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
+			     &do_rr);
 
-	if (!match &&
-	    (strict & RT6_LOOKUP_F_REACHABLE)) {
+	if (do_rr) {
 		struct rt6_info *next = rt0->dst.rt6_next;
 
 		/* no entries matched; do round-robin */
-- 
cgit v1.2.3


From 2ac3ac8f86f2fe065d746d9a9abaca867adec577 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Thu, 1 Aug 2013 10:04:14 +0200
Subject: ipv6: prevent fib6_run_gc() contention

On a high-traffic router with many processors and many IPv6 dst
entries, soft lockup in fib6_run_gc() can occur when number of
entries reaches gc_thresh.

This happens because fib6_run_gc() uses fib6_gc_lock to allow
only one thread to run the garbage collector but ip6_dst_gc()
doesn't update net->ipv6.ip6_rt_last_gc until fib6_run_gc()
returns. On a system with many entries, this can take some time
so that in the meantime, other threads pass the tests in
ip6_dst_gc() (ip6_rt_last_gc is still not updated) and wait for
the lock. They then have to run the garbage collector one after
another which blocks them for quite long.

Resolve this by replacing special value ~0UL of expire parameter
to fib6_run_gc() by explicit "force" parameter to choose between
spin_lock_bh() and spin_trylock_bh() and call fib6_run_gc() with
force=false if gc_thresh is reached but not max_size.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a8c891aa2464..824c424f9648 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1326,7 +1326,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
 		goto out;
 
 	net->ipv6.ip6_rt_gc_expire++;
-	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
+	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size);
 	net->ipv6.ip6_rt_last_gc = now;
 	entries = dst_entries_get_slow(ops);
 	if (entries < ops->gc_thresh)
@@ -2827,7 +2827,7 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	net = (struct net *)ctl->extra1;
 	delay = net->ipv6.sysctl.flush_delay;
 	proc_dointvec(ctl, write, buffer, lenp, ppos);
-	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
+	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 49a18d86f66d33a20144ecb5a34bba0d1856b260 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Thu, 1 Aug 2013 10:04:24 +0200
Subject: ipv6: update ip6_rt_last_gc every time GC is run

As pointed out by Eric Dumazet, net->ipv6.ip6_rt_last_gc should
hold the last time garbage collector was run so that we should
update it whenever fib6_run_gc() calls fib6_clean_all(), not only
if we got there from ip6_dst_gc().

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/ipv6/route.c')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 824c424f9648..b70f8979003b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1311,7 +1311,6 @@ static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
 
 static int ip6_dst_gc(struct dst_ops *ops)
 {
-	unsigned long now = jiffies;
 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
@@ -1321,13 +1320,12 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	int entries;
 
 	entries = dst_entries_get_fast(ops);
-	if (time_after(rt_last_gc + rt_min_interval, now) &&
+	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
 	    entries <= rt_max_size)
 		goto out;
 
 	net->ipv6.ip6_rt_gc_expire++;
 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size);
-	net->ipv6.ip6_rt_last_gc = now;
 	entries = dst_entries_get_slow(ops);
 	if (entries < ops->gc_thresh)
 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
-- 
cgit v1.2.3