From 57b354e66b67c4c72468a26d4313d1217ef32e17 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 16 May 2013 23:36:32 +0000
Subject: dev: remove duplicate 'skb->dev = dev' in dev_forward_skb()

This was added by commit 59b9997baba5 (Revert "net: maintain namespace
isolation between vlan and real device").
In fact, before the initial commit - the one that is reverted -, this
statement was not present.
'skb->dev = dev' is already done in eth_type_trans(), which is call just
after.

Spotted-by: Alain Ritoux <alain.ritoux@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index fc1e289397f5..18e9730cc4be 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1629,7 +1629,6 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 		return NET_RX_DROP;
 	}
 	skb->skb_iif = 0;
-	skb->dev = dev;
 	skb_dst_drop(skb);
 	skb->tstamp.tv64 = 0;
 	skb->pkt_type = PACKET_HOST;
-- 
cgit v1.2.3


From 99bbc70741903c063b3ccad90a3e06fc55df9245 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 20 May 2013 04:02:32 +0000
Subject: rps: selective flow shedding during softnet overflow

A cpu executing the network receive path sheds packets when its input
queue grows to netdev_max_backlog. A single high rate flow (such as a
spoofed source DoS) can exceed a single cpu processing rate and will
degrade throughput of other flows hashed onto the same cpu.

This patch adds a more fine grained hashtable. If the netdev backlog
is above a threshold, IRQ cpus track the ratio of total traffic of
each flow (using 4096 buckets, configurable). The ratio is measured
by counting the number of packets per flow over the last 256 packets
from the source cpu. Any flow that occupies a large fraction of this
(set at 50%) will see packet drop while above the threshold.

Tested:
Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0,
kernel receive (RPS) on cpu0 and application threads on cpus 2--7
each handling 20k req/s. Throughput halves when hit with a 400 kpps
antagonist storm. With this patch applied, antagonist overload is
dropped and the server processes its complete load.

The patch is effective when kernel receive processing is the
bottleneck. The above RPS scenario is a extreme, but the same is
reached with RFS and sufficient kernel processing (iptables, packet
socket tap, ..).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 18e9730cc4be..7229bc30e509 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
 	return 0;
 }
 
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &__get_cpu_var(softnet_data);
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
 
 	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
 
 	dev_boot_phase = 0;
-- 
cgit v1.2.3


From 42e52bf9e3ae80fd44b21ddfcd64c54e6db2ff76 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sat, 25 May 2013 04:12:10 +0000
Subject: net: add netnotifier event for upper device change

Now when upper device is changed, event is not propagated via RT Netlink
to userspace. Userspace might never now about the change. Fix this by
adding upper-device-change notifier event.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7229bc30e509..50c02ded1d69 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4411,7 +4411,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	else
 		list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
 	dev_hold(upper_dev);
-
+	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
 	return 0;
 }
 
@@ -4471,6 +4471,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 	list_del_rcu(&upper->list);
 	dev_put(upper_dev);
 	kfree_rcu(upper, rcu);
+	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-- 
cgit v1.2.3


From 0d89d2035fe063461a5ddb609b2c12e7fb006e44 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Thu, 23 May 2013 21:02:52 +0000
Subject: MPLS: Add limited GSO support

In the case where a non-MPLS packet is received and an MPLS stack is
added it may well be the case that the original skb is GSO but the
NIC used for transmit does not support GSO of MPLS packets.

The aim of this code is to provide GSO in software for MPLS packets
whose skbs are GSO.

SKB Usage:

When an implementation adds an MPLS stack to a non-MPLS packet it should do
the following to skb metadata:

* Set skb->inner_protocol to the old non-MPLS ethertype of the packet.
  skb->inner_protocol is added by this patch.

* Set skb->protocol to the new MPLS ethertype of the packet.

* Set skb->network_header to correspond to the
  end of the L3 header, including the MPLS label stack.

I have posted a patch, "[PATCH v3.29] datapath: Add basic MPLS support to
kernel" which adds MPLS support to the kernel datapath of Open vSwtich.
That patch sets the above requirements in datapath/actions.c:push_mpls()
and was used to exercise this code.  The datapath patch is against the Open
vSwtich tree but it is intended that it be added to the Open vSwtich code
present in the mainline Linux kernel at some point.

Features:

I believe that the approach that I have taken is at least partially
consistent with the handling of other protocols.  Jesse, I understand that
you have some ideas here.  I am more than happy to change my implementation.

This patch adds dev->mpls_features which may be used by devices
to advertise features supported for MPLS packets.

A new NETIF_F_MPLS_GSO feature is added for devices which support
hardware MPLS GSO offload.  Currently no devices support this
and MPLS GSO always falls back to software.

Alternate Implementation:

One possible alternate implementation is to teach netif_skb_features()
and skb_network_protocol() about MPLS, in a similar way to their
understanding of VLANs. I believe this would avoid the need
for net/mpls/mpls_gso.c and in particular the calls to
__skb_push() and __skb_push() in mpls_gso_segment().

I have decided on the implementation in this patch as it should
not introduce any overhead in the case where mpls_gso is not compiled
into the kernel or inserted as a module.

MPLS GSO suggested by Jesse Gross.
Based in part on "v4 GRE: Add TCP segmentation offload for GRE"
by Pravin B Shelar.

Cc: Jesse Gross <jesse@nicira.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 50c02ded1d69..2f09cb29cc95 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5277,6 +5277,10 @@ int register_netdevice(struct net_device *dev)
 	 */
 	dev->hw_enc_features |= NETIF_F_SG;
 
+	/* Make NETIF_F_SG inheritable to MPLS.
+	 */
+	dev->mpls_features |= NETIF_F_SG;
+
 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 	ret = notifier_to_errno(ret);
 	if (ret)
-- 
cgit v1.2.3


From da6e378ba918cd0feeb90eeb84d8b42148bb0c82 Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Mon, 27 May 2013 19:53:31 +0000
Subject: netpoll: remove return value from netpoll_rx_disable()

The netpoll_rx_disable() will always return 0, it is no use and looks wordy,
so remove the unnecessary code and get rid of it in _dev_open and _dev_close.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2f09cb29cc95..5f747974ac58 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1198,9 +1198,7 @@ static int __dev_open(struct net_device *dev)
 	 * If we don't do this there is a chance ndo_poll_controller
 	 * or ndo_poll may be running while we open the device
 	 */
-	ret = netpoll_rx_disable(dev);
-	if (ret)
-		return ret;
+	netpoll_rx_disable(dev);
 
 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
 	ret = notifier_to_errno(ret);
@@ -1309,9 +1307,7 @@ static int __dev_close(struct net_device *dev)
 	LIST_HEAD(single);
 
 	/* Temporarily disable netpoll until the interface is down */
-	retval = netpoll_rx_disable(dev);
-	if (retval)
-		return retval;
+	netpoll_rx_disable(dev);
 
 	list_add(&dev->unreg_list, &single);
 	retval = __dev_close_many(&single);
@@ -1353,14 +1349,11 @@ static int dev_close_many(struct list_head *head)
  */
 int dev_close(struct net_device *dev)
 {
-	int ret = 0;
 	if (dev->flags & IFF_UP) {
 		LIST_HEAD(single);
 
 		/* Block netpoll rx while the interface is going down */
-		ret = netpoll_rx_disable(dev);
-		if (ret)
-			return ret;
+		netpoll_rx_disable(dev);
 
 		list_add(&dev->unreg_list, &single);
 		dev_close_many(&single);
@@ -1368,7 +1361,7 @@ int dev_close(struct net_device *dev)
 
 		netpoll_rx_enable(dev);
 	}
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(dev_close);
 
-- 
cgit v1.2.3


From 351638e7deeed2ec8ce451b53d33921b3da68f83 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 28 May 2013 01:30:21 +0000
Subject: net: pass info struct via netdevice notifier

So far, only net_device * could be passed along with netdevice notifier
event. This patch provides a possibility to pass custom structure
able to provide info that event listener needs to know.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>

v2->v3: fix typo on simeth
	shortened dev_getter
	shortened notifier_info struct name
v1->v2: fix notifier_call parameter in call_netdevice_notifier()
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 10 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 5f747974ac58..54fce6006a83 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1391,6 +1391,20 @@ void dev_disable_lro(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_disable_lro);
 
+static void netdev_notifier_info_init(struct netdev_notifier_info *info,
+				      struct net_device *dev)
+{
+	info->dev = dev;
+}
+
+static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
+				   struct net_device *dev)
+{
+	struct netdev_notifier_info info;
+
+	netdev_notifier_info_init(&info, dev);
+	return nb->notifier_call(nb, val, &info);
+}
 
 static int dev_boot_phase = 1;
 
@@ -1423,7 +1437,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 		goto unlock;
 	for_each_net(net) {
 		for_each_netdev(net, dev) {
-			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
+			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 			err = notifier_to_errno(err);
 			if (err)
 				goto rollback;
@@ -1431,7 +1445,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 			if (!(dev->flags & IFF_UP))
 				continue;
 
-			nb->notifier_call(nb, NETDEV_UP, dev);
+			call_netdevice_notifier(nb, NETDEV_UP, dev);
 		}
 	}
 
@@ -1447,10 +1461,11 @@ rollback:
 				goto outroll;
 
 			if (dev->flags & IFF_UP) {
-				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
-				nb->notifier_call(nb, NETDEV_DOWN, dev);
+				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+							dev);
+				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 			}
-			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 		}
 	}
 
@@ -1488,10 +1503,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	for_each_net(net) {
 		for_each_netdev(net, dev) {
 			if (dev->flags & IFF_UP) {
-				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
-				nb->notifier_call(nb, NETDEV_DOWN, dev);
+				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+							dev);
+				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 			}
-			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 		}
 	}
 unlock:
@@ -1500,6 +1516,25 @@ unlock:
 }
 EXPORT_SYMBOL(unregister_netdevice_notifier);
 
+/**
+ *	call_netdevice_notifiers_info - call all network notifier blocks
+ *	@val: value passed unmodified to notifier function
+ *	@dev: net_device pointer passed unmodified to notifier function
+ *	@info: notifier information data
+ *
+ *	Call all network notifier blocks.  Parameters and return value
+ *	are as for raw_notifier_call_chain().
+ */
+
+int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
+				  struct netdev_notifier_info *info)
+{
+	ASSERT_RTNL();
+	netdev_notifier_info_init(info, dev);
+	return raw_notifier_call_chain(&netdev_chain, val, info);
+}
+EXPORT_SYMBOL(call_netdevice_notifiers_info);
+
 /**
  *	call_netdevice_notifiers - call all network notifier blocks
  *      @val: value passed unmodified to notifier function
@@ -1511,8 +1546,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
 
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 {
-	ASSERT_RTNL();
-	return raw_notifier_call_chain(&netdev_chain, val, dev);
+	struct netdev_notifier_info info;
+
+	return call_netdevice_notifiers_info(val, dev, &info);
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
-- 
cgit v1.2.3


From be9efd3653284f2827fd82861e8e9db9a8f726e1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 28 May 2013 01:30:22 +0000
Subject: net: pass changed flags along with NETDEV_CHANGE event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use new netdevice notifier infrastructure to pass along changed flags.

Signed-off-by: Timo Teräs <timo.teras@iki.fi>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>

v2->v3: shortened notifier_info struct name
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 54fce6006a83..6eb621cc3b81 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4771,8 +4771,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
 	}
 
 	if (dev->flags & IFF_UP &&
-	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
-		call_netdevice_notifiers(NETDEV_CHANGE, dev);
+	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
+		struct netdev_notifier_change_info change_info;
+
+		change_info.flags_changed = changes;
+		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+					      &change_info.info);
+	}
 }
 
 /**
-- 
cgit v1.2.3


From 75538c2b85cf22eb9af6adfaf26ed7219025adeb Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Wed, 29 May 2013 11:30:50 +0800
Subject: net: always pass struct netdev_notifier_info to netdevice notifiers

commit 351638e7deeed2ec8ce451b53d3 (net: pass info struct via netdevice notifier)
breaks booting of my KVM guest, this is due to we still forget to pass
struct netdev_notifier_info in several places. This patch completes it.

Cc: Jiri Pirko <jiri@resnulli.us>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6eb621cc3b81..b2e9057be3bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1391,12 +1391,6 @@ void dev_disable_lro(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_disable_lro);
 
-static void netdev_notifier_info_init(struct netdev_notifier_info *info,
-				      struct net_device *dev)
-{
-	info->dev = dev;
-}
-
 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 				   struct net_device *dev)
 {
-- 
cgit v1.2.3


From ced14f6804a979d1972415bc23f2f8ddb18595dd Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Tue, 28 May 2013 20:34:25 +0000
Subject: net: Correct comparisons and calculations using skb->tail and
 skb-transport_header

This corrects an regression introduced by "net: Use 16bits for *_headers
fields of struct skbuff" when NET_SKBUFF_DATA_USES_OFFSET is not set. In
that case skb->tail will be a pointer whereas skb->transport_header
will be an offset from head. This is corrected by using wrappers that
ensure that comparisons and calculations are always made using pointers.

Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index b2e9057be3bf..d4d874a25e45 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1724,7 +1724,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 			skb_reset_mac_header(skb2);
 
 			if (skb_network_header(skb2) < skb2->data ||
-			    skb2->network_header > skb2->tail) {
+			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 						     ntohs(skb2->protocol),
 						     dev->name);
@@ -3892,7 +3892,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
 	NAPI_GRO_CB(skb)->frag0 = NULL;
 	NAPI_GRO_CB(skb)->frag0_len = 0;
 
-	if (skb->mac_header == skb->tail &&
+	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
 	    pinfo->nr_frags &&
 	    !PageHighMem(skb_frag_page(frag0))) {
 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
-- 
cgit v1.2.3


From 430f03cde2fb9596d8b562824471e298a8080df9 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Sun, 2 Jun 2013 20:43:55 +0000
Subject: net: mark netdev_create_hash __net_init

netdev_create_hash() is only called from netdev_init() which is marked
__net_init.

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index d4d874a25e45..9c18557f93c6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6088,7 +6088,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
 }
 EXPORT_SYMBOL(netdev_increment_features);
 
-static struct hlist_head *netdev_create_hash(void)
+static struct hlist_head * __net_init netdev_create_hash(void)
 {
 	int i;
 	struct hlist_head *hash;
-- 
cgit v1.2.3


From af12fa6e46aa651e7b86a4c4117b562518fef184 Mon Sep 17 00:00:00 2001
From: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Date: Mon, 10 Jun 2013 11:39:41 +0300
Subject: net: add napi_id and hash

Adds a napi_id and a hashing mechanism to lookup a napi by id.
This will be used by subsequent patches to implement low latency
Ethernet device polling.
Based on a code sample by Eric Dumazet.

Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 9c18557f93c6..fa007dba6beb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -129,6 +129,7 @@
 #include <linux/inetdevice.h>
 #include <linux/cpu_rmap.h>
 #include <linux/static_key.h>
+#include <linux/hashtable.h>
 
 #include "net-sysfs.h"
 
@@ -166,6 +167,12 @@ static struct list_head offload_base __read_mostly;
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
+/* protects napi_hash addition/deletion and napi_gen_id */
+static DEFINE_SPINLOCK(napi_hash_lock);
+
+static unsigned int napi_gen_id;
+static DEFINE_HASHTABLE(napi_hash, 8);
+
 seqcount_t devnet_rename_seq;
 
 static inline void dev_base_seq_inc(struct net *net)
@@ -4136,6 +4143,58 @@ void napi_complete(struct napi_struct *n)
 }
 EXPORT_SYMBOL(napi_complete);
 
+/* must be called under rcu_read_lock(), as we dont take a reference */
+struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+	struct napi_struct *napi;
+
+	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+		if (napi->napi_id == napi_id)
+			return napi;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(napi_by_id);
+
+void napi_hash_add(struct napi_struct *napi)
+{
+	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+
+		spin_lock(&napi_hash_lock);
+
+		/* 0 is not a valid id, we also skip an id that is taken
+		 * we expect both events to be extremely rare
+		 */
+		napi->napi_id = 0;
+		while (!napi->napi_id) {
+			napi->napi_id = ++napi_gen_id;
+			if (napi_by_id(napi->napi_id))
+				napi->napi_id = 0;
+		}
+
+		hlist_add_head_rcu(&napi->napi_hash_node,
+			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+		spin_unlock(&napi_hash_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(napi_hash_add);
+
+/* Warning : caller is responsible to make sure rcu grace period
+ * is respected before freeing memory containing @napi
+ */
+void napi_hash_del(struct napi_struct *napi)
+{
+	spin_lock(&napi_hash_lock);
+
+	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+		hlist_del_rcu(&napi->napi_hash_node);
+
+	spin_unlock(&napi_hash_lock);
+}
+EXPORT_SYMBOL_GPL(napi_hash_del);
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
-- 
cgit v1.2.3


From 60877a32bce00041528576e6b8df5abe9251fa73 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 20 Jun 2013 01:15:51 -0700
Subject: net: allow large number of tx queues

netif_alloc_netdev_queues() uses kcalloc() to allocate memory
for the "struct netdev_queue *_tx" array.

For large number of tx queues, kcalloc() might fail, so this
patch does a fallback to vzalloc().

As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT
to kzalloc() flags to do this fallback only when really needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index fa007dba6beb..722f633926e0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/static_key.h>
 #include <linux/hashtable.h>
+#include <linux/vmalloc.h>
 
 #include "net-sysfs.h"
 
@@ -5253,17 +5254,28 @@ static void netdev_init_one_queue(struct net_device *dev,
 #endif
 }
 
+static void netif_free_tx_queues(struct net_device *dev)
+{
+	if (is_vmalloc_addr(dev->_tx))
+		vfree(dev->_tx);
+	else
+		kfree(dev->_tx);
+}
+
 static int netif_alloc_netdev_queues(struct net_device *dev)
 {
 	unsigned int count = dev->num_tx_queues;
 	struct netdev_queue *tx;
+	size_t sz = count * sizeof(*tx);
 
-	BUG_ON(count < 1);
-
-	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
-	if (!tx)
-		return -ENOMEM;
+	BUG_ON(count < 1 || count > 0xffff);
 
+	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!tx) {
+		tx = vzalloc(sz);
+		if (!tx)
+			return -ENOMEM;
+	}
 	dev->_tx = tx;
 
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5811,7 +5823,7 @@ free_all:
 
 free_pcpu:
 	free_percpu(dev->pcpu_refcnt);
-	kfree(dev->_tx);
+	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
 	kfree(dev->_rx);
 #endif
@@ -5836,7 +5848,7 @@ void free_netdev(struct net_device *dev)
 
 	release_net(dev_net(dev));
 
-	kfree(dev->_tx);
+	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
 	kfree(dev->_rx);
 #endif
-- 
cgit v1.2.3


From 621e84d6f373dcb273ebfd772638b8e7dc3c2c48 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 26 Jun 2013 16:11:27 +0200
Subject: dev: introduce skb_scrub_packet()

The goal of this new function is to perform all needed cleanup before sending
an skb into another netns.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 722f633926e0..370354a9c5f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1652,22 +1652,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 		}
 	}
 
-	skb_orphan(skb);
-
 	if (unlikely(!is_skb_forwardable(dev, skb))) {
 		atomic_long_inc(&dev->rx_dropped);
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
-	skb->skb_iif = 0;
-	skb_dst_drop(skb);
-	skb->tstamp.tv64 = 0;
-	skb->pkt_type = PACKET_HOST;
+	skb_scrub_packet(skb);
 	skb->protocol = eth_type_trans(skb, dev);
-	skb->mark = 0;
-	secpath_reset(skb);
-	nf_reset(skb);
-	nf_reset_trace(skb);
 	return netif_rx(skb);
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
-- 
cgit v1.2.3


From 06a23fe31ca3992863721f21bdb0307af93da807 Mon Sep 17 00:00:00 2001
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Tue, 2 Jul 2013 20:30:10 +0900
Subject: core/dev: set pkt_type after eth_type_trans() in dev_forward_skb()

The dev_forward_skb() assignment of pkt_type should be done
after the call to eth_type_trans().

ip-encapsulated packets can be handled by localhost. But skb->pkt_type
can be PACKET_OTHERHOST when packet comes via veth into ip tunnel device.
In that case, the packet is dropped by ip_rcv().
Although this example uses gretap. l2tp-eth also has same issue.
For l2tp-eth case, add dummy device for ip address and ip l2tp command.

netns A |                     root netns                      | netns B
   veth<->veth=bridge=gretap <-loop back-> gretap=bridge=veth<->veth

arp packet ->
pkt_type
         BROADCAST------------>ip_rcv()------------------------>

                                                             <- arp reply
                                                                pkt_type
                               ip_rcv()<-----------------OTHERHOST
                               drop

sample operations
  ip link add tapa type gretap remote 172.17.107.4 local 172.17.107.3
  ip link add tapb type gretap remote 172.17.107.3 local 172.17.107.4
  ip link set tapa up
  ip link set tapb up
  ip address add 172.17.107.3 dev tapa
  ip address add 172.17.107.4 dev tapb
  ip route get 172.17.107.3
  > local 172.17.107.3 dev lo  src 172.17.107.3
  >    cache <local>
  ip route get 172.17.107.4
  > local 172.17.107.4 dev lo  src 172.17.107.4
  >    cache <local>
  ip link add vetha type veth peer name vetha-peer
  ip link add vethb type veth peer name vethb-peer
  brctl addbr bra
  brctl addbr brb
  brctl addif bra tapa
  brctl addif bra vetha-peer
  brctl addif brb tapb
  brctl addif brb vethb-peer
  brctl show
  > bridge name     bridge id               STP enabled     interfaces
  > bra             8000.6ea21e758ff1       no              tapa
  >                                                         vetha-peer
  > brb             8000.420020eb92d5       no              tapb
  >                                                         vethb-peer
  ip link set vetha-peer up
  ip link set vethb-peer up
  ip link set bra up
  ip link set brb up
  ip netns add a
  ip netns add b
  ip link set vetha netns a
  ip link set vethb netns b
  ip netns exec a ip address add 10.0.0.3/24 dev vetha
  ip netns exec b ip address add 10.0.0.4/24 dev vethb
  ip netns exec a ip link set vetha up
  ip netns exec b ip link set vethb up
  ip netns exec a arping -I vetha 10.0.0.4
  ARPING 10.0.0.4 from 10.0.0.3 vetha
  ^CSent 2 probes (2 broadcast(s))
  Received 0 response(s)

Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Hong Zhiguo <honkiko@gmail.com>
Cc: Rami Rosen <ramirose@gmail.com>
Cc: Tom Parkin <tparkin@katalix.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
Cc: Jesse Gross <jesse@nicira.com>
Cc: dev@openvswitch.org
Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 370354a9c5f6..6a93cd8cd264 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1659,6 +1659,12 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 	}
 	skb_scrub_packet(skb);
 	skb->protocol = eth_type_trans(skb, dev);
+
+	/* eth_type_trans() can set pkt_type.
+	 * clear pkt_type _after_ calling eth_type_trans()
+	 */
+	skb->pkt_type = PACKET_HOST;
+
 	return netif_rx(skb);
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
-- 
cgit v1.2.3