From 57b354e66b67c4c72468a26d4313d1217ef32e17 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 16 May 2013 23:36:32 +0000 Subject: dev: remove duplicate 'skb->dev = dev' in dev_forward_skb() This was added by commit 59b9997baba5 (Revert "net: maintain namespace isolation between vlan and real device"). In fact, before the initial commit - the one that is reverted -, this statement was not present. 'skb->dev = dev' is already done in eth_type_trans(), which is call just after. Spotted-by: Alain Ritoux Signed-off-by: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fc1e289397f5..18e9730cc4be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1629,7 +1629,6 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return NET_RX_DROP; } skb->skb_iif = 0; - skb->dev = dev; skb_dst_drop(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; -- cgit v1.2.3 From 99bbc70741903c063b3ccad90a3e06fc55df9245 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 20 May 2013 04:02:32 +0000 Subject: rps: selective flow shedding during softnet overflow A cpu executing the network receive path sheds packets when its input queue grows to netdev_max_backlog. A single high rate flow (such as a spoofed source DoS) can exceed a single cpu processing rate and will degrade throughput of other flows hashed onto the same cpu. This patch adds a more fine grained hashtable. If the netdev backlog is above a threshold, IRQ cpus track the ratio of total traffic of each flow (using 4096 buckets, configurable). The ratio is measured by counting the number of packets per flow over the last 256 packets from the source cpu. Any flow that occupies a large fraction of this (set at 50%) will see packet drop while above the threshold. Tested: Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0, kernel receive (RPS) on cpu0 and application threads on cpus 2--7 each handling 20k req/s. Throughput halves when hit with a 400 kpps antagonist storm. With this patch applied, antagonist overload is dropped and the server processes its complete load. The patch is effective when kernel receive processing is the bottleneck. The above RPS scenario is a extreme, but the same is reached with RFS and sufficient kernel processing (iptables, packet socket tap, ..). Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 18e9730cc4be..7229bc30e509 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd) return 0; } +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (netdev_max_backlog >> 1)) + return false; + + sd = &__get_cpu_var(softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). @@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, { struct softnet_data *sd; unsigned long flags; + unsigned int qlen; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); - if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -6269,6 +6311,10 @@ static int __init net_dev_init(void) sd->backlog.weight = weight_p; sd->backlog.gro_list = NULL; sd->backlog.gro_count = 0; + +#ifdef CONFIG_NET_FLOW_LIMIT + sd->flow_limit = NULL; +#endif } dev_boot_phase = 0; -- cgit v1.2.3 From 42e52bf9e3ae80fd44b21ddfcd64c54e6db2ff76 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 25 May 2013 04:12:10 +0000 Subject: net: add netnotifier event for upper device change Now when upper device is changed, event is not propagated via RT Netlink to userspace. Userspace might never now about the change. Fix this by adding upper-device-change notifier event. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7229bc30e509..50c02ded1d69 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4411,7 +4411,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, else list_add_tail_rcu(&upper->list, &dev->upper_dev_list); dev_hold(upper_dev); - + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; } @@ -4471,6 +4471,7 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_del_rcu(&upper->list); dev_put(upper_dev); kfree_rcu(upper, rcu); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); -- cgit v1.2.3 From 0d89d2035fe063461a5ddb609b2c12e7fb006e44 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 23 May 2013 21:02:52 +0000 Subject: MPLS: Add limited GSO support In the case where a non-MPLS packet is received and an MPLS stack is added it may well be the case that the original skb is GSO but the NIC used for transmit does not support GSO of MPLS packets. The aim of this code is to provide GSO in software for MPLS packets whose skbs are GSO. SKB Usage: When an implementation adds an MPLS stack to a non-MPLS packet it should do the following to skb metadata: * Set skb->inner_protocol to the old non-MPLS ethertype of the packet. skb->inner_protocol is added by this patch. * Set skb->protocol to the new MPLS ethertype of the packet. * Set skb->network_header to correspond to the end of the L3 header, including the MPLS label stack. I have posted a patch, "[PATCH v3.29] datapath: Add basic MPLS support to kernel" which adds MPLS support to the kernel datapath of Open vSwtich. That patch sets the above requirements in datapath/actions.c:push_mpls() and was used to exercise this code. The datapath patch is against the Open vSwtich tree but it is intended that it be added to the Open vSwtich code present in the mainline Linux kernel at some point. Features: I believe that the approach that I have taken is at least partially consistent with the handling of other protocols. Jesse, I understand that you have some ideas here. I am more than happy to change my implementation. This patch adds dev->mpls_features which may be used by devices to advertise features supported for MPLS packets. A new NETIF_F_MPLS_GSO feature is added for devices which support hardware MPLS GSO offload. Currently no devices support this and MPLS GSO always falls back to software. Alternate Implementation: One possible alternate implementation is to teach netif_skb_features() and skb_network_protocol() about MPLS, in a similar way to their understanding of VLANs. I believe this would avoid the need for net/mpls/mpls_gso.c and in particular the calls to __skb_push() and __skb_push() in mpls_gso_segment(). I have decided on the implementation in this patch as it should not introduce any overhead in the case where mpls_gso is not compiled into the kernel or inserted as a module. MPLS GSO suggested by Jesse Gross. Based in part on "v4 GRE: Add TCP segmentation offload for GRE" by Pravin B Shelar. Cc: Jesse Gross Cc: Pravin B Shelar Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 50c02ded1d69..2f09cb29cc95 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5277,6 +5277,10 @@ int register_netdevice(struct net_device *dev) */ dev->hw_enc_features |= NETIF_F_SG; + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) -- cgit v1.2.3 From da6e378ba918cd0feeb90eeb84d8b42148bb0c82 Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Mon, 27 May 2013 19:53:31 +0000 Subject: netpoll: remove return value from netpoll_rx_disable() The netpoll_rx_disable() will always return 0, it is no use and looks wordy, so remove the unnecessary code and get rid of it in _dev_open and _dev_close. Signed-off-by: Ding Tianhong Signed-off-by: David S. Miller --- net/core/dev.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2f09cb29cc95..5f747974ac58 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1198,9 +1198,7 @@ static int __dev_open(struct net_device *dev) * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; + netpoll_rx_disable(dev); ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); @@ -1309,9 +1307,7 @@ static int __dev_close(struct net_device *dev) LIST_HEAD(single); /* Temporarily disable netpoll until the interface is down */ - retval = netpoll_rx_disable(dev); - if (retval) - return retval; + netpoll_rx_disable(dev); list_add(&dev->unreg_list, &single); retval = __dev_close_many(&single); @@ -1353,14 +1349,11 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { - int ret = 0; if (dev->flags & IFF_UP) { LIST_HEAD(single); /* Block netpoll rx while the interface is going down */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; + netpoll_rx_disable(dev); list_add(&dev->unreg_list, &single); dev_close_many(&single); @@ -1368,7 +1361,7 @@ int dev_close(struct net_device *dev) netpoll_rx_enable(dev); } - return ret; + return 0; } EXPORT_SYMBOL(dev_close); -- cgit v1.2.3 From 351638e7deeed2ec8ce451b53d33921b3da68f83 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 28 May 2013 01:30:21 +0000 Subject: net: pass info struct via netdevice notifier So far, only net_device * could be passed along with netdevice notifier event. This patch provides a possibility to pass custom structure able to provide info that event listener needs to know. Signed-off-by: Jiri Pirko v2->v3: fix typo on simeth shortened dev_getter shortened notifier_info struct name v1->v2: fix notifier_call parameter in call_netdevice_notifier() Signed-off-by: David S. Miller --- net/core/dev.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5f747974ac58..54fce6006a83 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1391,6 +1391,20 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +static void netdev_notifier_info_init(struct netdev_notifier_info *info, + struct net_device *dev) +{ + info->dev = dev; +} + +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return nb->notifier_call(nb, val, &info); +} static int dev_boot_phase = 1; @@ -1423,7 +1437,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; for_each_net(net) { for_each_netdev(net, dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) goto rollback; @@ -1431,7 +1445,7 @@ int register_netdevice_notifier(struct notifier_block *nb) if (!(dev->flags & IFF_UP)) continue; - nb->notifier_call(nb, NETDEV_UP, dev); + call_netdevice_notifier(nb, NETDEV_UP, dev); } } @@ -1447,10 +1461,11 @@ rollback: goto outroll; if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } @@ -1488,10 +1503,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } unlock: @@ -1500,6 +1516,25 @@ unlock: } EXPORT_SYMBOL(unregister_netdevice_notifier); +/** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, + struct netdev_notifier_info *info) +{ + ASSERT_RTNL(); + netdev_notifier_info_init(info, dev); + return raw_notifier_call_chain(&netdev_chain, val, info); +} +EXPORT_SYMBOL(call_netdevice_notifiers_info); + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1511,8 +1546,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); - return raw_notifier_call_chain(&netdev_chain, val, dev); + struct netdev_notifier_info info; + + return call_netdevice_notifiers_info(val, dev, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); -- cgit v1.2.3 From be9efd3653284f2827fd82861e8e9db9a8f726e1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 28 May 2013 01:30:22 +0000 Subject: net: pass changed flags along with NETDEV_CHANGE event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use new netdevice notifier infrastructure to pass along changed flags. Signed-off-by: Timo Teräs Signed-off-by: Jiri Pirko v2->v3: shortened notifier_info struct name Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 54fce6006a83..6eb621cc3b81 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4771,8 +4771,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) } if (dev->flags & IFF_UP && - (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = changes; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + } } /** -- cgit v1.2.3 From 75538c2b85cf22eb9af6adfaf26ed7219025adeb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 29 May 2013 11:30:50 +0800 Subject: net: always pass struct netdev_notifier_info to netdevice notifiers commit 351638e7deeed2ec8ce451b53d3 (net: pass info struct via netdevice notifier) breaks booting of my KVM guest, this is due to we still forget to pass struct netdev_notifier_info in several places. This patch completes it. Cc: Jiri Pirko Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6eb621cc3b81..b2e9057be3bf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1391,12 +1391,6 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); -static void netdev_notifier_info_init(struct netdev_notifier_info *info, - struct net_device *dev) -{ - info->dev = dev; -} - static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { -- cgit v1.2.3 From ced14f6804a979d1972415bc23f2f8ddb18595dd Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 28 May 2013 20:34:25 +0000 Subject: net: Correct comparisons and calculations using skb->tail and skb-transport_header This corrects an regression introduced by "net: Use 16bits for *_headers fields of struct skbuff" when NET_SKBUFF_DATA_USES_OFFSET is not set. In that case skb->tail will be a pointer whereas skb->transport_header will be an offset from head. This is corrected by using wrappers that ensure that comparisons and calculations are always made using pointers. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index b2e9057be3bf..d4d874a25e45 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1724,7 +1724,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || - skb2->network_header > skb2->tail) { + skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); @@ -3892,7 +3892,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb->mac_header == skb->tail && + if (skb_mac_header(skb) == skb_tail_pointer(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); -- cgit v1.2.3 From 430f03cde2fb9596d8b562824471e298a8080df9 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 2 Jun 2013 20:43:55 +0000 Subject: net: mark netdev_create_hash __net_init netdev_create_hash() is only called from netdev_init() which is marked __net_init. Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d4d874a25e45..9c18557f93c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6088,7 +6088,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); -static struct hlist_head *netdev_create_hash(void) +static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; -- cgit v1.2.3 From af12fa6e46aa651e7b86a4c4117b562518fef184 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:39:41 +0300 Subject: net: add napi_id and hash Adds a napi_id and a hashing mechanism to lookup a napi by id. This will be used by subsequent patches to implement low latency Ethernet device polling. Based on a code sample by Eric Dumazet. Signed-off-by: Eliezer Tamir Signed-off-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9c18557f93c6..fa007dba6beb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -166,6 +167,12 @@ static struct list_head offload_base __read_mostly; DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) @@ -4136,6 +4143,58 @@ void napi_complete(struct napi_struct *n) } EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { -- cgit v1.2.3 From 60877a32bce00041528576e6b8df5abe9251fa73 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jun 2013 01:15:51 -0700 Subject: net: allow large number of tx queues netif_alloc_netdev_queues() uses kcalloc() to allocate memory for the "struct netdev_queue *_tx" array. For large number of tx queues, kcalloc() might fail, so this patch does a fallback to vzalloc(). As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT to kzalloc() flags to do this fallback only when really needed. Signed-off-by: Eric Dumazet Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/dev.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fa007dba6beb..722f633926e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -130,6 +130,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -5253,17 +5254,28 @@ static void netdev_init_one_queue(struct net_device *dev, #endif } +static void netif_free_tx_queues(struct net_device *dev) +{ + if (is_vmalloc_addr(dev->_tx)) + vfree(dev->_tx); + else + kfree(dev->_tx); +} + static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; + size_t sz = count * sizeof(*tx); - BUG_ON(count < 1); - - tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) - return -ENOMEM; + BUG_ON(count < 1 || count > 0xffff); + tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!tx) { + tx = vzalloc(sz); + if (!tx) + return -ENOMEM; + } dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -5811,7 +5823,7 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - kfree(dev->_tx); + netif_free_tx_queues(dev); #ifdef CONFIG_RPS kfree(dev->_rx); #endif @@ -5836,7 +5848,7 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); - kfree(dev->_tx); + netif_free_tx_queues(dev); #ifdef CONFIG_RPS kfree(dev->_rx); #endif -- cgit v1.2.3 From 621e84d6f373dcb273ebfd772638b8e7dc3c2c48 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 26 Jun 2013 16:11:27 +0200 Subject: dev: introduce skb_scrub_packet() The goal of this new function is to perform all needed cleanup before sending an skb into another netns. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 722f633926e0..370354a9c5f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1652,22 +1652,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } } - skb_orphan(skb); - if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } - skb->skb_iif = 0; - skb_dst_drop(skb); - skb->tstamp.tv64 = 0; - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb); skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); - nf_reset_trace(skb); return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); -- cgit v1.2.3 From 06a23fe31ca3992863721f21bdb0307af93da807 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 2 Jul 2013 20:30:10 +0900 Subject: core/dev: set pkt_type after eth_type_trans() in dev_forward_skb() The dev_forward_skb() assignment of pkt_type should be done after the call to eth_type_trans(). ip-encapsulated packets can be handled by localhost. But skb->pkt_type can be PACKET_OTHERHOST when packet comes via veth into ip tunnel device. In that case, the packet is dropped by ip_rcv(). Although this example uses gretap. l2tp-eth also has same issue. For l2tp-eth case, add dummy device for ip address and ip l2tp command. netns A | root netns | netns B veth<->veth=bridge=gretap <-loop back-> gretap=bridge=veth<->veth arp packet -> pkt_type BROADCAST------------>ip_rcv()------------------------> <- arp reply pkt_type ip_rcv()<-----------------OTHERHOST drop sample operations ip link add tapa type gretap remote 172.17.107.4 local 172.17.107.3 ip link add tapb type gretap remote 172.17.107.3 local 172.17.107.4 ip link set tapa up ip link set tapb up ip address add 172.17.107.3 dev tapa ip address add 172.17.107.4 dev tapb ip route get 172.17.107.3 > local 172.17.107.3 dev lo src 172.17.107.3 > cache ip route get 172.17.107.4 > local 172.17.107.4 dev lo src 172.17.107.4 > cache ip link add vetha type veth peer name vetha-peer ip link add vethb type veth peer name vethb-peer brctl addbr bra brctl addbr brb brctl addif bra tapa brctl addif bra vetha-peer brctl addif brb tapb brctl addif brb vethb-peer brctl show > bridge name bridge id STP enabled interfaces > bra 8000.6ea21e758ff1 no tapa > vetha-peer > brb 8000.420020eb92d5 no tapb > vethb-peer ip link set vetha-peer up ip link set vethb-peer up ip link set bra up ip link set brb up ip netns add a ip netns add b ip link set vetha netns a ip link set vethb netns b ip netns exec a ip address add 10.0.0.3/24 dev vetha ip netns exec b ip address add 10.0.0.4/24 dev vethb ip netns exec a ip link set vetha up ip netns exec b ip link set vethb up ip netns exec a arping -I vetha 10.0.0.4 ARPING 10.0.0.4 from 10.0.0.3 vetha ^CSent 2 probes (2 broadcast(s)) Received 0 response(s) Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Eric Dumazet Cc: Patrick McHardy Cc: Hong Zhiguo Cc: Rami Rosen Cc: Tom Parkin Cc: Cong Wang Cc: Pravin B Shelar Cc: Jesse Gross Cc: dev@openvswitch.org Signed-off-by: Isaku Yamahata Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 370354a9c5f6..6a93cd8cd264 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1659,6 +1659,12 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } skb_scrub_packet(skb); skb->protocol = eth_type_trans(skb, dev); + + /* eth_type_trans() can set pkt_type. + * clear pkt_type _after_ calling eth_type_trans() + */ + skb->pkt_type = PACKET_HOST; + return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); -- cgit v1.2.3