From 588f033075d8c7efe28695402114eab3f9da47c4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2011 04:12:55 +0000 Subject: net: use jump_label for netstamp_needed netstamp_needed seems a good candidate to jump_label conversion. This avoids 3 conditional branches per incoming packet in fast path. No measurable difference, given that these conditional branches are predicted on modern cpus. Only a small icache reduction, thanks to the unlikely() stuff. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6ba50a1e404c..51f89cd0a3f4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -137,6 +137,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -1449,34 +1450,32 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); +static struct jump_label_key netstamp_needed __read_mostly; void net_enable_timestamp(void) { - atomic_inc(&netstamp_needed); + jump_label_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { - atomic_dec(&netstamp_needed); + jump_label_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { - if (atomic_read(&netstamp_needed)) + skb->tstamp.tv64 = 0; + if (static_branch(&netstamp_needed)) __net_timestamp(skb); - else - skb->tstamp.tv64 = 0; } -static inline void net_timestamp_check(struct sk_buff *skb) -{ - if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) - __net_timestamp(skb); -} +#define net_timestamp_check(COND, SKB) \ + if (static_branch(&netstamp_needed)) { \ + if ((COND) && !(SKB)->tstamp.tv64) \ + __net_timestamp(SKB); \ + } \ static int net_hwtstamp_validate(struct ifreq *ifr) { @@ -2997,8 +2996,7 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); #ifdef CONFIG_RPS @@ -3230,8 +3228,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; __be16 type; - if (!netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); @@ -3362,8 +3359,7 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { - if (netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(netdev_tstamp_prequeue, skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; -- cgit v1.2.3 From bc5787c6125cc2c868eaace46c46ce6e83dcfcb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 15 Nov 2011 15:29:55 +0000 Subject: net: remove legacy ethtool ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As all drivers are converted, we may now remove discrete offload setting callback handling. Signed-off-by: Michał Mirosław Acked-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 51f89cd0a3f4..185e246d61fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1321,8 +1321,6 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - u32 flags; - /* * If we're trying to disable lro on a vlan device * use the underlying physical device instead @@ -1330,15 +1328,9 @@ void dev_disable_lro(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); - if (dev->ethtool_ops && dev->ethtool_ops->get_flags) - flags = dev->ethtool_ops->get_flags(dev); - else - flags = ethtool_op_get_flags(dev); - - if (!(flags & ETH_FLAG_LRO)) - return; + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); - __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); } -- cgit v1.2.3 From c8f44affb7244f2ac3e703cab13d55ede27621bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 15 Nov 2011 15:29:55 +0000 Subject: net: introduce and use netdev_features_t for device features sets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2: add couple missing conversions in drivers split unexporting netdev_fix_features() implemented %pNF convert sock::sk_route_(no?)caps Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/dev.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 185e246d61fd..f1cca59c4638 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1914,7 +1914,8 @@ EXPORT_SYMBOL(skb_checksum_help); * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) +struct sk_buff *skb_gso_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; @@ -1944,9 +1945,9 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) dev->ethtool_ops->get_drvinfo(dev, &info); - WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", - info.driver, dev ? dev->features : 0L, - skb->sk ? skb->sk->sk_route_caps : 0L, + WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n", + info.driver, dev ? &dev->features : NULL, + skb->sk ? &skb->sk->sk_route_caps : NULL, skb->len, skb->data_len, skb->ip_summed); if (skb_header_cloned(skb) && @@ -2055,7 +2056,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) * This function segments the given skb and stores the list of segments * in skb->next. */ -static int dev_gso_segment(struct sk_buff *skb, int features) +static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs; @@ -2094,7 +2095,7 @@ static inline void skb_orphan_try(struct sk_buff *skb) } } -static bool can_checksum_protocol(unsigned long features, __be16 protocol) +static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { return ((features & NETIF_F_GEN_CSUM) || ((features & NETIF_F_V4_CSUM) && @@ -2105,7 +2106,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) protocol == htons(ETH_P_FCOE))); } -static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) +static netdev_features_t harmonize_features(struct sk_buff *skb, + __be16 protocol, netdev_features_t features) { if (!can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; @@ -2117,10 +2119,10 @@ static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features return features; } -u32 netif_skb_features(struct sk_buff *skb) +netdev_features_t netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - u32 features = skb->dev->features; + netdev_features_t features = skb->dev->features; if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; @@ -2166,7 +2168,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, unsigned int skb_len; if (likely(!skb->next)) { - u32 features; + netdev_features_t features; /* * If device doesn't need skb->dst, release it right now while @@ -5350,7 +5352,8 @@ static void rollback_registered(struct net_device *dev) list_del(&single); } -static u32 netdev_fix_features(struct net_device *dev, u32 features) +static netdev_features_t netdev_fix_features(struct net_device *dev, + netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && @@ -5412,7 +5415,7 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features) int __netdev_update_features(struct net_device *dev) { - u32 features; + netdev_features_t features; int err = 0; ASSERT_RTNL(); @@ -5428,16 +5431,16 @@ int __netdev_update_features(struct net_device *dev) if (dev->features == features) return 0; - netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", - dev->features, features); + netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", + &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); if (unlikely(err < 0)) { netdev_err(dev, - "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", - err, features, dev->features); + "set_features() failed (%d); wanted %pNF, left %pNF\n", + err, &features, &dev->features); return -1; } @@ -6361,7 +6364,8 @@ static int dev_cpu_callback(struct notifier_block *nfb, * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ -u32 netdev_increment_features(u32 all, u32 one, u32 mask) +netdev_features_t netdev_increment_features(netdev_features_t all, + netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_GEN_CSUM) mask |= NETIF_F_ALL_CSUM; -- cgit v1.2.3 From 34324dc2bf27c1773045fea63cb11f7e2a6ad2b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 15 Nov 2011 15:29:55 +0000 Subject: net: remove NETIF_F_NO_CSUM feature bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only distinct use is checking if NETIF_F_NOCACHE_COPY should be enabled by default. The check heuristics is altered a bit here, so it hits other people than before. The default shouldn't be trusted for performance-critical cases anyway. For all other uses NETIF_F_NO_CSUM is equivalent to NETIF_F_HW_CSUM. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/dev.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f1cca59c4638..26c49d55e79d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5362,12 +5362,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } - if ((features & NETIF_F_NO_CSUM) && - (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_warn(dev, "mixed no checksumming and other settings.\n"); - features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); - } - /* Fix illegal SG+CSUM combinations. */ if ((features & NETIF_F_SG) && !(features & NETIF_F_ALL_CSUM)) { @@ -5624,11 +5618,12 @@ int register_netdevice(struct net_device *dev) dev->wanted_features = dev->features & dev->hw_features; /* Turn on no cache copy if HW is doing checksum */ - dev->hw_features |= NETIF_F_NOCACHE_COPY; - if ((dev->features & NETIF_F_ALL_CSUM) && - !(dev->features & NETIF_F_NO_CSUM)) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; + if (!(dev->flags & IFF_LOOPBACK)) { + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if (dev->features & NETIF_F_ALL_CSUM) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; + } } /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -6374,10 +6369,6 @@ netdev_features_t netdev_increment_features(netdev_features_t all, all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; - /* If device needs checksumming, downgrade to it. */ - if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) - all &= ~NETIF_F_NO_CSUM; - /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_GEN_CSUM) all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); -- cgit v1.2.3 From adc9300e78e6091a7eaa1821213836379d4dbaa8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Nov 2011 03:13:26 +0000 Subject: net: use jump_label to shortcut RPS if not setup Most machines dont use RPS/RFS, and pay a fair amount of instructions in netif_receive_skb() / netif_rx() / get_rps_cpu() just to discover RPS/RFS is not setup. Add a jump_label named rps_needed. If no device rps_map or global rps_sock_flow_table is setup, netif_receive_skb() / netif_rx() do a single instruction instead of many ones, including conditional jumps. jmp +0 (if CONFIG_JUMP_LABEL=y) Signed-off-by: Eric Dumazet CC: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 26c49d55e79d..f78959996148 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2711,6 +2711,8 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +struct jump_label_key rps_needed __read_mostly; + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -2994,7 +2996,7 @@ int netif_rx(struct sk_buff *skb) trace_netif_rx(skb); #ifdef CONFIG_RPS - { + if (static_branch(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3009,14 +3011,13 @@ int netif_rx(struct sk_buff *skb) rcu_read_unlock(); preempt_enable(); - } -#else + } else +#endif { unsigned int qtail; ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } -#endif return ret; } EXPORT_SYMBOL(netif_rx); @@ -3359,7 +3360,7 @@ int netif_receive_skb(struct sk_buff *skb) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - { + if (static_branch(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; @@ -3370,16 +3371,12 @@ int netif_receive_skb(struct sk_buff *skb) if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - } else { - rcu_read_unlock(); - ret = __netif_receive_skb(skb); + return ret; } - - return ret; + rcu_read_unlock(); } -#else - return __netif_receive_skb(skb); #endif + return __netif_receive_skb(skb); } EXPORT_SYMBOL(netif_receive_skb); -- cgit v1.2.3 From 5bc1421e34ecfe0bd4b26dc3232b7d5e25179144 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 22 Nov 2011 05:10:51 +0000 Subject: net: add network priority cgroup infrastructure (v4) This patch adds in the infrastructure code to create the network priority cgroup. The cgroup, in addition to the standard processes file creates two control files: 1) prioidx - This is a read-only file that exports the index of this cgroup. This is a value that is both arbitrary and unique to a cgroup in this subsystem, and is used to index the per-device priority map 2) priomap - This is a writeable file. On read it reports a table of 2-tuples where name is the name of a network interface and priority is indicates the priority assigned to frames egresessing on the named interface and originating from a pid in this cgroup This cgroup allows for skb priority to be set prior to a root qdisc getting selected. This is benenficial for DCB enabled systems, in that it allows for any application to use dcb configured priorities so without application modification Signed-off-by: Neil Horman Signed-off-by: John Fastabend CC: Robert Love CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/dev.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f78959996148..8afb244b205f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2449,6 +2449,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +static void skb_update_prio(struct sk_buff *skb) +{ + struct netprio_map *map = rcu_dereference(skb->dev->priomap); + + if ((!skb->priority) && (skb->sk) && map) + skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; +} +#else +#define skb_update_prio(skb) +#endif + static DEFINE_PER_CPU(int, xmit_recursion); #define RECURSION_LIMIT 10 @@ -2489,6 +2501,8 @@ int dev_queue_xmit(struct sk_buff *skb) */ rcu_read_lock_bh(); + skb_update_prio(skb); + txq = dev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); -- cgit v1.2.3 From 5cac98dd06bc43a7baab3523184f70fd359e9f35 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 27 Nov 2011 21:14:46 +0000 Subject: net: Fix corruption in /proc/*/net/dev_mcast I just hit this during my testing. Isn't there another bug lurking? BUG kmalloc-8: Redzone overwritten INFO: 0xc0000000de9dec48-0xc0000000de9dec4b. First byte 0x0 instead of 0xcc INFO: Allocated in .__seq_open_private+0x30/0xa0 age=0 cpu=5 pid=3896 .__kmalloc+0x1e0/0x2d0 .__seq_open_private+0x30/0xa0 .seq_open_net+0x60/0xe0 .dev_mc_seq_open+0x4c/0x70 .proc_reg_open+0xd8/0x260 .__dentry_open.clone.11+0x2b8/0x400 .do_last+0xf4/0x950 .path_openat+0xf8/0x480 .do_filp_open+0x48/0xc0 .do_sys_open+0x140/0x250 syscall_exit+0x0/0x40 dev_mc_seq_ops uses dev_seq_start/next/stop but only allocates sizeof(struct seq_net_private) of private data, whereas it expects sizeof(struct dev_iter_state): struct dev_iter_state { struct seq_net_private p; unsigned int pos; /* bucket << BUCKET_SPACE + offset */ }; Create dev_seq_open_ops and use it so we don't have to expose struct dev_iter_state. [ Problem added by commit f04565ddf52e4 (dev: use name hash for dev_seq_ops) -Eric ] Signed-off-by: Anton Blanchard Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6ba50a1e404c..1482eea0bbf0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4282,6 +4282,12 @@ static int dev_seq_open(struct inode *inode, struct file *file) sizeof(struct dev_iter_state)); } +int dev_seq_open_ops(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state)); +} + static const struct file_operations dev_seq_fops = { .owner = THIS_MODULE, .open = dev_seq_open, -- cgit v1.2.3 From 4504b8613b6c65d7787a434f8dcf7901bfe3983d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 28 Nov 2011 05:23:23 +0000 Subject: net: use skb_flow_dissect() in __skb_get_rxhash() No functional changes. This uses the code we factorized in skb_flow_dissect() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 125 +++++++-------------------------------------------------- 1 file changed, 14 insertions(+), 111 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8afb244b205f..962e3de25a35 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -133,11 +133,9 @@ #include #include #include -#include -#include -#include #include #include +#include #include "net-sysfs.h" @@ -2598,123 +2596,28 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ void __skb_get_rxhash(struct sk_buff *skb) { - int nhoff, hash = 0, poff; - const struct ipv6hdr *ip6; - const struct iphdr *ip; - const struct vlan_hdr *vlan; - u8 ip_proto; - u32 addr1, addr2; - u16 proto; - union { - u32 v32; - u16 v16[2]; - } ports; - - nhoff = skb_network_offset(skb); - proto = skb->protocol; - -again: - switch (proto) { - case __constant_htons(ETH_P_IP): -ip: - if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) - goto done; - - ip = (const struct iphdr *) (skb->data + nhoff); - if (ip_is_fragment(ip)) - ip_proto = 0; - else - ip_proto = ip->protocol; - addr1 = (__force u32) ip->saddr; - addr2 = (__force u32) ip->daddr; - nhoff += ip->ihl * 4; - break; - case __constant_htons(ETH_P_IPV6): -ipv6: - if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) - goto done; - - ip6 = (const struct ipv6hdr *) (skb->data + nhoff); - ip_proto = ip6->nexthdr; - addr1 = (__force u32) ip6->saddr.s6_addr32[3]; - addr2 = (__force u32) ip6->daddr.s6_addr32[3]; - nhoff += 40; - break; - case __constant_htons(ETH_P_8021Q): - if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff)) - goto done; - vlan = (const struct vlan_hdr *) (skb->data + nhoff); - proto = vlan->h_vlan_encapsulated_proto; - nhoff += sizeof(*vlan); - goto again; - case __constant_htons(ETH_P_PPP_SES): - if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff)) - goto done; - proto = *((__be16 *) (skb->data + nhoff + - sizeof(struct pppoe_hdr))); - nhoff += PPPOE_SES_HLEN; - switch (proto) { - case __constant_htons(PPP_IP): - goto ip; - case __constant_htons(PPP_IPV6): - goto ipv6; - default: - goto done; - } - default: - goto done; - } - - switch (ip_proto) { - case IPPROTO_GRE: - if (pskb_may_pull(skb, nhoff + 16)) { - u8 *h = skb->data + nhoff; - __be16 flags = *(__be16 *)h; + struct flow_keys keys; + u32 hash; - /* - * Only look inside GRE if version zero and no - * routing - */ - if (!(flags & (GRE_VERSION|GRE_ROUTING))) { - proto = *(__be16 *)(h + 2); - nhoff += 4; - if (flags & GRE_CSUM) - nhoff += 4; - if (flags & GRE_KEY) - nhoff += 4; - if (flags & GRE_SEQ) - nhoff += 4; - goto again; - } - } - break; - case IPPROTO_IPIP: - goto again; - default: - break; - } + if (!skb_flow_dissect(skb, &keys)) + return; - ports.v32 = 0; - poff = proto_ports_offset(ip_proto); - if (poff >= 0) { - nhoff += poff; - if (pskb_may_pull(skb, nhoff + 4)) { - ports.v32 = * (__force u32 *) (skb->data + nhoff); - if (ports.v16[1] < ports.v16[0]) - swap(ports.v16[0], ports.v16[1]); - skb->l4_rxhash = 1; - } + if (keys.ports) { + if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) + swap(keys.port16[0], keys.port16[1]); + skb->l4_rxhash = 1; } /* get a consistent hash (same value on both flow directions) */ - if (addr2 < addr1) - swap(addr1, addr2); + if ((__force u32)keys.dst < (__force u32)keys.src) + swap(keys.dst, keys.src); - hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, hashrnd); if (!hash) hash = 1; -done: skb->rxhash = hash; } EXPORT_SYMBOL(__skb_get_rxhash); -- cgit v1.2.3 From b90e5794c5bdef91d26c623e992257947c506e35 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 28 Nov 2011 11:16:50 +0000 Subject: net: dont call jump_label_dec from irq context Igor Maravic reported an error caused by jump_label_dec() being called from IRQ context : BUG: sleeping function called from invalid context at kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper 1 lock held by swapper/0: #0: (&n->timer){+.-...}, at: [] call_timer_fn+0x0/0x340 Pid: 0, comm: swapper Not tainted 3.2.0-rc2-net-next-mpls+ #1 Call Trace: [] __might_sleep+0x137/0x1f0 [] mutex_lock_nested+0x2f/0x370 [] ? trace_hardirqs_off+0xd/0x10 [] ? local_clock+0x6f/0x80 [] ? lock_release_holdtime.part.22+0x15/0x1a0 [] ? sock_def_write_space+0x59/0x160 [] ? arp_error_report+0x3e/0x90 [] atomic_dec_and_mutex_lock+0x5d/0x80 [] jump_label_dec+0x1d/0x50 [] net_disable_timestamp+0x15/0x20 [] sock_disable_timestamp+0x45/0x50 [] __sk_free+0x80/0x200 [] ? sk_send_sigurg+0x70/0x70 [] ? arp_error_report+0x3e/0x90 [] sock_wfree+0x3a/0x70 [] skb_release_head_state+0x70/0x120 [] __kfree_skb+0x16/0x30 [] kfree_skb+0x49/0x170 [] arp_error_report+0x3e/0x90 [] neigh_invalidate+0x89/0xc0 [] neigh_timer_handler+0x9e/0x2a0 [] ? neigh_update+0x640/0x640 [] __do_softirq+0xc8/0x3a0 Since jump_label_{inc|dec} must be called from process context only, we must defer jump_label_dec() if net_disable_timestamp() is called from interrupt context. Reported-by: Igor Maravic Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 962e3de25a35..c7ef6c5d3782 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1441,15 +1441,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) EXPORT_SYMBOL(call_netdevice_notifiers); static struct jump_label_key netstamp_needed __read_mostly; +#ifdef HAVE_JUMP_LABEL +/* We are not allowed to call jump_label_dec() from irq context + * If net_disable_timestamp() is called from irq context, defer the + * jump_label_dec() calls. + */ +static atomic_t netstamp_needed_deferred; +#endif void net_enable_timestamp(void) { +#ifdef HAVE_JUMP_LABEL + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + + if (deferred) { + while (--deferred) + jump_label_dec(&netstamp_needed); + return; + } +#endif + WARN_ON(in_interrupt()); jump_label_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { +#ifdef HAVE_JUMP_LABEL + if (in_interrupt()) { + atomic_inc(&netstamp_needed_deferred); + return; + } +#endif jump_label_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); -- cgit v1.2.3 From 7346649826382b769cfadf4a2fe8a84d060c55e9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 28 Nov 2011 16:32:44 +0000 Subject: net: Add queue state xoff flag for stack Create separate queue state flags so that either the stack or drivers can turn on XOFF. Added a set of functions used in the stack to determine if a queue is really stopped (either by stack or driver) Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index c7ef6c5d3782..cb8f753b4238 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2270,7 +2270,7 @@ gso: return rc; } txq_trans_update(txq); - if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) + if (unlikely(netif_xmit_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); @@ -2558,7 +2558,7 @@ int dev_queue_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, cpu); - if (!netif_tx_queue_stopped(txq)) { + if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); rc = dev_hard_start_xmit(skb, dev, txq); __this_cpu_dec(xmit_recursion); -- cgit v1.2.3 From 114cf5802165ee93e3ab461c9c505cd94a08b800 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 28 Nov 2011 16:33:09 +0000 Subject: bql: Byte queue limits Networking stack support for byte queue limits, uses dynamic queue limits library. Byte queue limits are maintained per transmit queue, and a dql structure has been added to netdev_queue structure for this purpose. Configuration of bql is in the tx- sysfs directory for the queue under the byte_queue_limits directory. Configuration includes: limit_min, bql minimum limit limit_max, bql maximum limit hold_time, bql slack hold time Also under the directory are: limit, current byte limit inflight, current number of bytes on the queue Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cb8f753b4238..91a599122074 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5470,6 +5470,9 @@ static void netdev_init_one_queue(struct net_device *dev, queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; +#ifdef CONFIG_BQL + dql_init(&queue->dql, HZ); +#endif } static int netif_alloc_netdev_queues(struct net_device *dev) -- cgit v1.2.3 From 6977a79d36baf8b295b1893621874202e1d02094 Mon Sep 17 00:00:00 2001 From: Igor Maravic Date: Fri, 25 Nov 2011 07:44:54 +0000 Subject: net: Fix skb_update_prio RCU usage. Change function rcu_dereference to rcu_dereference_bh to avoid warning [ INFO: suspicious RCU usage. ] ------------------------------- net/core/dev.c:2459 suspicious rcu_dereference_check() usage! because we are locking with rcu_read_lock_bh(); in function dev_queue_xmit(struct sk_buff *skb) Signed-off-by: Igor Maravic Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 91a599122074..278463e91e3a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2473,7 +2473,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, #if IS_ENABLED(CONFIG_NETPRIO_CGROUP) static void skb_update_prio(struct sk_buff *skb) { - struct netprio_map *map = rcu_dereference(skb->dev->priomap); + struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); if ((!skb->priority) && (skb->sk) && map) skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; -- cgit v1.2.3 From 8f891489866ec62a87494eff3ed17c88152c32d4 Mon Sep 17 00:00:00 2001 From: "RongQing.Li" Date: Wed, 30 Nov 2011 23:43:07 -0500 Subject: net/core: fix rollback handler in register_netdevice_notifier Within nested statements, the break statement terminates only the do, for, switch, or while statement that immediately encloses it, So replace the break with goto. Signed-off-by: RongQing.Li Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1482eea0bbf0..5a13edfc9f73 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1396,7 +1396,7 @@ rollback: for_each_net(net) { for_each_netdev(net, dev) { if (dev == last) - break; + goto outroll; if (dev->flags & IFF_UP) { nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); @@ -1407,6 +1407,7 @@ rollback: } } +outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } -- cgit v1.2.3 From b536db9332cf90c4f44ca809f028645205fa89ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Nov 2011 21:42:26 +0000 Subject: net: net_device flags is an unsigned int commit b00055aacdb ([NET] core: add RFC2863 operstate) changed net_device flags from unsigned short to unsigned int. Some core functions still assume its an unsigned short. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 278463e91e3a..e0c3deec59b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4459,7 +4459,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; uid_t uid; gid_t gid; @@ -4516,7 +4516,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) */ int dev_set_promiscuity(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc); @@ -4543,7 +4543,7 @@ EXPORT_SYMBOL(dev_set_promiscuity); int dev_set_allmulti(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; ASSERT_RTNL(); @@ -4646,7 +4646,7 @@ EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags) { - int old_flags = dev->flags; + unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); @@ -4729,10 +4729,10 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int dev_change_flags(struct net_device *dev, unsigned int flags) { - int ret, changes; - int old_flags = dev->flags; + int ret; + unsigned int changes, old_flags = dev->flags; ret = __dev_change_flags(dev, flags); if (ret < 0) -- cgit v1.2.3 From e52ac3398c3d772d372b9b62ab408fd5eec96840 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 16 Jan 2012 12:38:59 +0000 Subject: net: Use device model to get driver name in skb_gso_segment() ethtool operations generally require the caller to hold RTNL and are not safe to call in atomic context. The device model provides this information for most devices; we'll only lose it for some old ISA drivers. Signed-off-by: Ben Hutchings Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f494675471a9..7e6b7dcaacde 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1962,13 +1962,13 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { struct net_device *dev = skb->dev; - struct ethtool_drvinfo info = {}; + const char *driver = ""; - if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) - dev->ethtool_ops->get_drvinfo(dev, &info); + if (dev && dev->dev.parent) + driver = dev_driver_string(dev->dev.parent); WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n", - info.driver, dev ? &dev->features : NULL, + driver, dev ? &dev->features : NULL, skb->sk ? &skb->sk->sk_route_caps : NULL, skb->len, skb->data_len, skb->ip_summed); -- cgit v1.2.3 From 36c92474498ad6cc3afb24b91a67a444d79978fe Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 17 Jan 2012 07:57:56 +0000 Subject: net: WARN if skb_checksum_help() is called on skb requiring segmentation skb_checksum_help() has never done anything useful with skbs that require segmentation. Setting skb->ip_summed = CHECKSUM_NONE makes them invalid and provokes a later WARNing in skb_gso_segment(). Passing such an skb to skb_checksum_help() indicates a bug, so we should warn about it immediately. Move the warning from skb_gso_segment() into a shared function, and add gso_type and gso_size to it. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7e6b7dcaacde..17db2f2e5236 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1887,6 +1887,22 @@ void skb_set_dev(struct sk_buff *skb, struct net_device *dev) EXPORT_SYMBOL(skb_set_dev); #endif /* CONFIG_NET_NS */ +static void skb_warn_bad_offload(const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + const char *driver = ""; + + if (dev && dev->dev.parent) + driver = dev_driver_string(dev->dev.parent); + + WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " + "gso_type=%d ip_summed=%d\n", + driver, dev ? &dev->features : NULL, + skb->sk ? &skb->sk->sk_route_caps : NULL, + skb->len, skb->data_len, skb_shinfo(skb)->gso_size, + skb_shinfo(skb)->gso_type, skb->ip_summed); +} + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. @@ -1900,8 +1916,8 @@ int skb_checksum_help(struct sk_buff *skb) goto out_set_summed; if (unlikely(skb_shinfo(skb)->gso_size)) { - /* Let GSO fix up the checksum. */ - goto out_set_summed; + skb_warn_bad_offload(skb); + return -EINVAL; } offset = skb_checksum_start_offset(skb); @@ -1961,16 +1977,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, __skb_pull(skb, skb->mac_len); if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - struct net_device *dev = skb->dev; - const char *driver = ""; - - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - - WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n", - driver, dev ? &dev->features : NULL, - skb->sk ? &skb->sk->sk_route_caps : NULL, - skb->len, skb->data_len, skb->ip_summed); + skb_warn_bad_offload(skb); if (skb_header_cloned(skb) && (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) -- cgit v1.2.3 From 65e9d2faab70d07b9a38ac6ed298f191d24541fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 17 Jan 2012 10:00:40 +0000 Subject: net: fix NULL-deref in WARN() in skb_gso_segment() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug was introduced in commit c8f44affb7244f2ac3e703cab13d55ede27621bb. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 17db2f2e5236..115dee1d985d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1889,6 +1889,7 @@ EXPORT_SYMBOL(skb_set_dev); static void skb_warn_bad_offload(const struct sk_buff *skb) { + static const netdev_features_t null_features = 0; struct net_device *dev = skb->dev; const char *driver = ""; @@ -1897,8 +1898,8 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : NULL, - skb->sk ? &skb->sk->sk_route_caps : NULL, + driver, dev ? &dev->features : &null_features, + skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); } -- cgit v1.2.3 From 5ca3b72c5da47d95b83857b768def6172fbc080a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Feb 2012 08:51:50 +0000 Subject: gro: more generic L2 header check Shlomo Pongratz reported GRO L2 header check was suited for Ethernet only, and failed on IB/ipoib traffic. He provided a patch faking a zeroed header to let GRO aggregates frames. Roland Dreier, Herbert Xu, and others suggested we change GRO L2 header check to be more generic, ie not assuming L2 header is 14 bytes, but taking into account hard_header_len. __napi_gro_receive() has special handling for the common case (Ethernet) to avoid a memcmp() call and use an inline optimized function instead. Signed-off-by: Eric Dumazet Reported-by: Shlomo Pongratz Cc: Roland Dreier Cc: Or Gerlitz Cc: Herbert Xu Tested-by: Sean Hefty Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..6ca32f6b3105 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3500,14 +3500,20 @@ static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.2.3