From 76620aafd66f0004829764940c5466144969cffc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 16 Apr 2009 02:02:07 -0700 Subject: gro: New frags interface to avoid copying shinfo It turns out that copying a 16-byte area at ~800k times a second can be really expensive :) This patch redesigns the frags GRO interface to avoid copying that area twice. The two disciples of the frags interface have been converted. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 81 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 45 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 91d792d17e09..619fa141b8f5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2519,16 +2519,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_reuse_skb); -struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, - struct napi_gro_fraginfo *info) +struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - skb_frag_t *frag; - int i; - - napi->skb = NULL; if (!skb) { skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); @@ -2536,47 +2530,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, goto out; skb_reserve(skb, NET_IP_ALIGN); - } - - BUG_ON(info->nr_frags > MAX_SKB_FRAGS); - frag = &info->frags[info->nr_frags - 1]; - for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) { - skb_fill_page_desc(skb, i, frag->page, frag->page_offset, - frag->size); - frag++; + napi->skb = skb; } - skb_shinfo(skb)->nr_frags = info->nr_frags; - - skb->data_len = info->len; - skb->len += info->len; - skb->truesize += info->len; - - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); - - eth = skb_gro_header(skb, sizeof(*eth)); - if (!eth) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; - } - - skb_gro_pull(skb, sizeof(*eth)); - - /* - * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. - */ - skb->protocol = eth->h_proto; - - skb->ip_summed = info->ip_summed; - skb->csum = info->csum; out: return skb; } -EXPORT_SYMBOL(napi_fraginfo_skb); +EXPORT_SYMBOL(napi_get_frags); int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { @@ -2606,9 +2567,39 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) } EXPORT_SYMBOL(napi_frags_finish); -int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + struct ethhdr *eth; + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + eth = skb_gro_header(skb, sizeof(*eth)); + if (!eth) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } + + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; + +out: + return skb; +} +EXPORT_SYMBOL(napi_frags_skb); + +int napi_gro_frags(struct napi_struct *napi) { - struct sk_buff *skb = napi_fraginfo_skb(napi, info); + struct sk_buff *skb = napi_frags_skb(napi); if (!skb) return NET_RX_DROP; @@ -2712,7 +2703,7 @@ void netif_napi_del(struct napi_struct *napi) struct sk_buff *skb, *next; list_del_init(&napi->dev_list); - kfree_skb(napi->skb); + napi_free_frags(napi); for (skb = napi->gro_list; skb; skb = next) { next = skb->next; -- cgit v1.2.3 From edbd9e30306067c3a45c035eb95a6f49daaa2337 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 27 Apr 2009 05:44:29 -0700 Subject: gro: Fix handling of headers that extend over the tail The skb_gro_* code fails to handle the case where a header starts in the linear area but ends in the frags area. Since the goal of skb_gro_* is to optimise the case of completely non-linear packets, we can simply bail out if we have anything in the linear area. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e48c08af76ad..6785b067ad50 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2378,18 +2378,13 @@ void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) unsigned int offset = skb_gro_offset(skb); hlen += offset; - if (hlen <= skb_headlen(skb)) - return skb->data + offset; - - if (unlikely(!skb_shinfo(skb)->nr_frags || - skb_shinfo(skb)->frags[0].size <= - hlen - skb_headlen(skb) || + if (unlikely(skb_headlen(skb) || + skb_shinfo(skb)->frags[0].size < hlen || PageHighMem(skb_shinfo(skb)->frags[0].page))) return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; return page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset + - offset - skb_headlen(skb); + skb_shinfo(skb)->frags[0].page_offset + offset; } EXPORT_SYMBOL(skb_gro_header); -- cgit v1.2.3 From 513de11bba246b7a67df4c314d9fc936b6a75d0e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 3 May 2009 14:43:10 -0700 Subject: net: Avoid modulus in skb_tx_hash() for forwarding case. Based almost entirely upon a patch by Eric Dumazet. The common case is to have num-tx-queues <= num_rx_queues and even if num_tx_queues is larger it will not be significantly larger. Therefore, a subtraction loop is always going to be faster than modulus. Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 81442957c5c2..3c8073fe970a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1735,8 +1735,12 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { u32 hash; - if (skb_rx_queue_recorded(skb)) - return skb_get_rx_queue(skb) % dev->real_num_tx_queues; + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely (hash >= dev->real_num_tx_queues)) + hash -= dev->real_num_tx_queues; + return hash; + } if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; -- cgit v1.2.3 From f001fde5eadd915f4858d22ed70d7040f48767cf Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 May 2009 02:48:28 +0000 Subject: net: introduce a list of device addresses dev_addr_list (v6) v5 -> v6 (current): -removed so far unused static functions -corrected dev_addr_del_multiple to call del instead of add v4 -> v5: -added device address type (suggested by davem) -removed refcounting (better to have simplier code then safe potentially few bytes) v3 -> v4: -changed kzalloc to kmalloc in __hw_addr_add_ii() -ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init() v2 -> v3: -removed unnecessary rcu read locking -moved dev_addr_flush() calling to ensure no null dereference of dev_addr v1 -> v2: -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush -removed unnecessary rcu_read locking in dev_addr_init -use compare_ether_addr_64bits instead of compare_ether_addr -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr -use call_rcu instead of rcu_synchronize -moved is_etherdev_addr into __KERNEL__ ifdef This patch introduces a new list in struct net_device and brings a set of functions to handle the work with device address list. The list is a replacement for the original dev_addr field and because in some situations there is need to carry several device addresses with the net device. To be backward compatible, dev_addr is made to point to the first member of the list so original drivers sees no difference. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3c8073fe970a..637ea71b0a0d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3434,6 +3434,252 @@ void dev_set_rx_mode(struct net_device *dev) netif_addr_unlock_bh(dev); } +/* hw addresses list handling functions */ + +static int __hw_addr_add(struct list_head *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return -ENOMEM; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + list_add_tail_rcu(&ha->list, list); + return 0; +} + +static void ha_rcu_free(struct rcu_head *head) +{ + struct netdev_hw_addr *ha; + + ha = container_of(head, struct netdev_hw_addr, rcu_head); + kfree(ha); +} + +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + struct netdev_hw_addr *ha; + int i = 0; + + list_for_each_entry(ha, list, list) { + if (i++ != ignore_index && + !memcmp(ha->addr, addr, addr_len) && + (ha->type == addr_type || !addr_type)) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + return 0; + } + } + return -ENOENT; +} + +static int __hw_addr_add_multiple_ii(struct list_head *to_list, + struct list_head *from_list, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + int err; + struct netdev_hw_addr *ha, *ha2; + unsigned char type; + + list_for_each_entry(ha, from_list, list) { + type = addr_type ? addr_type : ha->type; + err = __hw_addr_add(to_list, ha->addr, addr_len, type); + if (err) + goto unroll; + } + return 0; + +unroll: + list_for_each_entry(ha2, from_list, list) { + if (ha2 == ha) + break; + type = addr_type ? addr_type : ha2->type; + __hw_addr_del_ii(to_list, ha2->addr, addr_len, type, + ignore_index); + } + return err; +} + +static void __hw_addr_del_multiple_ii(struct list_head *to_list, + struct list_head *from_list, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + struct netdev_hw_addr *ha; + unsigned char type; + + list_for_each_entry(ha, from_list, list) { + type = addr_type ? addr_type : ha->type; + __hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type, + ignore_index); + } +} + +static void __hw_addr_flush(struct list_head *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, list, list) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + } +} + +/* Device addresses handling functions */ + +static void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->dev_addr_list); + dev->dev_addr = NULL; +} + +static int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + INIT_LIST_HEAD(&dev->dev_addr_list); + memset(addr, 0, sizeof(*addr)); + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addr_list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, + addr_type, 0); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/** + * dev_addr_add_multiple - Add device addresses from another device + * @to_dev: device to which addresses will be added + * @from_dev: device from which addresses will be added + * @addr_type: address type - 0 means type will be used from from_dev + * + * Add device addresses of the one device to another. + ** + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type, 0); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add_multiple); + +/** + * dev_addr_del_multiple - Delete device addresses by another device + * @to_dev: device where the addresses will be deleted + * @from_dev: device by which addresses the addresses will be deleted + * @addr_type: address type - 0 means type will used from from_dev + * + * Deletes addresses in to device by the list of addresses in from device. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + __hw_addr_del_multiple_ii(&to_dev->dev_addr_list, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type, 0); + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return 0; +} +EXPORT_SYMBOL(dev_addr_del_multiple); + +/* unicast and multicast addresses handling functions */ + int __dev_addr_delete(struct dev_addr_list **list, int *count, void *addr, int alen, int glbl) { @@ -4776,6 +5022,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; + dev_addr_init(dev); netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); @@ -4801,6 +5048,9 @@ void free_netdev(struct net_device *dev) kfree(dev->_tx); + /* Flush device addresses */ + dev_addr_flush(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); -- cgit v1.2.3 From ab9c73ccb52f40576ce017528d542eda3c6ae766 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 8 May 2009 13:30:17 +0000 Subject: net: check retval of dev_addr_init() Add missed checking of dev_addr_init return value in alloc_netdev_mq. Signed-off-by: Jiri Pirko net/core/dev.c | 15 ++++++++++++--- 1 files changed, 12 insertions(+), 3 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71b0a0d..14dd725aaab7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5007,13 +5007,16 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, if (!tx) { printk(KERN_ERR "alloc_netdev: Unable to allocate " "tx qdiscs.\n"); - kfree(p); - return NULL; + goto free_p; } dev = (struct net_device *) (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); dev->padded = (char *)dev - (char *)p; + + if (dev_addr_init(dev)) + goto free_tx; + dev_net_set(dev, &init_net); dev->_tx = tx; @@ -5022,13 +5025,19 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; - dev_addr_init(dev); netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); setup(dev); strcpy(dev->name, name); return dev; + +free_tx: + kfree(tx); + +free_p: + kfree(p); + return NULL; } EXPORT_SYMBOL(alloc_netdev_mq); -- cgit v1.2.3 From 7004bf252c53da18f6b55103e0c92f777f846806 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 May 2009 00:34:33 +0000 Subject: net: add tx_packets/tx_bytes/tx_dropped counters in struct netdev_queue offsetof(struct net_device, features)=0x44 offsetof(struct net_device, stats.tx_packets)=0x54 offsetof(struct net_device, stats.tx_bytes)=0x5c offsetof(struct net_device, stats.tx_dropped)=0x6c Network drivers that touch dev->stats.tx_packets/stats.tx_bytes in their tx path can slow down SMP operations, since they dirty a cache line that should stay shared (dev->features is needed in rx and tx paths) We could move away stats field in net_device but it wont help that much. (Two cache lines dirtied in tx path, we can do one only) Better solution is to add tx_packets/tx_bytes/tx_dropped in struct netdev_queue because this structure is already touched in tx path and counters updates will then be free (no increase in size) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 14dd725aaab7..6d3630d16271 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4943,13 +4943,30 @@ void netdev_run_todo(void) * the internal statistics structure is used. */ const struct net_device_stats *dev_get_stats(struct net_device *dev) - { +{ const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_get_stats) return ops->ndo_get_stats(dev); - else - return &dev->stats; + else { + unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; + struct net_device_stats *stats = &dev->stats; + unsigned int i; + struct netdev_queue *txq; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + tx_bytes += txq->tx_bytes; + tx_packets += txq->tx_packets; + tx_dropped += txq->tx_dropped; + } + if (tx_bytes || tx_packets || tx_dropped) { + stats->tx_bytes = tx_bytes; + stats->tx_packets = tx_packets; + stats->tx_dropped = tx_dropped; + } + return stats; + } } EXPORT_SYMBOL(dev_get_stats); -- cgit v1.2.3 From 93f154b594fe47e4a7e5358b309add449a046cd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 May 2009 22:19:19 -0700 Subject: net: release dst entry in dev_hard_start_xmit() One point of contention in high network loads is the dst_release() performed when a transmited skb is freed. This is because NIC tx completion calls dev_kree_skb() long after original call to dev_queue_xmit(skb). CPU cache is cold and the atomic op in dst_release() stalls. On SMP, this is quite visible if one CPU is 100% handling softirqs for a network device, since dst_clone() is done by other cpus, involving cache line ping pongs. It seems right place to release dst is in dev_hard_start_xmit(), for most devices but ones that are virtual, and some exceptions. David Miller suggested to define a new device flag, set in alloc_netdev_mq() (so that most devices set it at init time), and carefuly unset in devices which dont want a NULL skb->dst in their ndo_start_xmit(). List of devices that must clear this flag is : - loopback device, because it calls netif_rx() and quoting Patrick : "ip_route_input() doesn't accept loopback addresses, so loopback packets already need to have a dst_entry attached." - appletalk/ipddp.c : needs skb->dst in its xmit function - And all devices that call again dev_queue_xmit() from their xmit function (as some classifiers need skb->dst) : bonding, vlan, macvlan, eql, ifb, hdlc_fr Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6d3630d16271..92ebeca29901 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1688,6 +1688,14 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } + /* + * If device doesnt need skb->dst, release it right now while + * its hot in this cpu cache + */ + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) { + dst_release(skb->dst); + skb->dst = NULL; + } rc = ops->ndo_start_xmit(skb, dev); /* * TODO: if skb_orphan() was called by @@ -5045,6 +5053,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); + dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); return dev; -- cgit v1.2.3 From 4ea7e38696c7e798c47ebbecadfd392f23f814f9 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 21 May 2009 07:36:08 +0000 Subject: dropmon: add ability to detect when hardware dropsrxpackets Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman include/linux/net_dropmon.h | 8 ++ include/trace/napi.h | 11 +++ net/core/dev.c | 5 + net/core/drop_monitor.c | 124 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 149 insertions(+), 5 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 92ebeca29901..3942266d1f6c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2771,8 +2772,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); -- cgit v1.2.3 From e3804cbebb67887879102925961d41b503f7fbe3 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Mon, 25 May 2009 01:53:53 -0700 Subject: net: remove COMPAT_NET_DEV_OPS All drivers are already converted to new net_device_ops API and nobody uses old API anymore. Signed-off-by: Alexander Beregalov Signed-off-by: David S. Miller --- net/core/dev.c | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3942266d1f6c..241613f6dd2f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4580,39 +4580,6 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); -/* Some devices need to (re-)set their netdev_ops inside - * ->init() or similar. If that happens, we have to setup - * the compat pointers again. - */ -void netdev_resync_ops(struct net_device *dev) -{ -#ifdef CONFIG_COMPAT_NET_DEV_OPS - const struct net_device_ops *ops = dev->netdev_ops; - - dev->init = ops->ndo_init; - dev->uninit = ops->ndo_uninit; - dev->open = ops->ndo_open; - dev->change_rx_flags = ops->ndo_change_rx_flags; - dev->set_rx_mode = ops->ndo_set_rx_mode; - dev->set_multicast_list = ops->ndo_set_multicast_list; - dev->set_mac_address = ops->ndo_set_mac_address; - dev->validate_addr = ops->ndo_validate_addr; - dev->do_ioctl = ops->ndo_do_ioctl; - dev->set_config = ops->ndo_set_config; - dev->change_mtu = ops->ndo_change_mtu; - dev->neigh_setup = ops->ndo_neigh_setup; - dev->tx_timeout = ops->ndo_tx_timeout; - dev->get_stats = ops->ndo_get_stats; - dev->vlan_rx_register = ops->ndo_vlan_rx_register; - dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; - dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; -#ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = ops->ndo_poll_controller; -#endif -#endif -} -EXPORT_SYMBOL(netdev_resync_ops); - /** * register_netdevice - register a network device * @dev: device to register @@ -4652,23 +4619,6 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; -#ifdef CONFIG_COMPAT_NET_DEV_OPS - /* Netdevice_ops API compatibility support. - * This is temporary until all network devices are converted. - */ - if (dev->netdev_ops) { - netdev_resync_ops(dev); - } else { - char drivername[64]; - pr_info("%s (%s): not using net_device_ops yet\n", - dev->name, netdev_drivername(dev, drivername, 64)); - - /* This works only because net_device_ops and the - compatibility structure are the same. */ - dev->netdev_ops = (void *) &(dev->init); - } -#endif - /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); -- cgit v1.2.3 From 08baf561083bc27a953aa087dd8a664bb2b88e8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2009 22:58:01 -0700 Subject: net: txq_trans_update() helper We would like to get rid of netdev->trans_start = jiffies; that about all net drivers have to use in their start_xmit() function, and use txq->trans_start instead. This can be done generically in core network, as suggested by David. Some devices, (particularly loopback) dont need trans_start update, because they dont have transmit watchdog. We could add a new device flag, or rely on fact that txq->tran_start can be updated is txq->xmit_lock_owner is different than -1. Use a helper function to hide our choice. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 241613f6dd2f..5eb3e48ab31d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1698,6 +1698,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->dst = NULL; } rc = ops->ndo_start_xmit(skb, dev); + if (rc == 0) + txq_trans_update(txq); /* * TODO: if skb_orphan() was called by * dev->hard_start_xmit() (for example, the unmodified @@ -1727,6 +1729,7 @@ gso: skb->next = nskb; return rc; } + txq_trans_update(txq); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); -- cgit v1.2.3 From 78a478d0efd9e86e5345b436e130497b4e5846e8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:21 +0000 Subject: gro: Inline skb_gro_header and cache frag0 virtual address The function skb_gro_header is called four times per packet which quickly adds up at 10Gb/s. This patch inlines it to allow better optimisations. Some architectures perform multiplication for page_address, which is done by each skb_gro_header invocation. This patch caches that value in skb->cb to avoid the unnecessary multiplications. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5eb3e48ab31d..bdb1a738193d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2390,21 +2390,6 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); -void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) -{ - unsigned int offset = skb_gro_offset(skb); - - hlen += offset; - if (unlikely(skb_headlen(skb) || - skb_shinfo(skb)->frags[0].size < hlen || - PageHighMem(skb_shinfo(skb)->frags[0].page))) - return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; - - return page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset + offset; -} -EXPORT_SYMBOL(skb_gro_header); - int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; @@ -2520,6 +2505,18 @@ int napi_skb_finish(int ret, struct sk_buff *skb) } EXPORT_SYMBOL(napi_skb_finish); +void skb_gro_reset_offset(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + + if (!skb_headlen(skb) && !PageHighMem(skb_shinfo(skb)->frags[0].page)) + NAPI_GRO_CB(skb)->frag0 = + page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset; +} +EXPORT_SYMBOL(skb_gro_reset_offset); + int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); -- cgit v1.2.3 From 78d3fd0b7de844a6dad56e9620fc9d2271b32ab9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:23 +0000 Subject: gro: Only use skb_gro_header for completely non-linear packets Currently skb_gro_header is used for packets which put the hardware header in skb->data with the rest in frags. Since the drivers that need this optimisation all provide completely non-linear packets, we can gain extra optimisations by only performing the frag0 optimisation for completely non-linear packets. In particular, we can simply test frag0 (instead of skb_headlen) to see whether the optimisation is in force. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index bdb1a738193d..f9d90c56b6f0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2510,7 +2510,8 @@ void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; - if (!skb_headlen(skb) && !PageHighMem(skb_shinfo(skb)->frags[0].page)) + if (skb->mac_header == skb->tail && + !PageHighMem(skb_shinfo(skb)->frags[0].page)) NAPI_GRO_CB(skb)->frag0 = page_address(skb_shinfo(skb)->frags[0].page) + skb_shinfo(skb)->frags[0].page_offset; -- cgit v1.2.3 From 7489594cb249aeb178287c9a43a9e4f366044259 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:27 +0000 Subject: gro: Optimise length comparison in skb_gro_header By caching frag0_len, we can avoid checking both frag0 and the length separately in skb_gro_header. This helps as skb_gro_header is called four times per packet which amounts to a few million times at 10Gb/s. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f9d90c56b6f0..b1722a2d1fbe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2509,12 +2509,15 @@ void skb_gro_reset_offset(struct sk_buff *skb) { NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_shinfo(skb)->frags[0].page)) + !PageHighMem(skb_shinfo(skb)->frags[0].page)) { NAPI_GRO_CB(skb)->frag0 = page_address(skb_shinfo(skb)->frags[0].page) + skb_shinfo(skb)->frags[0].page_offset; + NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; + } } EXPORT_SYMBOL(skb_gro_reset_offset); -- cgit v1.2.3 From a5b1cf288d4200506ab62fbb86cc81ace948a306 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:28 +0000 Subject: gro: Avoid unnecessary comparison after skb_gro_header For the overwhelming majority of cases, skb_gro_header's return value cannot be NULL. Yet we must check it because of its current form. This patch splits it up into multiple functions in order to avoid this. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index b1722a2d1fbe..cd29e613bc5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2590,17 +2590,24 @@ struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; struct ethhdr *eth; + unsigned int hlen; + unsigned int off; napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb); - eth = skb_gro_header(skb, sizeof(*eth)); - if (!eth) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + off = skb_gro_offset(skb); + hlen = off + sizeof(*eth); + eth = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + eth = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!eth)) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } } skb_gro_pull(skb, sizeof(*eth)); -- cgit v1.2.3 From cb18978cbf454c236db5e4191a12ef71eef9b3a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:31 +0000 Subject: gro: Open-code final pskb_may_pull As we know the only packets which need the final pskb_may_pull are completely non-linear, and have all the required bits in frag0, we can perform a straight memcpy instead of going through pskb_may_pull and doing skb_copy_bits. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cd29e613bc5a..ed4550fd9ece 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2452,10 +2452,25 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) ret = GRO_HELD; pull: - if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) { - if (napi->gro_list == skb) - napi->gro_list = skb->next; - ret = GRO_DROP; + if (skb_headlen(skb) < skb_gro_offset(skb)) { + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->tail += grow; + skb->data_len -= grow; + + skb_shinfo(skb)->frags[0].page_offset += grow; + skb_shinfo(skb)->frags[0].size -= grow; + + if (unlikely(!skb_shinfo(skb)->frags[0].size)) { + put_page(skb_shinfo(skb)->frags[0].page); + memmove(skb_shinfo(skb)->frags, + skb_shinfo(skb)->frags + 1, + --skb_shinfo(skb)->nr_frags); + } } ok: -- cgit v1.2.3 From 1ce8e7b57b3a4527ef83da1c5c7bd8a6b9d87b56 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2009 04:42:37 +0000 Subject: net: ALIGN/PTR_ALIGN cleanup in alloc_netdev_mq()/netdev_priv() Use ALIGN() and PTR_ALIGN() macros instead of handcoding them. Get rid of NETDEV_ALIGN_CONST ugly define Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ed4550fd9ece..32ceee17896e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4988,18 +4988,18 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, struct netdev_queue *tx; struct net_device *dev; size_t alloc_size; - void *p; + struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ - alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); alloc_size += sizeof_priv; } /* ensure 32-byte alignment of whole construct */ - alloc_size += NETDEV_ALIGN_CONST; + alloc_size += NETDEV_ALIGN - 1; p = kzalloc(alloc_size, GFP_KERNEL); if (!p) { @@ -5014,8 +5014,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } - dev = (struct net_device *) - (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); + dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) -- cgit v1.2.3 From ccffad25b5136958d4769ed6de5e87992dd9c65c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 22 May 2009 23:22:17 +0000 Subject: net: convert unicast addr list This patch converts unicast address list to standard list_head using previously introduced struct netdev_hw_addr. It also relaxes the locking. Original spinlock (still used for multicast addresses) is not needed and is no longer used for a protection of this list. All reading and writing takes place under rtnl (with no changes). I also removed a possibility to specify the length of the address while adding or deleting unicast address. It's always dev->addr_len. The convertion touched especially e1000 and ixgbe codes when the change is not so trivial. Signed-off-by: Jiri Pirko drivers/net/bnx2.c | 13 +-- drivers/net/e1000/e1000_main.c | 24 +++-- drivers/net/ixgbe/ixgbe_common.c | 14 ++-- drivers/net/ixgbe/ixgbe_common.h | 4 +- drivers/net/ixgbe/ixgbe_main.c | 6 +- drivers/net/ixgbe/ixgbe_type.h | 4 +- drivers/net/macvlan.c | 11 +- drivers/net/mv643xx_eth.c | 11 +- drivers/net/niu.c | 7 +- drivers/net/virtio_net.c | 7 +- drivers/s390/net/qeth_l2_main.c | 6 +- drivers/scsi/fcoe/fcoe.c | 16 ++-- include/linux/netdevice.h | 18 ++-- net/8021q/vlan.c | 4 +- net/8021q/vlan_dev.c | 10 +- net/core/dev.c | 195 +++++++++++++++++++++++++++----------- net/dsa/slave.c | 10 +- net/packet/af_packet.c | 4 +- 18 files changed, 227 insertions(+), 137 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 195 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 138 insertions(+), 57 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 32ceee17896e..e2fcc5f10177 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3473,8 +3473,9 @@ void dev_set_rx_mode(struct net_device *dev) /* hw addresses list handling functions */ -static int __hw_addr_add(struct list_head *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_add(struct list_head *list, int *delta, + unsigned char *addr, int addr_len, + unsigned char addr_type) { struct netdev_hw_addr *ha; int alloc_size; @@ -3482,6 +3483,15 @@ static int __hw_addr_add(struct list_head *list, unsigned char *addr, if (addr_len > MAX_ADDR_LEN) return -EINVAL; + list_for_each_entry(ha, list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + ha->type == addr_type) { + ha->refcount++; + return 0; + } + } + + alloc_size = sizeof(*ha); if (alloc_size < L1_CACHE_BYTES) alloc_size = L1_CACHE_BYTES; @@ -3490,7 +3500,11 @@ static int __hw_addr_add(struct list_head *list, unsigned char *addr, return -ENOMEM; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; + ha->refcount = 1; + ha->synced = false; list_add_tail_rcu(&ha->list, list); + if (delta) + (*delta)++; return 0; } @@ -3502,29 +3516,30 @@ static void ha_rcu_free(struct rcu_head *head) kfree(ha); } -static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr, - int addr_len, unsigned char addr_type, - int ignore_index) +static int __hw_addr_del(struct list_head *list, int *delta, + unsigned char *addr, int addr_len, + unsigned char addr_type) { struct netdev_hw_addr *ha; - int i = 0; list_for_each_entry(ha, list, list) { - if (i++ != ignore_index && - !memcmp(ha->addr, addr, addr_len) && + if (!memcmp(ha->addr, addr, addr_len) && (ha->type == addr_type || !addr_type)) { + if (--ha->refcount) + return 0; list_del_rcu(&ha->list); call_rcu(&ha->rcu_head, ha_rcu_free); + if (delta) + (*delta)--; return 0; } } return -ENOENT; } -static int __hw_addr_add_multiple_ii(struct list_head *to_list, - struct list_head *from_list, - int addr_len, unsigned char addr_type, - int ignore_index) +static int __hw_addr_add_multiple(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int addr_len, + unsigned char addr_type) { int err; struct netdev_hw_addr *ha, *ha2; @@ -3532,7 +3547,8 @@ static int __hw_addr_add_multiple_ii(struct list_head *to_list, list_for_each_entry(ha, from_list, list) { type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); + err = __hw_addr_add(to_list, to_delta, ha->addr, + addr_len, type); if (err) goto unroll; } @@ -3543,27 +3559,69 @@ unroll: if (ha2 == ha) break; type = addr_type ? addr_type : ha2->type; - __hw_addr_del_ii(to_list, ha2->addr, addr_len, type, - ignore_index); + __hw_addr_del(to_list, to_delta, ha2->addr, + addr_len, type); } return err; } -static void __hw_addr_del_multiple_ii(struct list_head *to_list, - struct list_head *from_list, - int addr_len, unsigned char addr_type, - int ignore_index) +static void __hw_addr_del_multiple(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int addr_len, + unsigned char addr_type) { struct netdev_hw_addr *ha; unsigned char type; list_for_each_entry(ha, from_list, list) { type = addr_type ? addr_type : ha->type; - __hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type, - ignore_index); + __hw_addr_del(to_list, to_delta, ha->addr, + addr_len, addr_type); + } +} + +static int __hw_addr_sync(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int *from_delta, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, from_list, list) { + if (!ha->synced) { + err = __hw_addr_add(to_list, to_delta, ha->addr, + addr_len, ha->type); + if (err) + break; + ha->synced = true; + ha->refcount++; + } else if (ha->refcount == 1) { + __hw_addr_del(to_list, to_delta, ha->addr, + addr_len, ha->type); + __hw_addr_del(from_list, from_delta, ha->addr, + addr_len, ha->type); + } } + return err; } +static void __hw_addr_unsync(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int *from_delta, + int addr_len) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, from_list, list) { + if (ha->synced) { + __hw_addr_del(to_list, to_delta, ha->addr, + addr_len, ha->type); + ha->synced = false; + __hw_addr_del(from_list, from_delta, ha->addr, + addr_len, ha->type); + } + } +} + + static void __hw_addr_flush(struct list_head *list) { struct netdev_hw_addr *ha, *tmp; @@ -3594,7 +3652,7 @@ static int dev_addr_init(struct net_device *dev) INIT_LIST_HEAD(&dev->dev_addr_list); memset(addr, 0, sizeof(*addr)); - err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr), + err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(*addr), NETDEV_HW_ADDR_T_LAN); if (!err) { /* @@ -3626,7 +3684,7 @@ int dev_addr_add(struct net_device *dev, unsigned char *addr, ASSERT_RTNL(); - err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len, + err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); @@ -3649,11 +3707,20 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr, unsigned char addr_type) { int err; + struct netdev_hw_addr *ha; ASSERT_RTNL(); - err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, - addr_type, 0); + /* + * We can not remove the first address from the list because + * dev->dev_addr points to that. + */ + ha = list_first_entry(&dev->dev_addr_list, struct netdev_hw_addr, list); + if (ha->addr == dev->dev_addr && ha->refcount == 1) + return -ENOENT; + + err = __hw_addr_del(&dev->dev_addr_list, NULL, addr, dev->addr_len, + addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; @@ -3680,9 +3747,9 @@ int dev_addr_add_multiple(struct net_device *to_dev, if (from_dev->addr_len != to_dev->addr_len) return -EINVAL; - err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list, - &from_dev->dev_addr_list, - to_dev->addr_len, addr_type, 0); + err = __hw_addr_add_multiple(&to_dev->dev_addr_list, NULL, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); return err; @@ -3707,9 +3774,9 @@ int dev_addr_del_multiple(struct net_device *to_dev, if (from_dev->addr_len != to_dev->addr_len) return -EINVAL; - __hw_addr_del_multiple_ii(&to_dev->dev_addr_list, - &from_dev->dev_addr_list, - to_dev->addr_len, addr_type, 0); + __hw_addr_del_multiple(&to_dev->dev_addr_list, NULL, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type); call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); return 0; } @@ -3779,24 +3846,22 @@ int __dev_addr_add(struct dev_addr_list **list, int *count, * dev_unicast_delete - Release secondary unicast address. * @dev: device * @addr: address to delete - * @alen: length of @addr * * Release reference to a secondary unicast address and remove it * from the device if the reference count drops to zero. * * The caller must hold the rtnl_mutex. */ -int dev_unicast_delete(struct net_device *dev, void *addr, int alen) +int dev_unicast_delete(struct net_device *dev, void *addr) { int err; ASSERT_RTNL(); - netif_addr_lock_bh(dev); - err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); + err = __hw_addr_del(&dev->uc_list, &dev->uc_count, addr, + dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_delete); @@ -3805,24 +3870,22 @@ EXPORT_SYMBOL(dev_unicast_delete); * dev_unicast_add - add a secondary unicast address * @dev: device * @addr: address to add - * @alen: length of @addr * * Add a secondary unicast address to the device or increase * the reference count if it already exists. * * The caller must hold the rtnl_mutex. */ -int dev_unicast_add(struct net_device *dev, void *addr, int alen) +int dev_unicast_add(struct net_device *dev, void *addr) { int err; ASSERT_RTNL(); - netif_addr_lock_bh(dev); - err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); + err = __hw_addr_add(&dev->uc_list, &dev->uc_count, addr, + dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_add); @@ -3879,8 +3942,7 @@ void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, * @from: source device * * Add newly added addresses to the destination device and release - * addresses that have no users left. The source device must be - * locked by netif_tx_lock_bh. + * addresses that have no users left. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. @@ -3889,12 +3951,15 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from) { int err = 0; - netif_addr_lock_bh(to); - err = __dev_addr_sync(&to->uc_list, &to->uc_count, - &from->uc_list, &from->uc_count); + ASSERT_RTNL(); + + if (to->addr_len != from->addr_len) + return -EINVAL; + + err = __hw_addr_sync(&to->uc_list, &to->uc_count, + &from->uc_list, &from->uc_count, to->addr_len); if (!err) __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); return err; } EXPORT_SYMBOL(dev_unicast_sync); @@ -3910,18 +3975,33 @@ EXPORT_SYMBOL(dev_unicast_sync); */ void dev_unicast_unsync(struct net_device *to, struct net_device *from) { - netif_addr_lock_bh(from); - netif_addr_lock(to); + ASSERT_RTNL(); - __dev_addr_unsync(&to->uc_list, &to->uc_count, - &from->uc_list, &from->uc_count); - __dev_set_rx_mode(to); + if (to->addr_len != from->addr_len) + return; - netif_addr_unlock(to); - netif_addr_unlock_bh(from); + __hw_addr_unsync(&to->uc_list, &to->uc_count, + &from->uc_list, &from->uc_count, to->addr_len); + __dev_set_rx_mode(to); } EXPORT_SYMBOL(dev_unicast_unsync); +static void dev_unicast_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->uc_list); + dev->uc_count = 0; +} + +static void dev_unicast_init(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + INIT_LIST_HEAD(&dev->uc_list); +} + + static void __dev_addr_discard(struct dev_addr_list **list) { struct dev_addr_list *tmp; @@ -3940,9 +4020,6 @@ static void dev_addr_discard(struct net_device *dev) { netif_addr_lock_bh(dev); - __dev_addr_discard(&dev->uc_list); - dev->uc_count = 0; - __dev_addr_discard(&dev->mc_list); dev->mc_count = 0; @@ -4535,6 +4612,7 @@ static void rollback_registered(struct net_device *dev) /* * Flush the unicast and multicast chains */ + dev_unicast_flush(dev); dev_addr_discard(dev); if (dev->netdev_ops->ndo_uninit) @@ -5020,6 +5098,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, if (dev_addr_init(dev)) goto free_tx; + dev_unicast_init(dev); + dev_net_set(dev, &init_net); dev->_tx = tx; @@ -5223,6 +5303,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* * Flush the unicast and multicast chains */ + dev_unicast_flush(dev); dev_addr_discard(dev); netdev_unregister_kobject(dev); -- cgit v1.2.3 From adf30907d63893e4208dfe3f5c88ae12bc2f25d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jun 2009 05:19:30 +0000 Subject: net: skb->dst accessors Define three accessors to get/set dst attached to a skb struct dst_entry *skb_dst(const struct sk_buff *skb) void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) void skb_dst_drop(struct sk_buff *skb) This one should replace occurrences of : dst_release(skb->dst) skb->dst = NULL; Delete skb->dst field Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e2fcc5f10177..34b49a6a22fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1693,10 +1693,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * If device doesnt need skb->dst, release it right now while * its hot in this cpu cache */ - if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) { - dst_release(skb->dst); - skb->dst = NULL; - } + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + rc = ops->ndo_start_xmit(skb, dev); if (rc == 0) txq_trans_update(txq); -- cgit v1.2.3 From 3b8bcfd5d31ea0fec58681d035544ace707d2536 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 30 May 2009 01:39:53 +0200 Subject: net: introduce pre-up netdev notifier NETDEV_UP is called after the device is set UP, but sometimes it is useful to be able to veto the device UP. Introduce a new NETDEV_PRE_UP notifier that can be used for exactly this. The first use case will be cfg80211 denying interfaces to be set UP if the device is known to be rfkill'ed. Signed-off-by: Johannes Berg Acked-by: David S. Miller Signed-off-by: John W. Linville --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 34b49a6a22fd..1f38401fc028 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1048,7 +1048,7 @@ void dev_load(struct net *net, const char *name) int dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; - int ret = 0; + int ret; ASSERT_RTNL(); @@ -1065,6 +1065,11 @@ int dev_open(struct net_device *dev) if (!netif_device_present(dev)) return -ENODEV; + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = notifier_to_errno(ret); + if (ret) + return ret; + /* * Call device private open method */ -- cgit v1.2.3 From 4cf704fbea96075942bd033fd75aa4e76ae1c8a1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 00:18:51 -0700 Subject: net/core/dev.c: Use frag list abstraction interfaces. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1f38401fc028..4913089c91dc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1820,7 +1820,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (netif_needs_gso(dev, skb)) goto gso; - if (skb_shinfo(skb)->frag_list && + if (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST) && __skb_linearize(skb)) goto out_kfree_skb; @@ -2407,7 +2407,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list) + if (skb_is_gso(skb) || skb_has_frags(skb)) goto normal; rcu_read_lock(); -- cgit v1.2.3 From 0c27922e4933ceb86644f4a9b1af212ffe5aad75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Jun 2009 03:49:24 +0000 Subject: net: dev_addr_init() fix commit f001fde5eadd915f4858d22ed70d7040f48767cf (net: introduce a list of device addresses dev_addr_list (v6)) added one regression Vegard Nossum found in its testings. With kmemcheck help, Vegard found some uninitialized memory was read and reported to user, potentialy leaking kernel data. ( thread can be found on http://lkml.org/lkml/2009/5/30/177 ) dev_addr_init() incorrectly uses sizeof() operator. We were initializing one byte instead of MAX_ADDR_LEN bytes. Reported-by: Vegard Nossum Signed-off-by: Eric Dumazet Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4913089c91dc..81b392ef5114 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3655,8 +3655,8 @@ static int dev_addr_init(struct net_device *dev) /* rtnl_mutex must be held here */ INIT_LIST_HEAD(&dev->dev_addr_list); - memset(addr, 0, sizeof(*addr)); - err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(*addr), + memset(addr, 0, sizeof(addr)); + err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(addr), NETDEV_HW_ADDR_T_LAN); if (!err) { /* -- cgit v1.2.3 From fcb94e422479da52ed90bab230c59617a0462416 Mon Sep 17 00:00:00 2001 From: Sergey Lapin Date: Mon, 8 Jun 2009 12:18:47 +0000 Subject: Add constants for the ieee 802.15.4 stack IEEE 802.15.4 stack requires several constants to be defined/adjusted. Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: Sergey Lapin Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 81b392ef5114..11560e3258b5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -269,7 +269,8 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, - ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE}; + ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_IEEE802154_PHY, + ARPHRD_VOID, ARPHRD_NONE}; static const char *netdev_lock_name[] = {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", @@ -286,7 +287,8 @@ static const char *netdev_lock_name[] = "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", - "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"}; + "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_IEEE802154_PHY", + "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; -- cgit v1.2.3 From 746e6ad23cd6fec2edce056e014a0eabeffa838c Mon Sep 17 00:00:00 2001 From: John Dykstra Date: Thu, 11 Jun 2009 20:57:21 -0700 Subject: [PATCH] net core: Some interface flags not returned by SIOCGIFFLAGS Commit b00055aacdb172c05067612278ba27265fcd05ce " [NET] core: add RFC2863 operstate" defined new interface flag values. Its documentation specified that these flags could be accessed from user space via SIOCGIFFLAGS. However, this does not work because the new flags do not fit in that ioctl's argument width. Change the documentation to match the code's behavior. Also change the source to explicitly show the truncation. This _should_ have no effect on executable code, and did not with gcc 4.2.4 generating x86 code. A new ioctl could be defined to return all interface flags to user space. However, since this has been broken for three years with no one complaining, there doesn't seem much need. They are still accessible via netlink. Reported-by: "Fredrik Arnerup" Signed-off-by: John Dykstra Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 11560e3258b5..a09bf658970f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4209,7 +4209,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = dev_get_flags(dev); + ifr->ifr_flags = (short) dev_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface -- cgit v1.2.3 From da6782927de809d9d427bd4bd6a4024243e41f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 5 Jun 2009 05:35:28 +0000 Subject: bridge: Simplify interface for ATM LANE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch changes FDB entry check for ATM LANE bridge integration. There's no point in holding a FDB entry around SKB building. br_fdb_get()/br_fdb_put() pair are changed into single br_fdb_test_addr() hook that checks if the addr has FDB entry pointing to other port to the one the request arrived on. FDB entry refcounting is removed as it's not used anywhere else. Signed-off-by: Michał Mirosław Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a09bf658970f..ea00e36f48e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2071,11 +2071,13 @@ static inline int deliver_skb(struct sk_buff *skb, } #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) -/* These hooks defined here for ATM */ -struct net_bridge; -struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, - unsigned char *addr); -void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; + +#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) +/* This hook is defined here for ATM LANE */ +int (*br_fdb_test_addr_hook)(struct net_device *dev, + unsigned char *addr) __read_mostly; +EXPORT_SYMBOL(br_fdb_test_addr_hook); +#endif /* * If bridge module is loaded call bridging hook. @@ -2083,6 +2085,8 @@ void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; */ struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff *skb) __read_mostly; +EXPORT_SYMBOL(br_handle_frame_hook); + static inline struct sk_buff *handle_bridge(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) @@ -5665,12 +5669,6 @@ EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); EXPORT_SYMBOL(dev_get_flags); -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) -EXPORT_SYMBOL(br_handle_frame_hook); -EXPORT_SYMBOL(br_fdb_get_hook); -EXPORT_SYMBOL(br_fdb_put_hook); -#endif - EXPORT_SYMBOL(dev_load); EXPORT_PER_CPU_SYMBOL(softnet_data); -- cgit v1.2.3