From 553675fb5e9ce3d71aa6cb527f98cd34793c0dbc Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 16 May 2013 11:35:20 +0000 Subject: vxlan: listen on multiple ports The commit 823aa873bc782f1c51b1ce8ec6da7cfcaf93836e Author: stephen hemminger Date: Sat Apr 27 11:31:57 2013 +0000 vxlan: allow choosing destination port per vxlan introduced per-vxlan UDP port configuration but only did half of the necessary work. It added per vxlan destination for sending, but overlooked the handling of multiple ports for incoming traffic. This patch changes the listening port management to handle multiple incoming UDP ports. The earlier per-namespace structure is now a hash list per namespace. It is also now possible to define the same virtual network id but with different UDP port values which can be useful for migration. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 263 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 181 insertions(+), 82 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ba81f3c39a83..80d359c6310b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -44,6 +44,8 @@ #define VXLAN_VERSION "0.1" +#define PORT_HASH_BITS 8 +#define PORT_HASH_SIZE (1<vni_list[hash_32(id, VNI_HASH_BITS)]; +} + +/* Socket hash table head */ +static inline struct hlist_head *vs_head(struct net *net, __be16 port) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); - return &vn->vni_list[hash_32(id, VNI_HASH_BITS)]; + return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; +} + +/* Find VXLAN socket based on network namespace and UDP port */ +static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) +{ + struct vxlan_sock *vs; + + hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { + if (inet_sk(vs->sock->sk)->inet_sport == port) + return vs; + } + return NULL; } /* Look up VNI in a per net namespace table */ -static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id) +static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) { + struct vxlan_sock *vs; struct vxlan_dev *vxlan; - hlist_for_each_entry_rcu(vxlan, vni_head(net, id), hlist) { + vs = vxlan_find_port(net, port); + if (!vs) + return NULL; + + hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { if (vxlan->default_dst.remote_vni == id) return vxlan; } @@ -592,20 +631,18 @@ static void vxlan_snoop(struct net_device *dev, static bool vxlan_group_used(struct vxlan_net *vn, const struct vxlan_dev *this) { - const struct vxlan_dev *vxlan; - unsigned h; + struct vxlan_dev *vxlan; - for (h = 0; h < VNI_HASH_SIZE; ++h) - hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) { - if (vxlan == this) - continue; + list_for_each_entry(vxlan, &vn->vxlan_list, next) { + if (vxlan == this) + continue; - if (!netif_running(vxlan->dev)) - continue; + if (!netif_running(vxlan->dev)) + continue; - if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip) - return true; - } + if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip) + return true; + } return false; } @@ -615,7 +652,7 @@ static int vxlan_join_group(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - struct sock *sk = vn->sock->sk; + struct sock *sk = vxlan->vn_sock->sock->sk; struct ip_mreqn mreq = { .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, .imr_ifindex = vxlan->default_dst.remote_ifindex, @@ -643,7 +680,7 @@ static int vxlan_leave_group(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); int err = 0; - struct sock *sk = vn->sock->sk; + struct sock *sk = vxlan->vn_sock->sock->sk; struct ip_mreqn mreq = { .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, .imr_ifindex = vxlan->default_dst.remote_ifindex, @@ -670,6 +707,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct vxlanhdr *vxh; struct vxlan_dev *vxlan; struct pcpu_tstats *stats; + __be16 port; __u32 vni; int err; @@ -693,9 +731,11 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) /* Is this VNI defined? */ vni = ntohl(vxh->vx_vni) >> 8; - vxlan = vxlan_find_vni(sock_net(sk), vni); + port = inet_sk(sk)->inet_sport; + vxlan = vxlan_find_vni(sock_net(sk), vni, port); if (!vxlan) { - netdev_dbg(skb->dev, "unknown vni %d\n", vni); + netdev_dbg(skb->dev, "unknown vni %d port %u\n", + vni, ntohs(port)); goto drop; } @@ -875,7 +915,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; } -static void vxlan_sock_free(struct sk_buff *skb) +static void vxlan_sock_put(struct sk_buff *skb) { sock_put(skb->sk); } @@ -883,13 +923,13 @@ static void vxlan_sock_free(struct sk_buff *skb) /* On transmit, associate with the tunnel socket */ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) { - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - struct sock *sk = vn->sock->sk; + struct vxlan_dev *vxlan = netdev_priv(dev); + struct sock *sk = vxlan->vn_sock->sock->sk; skb_orphan(skb); sock_hold(sk); skb->sk = sk; - skb->destructor = vxlan_sock_free; + skb->destructor = vxlan_sock_put; } /* Compute source port for outgoing packet @@ -1031,7 +1071,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *dst_vxlan; ip_rt_put(rt); - dst_vxlan = vxlan_find_vni(dev_net(dev), vni); + dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); @@ -1306,6 +1346,7 @@ static void vxlan_setup(struct net_device *dev) dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); init_timer_deferrable(&vxlan->age_timer); @@ -1390,11 +1431,78 @@ static const struct ethtool_ops vxlan_ethtool_ops = { .get_link = ethtool_op_get_link, }; +static void vxlan_del_work(struct work_struct *work) +{ + struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); + + sk_release_kernel(vs->sock->sk); + kfree_rcu(vs, rcu); +} + +/* Create new listen socket if needed */ +static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) +{ + struct vxlan_sock *vs; + struct sock *sk; + struct sockaddr_in vxlan_addr = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + int rc; + unsigned h; + + vs = kmalloc(sizeof(*vs), GFP_KERNEL); + if (!vs) + return ERR_PTR(-ENOMEM); + + for (h = 0; h < VNI_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&vs->vni_list[h]); + + INIT_WORK(&vs->del_work, vxlan_del_work); + + /* Create UDP socket for encapsulation receive. */ + rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock); + if (rc < 0) { + pr_debug("UDP socket create failed\n"); + kfree(vs); + return ERR_PTR(rc); + } + + /* Put in proper namespace */ + sk = vs->sock->sk; + sk_change_net(sk, net); + + vxlan_addr.sin_port = port; + + rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr, + sizeof(vxlan_addr)); + if (rc < 0) { + pr_debug("bind for UDP socket %pI4:%u (%d)\n", + &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); + sk_release_kernel(sk); + kfree(vs); + return ERR_PTR(rc); + } + + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + + /* Mark socket as an encapsulation socket. */ + udp_sk(sk)->encap_type = 1; + udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; + udp_encap_enable(); + + vs->refcnt = 1; + return vs; +} + static int vxlan_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; + struct vxlan_sock *vs; __u32 vni; int err; @@ -1402,10 +1510,6 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, return -EINVAL; vni = nla_get_u32(data[IFLA_VXLAN_ID]); - if (vxlan_find_vni(net, vni)) { - pr_info("duplicate VNI %u\n", vni); - return -EEXIST; - } dst->remote_vni = vni; if (data[IFLA_VXLAN_GROUP]) @@ -1471,22 +1575,58 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, if (data[IFLA_VXLAN_PORT]) vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); + if (vxlan_find_vni(net, vni, vxlan->dst_port)) { + pr_info("duplicate VNI %u\n", vni); + return -EEXIST; + } + + vs = vxlan_find_port(net, vxlan->dst_port); + if (vs) + ++vs->refcnt; + else { + /* Drop lock because socket create acquires RTNL lock */ + rtnl_unlock(); + vs = vxlan_socket_create(net, vxlan->dst_port); + rtnl_lock(); + if (IS_ERR(vs)) + return PTR_ERR(vs); + + hlist_add_head_rcu(&vs->hlist, vs_head(net, vxlan->dst_port)); + } + vxlan->vn_sock = vs; + SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops); err = register_netdevice(dev); - if (!err) - hlist_add_head_rcu(&vxlan->hlist, vni_head(net, dst->remote_vni)); + if (err) { + if (--vs->refcnt == 0) { + rtnl_unlock(); + sk_release_kernel(vs->sock->sk); + kfree(vs); + rtnl_lock(); + } + return err; + } - return err; + list_add(&vxlan->next, &vn->vxlan_list); + hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); + + return 0; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_sock *vs = vxlan->vn_sock; hlist_del_rcu(&vxlan->hlist); - + list_del(&vxlan->next); unregister_netdevice_queue(dev, head); + + if (--vs->refcnt == 0) { + hlist_del_rcu(&vs->hlist); + schedule_work(&vs->del_work); + } } static size_t vxlan_get_size(const struct net_device *dev) @@ -1572,46 +1712,12 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = { static __net_init int vxlan_init_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct sock *sk; - struct sockaddr_in vxlan_addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_ANY), - }; - int rc; unsigned h; - /* Create UDP socket for encapsulation receive. */ - rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock); - if (rc < 0) { - pr_debug("UDP socket create failed\n"); - return rc; - } - /* Put in proper namespace */ - sk = vn->sock->sk; - sk_change_net(sk, net); - - vxlan_addr.sin_port = htons(vxlan_port); - - rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr, - sizeof(vxlan_addr)); - if (rc < 0) { - pr_debug("bind for UDP socket %pI4:%u (%d)\n", - &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); - sk_release_kernel(sk); - vn->sock = NULL; - return rc; - } - - /* Disable multicast loopback */ - inet_sk(sk)->mc_loop = 0; + INIT_LIST_HEAD(&vn->vxlan_list); - /* Mark socket as an encapsulation socket. */ - udp_sk(sk)->encap_type = 1; - udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; - udp_encap_enable(); - - for (h = 0; h < VNI_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vn->vni_list[h]); + for (h = 0; h < PORT_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&vn->sock_list[h]); return 0; } @@ -1620,18 +1726,11 @@ static __net_exit void vxlan_exit_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan; - unsigned h; rtnl_lock(); - for (h = 0; h < VNI_HASH_SIZE; ++h) - hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) - dev_close(vxlan->dev); + list_for_each_entry(vxlan, &vn->vxlan_list, next) + dev_close(vxlan->dev); rtnl_unlock(); - - if (vn->sock) { - sk_release_kernel(vn->sock->sk); - vn->sock = NULL; - } } static struct pernet_operations vxlan_net_ops = { -- cgit v1.2.3 From 784e4616a40166fc60a1741a2c64155e99fe9247 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 27 May 2013 22:35:51 +0000 Subject: vxlan: remove the unused rcu head from struct vxlan_rdst Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 5ed64d496f83..289d79a48c82 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -97,7 +97,6 @@ struct vxlan_net { }; struct vxlan_rdst { - struct rcu_head rcu; __be32 remote_ip; __be16 remote_port; u32 remote_vni; -- cgit v1.2.3 From 31fec5aa21d166cf81702a669c1398784b513b8a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 27 May 2013 22:35:52 +0000 Subject: vxlan: use unsigned int instead of unsigned 'unsigned int' is slightly better. Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 289d79a48c82..6fc962042046 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1269,7 +1269,7 @@ static int vxlan_open(struct net_device *dev) /* Purge the forwarding table */ static void vxlan_flush(struct vxlan_dev *vxlan) { - unsigned h; + unsigned int h; spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { @@ -1333,7 +1333,7 @@ static void vxlan_free(struct net_device *dev) static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned h; + unsigned int h; int low, high; eth_hw_addr_random(dev); @@ -1459,7 +1459,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) .sin_addr.s_addr = htonl(INADDR_ANY), }; int rc; - unsigned h; + unsigned int h; vs = kmalloc(sizeof(*vs), GFP_KERNEL); if (!vs) @@ -1722,7 +1722,7 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = { static __net_init int vxlan_init_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); - unsigned h; + unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); -- cgit v1.2.3 From 7332a13b038be05cf44fcf0cacfdd8aade77b16f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 27 May 2013 22:35:53 +0000 Subject: vxlan: defer vxlan init as late as possible When vxlan is compiled as builtin, its init code runs before IPv6 init, this could cause problems if we create IPv6 socket in the latter patch. Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 6fc962042046..8111565c35fc 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1771,7 +1771,7 @@ out2: out1: return rc; } -module_init(vxlan_init_module); +late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { -- cgit v1.2.3 From 0e6fbc5b6c6218987c93b8c7ca60cf786062899d Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:49:56 -0700 Subject: ip_tunnels: extend iptunnel_xmit() Refactor various ip tunnels xmit functions and extend iptunnel_xmit() so that there is more code sharing. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f6dce13c8f89..284c6c00c353 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1021,7 +1021,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct rtable *rt; const struct iphdr *old_iph; - struct iphdr *iph; struct vxlanhdr *vxh; struct udphdr *uh; struct flowi4 fl4; @@ -1030,6 +1029,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 vni; __be16 df = 0; __u8 tos, ttl; + int err; dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; vni = rdst->remote_vni; @@ -1097,13 +1097,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan_encap_bypass(skb, vxlan, dst_vxlan); return NETDEV_TX_OK; } - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | - IPSKB_REROUTED); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_FLAGS); vxh->vx_vni = htonl(vni << 8); @@ -1118,27 +1111,18 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, uh->len = htons(skb->len); uh->check = 0; - __skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; - iph->protocol = IPPROTO_UDP; - iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb); - iph->daddr = dst; - iph->saddr = fl4.saddr; - iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - tunnel_ip_select_ident(skb, old_iph, &rt->dst); - - nf_reset(skb); - vxlan_set_owner(dev, skb); if (handle_offloads(skb)) goto drop; - iptunnel_xmit(skb, dev); + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); + + err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, dst, + IPPROTO_UDP, tos, ttl, df); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return NETDEV_TX_OK; drop: -- cgit v1.2.3 From b7153984074e51a50dad905871b705e0d67aa147 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:09 -0700 Subject: vxlan: fix out of order operation on module removal If vxlan is removed with active vxlan's it would crash because rtnl_link_unregister (which calls vxlan_dellink), was invoked before unregister_pernet_device (which calls vxlan_stop). Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 284c6c00c353..d3005d3a768d 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1771,8 +1771,8 @@ late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { - rtnl_link_unregister(&vxlan_link_ops); unregister_pernet_device(&vxlan_net_ops); + rtnl_link_unregister(&vxlan_link_ops); rcu_barrier(); } module_exit(vxlan_cleanup_module); -- cgit v1.2.3 From 758c57d16adcbec3c03e85f0c9a5b4ca31f6c507 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:09 -0700 Subject: vxlan: fix crash from work pending on module removal Switch to using a per module work queue so that all the socket deletion callbacks are done when module is removed. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index d3005d3a768d..eb94bf5812cb 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -148,6 +148,7 @@ struct vxlan_dev { /* salt for hash table */ static u32 vxlan_salt __read_mostly; +static struct workqueue_struct *vxlan_wq; /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) @@ -1631,7 +1632,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) if (--vs->refcnt == 0) { hlist_del_rcu(&vs->hlist); - schedule_work(&vs->del_work); + queue_work(vxlan_wq, &vs->del_work); } } @@ -1750,6 +1751,10 @@ static int __init vxlan_init_module(void) { int rc; + vxlan_wq = alloc_workqueue("vxlan", 0, 0); + if (!vxlan_wq) + return -ENOMEM; + get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); rc = register_pernet_device(&vxlan_net_ops); @@ -1765,6 +1770,7 @@ static int __init vxlan_init_module(void) out2: unregister_pernet_device(&vxlan_net_ops); out1: + destroy_workqueue(vxlan_wq); return rc; } late_initcall(vxlan_init_module); @@ -1773,6 +1779,7 @@ static void __exit vxlan_cleanup_module(void) { unregister_pernet_device(&vxlan_net_ops); rtnl_link_unregister(&vxlan_link_ops); + destroy_workqueue(vxlan_wq); rcu_barrier(); } module_exit(vxlan_cleanup_module); -- cgit v1.2.3 From 7c47cedf43a8b3086c3dcf26cbc058747ee21bec Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:10 -0700 Subject: vxlan: move IGMP join/leave to work queue Do join/leave from work queue to avoid lock inversion problems between normal socket and RTNL. The code comes out cleaner as well. Uses Cong Wang's suggestion to turn refcnt into a real atomic since now need to handle case where last use of socket is IGMP worker. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 103 ++++++++++++++++++++-------------------------------- 1 file changed, 40 insertions(+), 63 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index eb94bf5812cb..b061c98474ee 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -85,7 +85,7 @@ struct vxlan_sock { struct hlist_node hlist; struct rcu_head rcu; struct work_struct del_work; - unsigned int refcnt; + atomic_t refcnt; struct socket *sock; struct hlist_head vni_list[VNI_HASH_SIZE]; }; @@ -131,6 +131,7 @@ struct vxlan_dev { __u8 ttl; u32 flags; /* VXLAN_F_* below */ + struct work_struct igmp_work; unsigned long age_interval; struct timer_list age_timer; spinlock_t hash_lock; @@ -648,76 +649,58 @@ static bool vxlan_snoop(struct net_device *dev, /* See if multicast group is already in use by other ID */ -static bool vxlan_group_used(struct vxlan_net *vn, - const struct vxlan_dev *this) +static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) { struct vxlan_dev *vxlan; list_for_each_entry(vxlan, &vn->vxlan_list, next) { - if (vxlan == this) - continue; - if (!netif_running(vxlan->dev)) continue; - if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip) + if (vxlan->default_dst.remote_ip == remote_ip) return true; } return false; } -/* kernel equivalent to IP_ADD_MEMBERSHIP */ -static int vxlan_join_group(struct net_device *dev) +static void vxlan_sock_hold(struct vxlan_sock *vs) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - struct sock *sk = vxlan->vn_sock->sock->sk; - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, - .imr_ifindex = vxlan->default_dst.remote_ifindex, - }; - int err; - - /* Already a member of group */ - if (vxlan_group_used(vn, vxlan)) - return 0; + atomic_inc(&vs->refcnt); +} - /* Need to drop RTNL to call multicast join */ - rtnl_unlock(); - lock_sock(sk); - err = ip_mc_join_group(sk, &mreq); - release_sock(sk); - rtnl_lock(); +static void vxlan_sock_release(struct vxlan_sock *vs) +{ + if (!atomic_dec_and_test(&vs->refcnt)) + return; - return err; + hlist_del_rcu(&vs->hlist); + queue_work(vxlan_wq, &vs->del_work); } - -/* kernel equivalent to IP_DROP_MEMBERSHIP */ -static int vxlan_leave_group(struct net_device *dev) +/* Callback to update multicast group membership. + * Scheduled when vxlan goes up/down. + */ +static void vxlan_igmp_work(struct work_struct *work) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - int err = 0; - struct sock *sk = vxlan->vn_sock->sock->sk; + struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_work); + struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); + struct vxlan_sock *vs = vxlan->vn_sock; + struct sock *sk = vs->sock->sk; struct ip_mreqn mreq = { .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, .imr_ifindex = vxlan->default_dst.remote_ifindex, }; - /* Only leave group when last vxlan is done. */ - if (vxlan_group_used(vn, vxlan)) - return 0; - - /* Need to drop RTNL to call multicast leave */ - rtnl_unlock(); lock_sock(sk); - err = ip_mc_leave_group(sk, &mreq); + if (vxlan_group_used(vn, vxlan->default_dst.remote_ip)) + ip_mc_join_group(sk, &mreq); + else + ip_mc_leave_group(sk, &mreq); release_sock(sk); - rtnl_lock(); - return err; + vxlan_sock_release(vs); + dev_put(vxlan->dev); } /* Callback from net/ipv4/udp.c to receive packets */ @@ -1249,12 +1232,11 @@ static int vxlan_init(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - int err; if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { - err = vxlan_join_group(dev); - if (err) - return err; + vxlan_sock_hold(vxlan->vn_sock); + dev_hold(dev); + queue_work(vxlan_wq, &vxlan->igmp_work); } if (vxlan->age_interval) @@ -1285,8 +1267,11 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) - vxlan_leave_group(dev); + if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + vxlan_sock_hold(vxlan->vn_sock); + dev_hold(dev); + queue_work(vxlan_wq, &vxlan->igmp_work); + } del_timer_sync(&vxlan->age_timer); @@ -1355,6 +1340,7 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); + INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work); init_timer_deferrable(&vxlan->age_timer); vxlan->age_timer.function = vxlan_cleanup; @@ -1498,8 +1484,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) udp_sk(sk)->encap_type = 1; udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; udp_encap_enable(); + atomic_set(&vs->refcnt, 1); - vs->refcnt = 1; return vs; } @@ -1589,7 +1575,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, vs = vxlan_find_port(net, vxlan->dst_port); if (vs) - ++vs->refcnt; + atomic_inc(&vs->refcnt); else { /* Drop lock because socket create acquires RTNL lock */ rtnl_unlock(); @@ -1606,12 +1592,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, err = register_netdevice(dev); if (err) { - if (--vs->refcnt == 0) { - rtnl_unlock(); - sk_release_kernel(vs->sock->sk); - kfree(vs); - rtnl_lock(); - } + vxlan_sock_release(vs); return err; } @@ -1629,11 +1610,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) hlist_del_rcu(&vxlan->hlist); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - - if (--vs->refcnt == 0) { - hlist_del_rcu(&vs->hlist); - queue_work(vxlan_wq, &vs->del_work); - } + vxlan_sock_release(vs); } static size_t vxlan_get_size(const struct net_device *dev) -- cgit v1.2.3 From 8385f50a03a8ad3d2c6d76b1117c959261ab7a1c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:10 -0700 Subject: vxlan: send notification when MAC migrates When learned entry migrates to another IP send a notification that entry has changed. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index b061c98474ee..1f2aa26550e9 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -629,6 +629,7 @@ static bool vxlan_snoop(struct net_device *dev, f->remote.remote_ip = src_ip; f->updated = jiffies; + vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); } else { /* learned new entry */ spin_lock(&vxlan->hash_lock); -- cgit v1.2.3 From 1c51a9159ddefa5119724a4c7da3fd3ef44b68d5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:11 -0700 Subject: vxlan: fix race caused by dropping rtnl_unlock It is possible for two cpu's to race creating vxlan device. For most cases this is harmless, but the ability to assign "next avaliable vxlan device" relies on rtnl lock being held across the whole operation. Therfore two instances of calling: ip li add vxlan%d vxlan ... could collide and create two devices with same name. To fix this defer creation of socket to a work queue, and handle possible races there. Introduce a lock to ensure that changes to vxlan socket hash list is SMP safe. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 111 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 27 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 1f2aa26550e9..71da8be98801 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -94,6 +94,7 @@ struct vxlan_sock { struct vxlan_net { struct list_head vxlan_list; struct hlist_head sock_list[PORT_HASH_SIZE]; + spinlock_t sock_lock; }; struct vxlan_rdst { @@ -131,7 +132,9 @@ struct vxlan_dev { __u8 ttl; u32 flags; /* VXLAN_F_* below */ + struct work_struct sock_work; struct work_struct igmp_work; + unsigned long age_interval; struct timer_list age_timer; spinlock_t hash_lock; @@ -151,6 +154,8 @@ struct vxlan_dev { static u32 vxlan_salt __read_mostly; static struct workqueue_struct *vxlan_wq; +static void vxlan_sock_work(struct work_struct *work); + /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) { @@ -670,12 +675,15 @@ static void vxlan_sock_hold(struct vxlan_sock *vs) atomic_inc(&vs->refcnt); } -static void vxlan_sock_release(struct vxlan_sock *vs) +static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) { if (!atomic_dec_and_test(&vs->refcnt)) return; + spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); + spin_unlock(&vn->sock_lock); + queue_work(vxlan_wq, &vs->del_work); } @@ -700,7 +708,7 @@ static void vxlan_igmp_work(struct work_struct *work) ip_mc_leave_group(sk, &mreq); release_sock(sk); - vxlan_sock_release(vs); + vxlan_sock_release(vn, vs); dev_put(vxlan->dev); } @@ -1222,10 +1230,29 @@ static void vxlan_cleanup(unsigned long arg) /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); + struct vxlan_sock *vs; + __u32 vni = vxlan->default_dst.remote_vni; + dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; + spin_lock(&vn->sock_lock); + vs = vxlan_find_port(dev_net(dev), vxlan->dst_port); + if (vs) { + /* If we have a socket with same port already, reuse it */ + atomic_inc(&vs->refcnt); + vxlan->vn_sock = vs; + hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); + } else { + /* otherwise make new socket outside of RTNL */ + dev_hold(dev); + queue_work(vxlan_wq, &vxlan->sock_work); + } + spin_unlock(&vn->sock_lock); + return 0; } @@ -1233,9 +1260,14 @@ static int vxlan_init(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_sock *vs = vxlan->vn_sock; + + /* socket hasn't been created */ + if (!vs) + return -ENOTCONN; if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { - vxlan_sock_hold(vxlan->vn_sock); + vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_work); } @@ -1267,9 +1299,10 @@ static void vxlan_flush(struct vxlan_dev *vxlan) static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_sock *vs = vxlan->vn_sock; - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { - vxlan_sock_hold(vxlan->vn_sock); + if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_work); } @@ -1342,6 +1375,7 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work); + INIT_WORK(&vxlan->sock_work, vxlan_sock_work); init_timer_deferrable(&vxlan->age_timer); vxlan->age_timer.function = vxlan_cleanup; @@ -1433,7 +1467,6 @@ static void vxlan_del_work(struct work_struct *work) kfree_rcu(vs, rcu); } -/* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) { struct vxlan_sock *vs; @@ -1490,13 +1523,52 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) return vs; } +/* Scheduled at device creation to bind to a socket */ +static void vxlan_sock_work(struct work_struct *work) +{ + struct vxlan_dev *vxlan + = container_of(work, struct vxlan_dev, sock_work); + struct net_device *dev = vxlan->dev; + struct net *net = dev_net(dev); + __u32 vni = vxlan->default_dst.remote_vni; + __be16 port = vxlan->dst_port; + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_sock *nvs, *ovs; + + nvs = vxlan_socket_create(net, port); + if (IS_ERR(nvs)) { + netdev_err(vxlan->dev, "Can not create UDP socket, %ld\n", + PTR_ERR(nvs)); + goto out; + } + + spin_lock(&vn->sock_lock); + /* Look again to see if can reuse socket */ + ovs = vxlan_find_port(net, port); + if (ovs) { + atomic_inc(&ovs->refcnt); + vxlan->vn_sock = ovs; + hlist_add_head_rcu(&vxlan->hlist, vni_head(ovs, vni)); + spin_unlock(&vn->sock_lock); + + sk_release_kernel(nvs->sock->sk); + kfree(nvs); + } else { + vxlan->vn_sock = nvs; + hlist_add_head_rcu(&nvs->hlist, vs_head(net, port)); + hlist_add_head_rcu(&vxlan->hlist, vni_head(nvs, vni)); + spin_unlock(&vn->sock_lock); + } +out: + dev_put(dev); +} + static int vxlan_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; - struct vxlan_sock *vs; __u32 vni; int err; @@ -1574,31 +1646,13 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, return -EEXIST; } - vs = vxlan_find_port(net, vxlan->dst_port); - if (vs) - atomic_inc(&vs->refcnt); - else { - /* Drop lock because socket create acquires RTNL lock */ - rtnl_unlock(); - vs = vxlan_socket_create(net, vxlan->dst_port); - rtnl_lock(); - if (IS_ERR(vs)) - return PTR_ERR(vs); - - hlist_add_head_rcu(&vs->hlist, vs_head(net, vxlan->dst_port)); - } - vxlan->vn_sock = vs; - SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops); err = register_netdevice(dev); - if (err) { - vxlan_sock_release(vs); + if (err) return err; - } list_add(&vxlan->next, &vn->vxlan_list); - hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); return 0; } @@ -1606,12 +1660,14 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; hlist_del_rcu(&vxlan->hlist); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - vxlan_sock_release(vs); + if (vs) + vxlan_sock_release(vn, vs); } static size_t vxlan_get_size(const struct net_device *dev) @@ -1700,6 +1756,7 @@ static __net_init int vxlan_init_net(struct net *net) unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); + spin_lock_init(&vn->sock_lock); for (h = 0; h < PORT_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->sock_list[h]); -- cgit v1.2.3 From ebf4063e869d959daf75efb4ef1c7bc80dcd4800 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:11 -0700 Subject: vxlan: move cleanup to uninit Put destruction of per-cpu statistics removal in ndo_uninit since it is created by ndo_init. This also avoids any problems that might be cause by destructor being called after module removed. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 71da8be98801..500f9ce437ec 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1256,6 +1256,17 @@ static int vxlan_init(struct net_device *dev) return 0; } +static void vxlan_uninit(struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); + struct vxlan_sock *vs = vxlan->vn_sock; + + if (vs) + vxlan_sock_release(vn, vs); + free_percpu(dev->tstats); +} + /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { @@ -1321,6 +1332,7 @@ static void vxlan_set_multicast_list(struct net_device *dev) static const struct net_device_ops vxlan_netdev_ops = { .ndo_init = vxlan_init, + .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, @@ -1339,12 +1351,6 @@ static struct device_type vxlan_type = { .name = "vxlan", }; -static void vxlan_free(struct net_device *dev) -{ - free_percpu(dev->tstats); - free_netdev(dev); -} - /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { @@ -1357,7 +1363,7 @@ static void vxlan_setup(struct net_device *dev) dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; - dev->destructor = vxlan_free; + dev->destructor = free_netdev; SET_NETDEV_DEVTYPE(dev, &vxlan_type); dev->tx_queue_len = 0; @@ -1660,14 +1666,10 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - struct vxlan_sock *vs = vxlan->vn_sock; hlist_del_rcu(&vxlan->hlist); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - if (vs) - vxlan_sock_release(vn, vs); } static size_t vxlan_get_size(const struct net_device *dev) -- cgit v1.2.3 From 4ad169300a7350a034b86c543070aed109882a86 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:11 -0700 Subject: vxlan: make vxlan_xmit_one void The function vxlan_xmit_one always returns NETDEV_TX_OK, so there is no point in keeping track of return values etc. Signed-off-by: Stephen Hemminger Acked-by: David L Stevens --- drivers/net/vxlan.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 500f9ce437ec..e65241c3d176 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1008,8 +1008,8 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, } } -static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, - struct vxlan_rdst *rdst, bool did_rsc) +static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, + struct vxlan_rdst *rdst, bool did_rsc) { struct vxlan_dev *vxlan = netdev_priv(dev); struct rtable *rt; @@ -1032,7 +1032,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan); - return NETDEV_TX_OK; + return; } goto drop; } @@ -1088,7 +1088,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); - return NETDEV_TX_OK; + return; } vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_FLAGS); @@ -1116,7 +1116,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, IPPROTO_UDP, tos, ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); - return NETDEV_TX_OK; + return; drop: dev->stats.tx_dropped++; @@ -1126,7 +1126,6 @@ tx_error: dev->stats.tx_errors++; tx_free: dev_kfree_skb(skb); - return NETDEV_TX_OK; } /* Transmit local packets over Vxlan @@ -1142,7 +1141,6 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) bool did_rsc = false; struct vxlan_rdst *rdst0, *rdst; struct vxlan_fdb *f; - int rc1, rc; skb_reset_mac_header(skb); eth = eth_hdr(skb); @@ -1170,24 +1168,18 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } else rdst0 = &f->remote; - rc = NETDEV_TX_OK; /* if there are multiple destinations, send copies */ for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) { struct sk_buff *skb1; skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) { - rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc); - if (rc == NETDEV_TX_OK) - rc = rc1; - } + if (skb1) + vxlan_xmit_one(skb1, dev, rdst, did_rsc); } - rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc); - if (rc == NETDEV_TX_OK) - rc = rc1; - return rc; + vxlan_xmit_one(skb, dev, rdst0, did_rsc); + return NETDEV_TX_OK; } /* Walk the forwarding table and purge stale entries */ -- cgit v1.2.3 From 3e61aa8f0a68e6e007c223688f442be04a44b0f4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:12 -0700 Subject: vxlan: convert remotes list to list_rcu Based on initial work by Mike Rapoport Use list macros and RCU for tracking multiple remotes. Note: this code assumes list always has at least one entry, because delete is not supported. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 97 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 42 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e65241c3d176..117b7fa6f33b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -102,7 +102,7 @@ struct vxlan_rdst { __be16 remote_port; u32 remote_vni; u32 remote_ifindex; - struct vxlan_rdst *remote_next; + struct list_head list; }; /* Forwarding table entry */ @@ -111,7 +111,7 @@ struct vxlan_fdb { struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; - struct vxlan_rdst remote; + struct list_head remotes; u16 state; /* see ndm_state */ u8 flags; /* see ndm_flags */ u8 eth_addr[ETH_ALEN]; @@ -170,6 +170,14 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } +/* First remote destination for a forwarding entry. + * Guaranteed to be non-NULL because remotes are never deleted. + */ +static inline struct vxlan_rdst *first_remote(struct vxlan_fdb *fdb) +{ + return list_first_or_null_rcu(&fdb->remotes, struct vxlan_rdst, list); +} + /* Find VXLAN socket based on network namespace and UDP port */ static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) { @@ -275,7 +283,7 @@ static inline size_t vxlan_nlmsg_size(void) } static void vxlan_fdb_notify(struct vxlan_dev *vxlan, - const struct vxlan_fdb *fdb, int type) + struct vxlan_fdb *fdb, int type) { struct net *net = dev_net(vxlan->dev); struct sk_buff *skb; @@ -285,7 +293,7 @@ static void vxlan_fdb_notify(struct vxlan_dev *vxlan, if (skb == NULL) goto errout; - err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote); + err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, first_remote(fdb)); if (err < 0) { /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -304,11 +312,16 @@ static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f; + struct vxlan_rdst remote; memset(&f, 0, sizeof f); f.state = NUD_STALE; - f.remote.remote_ip = ipa; /* goes to NDA_DST */ - f.remote.remote_vni = VXLAN_N_VID; + + remote.remote_ip = ipa; /* goes to NDA_DST */ + remote.remote_vni = VXLAN_N_VID; + + INIT_LIST_HEAD(&f.remotes); + list_add_rcu(&remote.list, &f.remotes); vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); } @@ -318,6 +331,7 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) struct vxlan_fdb f; memset(&f, 0, sizeof f); + INIT_LIST_HEAD(&f.remotes); f.state = NUD_STALE; memcpy(f.eth_addr, eth_addr, ETH_ALEN); @@ -377,17 +391,17 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, static int vxlan_fdb_append(struct vxlan_fdb *f, __be32 ip, __be16 port, __u32 vni, __u32 ifindex) { - struct vxlan_rdst *rd_prev, *rd; + struct vxlan_rdst *rd; - rd_prev = NULL; - for (rd = &f->remote; rd; rd = rd->remote_next) { + /* protected by vxlan->hash_lock */ + list_for_each_entry(rd, &f->remotes, list) { if (rd->remote_ip == ip && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) return 0; - rd_prev = rd; } + rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOBUFS; @@ -395,8 +409,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; - rd->remote_next = NULL; - rd_prev->remote_next = rd; + + list_add_tail_rcu(&rd->list, &f->remotes); + return 1; } @@ -448,16 +463,14 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, return -ENOMEM; notify = 1; - f->remote.remote_ip = ip; - f->remote.remote_port = port; - f->remote.remote_vni = vni; - f->remote.remote_ifindex = ifindex; - f->remote.remote_next = NULL; f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; + INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); + vxlan_fdb_append(f, ip, port, vni, ifindex); + ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac)); @@ -472,13 +485,10 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); + struct vxlan_rdst *rd, *nd; - while (f->remote.remote_next) { - struct vxlan_rdst *rd = f->remote.remote_next; - - f->remote.remote_next = rd->remote_next; + list_for_each_entry_safe(rd, nd, &f->remotes, list) kfree(rd); - } kfree(f); } @@ -588,23 +598,24 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; - for (rd = &f->remote; rd; rd = rd->remote_next) { - if (idx < cb->args[0]) - goto skip; + if (idx < cb->args[0]) + goto skip; + + list_for_each_entry_rcu(rd, &f->remotes, list) { err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, rd); if (err < 0) - break; -skip: - ++idx; + goto out; } +skip: + ++idx; } } - +out: return idx; } @@ -620,7 +631,9 @@ static bool vxlan_snoop(struct net_device *dev, f = vxlan_find_mac(vxlan, src_mac); if (likely(f)) { - if (likely(f->remote.remote_ip == src_ip)) + struct vxlan_rdst *rdst = first_remote(f); + + if (likely(rdst->remote_ip == src_ip)) return false; /* Don't migrate static entries, drop packets */ @@ -630,9 +643,9 @@ static bool vxlan_snoop(struct net_device *dev, if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pI4 to %pI4\n", - src_mac, &f->remote.remote_ip, &src_ip); + src_mac, &rdst->remote_ip, &src_ip); - f->remote.remote_ip = src_ip; + rdst->remote_ip = src_ip; f->updated = jiffies; vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); } else { @@ -866,7 +879,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) } f = vxlan_find_mac(vxlan, n->ha); - if (f && f->remote.remote_ip == htonl(INADDR_ANY)) { + if (f && first_remote(f)->remote_ip == htonl(INADDR_ANY)) { /* bridge-local neighbor */ neigh_release(n); goto out; @@ -1165,17 +1178,17 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) (vxlan->flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); - } else - rdst0 = &f->remote; - + } else { + rdst = rdst0 = first_remote(f); - /* if there are multiple destinations, send copies */ - for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) { - struct sk_buff *skb1; + /* if there are multiple destinations, send copies */ + list_for_each_entry_continue_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, rdst, did_rsc); + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, rdst, did_rsc); + } } vxlan_xmit_one(skb, dev, rdst0, did_rsc); -- cgit v1.2.3 From 9daaa397b3e18282715eeb0d7be79ea5bbadc119 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:12 -0700 Subject: vxlan: port module param should be ushort UDP ports are limited to 16 bits. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 117b7fa6f33b..f89a58bb3f26 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -70,8 +70,8 @@ struct vxlanhdr { * The IANA assigned port is 4789, but the Linux default is 8472 * for compatability with early adopters. */ -static unsigned int vxlan_port __read_mostly = 8472; -module_param_named(udp_port, vxlan_port, uint, 0444); +static unsigned short vxlan_port __read_mostly = 8472; +module_param_named(udp_port, vxlan_port, ushort, 0444); MODULE_PARM_DESC(udp_port, "Destination UDP port"); static bool log_ecn_error = true; -- cgit v1.2.3 From bb3fd6878a983f36c994bfbd71b01b2625ddf52b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:40 -0700 Subject: vxlan: Use initializer for dummy structures For the notification code, a couple of places build fdb entries on the stack, use structure initialization instead and fix formatting. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f89a58bb3f26..d2b9ab79c9ae 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -311,14 +311,13 @@ errout: static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_fdb f; - struct vxlan_rdst remote; - - memset(&f, 0, sizeof f); - f.state = NUD_STALE; - - remote.remote_ip = ipa; /* goes to NDA_DST */ - remote.remote_vni = VXLAN_N_VID; + struct vxlan_fdb f = { + .state = NUD_STALE, + }; + struct vxlan_rdst remote = { + .remote_ip = ipa, /* goes to NDA_DST */ + .remote_vni = VXLAN_N_VID, + }; INIT_LIST_HEAD(&f.remotes); list_add_rcu(&remote.list, &f.remotes); @@ -328,11 +327,11 @@ static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) { - struct vxlan_fdb f; + struct vxlan_fdb f = { + .state = NUD_STALE, + }; - memset(&f, 0, sizeof f); INIT_LIST_HEAD(&f.remotes); - f.state = NUD_STALE; memcpy(f.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); @@ -1485,6 +1484,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) struct sockaddr_in vxlan_addr = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = port, }; int rc; unsigned int h; @@ -1510,8 +1510,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) sk = vs->sock->sk; sk_change_net(sk, net); - vxlan_addr.sin_port = port; - rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr, sizeof(vxlan_addr)); if (rc < 0) { -- cgit v1.2.3 From 234f5b73794435f065c5fb13371415fe46956a4b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:41 -0700 Subject: vxlan: cosmetic cleanup's Fix whitespace and spelling Signed-off-by: Stephen Hemminger Acked-by: David L Stevens --- drivers/net/vxlan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index d2b9ab79c9ae..3b03cd4bdf37 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -68,7 +68,7 @@ struct vxlanhdr { /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 - * for compatability with early adopters. + * for compatibility with early adopters. */ static unsigned short vxlan_port __read_mostly = 8472; module_param_named(udp_port, vxlan_port, ushort, 0444); @@ -210,9 +210,9 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) /* Fill in neighbour message in skbuff. */ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, - const struct vxlan_fdb *fdb, - u32 portid, u32 seq, int type, unsigned int flags, - const struct vxlan_rdst *rdst) + const struct vxlan_fdb *fdb, + u32 portid, u32 seq, int type, unsigned int flags, + const struct vxlan_rdst *rdst) { unsigned long now = jiffies; struct nda_cacheinfo ci; @@ -1031,7 +1031,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4; __be32 dst; __be16 src_port, dst_port; - u32 vni; + u32 vni; __be16 df = 0; __u8 tos, ttl; int err; -- cgit v1.2.3 From 60d9d4c6dbd1bad80fb9a77775fc704302a563c9 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 20 Jun 2013 00:26:31 -0700 Subject: vxlan: Fix sparse warnings. Fix following sparse warnings. drivers/net/vxlan.c:238:44: warning: incorrect type in argument 3 (different base types) drivers/net/vxlan.c:238:44: expected restricted __be32 [usertype] value drivers/net/vxlan.c:238:44: got unsigned int const [unsigned] [usertype] remote_vni drivers/net/vxlan.c:1735:18: warning: incorrect type in initializer (different signedness) drivers/net/vxlan.c:1735:18: expected int *id drivers/net/vxlan.c:1735:18: got unsigned int static [toplevel] * Signed-off-by: Pravin B Shelar Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3b03cd4bdf37..212a25601fa6 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -78,7 +78,7 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -static unsigned int vxlan_net_id; +static int vxlan_net_id; /* per UDP socket information */ struct vxlan_sock { @@ -250,7 +250,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && - nla_put_be32(skb, NDA_VNI, rdst->remote_vni)) + nla_put_u32(skb, NDA_VNI, rdst->remote_vni)) goto nla_put_failure; if (rdst->remote_ifindex && nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) -- cgit v1.2.3 From afbd8bae9c798c5cdbe4439d3a50536b5438247c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:51 +0300 Subject: vxlan: add implicit fdb entry for default destination Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 68 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 19 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 212a25601fa6..bdfe46e50c49 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -80,6 +80,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int vxlan_net_id; +static const u8 all_zeros_mac[ETH_ALEN]; + /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; @@ -1151,7 +1153,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct ethhdr *eth; bool did_rsc = false; - struct vxlan_rdst *rdst0, *rdst; + struct vxlan_rdst *rdst; struct vxlan_fdb *f; skb_reset_mac_header(skb); @@ -1171,26 +1173,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } if (f == NULL) { - rdst0 = &vxlan->default_dst; - - if (rdst0->remote_ip == htonl(INADDR_ANY) && - (vxlan->flags & VXLAN_F_L2MISS) && - !is_multicast_ether_addr(eth->h_dest)) - vxlan_fdb_miss(vxlan, eth->h_dest); - } else { - rdst = rdst0 = first_remote(f); + f = vxlan_find_mac(vxlan, all_zeros_mac); + if (f == NULL) { + if ((vxlan->flags & VXLAN_F_L2MISS) && + !is_multicast_ether_addr(eth->h_dest)) + vxlan_fdb_miss(vxlan, eth->h_dest); + + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + } - /* if there are multiple destinations, send copies */ - list_for_each_entry_continue_rcu(rdst, &f->remotes, list) { - struct sk_buff *skb1; + list_for_each_entry_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, rdst, did_rsc); - } + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, rdst, did_rsc); } - vxlan_xmit_one(skb, dev, rdst0, did_rsc); + dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -1260,12 +1263,25 @@ static int vxlan_init(struct net_device *dev) return 0; } +static void vxlan_fdb_delete_defualt(struct vxlan_dev *vxlan) +{ + struct vxlan_fdb *f; + + spin_lock_bh(&vxlan->hash_lock); + f = __vxlan_find_mac(vxlan, all_zeros_mac); + if (f) + vxlan_fdb_destroy(vxlan, f); + spin_unlock_bh(&vxlan->hash_lock); +} + static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; + vxlan_fdb_delete_defualt(vxlan); + if (vs) vxlan_sock_release(vn, vs); free_percpu(dev->tstats); @@ -1304,7 +1320,9 @@ static void vxlan_flush(struct vxlan_dev *vxlan) hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); - vxlan_fdb_destroy(vxlan, f); + /* the all_zeros_mac entry is deleted at vxlan_uninit */ + if (!is_zero_ether_addr(f->eth_addr)) + vxlan_fdb_destroy(vxlan, f); } } spin_unlock_bh(&vxlan->hash_lock); @@ -1657,10 +1675,22 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops); - err = register_netdevice(dev); + /* create an fdb entry for default destination */ + err = vxlan_fdb_create(vxlan, all_zeros_mac, + vxlan->default_dst.remote_ip, + NUD_REACHABLE|NUD_PERMANENT, + NLM_F_EXCL|NLM_F_CREATE, + vxlan->dst_port, vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_ifindex, NTF_SELF); if (err) return err; + err = register_netdevice(dev); + if (err) { + vxlan_fdb_delete_defualt(vxlan); + return err; + } + list_add(&vxlan->next, &vn->vxlan_list); return 0; -- cgit v1.2.3 From a5e7c10a7ec244f272703f36f339c967efe1fc0d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:52 +0300 Subject: vxlan: introduce vxlan_fdb_find_rdst which will be reused by vxlan_fdb_delete Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index bdfe46e50c49..306bd94efa89 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -388,21 +388,34 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, return f; } -/* Add/update destinations for multicast */ -static int vxlan_fdb_append(struct vxlan_fdb *f, - __be32 ip, __be16 port, __u32 vni, __u32 ifindex) +/* caller should hold vxlan->hash_lock */ +static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, + __be32 ip, __be16 port, + __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; - /* protected by vxlan->hash_lock */ list_for_each_entry(rd, &f->remotes, list) { if (rd->remote_ip == ip && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) - return 0; + return rd; } + return NULL; +} + +/* Add/update destinations for multicast */ +static int vxlan_fdb_append(struct vxlan_fdb *f, + __be32 ip, __be16 port, __u32 vni, __u32 ifindex) +{ + struct vxlan_rdst *rd; + + rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (rd) + return 0; + rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOBUFS; -- cgit v1.2.3 From f0b074be7b61a800e39053d73dabf850649c1c8f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:53 +0300 Subject: vxlan: introduce vxlan_fdb_parse which will be reused by vxlan_fdb_delete Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 81 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 31 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 306bd94efa89..ee7cc71e57fd 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -518,58 +518,77 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) call_rcu(&f->rcu, vxlan_fdb_free); } -/* Add static entry (via netlink) */ -static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 flags) +static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, + __be32 *ip, __be16 *port, u32 *vni, u32 *ifindex) { - struct vxlan_dev *vxlan = netdev_priv(dev); struct net *net = dev_net(vxlan->dev); - __be32 ip; - __be16 port; - u32 vni, ifindex; - int err; - - if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { - pr_info("RTM_NEWNEIGH with invalid state %#x\n", - ndm->ndm_state); - return -EINVAL; - } - - if (tb[NDA_DST] == NULL) - return -EINVAL; - if (nla_len(tb[NDA_DST]) != sizeof(__be32)) - return -EAFNOSUPPORT; + if (tb[NDA_DST]) { + if (nla_len(tb[NDA_DST]) != sizeof(__be32)) + return -EAFNOSUPPORT; - ip = nla_get_be32(tb[NDA_DST]); + *ip = nla_get_be32(tb[NDA_DST]); + } else { + *ip = htonl(INADDR_ANY); + } if (tb[NDA_PORT]) { if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) return -EINVAL; - port = nla_get_be16(tb[NDA_PORT]); - } else - port = vxlan->dst_port; + *port = nla_get_be16(tb[NDA_PORT]); + } else { + *port = vxlan->dst_port; + } if (tb[NDA_VNI]) { if (nla_len(tb[NDA_VNI]) != sizeof(u32)) return -EINVAL; - vni = nla_get_u32(tb[NDA_VNI]); - } else - vni = vxlan->default_dst.remote_vni; + *vni = nla_get_u32(tb[NDA_VNI]); + } else { + *vni = vxlan->default_dst.remote_vni; + } if (tb[NDA_IFINDEX]) { struct net_device *tdev; if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) return -EINVAL; - ifindex = nla_get_u32(tb[NDA_IFINDEX]); - tdev = dev_get_by_index(net, ifindex); + *ifindex = nla_get_u32(tb[NDA_IFINDEX]); + tdev = dev_get_by_index(net, *ifindex); if (!tdev) return -EADDRNOTAVAIL; dev_put(tdev); - } else - ifindex = 0; + } else { + *ifindex = 0; + } + + return 0; +} + +/* Add static entry (via netlink) */ +static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 flags) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + /* struct net *net = dev_net(vxlan->dev); */ + __be32 ip; + __be16 port; + u32 vni, ifindex; + int err; + + if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { + pr_info("RTM_NEWNEIGH with invalid state %#x\n", + ndm->ndm_state); + return -EINVAL; + } + + if (tb[NDA_DST] == NULL) + return -EINVAL; + + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); + if (err) + return err; spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, -- cgit v1.2.3 From bc7892ba39992c6d645e906f1d52a626395b4b11 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:54 +0300 Subject: vxlan: allow removal of single destination from fdb entry When the last item is deleted from the remote destinations list, the fdb entry is destroyed. Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ee7cc71e57fd..c1825201f9e2 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -105,6 +105,7 @@ struct vxlan_rdst { u32 remote_vni; u32 remote_ifindex; struct list_head list; + struct rcu_head rcu; }; /* Forwarding table entry */ @@ -496,6 +497,12 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, return 0; } +static void vxlan_fdb_free_rdst(struct rcu_head *head) +{ + struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); + kfree(rd); +} + static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); @@ -605,14 +612,43 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; - int err = -ENOENT; + struct vxlan_rdst *rd = NULL; + __be32 ip; + __be16 port; + u32 vni, ifindex; + int err; + + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); + if (err) + return err; + + err = -ENOENT; spin_lock_bh(&vxlan->hash_lock); f = vxlan_find_mac(vxlan, addr); - if (f) { - vxlan_fdb_destroy(vxlan, f); - err = 0; + if (!f) + goto out; + + if (ip != htonl(INADDR_ANY)) { + rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (!rd) + goto out; + } + + err = 0; + + /* remove a destination if it's not the only one on the list, + * otherwise destroy the fdb entry + */ + if (rd && !list_is_singular(&f->remotes)) { + list_del_rcu(&rd->list); + call_rcu(&rd->rcu, vxlan_fdb_free_rdst); + goto out; } + + vxlan_fdb_destroy(vxlan, f); + +out: spin_unlock_bh(&vxlan->hash_lock); return err; -- cgit v1.2.3 From 58e4c767046a35f11a55af6ce946054ddf4a8580 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:56 +0300 Subject: vxlan: fdb: allow specifying multiple destinations for zero MAC The zero MAC entry in the fdb is used as default destination. With multiple default destinations it is possible to use vxlan in environments that disable multicast on the infrastructure level, e.g. public clouds. Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c1825201f9e2..3e75f9726c33 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -458,7 +458,8 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, notify = 1; } if ((flags & NLM_F_APPEND) && - is_multicast_ether_addr(f->eth_addr)) { + (is_multicast_ether_addr(f->eth_addr) || + is_zero_ether_addr(f->eth_addr))) { int rc = vxlan_fdb_append(f, ip, port, vni, ifindex); if (rc < 0) -- cgit v1.2.3 From ba609e9bf15bd7df35e2c06a2e5aaf9ab9289b10 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 25 Jun 2013 17:06:01 -0700 Subject: vxlan: fix function name spelling Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3e75f9726c33..227b54a1f88a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1332,7 +1332,7 @@ static int vxlan_init(struct net_device *dev) return 0; } -static void vxlan_fdb_delete_defualt(struct vxlan_dev *vxlan) +static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) { struct vxlan_fdb *f; @@ -1349,7 +1349,7 @@ static void vxlan_uninit(struct net_device *dev) struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; - vxlan_fdb_delete_defualt(vxlan); + vxlan_fdb_delete_default(vxlan); if (vs) vxlan_sock_release(vn, vs); @@ -1756,7 +1756,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, err = register_netdevice(dev); if (err) { - vxlan_fdb_delete_defualt(vxlan); + vxlan_fdb_delete_default(vxlan); return err; } -- cgit v1.2.3 From f89e57c4f5d7efdbe73b6214c73058aa335870e4 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 11 Jul 2013 11:38:06 -0700 Subject: vxlan: Fix kernel crash on rmmod. vxlan exit module unregisters vxlan net and then it unregisters rtnl ops which triggers vxlan_dellink() from __rtnl_kill_links(). vxlan_dellink() deletes vxlan-dev from vxlan_list which has list-head in vxlan-net-struct but that is already gone due to net-unregister. That is how we are getting following crash. Following commit fixes the crash by fixing module exit path. BUG: unable to handle kernel paging request at ffff8804102c8000 IP: [] __list_del_entry+0x29/0xd0 PGD 2972067 PUD 83e019067 PMD 83df97067 PTE 80000004102c8060 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: --- CPU: 19 PID: 6712 Comm: rmmod Tainted: GF 3.10.0+ #95 Hardware name: Dell Inc. PowerEdge R620/0KCKR5, BIOS 1.4.8 10/25/2012 task: ffff88080c47c580 ti: ffff88080ac50000 task.ti: ffff88080ac50000 RIP: 0010:[] [] __list_del_entry+0x29/0xd0 RSP: 0018:ffff88080ac51e08 EFLAGS: 00010206 RAX: ffff8804102c8000 RBX: ffff88040f0d4b10 RCX: dead000000200200 RDX: ffff8804102c8000 RSI: ffff88080ac51e58 RDI: ffff88040f0d4b10 RBP: ffff88080ac51e08 R08: 0000000000000001 R09: 2222222222222222 R10: 2222222222222222 R11: 2222222222222222 R12: ffff88080ac51e58 R13: ffffffffa07b8840 R14: ffffffff81ae48c0 R15: ffff88080ac51e58 FS: 00007f9ef105c700(0000) GS:ffff88082a800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8804102c8000 CR3: 00000008227e5000 CR4: 00000000000407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff88080ac51e28 ffffffff812cc6a1 2222222222222222 ffff88040f0d4000 ffff88080ac51e48 ffffffffa07b3311 ffff88040f0d4000 ffffffff81ae49c8 ffff88080ac51e98 ffffffff81492fc2 ffff88080ac51e58 ffff88080ac51e58 Call Trace: [] list_del+0x11/0x40 [] vxlan_dellink+0x51/0x70 [vxlan] [] __rtnl_link_unregister+0xa2/0xb0 [] rtnl_link_unregister+0x1e/0x30 [] vxlan_cleanup_module+0x1c/0x2f [vxlan] [] SyS_delete_module+0x1d1/0x2c0 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] system_call_fastpath+0x16/0x1b Code: eb 9f 55 48 8b 17 48 b9 00 01 10 00 00 00 ad de 48 8b 47 08 48 89 e5 48 39 ca 74 29 48 b9 00 02 20 00 00 00 ad de 48 39 c8 74 7a <4c> 8b 00 4c 39 c7 75 53 4c 8b 42 08 4c 39 c7 75 2b 48 89 42 08 RIP [] __list_del_entry+0x29/0xd0 RSP CR2: ffff8804102c8000 Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 227b54a1f88a..0ba1e7edbb1b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1916,9 +1916,9 @@ late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { - unregister_pernet_device(&vxlan_net_ops); rtnl_link_unregister(&vxlan_link_ops); destroy_workqueue(vxlan_wq); + unregister_pernet_device(&vxlan_net_ops); rcu_barrier(); } module_exit(vxlan_cleanup_module); -- cgit v1.2.3 From fe5c3561e6f0ac7c9546209f01351113c1b77ec8 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 13 Jul 2013 10:18:18 -0700 Subject: vxlan: add necessary locking on device removal The socket management is now done in workqueue (outside of RTNL) and protected by vn->sock_lock. There were two possible bugs, first the vxlan device was removed from the VNI hash table per socket without holding lock. And there was a race when device is created and the workqueue could run after deletion. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 0ba1e7edbb1b..a5ba8dd7e6be 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1767,9 +1767,15 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); + flush_workqueue(vxlan_wq); + + spin_lock(&vn->sock_lock); hlist_del_rcu(&vxlan->hlist); + spin_unlock(&vn->sock_lock); + list_del(&vxlan->next); unregister_netdevice_queue(dev, head); } -- cgit v1.2.3 From 372675a4a9ac0a0af962d44dadeea69926ce45e0 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 18 Jul 2013 08:38:26 -0700 Subject: vxlan: unregister on namespace exit Fix memory leaks and other badness from VXLAN network namespace teardown. When network namespace is removed, all the vxlan devices should be unregistered (not closed). Signed-off-by: Stephen Hemminger Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a5ba8dd7e6be..f101034a297a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1878,10 +1878,12 @@ static __net_exit void vxlan_exit_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan; + LIST_HEAD(list); rtnl_lock(); list_for_each_entry(vxlan, &vn->vxlan_list, next) - dev_close(vxlan->dev); + unregister_netdevice_queue(vxlan->dev, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } -- cgit v1.2.3 From 3fc2de2faba387218bdf9dbc6b13f513ac3b060a Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 18 Jul 2013 08:40:15 -0700 Subject: vxlan: fix igmp races There are two race conditions in existing code for doing IGMP management in workqueue in vxlan. First, the vxlan_group_used function checks the list of vxlan's without any protection, and it is possible for open followed by close to occur before the igmp work queue runs. To solve these move the check into vxlan_open/stop so it is protected by RTNL. And split into two work structures so that there is no racy reference to underlying device state. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'drivers/net/vxlan.c') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f101034a297a..f4c6db419ddb 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -136,7 +136,8 @@ struct vxlan_dev { u32 flags; /* VXLAN_F_* below */ struct work_struct sock_work; - struct work_struct igmp_work; + struct work_struct igmp_join; + struct work_struct igmp_leave; unsigned long age_interval; struct timer_list age_timer; @@ -736,7 +737,6 @@ static bool vxlan_snoop(struct net_device *dev, return false; } - /* See if multicast group is already in use by other ID */ static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) { @@ -770,12 +770,13 @@ static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } -/* Callback to update multicast group membership. - * Scheduled when vxlan goes up/down. +/* Callback to update multicast group membership when first VNI on + * multicast asddress is brought up + * Done as workqueue because ip_mc_join_group acquires RTNL. */ -static void vxlan_igmp_work(struct work_struct *work) +static void vxlan_igmp_join(struct work_struct *work) { - struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_work); + struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; @@ -785,10 +786,27 @@ static void vxlan_igmp_work(struct work_struct *work) }; lock_sock(sk); - if (vxlan_group_used(vn, vxlan->default_dst.remote_ip)) - ip_mc_join_group(sk, &mreq); - else - ip_mc_leave_group(sk, &mreq); + ip_mc_join_group(sk, &mreq); + release_sock(sk); + + vxlan_sock_release(vn, vs); + dev_put(vxlan->dev); +} + +/* Inverse of vxlan_igmp_join when last VNI is brought down */ +static void vxlan_igmp_leave(struct work_struct *work) +{ + struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); + struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); + struct vxlan_sock *vs = vxlan->vn_sock; + struct sock *sk = vs->sock->sk; + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, + .imr_ifindex = vxlan->default_dst.remote_ifindex, + }; + + lock_sock(sk); + ip_mc_leave_group(sk, &mreq); release_sock(sk); vxlan_sock_release(vn, vs); @@ -1359,6 +1377,7 @@ static void vxlan_uninit(struct net_device *dev) /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_sock *vs = vxlan->vn_sock; @@ -1366,10 +1385,11 @@ static int vxlan_open(struct net_device *dev) if (!vs) return -ENOTCONN; - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && + ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); - queue_work(vxlan_wq, &vxlan->igmp_work); + queue_work(vxlan_wq, &vxlan->igmp_join); } if (vxlan->age_interval) @@ -1400,13 +1420,15 @@ static void vxlan_flush(struct vxlan_dev *vxlan) /* Cleanup timer and forwarding table on shutdown */ static int vxlan_stop(struct net_device *dev) { + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_sock *vs = vxlan->vn_sock; - if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && + ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); - queue_work(vxlan_wq, &vxlan->igmp_work); + queue_work(vxlan_wq, &vxlan->igmp_leave); } del_timer_sync(&vxlan->age_timer); @@ -1471,7 +1493,8 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); - INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work); + INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join); + INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave); INIT_WORK(&vxlan->sock_work, vxlan_sock_work); init_timer_deferrable(&vxlan->age_timer); -- cgit v1.2.3