From b7153984074e51a50dad905871b705e0d67aa147 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:09 -0700 Subject: [PATCH 01/20] vxlan: fix out of order operation on module removal If vxlan is removed with active vxlan's it would crash because rtnl_link_unregister (which calls vxlan_dellink), was invoked before unregister_pernet_device (which calls vxlan_stop). Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 284c6c00c353..d3005d3a768d 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1771,8 +1771,8 @@ late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { - rtnl_link_unregister(&vxlan_link_ops); unregister_pernet_device(&vxlan_net_ops); + rtnl_link_unregister(&vxlan_link_ops); rcu_barrier(); } module_exit(vxlan_cleanup_module); From 758c57d16adcbec3c03e85f0c9a5b4ca31f6c507 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:09 -0700 Subject: [PATCH 02/20] vxlan: fix crash from work pending on module removal Switch to using a per module work queue so that all the socket deletion callbacks are done when module is removed. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index d3005d3a768d..eb94bf5812cb 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -148,6 +148,7 @@ struct vxlan_dev { /* salt for hash table */ static u32 vxlan_salt __read_mostly; +static struct workqueue_struct *vxlan_wq; /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) @@ -1631,7 +1632,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) if (--vs->refcnt == 0) { hlist_del_rcu(&vs->hlist); - schedule_work(&vs->del_work); + queue_work(vxlan_wq, &vs->del_work); } } @@ -1750,6 +1751,10 @@ static int __init vxlan_init_module(void) { int rc; + vxlan_wq = alloc_workqueue("vxlan", 0, 0); + if (!vxlan_wq) + return -ENOMEM; + get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); rc = register_pernet_device(&vxlan_net_ops); @@ -1765,6 +1770,7 @@ static int __init vxlan_init_module(void) out2: unregister_pernet_device(&vxlan_net_ops); out1: + destroy_workqueue(vxlan_wq); return rc; } late_initcall(vxlan_init_module); @@ -1773,6 +1779,7 @@ static void __exit vxlan_cleanup_module(void) { unregister_pernet_device(&vxlan_net_ops); rtnl_link_unregister(&vxlan_link_ops); + destroy_workqueue(vxlan_wq); rcu_barrier(); } module_exit(vxlan_cleanup_module); From 7c47cedf43a8b3086c3dcf26cbc058747ee21bec Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:10 -0700 Subject: [PATCH 03/20] vxlan: move IGMP join/leave to work queue Do join/leave from work queue to avoid lock inversion problems between normal socket and RTNL. The code comes out cleaner as well. Uses Cong Wang's suggestion to turn refcnt into a real atomic since now need to handle case where last use of socket is IGMP worker. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 107 +++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 65 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index eb94bf5812cb..b061c98474ee 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -85,7 +85,7 @@ struct vxlan_sock { struct hlist_node hlist; struct rcu_head rcu; struct work_struct del_work; - unsigned int refcnt; + atomic_t refcnt; struct socket *sock; struct hlist_head vni_list[VNI_HASH_SIZE]; }; @@ -131,6 +131,7 @@ struct vxlan_dev { __u8 ttl; u32 flags; /* VXLAN_F_* below */ + struct work_struct igmp_work; unsigned long age_interval; struct timer_list age_timer; spinlock_t hash_lock; @@ -648,76 +649,58 @@ static bool vxlan_snoop(struct net_device *dev, /* See if multicast group is already in use by other ID */ -static bool vxlan_group_used(struct vxlan_net *vn, - const struct vxlan_dev *this) +static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) { struct vxlan_dev *vxlan; list_for_each_entry(vxlan, &vn->vxlan_list, next) { - if (vxlan == this) - continue; - if (!netif_running(vxlan->dev)) continue; - if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip) + if (vxlan->default_dst.remote_ip == remote_ip) return true; } return false; } -/* kernel equivalent to IP_ADD_MEMBERSHIP */ -static int vxlan_join_group(struct net_device *dev) +static void vxlan_sock_hold(struct vxlan_sock *vs) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - struct sock *sk = vxlan->vn_sock->sock->sk; - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, - .imr_ifindex = vxlan->default_dst.remote_ifindex, - }; - int err; - - /* Already a member of group */ - if (vxlan_group_used(vn, vxlan)) - return 0; - - /* Need to drop RTNL to call multicast join */ - rtnl_unlock(); - lock_sock(sk); - err = ip_mc_join_group(sk, &mreq); - release_sock(sk); - rtnl_lock(); - - return err; + atomic_inc(&vs->refcnt); } - -/* kernel equivalent to IP_DROP_MEMBERSHIP */ -static int vxlan_leave_group(struct net_device *dev) +static void vxlan_sock_release(struct vxlan_sock *vs) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - int err = 0; - struct sock *sk = vxlan->vn_sock->sock->sk; + if (!atomic_dec_and_test(&vs->refcnt)) + return; + + hlist_del_rcu(&vs->hlist); + queue_work(vxlan_wq, &vs->del_work); +} + +/* Callback to update multicast group membership. + * Scheduled when vxlan goes up/down. + */ +static void vxlan_igmp_work(struct work_struct *work) +{ + struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_work); + struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); + struct vxlan_sock *vs = vxlan->vn_sock; + struct sock *sk = vs->sock->sk; struct ip_mreqn mreq = { .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, .imr_ifindex = vxlan->default_dst.remote_ifindex, }; - /* Only leave group when last vxlan is done. */ - if (vxlan_group_used(vn, vxlan)) - return 0; - - /* Need to drop RTNL to call multicast leave */ - rtnl_unlock(); lock_sock(sk); - err = ip_mc_leave_group(sk, &mreq); + if (vxlan_group_used(vn, vxlan->default_dst.remote_ip)) + ip_mc_join_group(sk, &mreq); + else + ip_mc_leave_group(sk, &mreq); release_sock(sk); - rtnl_lock(); - return err; + vxlan_sock_release(vs); + dev_put(vxlan->dev); } /* Callback from net/ipv4/udp.c to receive packets */ @@ -1249,12 +1232,11 @@ static int vxlan_init(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - int err; if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { - err = vxlan_join_group(dev); - if (err) - return err; + vxlan_sock_hold(vxlan->vn_sock); + dev_hold(dev); + queue_work(vxlan_wq, &vxlan->igmp_work); } if (vxlan->age_interval) @@ -1285,8 +1267,11 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) - vxlan_leave_group(dev); + if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + vxlan_sock_hold(vxlan->vn_sock); + dev_hold(dev); + queue_work(vxlan_wq, &vxlan->igmp_work); + } del_timer_sync(&vxlan->age_timer); @@ -1355,6 +1340,7 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); + INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work); init_timer_deferrable(&vxlan->age_timer); vxlan->age_timer.function = vxlan_cleanup; @@ -1498,8 +1484,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) udp_sk(sk)->encap_type = 1; udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; udp_encap_enable(); + atomic_set(&vs->refcnt, 1); - vs->refcnt = 1; return vs; } @@ -1589,7 +1575,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, vs = vxlan_find_port(net, vxlan->dst_port); if (vs) - ++vs->refcnt; + atomic_inc(&vs->refcnt); else { /* Drop lock because socket create acquires RTNL lock */ rtnl_unlock(); @@ -1606,12 +1592,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, err = register_netdevice(dev); if (err) { - if (--vs->refcnt == 0) { - rtnl_unlock(); - sk_release_kernel(vs->sock->sk); - kfree(vs); - rtnl_lock(); - } + vxlan_sock_release(vs); return err; } @@ -1629,11 +1610,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) hlist_del_rcu(&vxlan->hlist); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - - if (--vs->refcnt == 0) { - hlist_del_rcu(&vs->hlist); - queue_work(vxlan_wq, &vs->del_work); - } + vxlan_sock_release(vs); } static size_t vxlan_get_size(const struct net_device *dev) From 8385f50a03a8ad3d2c6d76b1117c959261ab7a1c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:10 -0700 Subject: [PATCH 04/20] vxlan: send notification when MAC migrates When learned entry migrates to another IP send a notification that entry has changed. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index b061c98474ee..1f2aa26550e9 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -629,6 +629,7 @@ static bool vxlan_snoop(struct net_device *dev, f->remote.remote_ip = src_ip; f->updated = jiffies; + vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); } else { /* learned new entry */ spin_lock(&vxlan->hash_lock); From 1c51a9159ddefa5119724a4c7da3fd3ef44b68d5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:11 -0700 Subject: [PATCH 05/20] vxlan: fix race caused by dropping rtnl_unlock It is possible for two cpu's to race creating vxlan device. For most cases this is harmless, but the ability to assign "next avaliable vxlan device" relies on rtnl lock being held across the whole operation. Therfore two instances of calling: ip li add vxlan%d vxlan ... could collide and create two devices with same name. To fix this defer creation of socket to a work queue, and handle possible races there. Introduce a lock to ensure that changes to vxlan socket hash list is SMP safe. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 111 +++++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 27 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 1f2aa26550e9..71da8be98801 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -94,6 +94,7 @@ struct vxlan_sock { struct vxlan_net { struct list_head vxlan_list; struct hlist_head sock_list[PORT_HASH_SIZE]; + spinlock_t sock_lock; }; struct vxlan_rdst { @@ -131,7 +132,9 @@ struct vxlan_dev { __u8 ttl; u32 flags; /* VXLAN_F_* below */ + struct work_struct sock_work; struct work_struct igmp_work; + unsigned long age_interval; struct timer_list age_timer; spinlock_t hash_lock; @@ -151,6 +154,8 @@ struct vxlan_dev { static u32 vxlan_salt __read_mostly; static struct workqueue_struct *vxlan_wq; +static void vxlan_sock_work(struct work_struct *work); + /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) { @@ -670,12 +675,15 @@ static void vxlan_sock_hold(struct vxlan_sock *vs) atomic_inc(&vs->refcnt); } -static void vxlan_sock_release(struct vxlan_sock *vs) +static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) { if (!atomic_dec_and_test(&vs->refcnt)) return; + spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); + spin_unlock(&vn->sock_lock); + queue_work(vxlan_wq, &vs->del_work); } @@ -700,7 +708,7 @@ static void vxlan_igmp_work(struct work_struct *work) ip_mc_leave_group(sk, &mreq); release_sock(sk); - vxlan_sock_release(vs); + vxlan_sock_release(vn, vs); dev_put(vxlan->dev); } @@ -1222,10 +1230,29 @@ static void vxlan_cleanup(unsigned long arg) /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); + struct vxlan_sock *vs; + __u32 vni = vxlan->default_dst.remote_vni; + dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; + spin_lock(&vn->sock_lock); + vs = vxlan_find_port(dev_net(dev), vxlan->dst_port); + if (vs) { + /* If we have a socket with same port already, reuse it */ + atomic_inc(&vs->refcnt); + vxlan->vn_sock = vs; + hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); + } else { + /* otherwise make new socket outside of RTNL */ + dev_hold(dev); + queue_work(vxlan_wq, &vxlan->sock_work); + } + spin_unlock(&vn->sock_lock); + return 0; } @@ -1233,9 +1260,14 @@ static int vxlan_init(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_sock *vs = vxlan->vn_sock; + + /* socket hasn't been created */ + if (!vs) + return -ENOTCONN; if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { - vxlan_sock_hold(vxlan->vn_sock); + vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_work); } @@ -1267,9 +1299,10 @@ static void vxlan_flush(struct vxlan_dev *vxlan) static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_sock *vs = vxlan->vn_sock; - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { - vxlan_sock_hold(vxlan->vn_sock); + if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { + vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_work); } @@ -1342,6 +1375,7 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work); + INIT_WORK(&vxlan->sock_work, vxlan_sock_work); init_timer_deferrable(&vxlan->age_timer); vxlan->age_timer.function = vxlan_cleanup; @@ -1433,7 +1467,6 @@ static void vxlan_del_work(struct work_struct *work) kfree_rcu(vs, rcu); } -/* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) { struct vxlan_sock *vs; @@ -1490,13 +1523,52 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) return vs; } +/* Scheduled at device creation to bind to a socket */ +static void vxlan_sock_work(struct work_struct *work) +{ + struct vxlan_dev *vxlan + = container_of(work, struct vxlan_dev, sock_work); + struct net_device *dev = vxlan->dev; + struct net *net = dev_net(dev); + __u32 vni = vxlan->default_dst.remote_vni; + __be16 port = vxlan->dst_port; + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_sock *nvs, *ovs; + + nvs = vxlan_socket_create(net, port); + if (IS_ERR(nvs)) { + netdev_err(vxlan->dev, "Can not create UDP socket, %ld\n", + PTR_ERR(nvs)); + goto out; + } + + spin_lock(&vn->sock_lock); + /* Look again to see if can reuse socket */ + ovs = vxlan_find_port(net, port); + if (ovs) { + atomic_inc(&ovs->refcnt); + vxlan->vn_sock = ovs; + hlist_add_head_rcu(&vxlan->hlist, vni_head(ovs, vni)); + spin_unlock(&vn->sock_lock); + + sk_release_kernel(nvs->sock->sk); + kfree(nvs); + } else { + vxlan->vn_sock = nvs; + hlist_add_head_rcu(&nvs->hlist, vs_head(net, port)); + hlist_add_head_rcu(&vxlan->hlist, vni_head(nvs, vni)); + spin_unlock(&vn->sock_lock); + } +out: + dev_put(dev); +} + static int vxlan_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; - struct vxlan_sock *vs; __u32 vni; int err; @@ -1574,31 +1646,13 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, return -EEXIST; } - vs = vxlan_find_port(net, vxlan->dst_port); - if (vs) - atomic_inc(&vs->refcnt); - else { - /* Drop lock because socket create acquires RTNL lock */ - rtnl_unlock(); - vs = vxlan_socket_create(net, vxlan->dst_port); - rtnl_lock(); - if (IS_ERR(vs)) - return PTR_ERR(vs); - - hlist_add_head_rcu(&vs->hlist, vs_head(net, vxlan->dst_port)); - } - vxlan->vn_sock = vs; - SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops); err = register_netdevice(dev); - if (err) { - vxlan_sock_release(vs); + if (err) return err; - } list_add(&vxlan->next, &vn->vxlan_list); - hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); return 0; } @@ -1606,12 +1660,14 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; hlist_del_rcu(&vxlan->hlist); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - vxlan_sock_release(vs); + if (vs) + vxlan_sock_release(vn, vs); } static size_t vxlan_get_size(const struct net_device *dev) @@ -1700,6 +1756,7 @@ static __net_init int vxlan_init_net(struct net *net) unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); + spin_lock_init(&vn->sock_lock); for (h = 0; h < PORT_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->sock_list[h]); From ebf4063e869d959daf75efb4ef1c7bc80dcd4800 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:11 -0700 Subject: [PATCH 06/20] vxlan: move cleanup to uninit Put destruction of per-cpu statistics removal in ndo_uninit since it is created by ndo_init. This also avoids any problems that might be cause by destructor being called after module removed. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 71da8be98801..500f9ce437ec 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1256,6 +1256,17 @@ static int vxlan_init(struct net_device *dev) return 0; } +static void vxlan_uninit(struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); + struct vxlan_sock *vs = vxlan->vn_sock; + + if (vs) + vxlan_sock_release(vn, vs); + free_percpu(dev->tstats); +} + /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { @@ -1321,6 +1332,7 @@ static void vxlan_set_multicast_list(struct net_device *dev) static const struct net_device_ops vxlan_netdev_ops = { .ndo_init = vxlan_init, + .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, @@ -1339,12 +1351,6 @@ static struct device_type vxlan_type = { .name = "vxlan", }; -static void vxlan_free(struct net_device *dev) -{ - free_percpu(dev->tstats); - free_netdev(dev); -} - /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { @@ -1357,7 +1363,7 @@ static void vxlan_setup(struct net_device *dev) dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; - dev->destructor = vxlan_free; + dev->destructor = free_netdev; SET_NETDEV_DEVTYPE(dev, &vxlan_type); dev->tx_queue_len = 0; @@ -1660,14 +1666,10 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - struct vxlan_sock *vs = vxlan->vn_sock; hlist_del_rcu(&vxlan->hlist); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - if (vs) - vxlan_sock_release(vn, vs); } static size_t vxlan_get_size(const struct net_device *dev) From 4ad169300a7350a034b86c543070aed109882a86 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:11 -0700 Subject: [PATCH 07/20] vxlan: make vxlan_xmit_one void The function vxlan_xmit_one always returns NETDEV_TX_OK, so there is no point in keeping track of return values etc. Signed-off-by: Stephen Hemminger Acked-by: David L Stevens --- drivers/net/vxlan.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 500f9ce437ec..e65241c3d176 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1008,8 +1008,8 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, } } -static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, - struct vxlan_rdst *rdst, bool did_rsc) +static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, + struct vxlan_rdst *rdst, bool did_rsc) { struct vxlan_dev *vxlan = netdev_priv(dev); struct rtable *rt; @@ -1032,7 +1032,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan); - return NETDEV_TX_OK; + return; } goto drop; } @@ -1088,7 +1088,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); - return NETDEV_TX_OK; + return; } vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_FLAGS); @@ -1116,7 +1116,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, IPPROTO_UDP, tos, ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); - return NETDEV_TX_OK; + return; drop: dev->stats.tx_dropped++; @@ -1126,7 +1126,6 @@ tx_error: dev->stats.tx_errors++; tx_free: dev_kfree_skb(skb); - return NETDEV_TX_OK; } /* Transmit local packets over Vxlan @@ -1142,7 +1141,6 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) bool did_rsc = false; struct vxlan_rdst *rdst0, *rdst; struct vxlan_fdb *f; - int rc1, rc; skb_reset_mac_header(skb); eth = eth_hdr(skb); @@ -1170,24 +1168,18 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } else rdst0 = &f->remote; - rc = NETDEV_TX_OK; /* if there are multiple destinations, send copies */ for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) { struct sk_buff *skb1; skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) { - rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc); - if (rc == NETDEV_TX_OK) - rc = rc1; - } + if (skb1) + vxlan_xmit_one(skb1, dev, rdst, did_rsc); } - rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc); - if (rc == NETDEV_TX_OK) - rc = rc1; - return rc; + vxlan_xmit_one(skb, dev, rdst0, did_rsc); + return NETDEV_TX_OK; } /* Walk the forwarding table and purge stale entries */ From 3e61aa8f0a68e6e007c223688f442be04a44b0f4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:12 -0700 Subject: [PATCH 08/20] vxlan: convert remotes list to list_rcu Based on initial work by Mike Rapoport Use list macros and RCU for tracking multiple remotes. Note: this code assumes list always has at least one entry, because delete is not supported. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 97 +++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e65241c3d176..117b7fa6f33b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -102,7 +102,7 @@ struct vxlan_rdst { __be16 remote_port; u32 remote_vni; u32 remote_ifindex; - struct vxlan_rdst *remote_next; + struct list_head list; }; /* Forwarding table entry */ @@ -111,7 +111,7 @@ struct vxlan_fdb { struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; - struct vxlan_rdst remote; + struct list_head remotes; u16 state; /* see ndm_state */ u8 flags; /* see ndm_flags */ u8 eth_addr[ETH_ALEN]; @@ -170,6 +170,14 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } +/* First remote destination for a forwarding entry. + * Guaranteed to be non-NULL because remotes are never deleted. + */ +static inline struct vxlan_rdst *first_remote(struct vxlan_fdb *fdb) +{ + return list_first_or_null_rcu(&fdb->remotes, struct vxlan_rdst, list); +} + /* Find VXLAN socket based on network namespace and UDP port */ static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) { @@ -275,7 +283,7 @@ static inline size_t vxlan_nlmsg_size(void) } static void vxlan_fdb_notify(struct vxlan_dev *vxlan, - const struct vxlan_fdb *fdb, int type) + struct vxlan_fdb *fdb, int type) { struct net *net = dev_net(vxlan->dev); struct sk_buff *skb; @@ -285,7 +293,7 @@ static void vxlan_fdb_notify(struct vxlan_dev *vxlan, if (skb == NULL) goto errout; - err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote); + err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, first_remote(fdb)); if (err < 0) { /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -304,11 +312,16 @@ static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f; + struct vxlan_rdst remote; memset(&f, 0, sizeof f); f.state = NUD_STALE; - f.remote.remote_ip = ipa; /* goes to NDA_DST */ - f.remote.remote_vni = VXLAN_N_VID; + + remote.remote_ip = ipa; /* goes to NDA_DST */ + remote.remote_vni = VXLAN_N_VID; + + INIT_LIST_HEAD(&f.remotes); + list_add_rcu(&remote.list, &f.remotes); vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); } @@ -318,6 +331,7 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) struct vxlan_fdb f; memset(&f, 0, sizeof f); + INIT_LIST_HEAD(&f.remotes); f.state = NUD_STALE; memcpy(f.eth_addr, eth_addr, ETH_ALEN); @@ -377,17 +391,17 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, static int vxlan_fdb_append(struct vxlan_fdb *f, __be32 ip, __be16 port, __u32 vni, __u32 ifindex) { - struct vxlan_rdst *rd_prev, *rd; + struct vxlan_rdst *rd; - rd_prev = NULL; - for (rd = &f->remote; rd; rd = rd->remote_next) { + /* protected by vxlan->hash_lock */ + list_for_each_entry(rd, &f->remotes, list) { if (rd->remote_ip == ip && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) return 0; - rd_prev = rd; } + rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOBUFS; @@ -395,8 +409,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; - rd->remote_next = NULL; - rd_prev->remote_next = rd; + + list_add_tail_rcu(&rd->list, &f->remotes); + return 1; } @@ -448,16 +463,14 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, return -ENOMEM; notify = 1; - f->remote.remote_ip = ip; - f->remote.remote_port = port; - f->remote.remote_vni = vni; - f->remote.remote_ifindex = ifindex; - f->remote.remote_next = NULL; f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; + INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); + vxlan_fdb_append(f, ip, port, vni, ifindex); + ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac)); @@ -472,13 +485,10 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); + struct vxlan_rdst *rd, *nd; - while (f->remote.remote_next) { - struct vxlan_rdst *rd = f->remote.remote_next; - - f->remote.remote_next = rd->remote_next; + list_for_each_entry_safe(rd, nd, &f->remotes, list) kfree(rd); - } kfree(f); } @@ -588,23 +598,24 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; - for (rd = &f->remote; rd; rd = rd->remote_next) { - if (idx < cb->args[0]) - goto skip; + if (idx < cb->args[0]) + goto skip; + + list_for_each_entry_rcu(rd, &f->remotes, list) { err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, rd); if (err < 0) - break; -skip: - ++idx; + goto out; } +skip: + ++idx; } } - +out: return idx; } @@ -620,7 +631,9 @@ static bool vxlan_snoop(struct net_device *dev, f = vxlan_find_mac(vxlan, src_mac); if (likely(f)) { - if (likely(f->remote.remote_ip == src_ip)) + struct vxlan_rdst *rdst = first_remote(f); + + if (likely(rdst->remote_ip == src_ip)) return false; /* Don't migrate static entries, drop packets */ @@ -630,9 +643,9 @@ static bool vxlan_snoop(struct net_device *dev, if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pI4 to %pI4\n", - src_mac, &f->remote.remote_ip, &src_ip); + src_mac, &rdst->remote_ip, &src_ip); - f->remote.remote_ip = src_ip; + rdst->remote_ip = src_ip; f->updated = jiffies; vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); } else { @@ -866,7 +879,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) } f = vxlan_find_mac(vxlan, n->ha); - if (f && f->remote.remote_ip == htonl(INADDR_ANY)) { + if (f && first_remote(f)->remote_ip == htonl(INADDR_ANY)) { /* bridge-local neighbor */ neigh_release(n); goto out; @@ -1165,17 +1178,17 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) (vxlan->flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); - } else - rdst0 = &f->remote; + } else { + rdst = rdst0 = first_remote(f); + /* if there are multiple destinations, send copies */ + list_for_each_entry_continue_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; - /* if there are multiple destinations, send copies */ - for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) { - struct sk_buff *skb1; - - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, rdst, did_rsc); + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, rdst, did_rsc); + } } vxlan_xmit_one(skb, dev, rdst0, did_rsc); From 9daaa397b3e18282715eeb0d7be79ea5bbadc119 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:12 -0700 Subject: [PATCH 09/20] vxlan: port module param should be ushort UDP ports are limited to 16 bits. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 117b7fa6f33b..f89a58bb3f26 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -70,8 +70,8 @@ struct vxlanhdr { * The IANA assigned port is 4789, but the Linux default is 8472 * for compatability with early adopters. */ -static unsigned int vxlan_port __read_mostly = 8472; -module_param_named(udp_port, vxlan_port, uint, 0444); +static unsigned short vxlan_port __read_mostly = 8472; +module_param_named(udp_port, vxlan_port, ushort, 0444); MODULE_PARM_DESC(udp_port, "Destination UDP port"); static bool log_ecn_error = true; From bb3fd6878a983f36c994bfbd71b01b2625ddf52b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:40 -0700 Subject: [PATCH 10/20] vxlan: Use initializer for dummy structures For the notification code, a couple of places build fdb entries on the stack, use structure initialization instead and fix formatting. Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f89a58bb3f26..d2b9ab79c9ae 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -311,14 +311,13 @@ errout: static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_fdb f; - struct vxlan_rdst remote; - - memset(&f, 0, sizeof f); - f.state = NUD_STALE; - - remote.remote_ip = ipa; /* goes to NDA_DST */ - remote.remote_vni = VXLAN_N_VID; + struct vxlan_fdb f = { + .state = NUD_STALE, + }; + struct vxlan_rdst remote = { + .remote_ip = ipa, /* goes to NDA_DST */ + .remote_vni = VXLAN_N_VID, + }; INIT_LIST_HEAD(&f.remotes); list_add_rcu(&remote.list, &f.remotes); @@ -328,11 +327,11 @@ static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) { - struct vxlan_fdb f; + struct vxlan_fdb f = { + .state = NUD_STALE, + }; - memset(&f, 0, sizeof f); INIT_LIST_HEAD(&f.remotes); - f.state = NUD_STALE; memcpy(f.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); @@ -1485,6 +1484,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) struct sockaddr_in vxlan_addr = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = port, }; int rc; unsigned int h; @@ -1510,8 +1510,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) sk = vs->sock->sk; sk_change_net(sk, net); - vxlan_addr.sin_port = port; - rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr, sizeof(vxlan_addr)); if (rc < 0) { From 234f5b73794435f065c5fb13371415fe46956a4b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 17 Jun 2013 14:16:41 -0700 Subject: [PATCH 11/20] vxlan: cosmetic cleanup's Fix whitespace and spelling Signed-off-by: Stephen Hemminger Acked-by: David L Stevens --- drivers/net/vxlan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index d2b9ab79c9ae..3b03cd4bdf37 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -68,7 +68,7 @@ struct vxlanhdr { /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 - * for compatability with early adopters. + * for compatibility with early adopters. */ static unsigned short vxlan_port __read_mostly = 8472; module_param_named(udp_port, vxlan_port, ushort, 0444); @@ -210,9 +210,9 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) /* Fill in neighbour message in skbuff. */ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, - const struct vxlan_fdb *fdb, - u32 portid, u32 seq, int type, unsigned int flags, - const struct vxlan_rdst *rdst) + const struct vxlan_fdb *fdb, + u32 portid, u32 seq, int type, unsigned int flags, + const struct vxlan_rdst *rdst) { unsigned long now = jiffies; struct nda_cacheinfo ci; @@ -1031,7 +1031,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4; __be32 dst; __be16 src_port, dst_port; - u32 vni; + u32 vni; __be16 df = 0; __u8 tos, ttl; int err; From 60d9d4c6dbd1bad80fb9a77775fc704302a563c9 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 20 Jun 2013 00:26:31 -0700 Subject: [PATCH 12/20] vxlan: Fix sparse warnings. Fix following sparse warnings. drivers/net/vxlan.c:238:44: warning: incorrect type in argument 3 (different base types) drivers/net/vxlan.c:238:44: expected restricted __be32 [usertype] value drivers/net/vxlan.c:238:44: got unsigned int const [unsigned] [usertype] remote_vni drivers/net/vxlan.c:1735:18: warning: incorrect type in initializer (different signedness) drivers/net/vxlan.c:1735:18: expected int *id drivers/net/vxlan.c:1735:18: got unsigned int static [toplevel] * Signed-off-by: Pravin B Shelar Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3b03cd4bdf37..212a25601fa6 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -78,7 +78,7 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -static unsigned int vxlan_net_id; +static int vxlan_net_id; /* per UDP socket information */ struct vxlan_sock { @@ -250,7 +250,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && - nla_put_be32(skb, NDA_VNI, rdst->remote_vni)) + nla_put_u32(skb, NDA_VNI, rdst->remote_vni)) goto nla_put_failure; if (rdst->remote_ifindex && nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) From afbd8bae9c798c5cdbe4439d3a50536b5438247c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:51 +0300 Subject: [PATCH 13/20] vxlan: add implicit fdb entry for default destination Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 68 ++++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 212a25601fa6..bdfe46e50c49 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -80,6 +80,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int vxlan_net_id; +static const u8 all_zeros_mac[ETH_ALEN]; + /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; @@ -1151,7 +1153,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct ethhdr *eth; bool did_rsc = false; - struct vxlan_rdst *rdst0, *rdst; + struct vxlan_rdst *rdst; struct vxlan_fdb *f; skb_reset_mac_header(skb); @@ -1171,26 +1173,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } if (f == NULL) { - rdst0 = &vxlan->default_dst; + f = vxlan_find_mac(vxlan, all_zeros_mac); + if (f == NULL) { + if ((vxlan->flags & VXLAN_F_L2MISS) && + !is_multicast_ether_addr(eth->h_dest)) + vxlan_fdb_miss(vxlan, eth->h_dest); - if (rdst0->remote_ip == htonl(INADDR_ANY) && - (vxlan->flags & VXLAN_F_L2MISS) && - !is_multicast_ether_addr(eth->h_dest)) - vxlan_fdb_miss(vxlan, eth->h_dest); - } else { - rdst = rdst0 = first_remote(f); - - /* if there are multiple destinations, send copies */ - list_for_each_entry_continue_rcu(rdst, &f->remotes, list) { - struct sk_buff *skb1; - - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, rdst, did_rsc); + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; } } - vxlan_xmit_one(skb, dev, rdst0, did_rsc); + list_for_each_entry_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; + + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, rdst, did_rsc); + } + + dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -1260,12 +1263,25 @@ static int vxlan_init(struct net_device *dev) return 0; } +static void vxlan_fdb_delete_defualt(struct vxlan_dev *vxlan) +{ + struct vxlan_fdb *f; + + spin_lock_bh(&vxlan->hash_lock); + f = __vxlan_find_mac(vxlan, all_zeros_mac); + if (f) + vxlan_fdb_destroy(vxlan, f); + spin_unlock_bh(&vxlan->hash_lock); +} + static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; + vxlan_fdb_delete_defualt(vxlan); + if (vs) vxlan_sock_release(vn, vs); free_percpu(dev->tstats); @@ -1304,7 +1320,9 @@ static void vxlan_flush(struct vxlan_dev *vxlan) hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); - vxlan_fdb_destroy(vxlan, f); + /* the all_zeros_mac entry is deleted at vxlan_uninit */ + if (!is_zero_ether_addr(f->eth_addr)) + vxlan_fdb_destroy(vxlan, f); } } spin_unlock_bh(&vxlan->hash_lock); @@ -1657,10 +1675,22 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops); - err = register_netdevice(dev); + /* create an fdb entry for default destination */ + err = vxlan_fdb_create(vxlan, all_zeros_mac, + vxlan->default_dst.remote_ip, + NUD_REACHABLE|NUD_PERMANENT, + NLM_F_EXCL|NLM_F_CREATE, + vxlan->dst_port, vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_ifindex, NTF_SELF); if (err) return err; + err = register_netdevice(dev); + if (err) { + vxlan_fdb_delete_defualt(vxlan); + return err; + } + list_add(&vxlan->next, &vn->vxlan_list); return 0; From a5e7c10a7ec244f272703f36f339c967efe1fc0d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:52 +0300 Subject: [PATCH 14/20] vxlan: introduce vxlan_fdb_find_rdst which will be reused by vxlan_fdb_delete Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index bdfe46e50c49..306bd94efa89 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -388,20 +388,33 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, return f; } +/* caller should hold vxlan->hash_lock */ +static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, + __be32 ip, __be16 port, + __u32 vni, __u32 ifindex) +{ + struct vxlan_rdst *rd; + + list_for_each_entry(rd, &f->remotes, list) { + if (rd->remote_ip == ip && + rd->remote_port == port && + rd->remote_vni == vni && + rd->remote_ifindex == ifindex) + return rd; + } + + return NULL; +} + /* Add/update destinations for multicast */ static int vxlan_fdb_append(struct vxlan_fdb *f, __be32 ip, __be16 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; - /* protected by vxlan->hash_lock */ - list_for_each_entry(rd, &f->remotes, list) { - if (rd->remote_ip == ip && - rd->remote_port == port && - rd->remote_vni == vni && - rd->remote_ifindex == ifindex) - return 0; - } + rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (rd) + return 0; rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) From f0b074be7b61a800e39053d73dabf850649c1c8f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:53 +0300 Subject: [PATCH 15/20] vxlan: introduce vxlan_fdb_parse which will be reused by vxlan_fdb_delete Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 83 ++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 32 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 306bd94efa89..ee7cc71e57fd 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -518,13 +518,60 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) call_rcu(&f->rcu, vxlan_fdb_free); } +static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, + __be32 *ip, __be16 *port, u32 *vni, u32 *ifindex) +{ + struct net *net = dev_net(vxlan->dev); + + if (tb[NDA_DST]) { + if (nla_len(tb[NDA_DST]) != sizeof(__be32)) + return -EAFNOSUPPORT; + + *ip = nla_get_be32(tb[NDA_DST]); + } else { + *ip = htonl(INADDR_ANY); + } + + if (tb[NDA_PORT]) { + if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) + return -EINVAL; + *port = nla_get_be16(tb[NDA_PORT]); + } else { + *port = vxlan->dst_port; + } + + if (tb[NDA_VNI]) { + if (nla_len(tb[NDA_VNI]) != sizeof(u32)) + return -EINVAL; + *vni = nla_get_u32(tb[NDA_VNI]); + } else { + *vni = vxlan->default_dst.remote_vni; + } + + if (tb[NDA_IFINDEX]) { + struct net_device *tdev; + + if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) + return -EINVAL; + *ifindex = nla_get_u32(tb[NDA_IFINDEX]); + tdev = dev_get_by_index(net, *ifindex); + if (!tdev) + return -EADDRNOTAVAIL; + dev_put(tdev); + } else { + *ifindex = 0; + } + + return 0; +} + /* Add static entry (via netlink) */ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 flags) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct net *net = dev_net(vxlan->dev); + /* struct net *net = dev_net(vxlan->dev); */ __be32 ip; __be16 port; u32 vni, ifindex; @@ -539,37 +586,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (tb[NDA_DST] == NULL) return -EINVAL; - if (nla_len(tb[NDA_DST]) != sizeof(__be32)) - return -EAFNOSUPPORT; - - ip = nla_get_be32(tb[NDA_DST]); - - if (tb[NDA_PORT]) { - if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) - return -EINVAL; - port = nla_get_be16(tb[NDA_PORT]); - } else - port = vxlan->dst_port; - - if (tb[NDA_VNI]) { - if (nla_len(tb[NDA_VNI]) != sizeof(u32)) - return -EINVAL; - vni = nla_get_u32(tb[NDA_VNI]); - } else - vni = vxlan->default_dst.remote_vni; - - if (tb[NDA_IFINDEX]) { - struct net_device *tdev; - - if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) - return -EINVAL; - ifindex = nla_get_u32(tb[NDA_IFINDEX]); - tdev = dev_get_by_index(net, ifindex); - if (!tdev) - return -EADDRNOTAVAIL; - dev_put(tdev); - } else - ifindex = 0; + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); + if (err) + return err; spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, From bc7892ba39992c6d645e906f1d52a626395b4b11 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:54 +0300 Subject: [PATCH 16/20] vxlan: allow removal of single destination from fdb entry When the last item is deleted from the remote destinations list, the fdb entry is destroyed. Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ee7cc71e57fd..c1825201f9e2 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -105,6 +105,7 @@ struct vxlan_rdst { u32 remote_vni; u32 remote_ifindex; struct list_head list; + struct rcu_head rcu; }; /* Forwarding table entry */ @@ -496,6 +497,12 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, return 0; } +static void vxlan_fdb_free_rdst(struct rcu_head *head) +{ + struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); + kfree(rd); +} + static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); @@ -605,14 +612,43 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; - int err = -ENOENT; + struct vxlan_rdst *rd = NULL; + __be32 ip; + __be16 port; + u32 vni, ifindex; + int err; + + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); + if (err) + return err; + + err = -ENOENT; spin_lock_bh(&vxlan->hash_lock); f = vxlan_find_mac(vxlan, addr); - if (f) { - vxlan_fdb_destroy(vxlan, f); - err = 0; + if (!f) + goto out; + + if (ip != htonl(INADDR_ANY)) { + rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (!rd) + goto out; } + + err = 0; + + /* remove a destination if it's not the only one on the list, + * otherwise destroy the fdb entry + */ + if (rd && !list_is_singular(&f->remotes)) { + list_del_rcu(&rd->list); + call_rcu(&rd->rcu, vxlan_fdb_free_rdst); + goto out; + } + + vxlan_fdb_destroy(vxlan, f); + +out: spin_unlock_bh(&vxlan->hash_lock); return err; From f693dff7107063f0ce08502052b78c4d4feb0e87 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:55 +0300 Subject: [PATCH 17/20] rtnetlink: allow using zero MAC address in rtnl_fdb_{add,del} This is required for multiple default destinations management in VXLAN Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- net/core/rtnetlink.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9007533867f0..3de740834d1f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2109,10 +2109,6 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) } addr = nla_data(tb[NDA_LLADDR]); - if (is_zero_ether_addr(addr)) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ether address\n"); - return -EINVAL; - } err = -EOPNOTSUPP; @@ -2210,10 +2206,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) } addr = nla_data(tb[NDA_LLADDR]); - if (is_zero_ether_addr(addr)) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ether address\n"); - return -EINVAL; - } err = -EOPNOTSUPP; From 58e4c767046a35f11a55af6ce946054ddf4a8580 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:56 +0300 Subject: [PATCH 18/20] vxlan: fdb: allow specifying multiple destinations for zero MAC The zero MAC entry in the fdb is used as default destination. With multiple default destinations it is possible to use vxlan in environments that disable multicast on the infrastructure level, e.g. public clouds. Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c1825201f9e2..3e75f9726c33 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -458,7 +458,8 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, notify = 1; } if ((flags & NLM_F_APPEND) && - is_multicast_ether_addr(f->eth_addr)) { + (is_multicast_ether_addr(f->eth_addr) || + is_zero_ether_addr(f->eth_addr))) { int rc = vxlan_fdb_append(f, ip, port, vni, ifindex); if (rc < 0) From 537f7f8494be4219eb0ef47121ea16a6f9f0f49e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 25 Jun 2013 09:34:36 -0700 Subject: [PATCH 19/20] bridge: check for zero ether address in fdb add The check for all-zero ether address was removed from rtnetlink core, since Vxlan uses all-zero ether address to signify default address. Need to add check back in for bridge. Signed-off-by: Stephen Hemminger --- net/bridge/br_fdb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index ebfa4443c69b..60aca9109a50 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -707,6 +707,11 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } } + if (is_zero_ether_addr(addr)) { + pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n"); + return -EINVAL; + } + p = br_port_get_rtnl(dev); if (p == NULL) { pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", From ba609e9bf15bd7df35e2c06a2e5aaf9ab9289b10 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 25 Jun 2013 17:06:01 -0700 Subject: [PATCH 20/20] vxlan: fix function name spelling Signed-off-by: Stephen Hemminger --- drivers/net/vxlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3e75f9726c33..227b54a1f88a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1332,7 +1332,7 @@ static int vxlan_init(struct net_device *dev) return 0; } -static void vxlan_fdb_delete_defualt(struct vxlan_dev *vxlan) +static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) { struct vxlan_fdb *f; @@ -1349,7 +1349,7 @@ static void vxlan_uninit(struct net_device *dev) struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; - vxlan_fdb_delete_defualt(vxlan); + vxlan_fdb_delete_default(vxlan); if (vs) vxlan_sock_release(vn, vs); @@ -1756,7 +1756,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, err = register_netdevice(dev); if (err) { - vxlan_fdb_delete_defualt(vxlan); + vxlan_fdb_delete_default(vxlan); return err; }