Re: [PATCH 3/3] net: macvtap driver
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: 2010-01-28 17:40:24
Also in:
bridge, lkml
On Wed, Jan 27, 2010 at 10:09:27PM +0100, Arnd Bergmann wrote:
quoted hunk ↗ jump to hunk
In order to use macvlan with qemu and other tools that require a tap file descriptor, the macvtap driver adds a small backend with a character device with the same interface as the tun driver, with a minimum set of features. Macvtap interfaces are created in the same way as macvlan interfaces using ip link, but the netif is just used as a handle for configuration and accounting, while the data goes through the chardev. Each macvtap interface has its own character device, simplifying permission management significantly over the generic tun/tap driver. Cc: Patrick McHardy <redacted> Cc: Stephen Hemminger <redacted> Cc: David S. Miller" <davem@davemloft.net> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Or Gerlitz <redacted> Cc: netdev@vger.kernel.org Cc: bridge@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> --- drivers/net/Kconfig | 12 + drivers/net/Makefile | 1 + drivers/net/macvtap.c | 572 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/if_macvlan.h | 1 + 4 files changed, 586 insertions(+), 0 deletions(-) create mode 100644 drivers/net/macvtap.cdiff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index cb0e534..411e207 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig@@ -90,6 +90,18 @@ config MACVLAN To compile this driver as a module, choose M here: the module will be called macvlan. +config MACVTAP + tristate "MAC-VLAN based tap driver (EXPERIMENTAL)" + depends on MACVLAN + help + This adds a specialized tap character device driver that is based + on the MAC-VLAN network interface, called macvtap. A macvtap device + can be added in the same way as a macvlan device, using 'type + macvlan', and then be accessed through the tap user space interface. + + To compile this driver as a module, choose M here: the module + will be called macvtap. + config EQUALIZER tristate "EQL (serial line load balancing) support" ---help---diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 0b763cb..9595803 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile@@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_MACVLAN) += macvlan.o +obj-$(CONFIG_MACVTAP) += macvtap.o obj-$(CONFIG_DE600) += de600.o obj-$(CONFIG_DE620) += de620.o obj-$(CONFIG_LANCE) += lance.odiff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c new file mode 100644 index 0000000..2916202 --- /dev/null +++ b/drivers/net/macvtap.c@@ -0,0 +1,572 @@ +#include <linux/etherdevice.h> +#include <linux/if_macvlan.h> +#include <linux/interrupt.h> +#include <linux/nsproxy.h> +#include <linux/compat.h> +#include <linux/if_tun.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/cache.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/cdev.h> +#include <linux/fs.h> + +#include <net/net_namespace.h> +#include <net/rtnetlink.h> +#include <net/sock.h> + +/* + * A macvtap queue is the central object of this driver, it connects + * an open character device to a macvlan interface. There can be + * multiple queues on one interface, which map back to queues + * implemented in hardware on the underlying device. + * + * macvtap_proto is used to allocate queues through the sock allocation + * mechanism. + * + * TODO: multiqueue support is currently not implemented, even though + * macvtap is basically prepared for that. We will need to add this + * here as well as in virtio-net and qemu to get line rate on 10gbit + * adapters from a guest. + */ +struct macvtap_queue { + struct sock sk; + struct socket sock; + struct macvlan_dev *vlan; + struct file *file; +}; + +static struct proto macvtap_proto = { + .name = "macvtap", + .owner = THIS_MODULE, + .obj_size = sizeof (struct macvtap_queue), +}; + +/* + * Minor number matches netdev->ifindex, so need a potentially + * large value. This also makes it possible to split the + * tap functionality out again in the future by offering it + * from other drivers besides macvtap. As long as every device + * only has one tap, the interface numbers assure that the + * device nodes are unique. + */ +static unsigned int macvtap_major; +#define MACVTAP_NUM_DEVS 65536 +static struct class *macvtap_class; +static struct cdev macvtap_cdev; + +/* + * RCU usage: + * The macvtap_queue is referenced both from the chardev struct file + * and from the struct macvlan_dev using rcu_read_lock. + * + * We never actually update the contents of a macvtap_queue atomically + * with RCU but it is used for race-free destruction of a queue when + * either the file or the macvlan_dev goes away. Pointers back to + * the dev and the file are implicitly valid as long as the queue + * exists. + * + * The callbacks from macvlan are always done with rcu_read_lock held + * already, while in the file_operations, we get it ourselves. + * + * When destroying a queue, we remove the pointers from the file and + * from the dev and then synchronize_rcu to make sure no thread is + * still using the queue. There may still be references to the struct + * sock inside of the queue from outbound SKBs, but these never + * reference back to the file or the dev. The data structure is freed + * through __sk_free when both our references and any pending SKBs + * are gone. + * + * macvtap_lock is only used to prevent multiple concurrent open() + * calls to assign a new vlan->tap pointer. It could be moved into + * the macvlan_dev itself but is extremely rarely used. + */ +static DEFINE_SPINLOCK(macvtap_lock); + +/* + * Choose the next free queue, for now there is only one + */ +static int macvtap_set_queue(struct net_device *dev, struct file *file, + struct macvtap_queue *q) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + int err = -EBUSY; + + spin_lock(&macvtap_lock); + if (rcu_dereference(vlan->tap)) + goto out; + + err = 0; + q->vlan = vlan; + rcu_assign_pointer(vlan->tap, q); + + q->file = file; + rcu_assign_pointer(file->private_data, q); + +out: + spin_unlock(&macvtap_lock); + return err; +} + +/* + * We must destroy each queue exactly once, when either + * the netdev or the file go away. + * + * Using the spinlock makes sure that we don't get + * to the queue again after destroying it. + * + * synchronize_rcu serializes with the packet flow + * that uses rcu_read_lock. + */ +static void macvtap_del_queue(struct macvtap_queue **qp) +{ + struct macvtap_queue *q; + + spin_lock(&macvtap_lock); + q = rcu_dereference(*qp); + if (!q) { + spin_unlock(&macvtap_lock); + return; + } + + rcu_assign_pointer(q->vlan->tap, NULL); + rcu_assign_pointer(q->file->private_data, NULL); + spin_unlock(&macvtap_lock); + + synchronize_rcu(); + sock_put(&q->sk); +} + +/* + * Since we only support one queue, just dereference the pointer. + */ +static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, + struct sk_buff *skb) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + return rcu_dereference(vlan->tap); +} + +static void macvtap_del_queues(struct net_device *dev) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + macvtap_del_queue(&vlan->tap); +} + +static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) +{ + rcu_read_lock_bh(); + return rcu_dereference(file->private_data); +} + +static inline void macvtap_file_put_queue(void) +{ + rcu_read_unlock_bh(); +} +
I find such wrappers around rcu obscure this, already sufficiently complex, pattern. Might be just me.
+/*
+ * Forward happens for data that gets sent from one macvlan
+ * endpoint to another one in bridge mode. We just take
+ * the skb and put it into the receive queue.
+ */
+static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
+{
+ struct macvtap_queue *q = macvtap_get_queue(dev, skb);
+ if (!q)
+ return -ENOLINK;
+
+ skb_queue_tail(&q->sk.sk_receive_queue, skb);
+ wake_up(q->sk.sk_sleep);
+ return 0;
+}
+
+/*
+ * Receive is for data from the external interface (lowerdev),
+ * in case of macvtap, we can treat that the same way as
+ * forward, which macvlan cannot.
+ */
+static int macvtap_receive(struct sk_buff *skb)
+{
+ skb_push(skb, ETH_HLEN);
+ return macvtap_forward(skb->dev, skb);
+}
+
+static int macvtap_newlink(struct net *src_net,
+ struct net_device *dev,
+ struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct device *classdev;
+ dev_t devt;
+ int err;
+
+ err = macvlan_common_newlink(src_net, dev, tb, data,
+ macvtap_receive, macvtap_forward);
+ if (err)
+ goto out;
+
+ devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+
+ classdev = device_create(macvtap_class, &dev->dev, devt,
+ dev, "tap%d", dev->ifindex);
+ if (IS_ERR(classdev)) {
+ err = PTR_ERR(classdev);
+ macvtap_del_queues(dev);
+ }
+
+out:
+ return err;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+ struct list_head *head)
+{
+ device_destroy(macvtap_class,
+ MKDEV(MAJOR(macvtap_major), dev->ifindex));
+
+ macvtap_del_queues(dev);
+ macvlan_dellink(dev, head);
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+ .kind = "macvtap",
+ .newlink = macvtap_newlink,
+ .dellink = macvtap_dellink,
+};
+
+
+static void macvtap_sock_write_space(struct sock *sk)
+{
+ if (!sock_writeable(sk) ||
+ !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+ return;
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_sync(sk->sk_sleep);
+}
+
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvtap_queue *q;
+ int err;
+This seems to keep reference to device as long as character device is open, which, if I understand correctly, will start printing error messages to kernel log about once a second if you try to remove the device. I suspect the best way to fix this issue would be to use some kind of notifier so that macvtap can disconnect on device removal.
+ err = -ENODEV;
+ if (!dev)
+ goto out;
+
+ /* check if this is a macvtap device */
+ err = -EINVAL;
+ if (dev->rtnl_link_ops != &macvtap_link_ops)
+ goto out;
+
+ err = -ENOMEM;
+ q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+ &macvtap_proto);
+ if (!q)
+ goto out;
+
+ init_waitqueue_head(&q->sock.wait);
+ q->sock.type = SOCK_RAW;
+ q->sock.state = SS_CONNECTED;
+ sock_init_data(&q->sock, &q->sk);
+ q->sk.sk_allocation = GFP_ATOMIC; /* for now */
+ q->sk.sk_write_space = macvtap_sock_write_space;
+
+ err = macvtap_set_queue(dev, file, q);
+ if (err)
+ sock_put(&q->sk);
+
+out:
+ return err;
+}
+
+static int macvtap_release(struct inode *inode, struct file *file)
+{
+ macvtap_del_queue((struct macvtap_queue **)&file->private_data);
+ return 0;
+}
+
+static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+{
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+ unsigned int mask = POLLERR;
+
+ if (!q)
+ goto out;
+
+ mask = 0;
+ poll_wait(file, &q->sock.wait, wait);
+
+ if (!skb_queue_empty(&q->sk.sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(&q->sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
+ sock_writeable(&q->sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+out:
+ macvtap_file_put_queue();
+ return mask;
+}
+
+/* Get packet from user space buffer */
+static ssize_t macvtap_get_user(struct macvtap_queue *q,
+ const struct iovec *iv, size_t count,
+ int noblock)
+{
+ struct sk_buff *skb;
+ size_t len = count;
+ int err;
+
+ if (unlikely(len < ETH_HLEN))
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+
+ if (!skb) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ return err;
+ }
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, count);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ macvlan_start_xmit(skb, q->vlan->dev);
+
+ return count;
+}
+I am surprised there's no GSO support. Would it be a good idea to share code with tun driver? That already has GSO ... Also, network header pointer seems off for vlan packets?
+static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t result = -ENOLINK;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ if (!q)
+ goto out;
+
+ result = macvtap_get_user(q, iv, iov_length(iv, count),
+ file->f_flags & O_NONBLOCK);
+out:
+ macvtap_file_put_queue();
+ return result;
+}
+
+/* Put packet to the user space buffer */
+static ssize_t macvtap_put_user(struct macvtap_queue *q,
+ const struct sk_buff *skb,
+ const struct iovec *iv, int len)
+{
+ struct macvlan_dev *vlan = q->vlan;
+ int ret;
+
+ len = min_t(int, skb->len, len);
+
+ ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+
+ macvlan_count_rx(vlan, len, ret == 0, 0);
+
+ return ret ? ret : len;
+}
+
+static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb;
+ ssize_t len, ret = 0;
+
+ if (!q) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ len = iov_length(iv, count);
+ if (len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ add_wait_queue(q->sk.sk_sleep, &wait);
+ while (len) {
+ current->state = TASK_INTERRUPTIBLE;
+
+ /* Read frames from the queue */
+ skb = skb_dequeue(&q->sk.sk_receive_queue);
+ if (!skb) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ /* Nothing to read, let's sleep */
+ schedule();
+ continue;
+ }
+ ret = macvtap_put_user(q, skb, iv, len);
+ kfree_skb(skb);
+ break;
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(q->sk.sk_sleep, &wait);
+
+out:
+ macvtap_file_put_queue();
+ return ret;
+}
+Same GSO comment here.
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{All of these seem to be stubs, and tun has many more that you didn't stub out. So, why do you bother to support any ioctls at all?
quoted hunk ↗ jump to hunk
+ struct macvtap_queue *q; + void __user *argp = (void __user *)arg; + struct ifreq __user *ifr = argp; + unsigned int __user *up = argp; + unsigned int u; + int err; + + switch (cmd) { + case TUNSETIFF: + /* ignore the name, just look at flags */ + if (get_user(u, &ifr->ifr_flags)) + return -EFAULT; + if (u != (IFF_TAP | IFF_NO_PI)) + return -EINVAL; + return 0; + + case TUNGETIFF: + q = macvtap_file_get_queue(file); + if (!q) + return -ENOLINK; + err = 0; + if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || + put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) + err = -EFAULT; + macvtap_file_put_queue(); + return err; + + case TUNGETFEATURES: + if (put_user((IFF_TAP | IFF_NO_PI), up)) + return -EFAULT; + return 0; + + case TUNSETSNDBUF: + /* ignore */ + return 0; + + case TUNSETOFFLOAD: + /* let the user check for future flags */ + if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + /* TODO: add support for these, so far we don't + support any offload */ + if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + return 0; + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations macvtap_fops = { + .owner = THIS_MODULE, + .open = macvtap_open, + .release = macvtap_release, + .aio_read = macvtap_aio_read, + .aio_write = macvtap_aio_write, + .poll = macvtap_poll, + .llseek = no_llseek, + .unlocked_ioctl = macvtap_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = macvtap_compat_ioctl, +#endif +}; + +static int macvtap_init(void) +{ + int err; + + err = alloc_chrdev_region(&macvtap_major, 0, + MACVTAP_NUM_DEVS, "macvtap"); + if (err) + goto out1; + + cdev_init(&macvtap_cdev, &macvtap_fops); + err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); + if (err) + goto out2; + + macvtap_class = class_create(THIS_MODULE, "macvtap"); + if (IS_ERR(macvtap_class)) { + err = PTR_ERR(macvtap_class); + goto out3; + } + + err = macvlan_link_register(&macvtap_link_ops); + if (err) + goto out4; + + return 0; + +out4: + class_unregister(macvtap_class); +out3: + cdev_del(&macvtap_cdev); +out2: + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +out1: + return err; +} +module_init(macvtap_init); + +static void macvtap_exit(void) +{ + rtnl_link_unregister(&macvtap_link_ops); + class_unregister(macvtap_class); + cdev_del(&macvtap_cdev); + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +} +module_exit(macvtap_exit); + +MODULE_ALIAS_RTNL_LINK("macvtap"); +MODULE_AUTHOR("Arnd Bergmann [off-list ref]"); +MODULE_LICENSE("GPL");diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9a11544..51f1512 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h@@ -34,6 +34,7 @@ struct macvlan_dev { enum macvlan_mode mode; int (*receive)(struct sk_buff *skb); int (*forward)(struct net_device *dev, struct sk_buff *skb); + struct macvtap_queue *tap; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan,-- 1.6.3.3