Thread (39 messages) 39 messages, 5 authors, 2010-09-29

Re: [PATCH v11 13/17] Add mp(mediate passthru) device.

From: Ben Hutchings <hidden>
Date: 2010-09-27 21:23:42
Also in: kvm, lkml

On Sat, 2010-09-25 at 12:27 +0800, xiaohui.xin@intel.com wrote:
quoted hunk ↗ jump to hunk
From: Xin Xiaohui <redacted>

The patch add mp(mediate passthru) device, which now
based on vhost-net backend driver and provides proto_ops
to send/receive guest buffers data from/to guest vitio-net
driver.

Signed-off-by: Xin Xiaohui <redacted>
Signed-off-by: Zhao Yu <redacted>
Reviewed-by: Jeff Dike <redacted>
---
 drivers/vhost/mpassthru.c | 1407 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 1407 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..d86d94c
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
[...]
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
This is unsafe; consider this usage:

	if (foo)
		DBG("bar\n");
	else
		baz();

You should use the standard pr_debug() or dev_dbg() instead.

[...]
+struct page_ctor {
+       struct list_head        readq;
+       int                     wq_len;
+       int                     rq_len;
+       spinlock_t              read_lock;
Only one queue?!

I would have appreciated some introductory comments on these structures.
I still don't have any sort of clear picture of how this is all supposed
to work.

[...]
+/* The main function to allocate external buffers */
+static struct skb_ext_page *page_ctor(struct mpassthru_port *port,
+		struct sk_buff *skb, int npages)
+{
+	int i;
+	unsigned long flags;
+	struct page_ctor *ctor;
+	struct page_info *info = NULL;
+
+	ctor = container_of(port, struct page_ctor, port);
+
+	spin_lock_irqsave(&ctor->read_lock, flags);
+	if (!list_empty(&ctor->readq)) {
+		info = list_first_entry(&ctor->readq, struct page_info, list);
+		list_del(&info->list);
+		ctor->rq_len--;
+	}
+	spin_unlock_irqrestore(&ctor->read_lock, flags);
+	if (!info)
+		return NULL;
+
+	for (i = 0; i < info->pnum; i++)
+		get_page(info->pages[i]);
+	info->skb = skb;
+	return &info->ext_page;
+}
Why isn't the npages parameter used?

[...]
+static void relinquish_resource(struct page_ctor *ctor)
+{
+	if (!(ctor->dev->flags & IFF_UP) &&
+			!(ctor->wq_len + ctor->rq_len))
+		printk(KERN_INFO "relinquish_resource\n");
+}
Looks like something's missing here.
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+	struct page_info *info = (struct page_info *)(iocb->private);
+	int i;
+
+	if (info->flags == INFO_READ) {
+		for (i = 0; i < info->pnum; i++) {
+			if (info->pages[i]) {
+				set_page_dirty_lock(info->pages[i]);
+				put_page(info->pages[i]);
+			}
+		}
+		mp_hash_delete(info->ctor, info);
+		if (info->skb) {
+			info->skb->destructor = NULL;
+			kfree_skb(info->skb);
+		}
+		info->ctor->rq_len--;
Doesn't rq_len represent the number of buffers queued between the guest
and the driver?  It is already decremented in page_ctor() so it seems
like it gets decremented twice for each buffer.  Also don't you need to
hold the read_lock when updating rq_len?
+	} else
+		info->ctor->wq_len--;
Maybe you should define rq_len and wq_len both as atomic_t.

[...]
+static void __mp_detach(struct mp_struct *mp)
+{
+	mp->mfile = NULL;
+
+	mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+	page_ctor_detach(mp);
+	mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
This is racy; you should hold rtnl_lock over all these changes.

[...]
+typedef u32 key_mp_t;
+static inline key_mp_t mp_hash(struct page *page, int buckets)
+{
+	key_mp_t k;
+
+	k = ((((unsigned long)page << 32UL) >> 32UL) / 0x38) % buckets ;
+	return k;
+}
This is never going to work on a 32-bit machine, and what is the purpose
of the magic number 0x38?

Try using hash_ptr() from <linux/hash.h>.
+static struct page_info *mp_hash_delete(struct page_ctor *ctor,
+					struct page_info *info)
+{
+	key_mp_t key = mp_hash(info->pages[0], HASH_BUCKETS);
+	struct page_info *tmp = NULL;
+	int i;
+
+	tmp = ctor->hash_table[key];
+	while (tmp) {
+		if (tmp == info) {
+			if (!tmp->prev) {
+				ctor->hash_table[key] = tmp->next;
+				if (tmp->next)
+					tmp->next->prev = NULL;
+			} else {
+				tmp->prev->next = tmp->next;
+				if (tmp->next)
+					tmp->next->prev = tmp->prev;
+			}
+			return tmp;
+		}
+		tmp = tmp->next;
+	}
+	return tmp;
+}
+
+static struct page_info *mp_hash_lookup(struct page_ctor *ctor,
+					struct page *page)
+{
+	key_mp_t key = mp_hash(page, HASH_BUCKETS);
+	struct page_info *tmp = NULL;
+
+	int i;
+	tmp = ctor->hash_table[key];
+	while (tmp) {
+		for (i = 0; i < tmp->pnum; i++) {
+			if (tmp->pages[i] == page)
+				return tmp;
+		}
+		tmp = tmp->next;
+	}
+	return tmp;
+}
How are thse serialised?
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the external buffer address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+		struct kiocb *iocb, struct iovec *iov,
+		int count, struct frag *frags,
+		int npages, int total)
+{
+	int rc;
+	int i, j, n = 0;
+	int len;
+	unsigned long base, lock_limit;
+	struct page_info *info = NULL;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	if (ctor->lock_pages + count > lock_limit && npages) {
+		printk(KERN_INFO "exceed the locked memory rlimit.");
+		return NULL;
+	}
What if the process is locking pages with mlock() as well?  Doesn't this
allow it to lock twice as many pages as it should be able to?
+	info = kmem_cache_alloc(ext_page_info_cache, GFP_KERNEL);
+	
+	if (!info)
+		return NULL;
+	info->skb = NULL;
+	info->next = info->prev = NULL;
+
+	for (i = j = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		if (!len)
+			continue;
+		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+		rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+				&info->pages[j]);
+		if (rc != n)
+			goto failed;
+
+		while (n--) {
+			frags[j].offset = base & ~PAGE_MASK;
+			frags[j].size = min_t(int, len,
+					PAGE_SIZE - frags[j].offset);
+			len -= frags[j].size;
+			base += frags[j].size;
+			j++;
+		}
+	}
+
+#ifdef CONFIG_HIGHMEM
+	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+		for (i = 0; i < j; i++) {
+			if (PageHighMem(info->pages[i]))
+				goto failed;
+		}
+	}
+#endif
Shouldn't you try to allocate lowmem pages explicitly, rather than
failing at this point?

[...]
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len,
+		int flags)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor;
+	struct iovec *iov = m->msg_iov;
+	int count = m->msg_iovlen;
+	int npages, payload;
+	struct page_info *info;
+	struct frag frags[MAX_SKB_FRAGS];
+	unsigned long base;
+	int i, len;
+	unsigned long flag;
+
+	if (!(flags & MSG_DONTWAIT))
+		return -EINVAL;
+
+	ctor = mp->ctor;
+	if (!ctor)
+		return -EINVAL;
+
+	/* Error detections in case invalid external buffer */
+	if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
+			mp->dev->features & NETIF_F_SG) {
+		return -EINVAL;
+	}
+
+	npages = ctor->port.npages;
+	payload = ctor->port.data_len;
+
+	/* If KVM guest virtio-net FE driver use SG feature */
+	if (count > 2) {
+		for (i = 2; i < count; i++) {
+			base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+			len = iov[i].iov_len;
+			if (npages == 1)
+				len = min_t(int, len, PAGE_SIZE - base);
+			else if (base)
+				break;
+			payload -= len;
+			if (payload <= 0)
+				goto proceed;
+			if (npages == 1 || (len & ~PAGE_MASK))
+				break;
+		}
+	}
+
+	if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+				- NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+		goto proceed;
+
+	return -EINVAL;
+
+proceed:
+	/* skip the virtnet head */
+	if (count > 1) {
+		iov++;
+		count--;
+	}
+
+	if (!ctor->lock_pages || !ctor->rq_len) {
+		set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+				iocb->ki_user_data * 4096 * 2,
+				iocb->ki_user_data * 4096 * 2);
+	}
+
+	/* Translate address to kernel */
+	info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
+	if (!info)
+		return -ENOMEM;
I'm not convinced that the checks above this ensure that there will be
<= MAX_SKB_FRAGS fragments.

[...]
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+	struct mp_file *mfile;
+	cycle_kernel_lock();
Seriously?

[...]
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct mp_struct *mp = mp_get(file->private_data);
+	struct sock *sk = mp->socket.sk;
+	struct sk_buff *skb;
+	int len, err;
+	ssize_t result = 0;
+
+	if (!mp)
+		return -EBADFD;
+
+	/* currently, async is not supported.
+	 * but we may support real async aio from user application,
+	 * maybe qemu virtio-net backend.
+	 */
+	if (!is_sync_kiocb(iocb))
+		return -EFAULT;
+
+	len = iov_length(iov, count);
+
+	if (unlikely(len) < ETH_HLEN)
+		return -EINVAL;
The first close-paren is in the wrong place.
+	skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+				  file->f_flags & O_NONBLOCK, &err);
+
+	if (!skb)
+		return -EFAULT;
Why EFAULT?
+	skb_reserve(skb, NET_IP_ALIGN);
+	skb_put(skb, len);
+
+	if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+		kfree_skb(skb);
+		return -EAGAIN;
+	}
+
+	skb->protocol = eth_type_trans(skb, mp->dev);
Why are you calling eth_type_trans() on transmit?

[...]
+static int mp_device_event(struct notifier_block *unused,
+		unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct mpassthru_port *port;
+	struct mp_struct *mp = NULL;
+	struct socket *sock = NULL;
+	struct sock *sk;
+
+	port = dev->mp_port;
+	if (port == NULL)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		sock = dev->mp_port->sock;
+		mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+		do_unbind(mp->mfile);
[...]

This can deadlock - netdev notifiers are called under the RTNL lock and
do_unbind() acquires the mp_mutex, whereas in other places they are
acquired in the opposite order.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help