Thread (8 messages) 8 messages, 3 authors, 2025-07-11

Re: [PATCH net-next 1/2] vhost: basic in order support

From: Jason Wang <jasowang@redhat.com>
Date: 2025-07-11 01:44:26
Also in: kvm, lkml, virtualization

On Thu, Jul 10, 2025 at 5:05 PM Eugenio Perez Martin
[off-list ref] wrote:
On Tue, Jul 8, 2025 at 8:48 AM Jason Wang [off-list ref] wrote:
quoted
This patch adds basic in order support for vhost. Two optimizations
are implemented in this patch:

1) Since driver uses descriptor in order, vhost can deduce the next
   avail ring head by counting the number of descriptors that has been
   used in next_avail_head. This eliminate the need to access the
   available ring in vhost.

2) vhost_add_used_and_singal_n() is extended to accept the number of
   batched buffers per used elem. While this increases the times of
   usersapce memory access but it helps to reduce the chance of
s/usersapce/userspace/
quoted
   used ring access of both the driver and vhost.

Vhost-net will be the first user for this.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   |   6 ++-
 drivers/vhost/vhost.c | 121 +++++++++++++++++++++++++++++++++++-------
 drivers/vhost/vhost.h |   8 ++-
 3 files changed, 111 insertions(+), 24 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7cbfc7d718b3..4f9c67f17b49 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -374,7 +374,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
        while (j) {
                add = min(UIO_MAXIOV - nvq->done_idx, j);
                vhost_add_used_and_signal_n(vq->dev, vq,
-                                           &vq->heads[nvq->done_idx], add);
+                                           &vq->heads[nvq->done_idx],
+                                           NULL, add);
                nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
                j -= add;
        }
@@ -457,7 +458,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
        if (!nvq->done_idx)
                return;

-       vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
+       vhost_add_used_and_signal_n(dev, vq, vq->heads, NULL,
+                                   nvq->done_idx);
        nvq->done_idx = 0;
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3a5ebb973dba..c7ed069fc49e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -364,6 +364,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
        vq->avail = NULL;
        vq->used = NULL;
        vq->last_avail_idx = 0;
+       vq->next_avail_head = 0;
        vq->avail_idx = 0;
        vq->last_used_idx = 0;
        vq->signalled_used = 0;
@@ -455,6 +456,8 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
        vq->log = NULL;
        kfree(vq->heads);
        vq->heads = NULL;
+       kfree(vq->nheads);
+       vq->nheads = NULL;
 }

 /* Helper to allocate iovec buffers for all vqs. */
@@ -472,7 +475,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
                                        GFP_KERNEL);
                vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
                                          GFP_KERNEL);
-               if (!vq->indirect || !vq->log || !vq->heads)
+               vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads),
+                                          GFP_KERNEL);
+               if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads)
                        goto err_nomem;
        }
        return 0;
@@ -1990,14 +1995,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
                        break;
                }
                if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
-                       vq->last_avail_idx = s.num & 0xffff;
+                       vq->next_avail_head = vq->last_avail_idx =
+                                             s.num & 0xffff;
                        vq->last_used_idx = (s.num >> 16) & 0xffff;
                } else {
                        if (s.num > 0xffff) {
                                r = -EINVAL;
                                break;
                        }
-                       vq->last_avail_idx = s.num;
+                       vq->next_avail_head = vq->last_avail_idx = s.num;
Why not just reuse last_avail_idx instead of creating next_avail_head?

At first glance it seemed to me that it was done this way to support
rewinding, but in_order path will happily reuse next_avail_head
without checking for last_avail_idx except for checking if the ring is
empty. Am I missing something?
Because the driver can submit a batch of available buffers so
last_avail_idx is not necessarily equal to next_avail_head.
quoted
                }
                /* Forget the cached index value. */
                vq->avail_idx = vq->last_avail_idx;
@@ -2590,11 +2596,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
                      unsigned int *out_num, unsigned int *in_num,
                      struct vhost_log *log, unsigned int *log_num)
 {
+       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
        struct vring_desc desc;
        unsigned int i, head, found = 0;
        u16 last_avail_idx = vq->last_avail_idx;
        __virtio16 ring_head;
-       int ret, access;
+       int ret, access, c = 0;

        if (vq->avail_idx == vq->last_avail_idx) {
                ret = vhost_get_avail_idx(vq);
@@ -2605,17 +2612,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
                        return vq->num;
        }

-       /* Grab the next descriptor number they're advertising, and increment
-        * the index we've seen. */
-       if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
-               vq_err(vq, "Failed to read head: idx %d address %p\n",
-                      last_avail_idx,
-                      &vq->avail->ring[last_avail_idx % vq->num]);
-               return -EFAULT;
+       if (in_order)
+               head = vq->next_avail_head & (vq->num - 1);
+       else {
+               /* Grab the next descriptor number they're
+                * advertising, and increment the index we've seen. */
+               if (unlikely(vhost_get_avail_head(vq, &ring_head,
+                                                 last_avail_idx))) {
+                       vq_err(vq, "Failed to read head: idx %d address %p\n",
+                               last_avail_idx,
+                               &vq->avail->ring[last_avail_idx % vq->num]);
+                       return -EFAULT;
+               }
+               head = vhost16_to_cpu(vq, ring_head);
        }

-       head = vhost16_to_cpu(vq, ring_head);
-
        /* If their number is silly, that's an error. */
        if (unlikely(head >= vq->num)) {
                vq_err(vq, "Guest says index %u > %u is available",
@@ -2658,6 +2669,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
                                                "in indirect descriptor at idx %d\n", i);
                                return ret;
                        }
+                       ++c;
                        continue;
                }
@@ -2693,10 +2705,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
                        }
                        *out_num += ret;
                }
+               ++c;
        } while ((i = next_desc(vq, &desc)) != -1);

        /* On success, increment avail index. */
        vq->last_avail_idx++;
+       vq->next_avail_head += c;

        /* Assume notifications from guest are disabled at this point,
         * if they aren't we would need to update avail_event index. */
@@ -2720,8 +2734,9 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
                cpu_to_vhost32(vq, head),
                cpu_to_vhost32(vq, len)
        };
+       u16 nheads = 1;

-       return vhost_add_used_n(vq, &heads, 1);
+       return vhost_add_used_n(vq, &heads, &nheads, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
@@ -2757,10 +2772,10 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
        return 0;
 }

-/* After we've used one of their buffers, we tell them about it.  We'll then
- * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
-                    unsigned count)
+static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq,
+                               struct vring_used_elem *heads,
+                               u16 *nheads,
+                               unsigned count)
nheads is not used in this function and it is checked to be NULL in
the caller, should we remove it from the parameter list?
Exactly.
quoted
 {
        int start, n, r;
@@ -2775,6 +2790,70 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
        }
        r = __vhost_add_used_n(vq, heads, count);

+       return r;
Nit: We can merge with the previous statement and do "return
__vhost_add_used_n(vq, heads, count);"
Right.
quoted
+}
+
+static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq,
+                                    struct vring_used_elem *heads,
+                                    u16 *nheads,
Nit: we can const-ify nheads, and do the same for _in_order variant
and vhost_add_used_n. Actually we can do it with heads too but it
requires more changes to existing code. I think it would be nice to
constify *nheads if you need to respin.
Let me do that.
quoted
+                                    unsigned count)
+{
+       vring_used_elem_t __user *used;
+       u16 old, new = vq->last_used_idx;
+       int start, i;
+
+       if (!nheads)
+               return -EINVAL;
+
+       start = vq->last_used_idx & (vq->num - 1);
+       used = vq->used->ring + start;
+
+       for (i = 0; i < count; i++) {
+               if (vhost_put_used(vq, &heads[i], start, 1)) {
+                       vq_err(vq, "Failed to write used");
+                       return -EFAULT;
+               }
+               start += nheads[i];
+               new += nheads[i];
+               if (start >= vq->num)
+                       start -= vq->num;
+       }
+
+       if (unlikely(vq->log_used)) {
+               /* Make sure data is seen before log. */
+               smp_wmb();
+               /* Log used ring entry write. */
+               log_used(vq, ((void __user *)used - (void __user *)vq->used),
+                        (vq->num - start) * sizeof *used);
+               if (start + count > vq->num)
+                       log_used(vq, 0,
+                                (start + count - vq->num) * sizeof *used);
+       }
+
+       old = vq->last_used_idx;
+       vq->last_used_idx = new;
+       /* If the driver never bothers to signal in a very long while,
+        * used index might wrap around. If that happens, invalidate
+        * signalled_used index we stored. TODO: make sure driver
+        * signals at least once in 2^16 and remove this. */
+       if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
+               vq->signalled_used_valid = false;
+       return 0;
+}
+
+/* After we've used one of their buffers, we tell them about it.  We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+                    u16 *nheads, unsigned count)
+{
+       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
+       int r;
+
+       if (!in_order || !nheads)
+               r = vhost_add_used_n_ooo(vq, heads, nheads, count);
+       else
+               r = vhost_add_used_n_in_order(vq, heads, nheads, count);
+
I just realized the original code didn't do it either, but we should
return if r < 0 here. Otherwise, used->ring[] has a random value and
used->idx is incremented covering these values. This should be
triggable in a guest that set used->idx valid but used->ring[]
invalid, for example.
This looks like a bug, I will send an independent fix.
quoted
        /* Make sure buffer is written before we update index. */
        smp_wmb();
        if (vhost_put_used_idx(vq)) {
@@ -2853,9 +2932,11 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
 /* multi-buffer version of vhost_add_used_and_signal */
 void vhost_add_used_and_signal_n(struct vhost_dev *dev,
                                 struct vhost_virtqueue *vq,
-                                struct vring_used_elem *heads, unsigned count)
+                                struct vring_used_elem *heads,
+                                u16 *nheads,
+                                unsigned count)
 {
-       vhost_add_used_n(vq, heads, count);
+       vhost_add_used_n(vq, heads, nheads, count);
        vhost_signal(dev, vq);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index bb75a292d50c..dca9f309d396 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -103,6 +103,8 @@ struct vhost_virtqueue {
         * Values are limited to 0x7fff, and the high bit is used as
         * a wrap counter when using VIRTIO_F_RING_PACKED. */
        u16 last_avail_idx;
+       /* Next avail ring head when VIRTIO_F_IN_ORDER is neogitated */
s/neogitated/negotiated/
Will fix it.
quoted
+       u16 next_avail_head;

        /* Caches available index value from user. */
        u16 avail_idx;
@@ -129,6 +131,7 @@ struct vhost_virtqueue {
        struct iovec iotlb_iov[64];
        struct iovec *indirect;
        struct vring_used_elem *heads;
+       u16 *nheads;
        /* Protected by virtqueue mutex. */
        struct vhost_iotlb *umem;
        struct vhost_iotlb *iotlb;
@@ -213,11 +216,12 @@ bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
-                    unsigned count);
+                    u16 *nheads, unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
                               unsigned int id, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
-                              struct vring_used_elem *heads, unsigned count);
+                                struct vring_used_elem *heads, u16 *nheads,
+                                unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
--
2.31.1
Thanks
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help