Thread (7 messages) 7 messages, 2 authors, 2025-11-19

Re: [PATCH net] vhost: rewind next_avail_head while discarding descriptors

From: Jason Wang <jasowang@redhat.com>
Date: 2025-11-14 01:53:29
Also in: kvm, lkml, stable, virtualization

On Thu, Nov 13, 2025 at 4:13 PM Michael S. Tsirkin [off-list ref] wrote:
On Thu, Nov 13, 2025 at 09:54:20AM +0800, Jason Wang wrote:
quoted
When discarding descriptors with IN_ORDER, we should rewind
next_avail_head otherwise it would run out of sync with
last_avail_idx. This would cause driver to report
"id X is not a head".

Fixing this by returning the number of descriptors that is used for
each buffer via vhost_get_vq_desc_n() so caller can use the value
while discarding descriptors.

Fixes: 67a873df0c41 ("vhost: basic in order support")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
Wow that change really caused a lot of fallout.

Thanks for the patch! Yet something to improve:

quoted
---
 drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
 drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
 drivers/vhost/vhost.h |  9 +++++++-
 3 files changed, 70 insertions(+), 35 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 35ded4330431..8f7f50acb6d6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
                                  struct vhost_net_virtqueue *tnvq,
                                  unsigned int *out_num, unsigned int *in_num,
-                                 struct msghdr *msghdr, bool *busyloop_intr)
+                                 struct msghdr *msghdr, bool *busyloop_intr,
+                                 unsigned int *ndesc)
 {
      struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
      struct vhost_virtqueue *rvq = &rnvq->vq;
      struct vhost_virtqueue *tvq = &tnvq->vq;

-     int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
-                               out_num, in_num, NULL, NULL);
+     int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+                                 out_num, in_num, NULL, NULL, ndesc);

      if (r == tvq->num && tvq->busyloop_timeout) {
              /* Flush batched packets first */
@@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,

              vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);

-             r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
-                                   out_num, in_num, NULL, NULL);
+             r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+                                     out_num, in_num, NULL, NULL, ndesc);
      }

      return r;
@@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
                     struct vhost_net_virtqueue *nvq,
                     struct msghdr *msg,
                     unsigned int *out, unsigned int *in,
-                    size_t *len, bool *busyloop_intr)
+                    size_t *len, bool *busyloop_intr,
+                    unsigned int *ndesc)
 {
      struct vhost_virtqueue *vq = &nvq->vq;
      int ret;

-     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
+     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
+                                    busyloop_intr, ndesc);

      if (ret < 0 || ret == vq->num)
              return ret;
@@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
      int sent_pkts = 0;
      bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
      bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
+     unsigned int ndesc = 0;

      do {
              bool busyloop_intr = false;
@@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
                      vhost_tx_batch(net, nvq, sock, &msg);

              head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
-                                &busyloop_intr);
+                                &busyloop_intr, &ndesc);
              /* On error, stop handling until the next kick. */
              if (unlikely(head < 0))
                      break;
@@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
                              goto done;
                      } else if (unlikely(err != -ENOSPC)) {
                              vhost_tx_batch(net, nvq, sock, &msg);
-                             vhost_discard_vq_desc(vq, 1);
+                             vhost_discard_vq_desc(vq, 1, ndesc);
                              vhost_net_enable_vq(net, vq);
                              break;
                      }
@@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
              err = sock->ops->sendmsg(sock, &msg, len);
              if (unlikely(err < 0)) {
                      if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
-                             vhost_discard_vq_desc(vq, 1);
+                             vhost_discard_vq_desc(vq, 1, ndesc);
                              vhost_net_enable_vq(net, vq);
                              break;
                      }
@@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
      int err;
      struct vhost_net_ubuf_ref *ubufs;
      struct ubuf_info_msgzc *ubuf;
+     unsigned int ndesc = 0;
      bool zcopy_used;
      int sent_pkts = 0;
@@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)

              busyloop_intr = false;
              head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
-                                &busyloop_intr);
+                                &busyloop_intr, &ndesc);
              /* On error, stop handling until the next kick. */
              if (unlikely(head < 0))
                      break;
@@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
                                      vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
                      }
                      if (retry) {
-                             vhost_discard_vq_desc(vq, 1);
+                             vhost_discard_vq_desc(vq, 1, ndesc);
                              vhost_net_enable_vq(net, vq);
                              break;
                      }
@@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
                     unsigned *iovcount,
                     struct vhost_log *log,
                     unsigned *log_num,
-                    unsigned int quota)
+                    unsigned int quota,
+                    unsigned int *ndesc)
 {
      struct vhost_virtqueue *vq = &nvq->vq;
      bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
-     unsigned int out, in;
+     unsigned int out, in, desc_num, n = 0;
      int seg = 0;
      int headcount = 0;
      unsigned d;
@@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
                      r = -ENOBUFS;
                      goto err;
              }
-             r = vhost_get_vq_desc(vq, vq->iov + seg,
-                                   ARRAY_SIZE(vq->iov) - seg, &out,
-                                   &in, log, log_num);
+             r = vhost_get_vq_desc_n(vq, vq->iov + seg,
+                                     ARRAY_SIZE(vq->iov) - seg, &out,
+                                     &in, log, log_num, &desc_num);
              if (unlikely(r < 0))
                      goto err;
@@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
              ++headcount;
              datalen -= len;
              seg += in;
+             n += desc_num;
      }

      *iovcount = seg;
@@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
              nheads[0] = headcount;
      }

+     *ndesc = n;
+
      return headcount;
 err:
-     vhost_discard_vq_desc(vq, headcount);
+     vhost_discard_vq_desc(vq, headcount, n);
So here ndesc and n are the same, but below in vhost_discard_vq_desc
they are different. Fun.
Not necessarily the same, a buffer could contain more than 1 descriptor.
quoted
      return r;
 }
@@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
      struct iov_iter fixup;
      __virtio16 num_buffers;
      int recv_pkts = 0;
+     unsigned int ndesc;

      mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
      sock = vhost_vq_get_backend(vq);
@@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
              headcount = get_rx_bufs(nvq, vq->heads + count,
                                      vq->nheads + count,
                                      vhost_len, &in, vq_log, &log,
-                                     likely(mergeable) ? UIO_MAXIOV : 1);
+                                     likely(mergeable) ? UIO_MAXIOV : 1,
+                                     &ndesc);
              /* On error, stop handling until the next kick. */
              if (unlikely(headcount < 0))
                      goto out;
@@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
              if (unlikely(err != sock_len)) {
                      pr_debug("Discarded rx packet: "
                               " len %d, expected %zd\n", err, sock_len);
-                     vhost_discard_vq_desc(vq, headcount);
+                     vhost_discard_vq_desc(vq, headcount, ndesc);
                      continue;
              }
              /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
                  copy_to_iter(&num_buffers, sizeof num_buffers,
                               &fixup) != sizeof num_buffers) {
                      vq_err(vq, "Failed num_buffers write");
-                     vhost_discard_vq_desc(vq, headcount);
+                     vhost_discard_vq_desc(vq, headcount, ndesc);
                      goto out;
              }
              nvq->done_idx += headcount;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8570fdf2e14a..b56568807588 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
      return 0;
 }

-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found.  A negative code is
- * returned on error. */
A new module API with no docs at all is not good.
Please add documentation to this one. vhost_get_vq_desc
is a subset and could refer to it.
Fixed.
quoted
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-                   struct iovec iov[], unsigned int iov_size,
-                   unsigned int *out_num, unsigned int *in_num,
-                   struct vhost_log *log, unsigned int *log_num)
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
+                     struct iovec iov[], unsigned int iov_size,
+                     unsigned int *out_num, unsigned int *in_num,
+                     struct vhost_log *log, unsigned int *log_num,
+                     unsigned int *ndesc)
quoted
 {
      bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
      struct vring_desc desc;
@@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
      vq->last_avail_idx++;
      vq->next_avail_head += c;

+     if (ndesc)
+             *ndesc = c;
+
      /* Assume notifications from guest are disabled at this point,
       * if they aren't we would need to update avail_event index. */
      BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
      return head;
 }
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found.  A negative code is
+ * returned on error.
+ */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+                   struct iovec iov[], unsigned int iov_size,
+                   unsigned int *out_num, unsigned int *in_num,
+                   struct vhost_log *log, unsigned int *log_num)
+{
+     return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
+                                log, log_num, NULL);
+}
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);

 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
+                        unsigned int ndesc)
ndesc is number of descriptors? And n is what, in that case?
The semantic of n is not changed which is the number of buffers, ndesc
is the number of descriptors.
quoted
 {
+     vq->next_avail_head -= ndesc;
      vq->last_avail_idx -= n;
 }
 EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 621a6d9a8791..69a39540df3d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -230,7 +230,14 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
                    struct iovec iov[], unsigned int iov_size,
                    unsigned int *out_num, unsigned int *in_num,
                    struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
+
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
+                     struct iovec iov[], unsigned int iov_size,
+                     unsigned int *out_num, unsigned int *in_num,
+                     struct vhost_log *log, unsigned int *log_num,
+                     unsigned int *ndesc);
+
+void vhost_discard_vq_desc(struct vhost_virtqueue *, int n, unsigned int ndesc);

 bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 bool vhost_vq_has_work(struct vhost_virtqueue *vq);
--
2.31.1
Thanks
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help