Thread (9 messages) 9 messages, 3 authors, 2023-08-03

RE: [PATCH v3 2/3] tools: hv: Add vmbus_bufring

From: Saurabh Singh Sengar <hidden>
Date: 2023-08-03 12:07:12
Also in: linux-doc, lkml

-----Original Message-----
From: Michael Kelley (LINUX) <redacted>
Sent: Thursday, August 3, 2023 3:14 AM
To: Saurabh Sengar <ssengar@linux.microsoft.com>; KY Srinivasan
[off-list ref]; Haiyang Zhang [off-list ref];
wei.liu@kernel.org; Dexuan Cui [off-list ref];
gregkh@linuxfoundation.org; corbet@lwn.net; linux-kernel@vger.kernel.org;
linux-hyperv@vger.kernel.org; linux-doc@vger.kernel.org
Subject: [EXTERNAL] RE: [PATCH v3 2/3] tools: hv: Add vmbus_bufring

From: Saurabh Sengar <ssengar@linux.microsoft.com> Sent: Friday, July 14,
2023 3:26 AM
quoted
Provide a userspace interface for userspace drivers or applications to
read/write a VMBus ringbuffer. A significant part of this code is
borrowed from DPDK[1]. Current library is supported exclusively for
the x86 architecture.

To build this library:
make -C tools/hv libvmbus_bufring.a

Applications using this library can include the vmbus_bufring.h header
file and libvmbus_bufring.a statically.

[1]
https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
ub.com%2FDPDK%2Fdpdk%2F&data=05%7C01%7Cssengar%40microsoft.com
%7C7aa6d
quoted
4dbbcb44895db5008db93a193c9%7C72f988bf86f141af91ab2d7cd011db47%7
C1%7C0
quoted
%7C638266094508922046%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLj
AwMDAiLCJQ
quoted
IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata
=O0cvl
quoted
EWlbNS51VoaBHo5l2wWDDjAFJVdfDeT3t%2FR36Y%3D&reserved=0

Signed-off-by: Mary Hardy <redacted>
Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
---
[V3]
- Made ring buffer data offset depend on page size
- remove rte_smp_rwmb macro and reused rte_compiler_barrier instead
- Added legal counsel sign-off
- Removed "Link:" tag
- Improve commit messages
- new library compilation dependent on x86
- simplify mmap

[V2]
- simpler sysfs path, less parsing

 tools/hv/Build           |   1 +
 tools/hv/Makefile        |  13 +-
 tools/hv/vmbus_bufring.c | 297
+++++++++++++++++++++++++++++++++++++++
 tools/hv/vmbus_bufring.h | 154 ++++++++++++++++++++
 4 files changed, 464 insertions(+), 1 deletion(-)  create mode 100644
tools/hv/vmbus_bufring.c  create mode 100644 tools/hv/vmbus_bufring.h
diff --git a/tools/hv/Build b/tools/hv/Build index
6cf51fa4b306..2a667d3d94cb 100644
--- a/tools/hv/Build
+++ b/tools/hv/Build
@@ -1,3 +1,4 @@
 hv_kvp_daemon-y += hv_kvp_daemon.o
 hv_vss_daemon-y += hv_vss_daemon.o
 hv_fcopy_daemon-y += hv_fcopy_daemon.o
+vmbus_bufring-y += vmbus_bufring.o
diff --git a/tools/hv/Makefile b/tools/hv/Makefile index
fe770e679ae8..33cf488fd20f 100644
--- a/tools/hv/Makefile
+++ b/tools/hv/Makefile
@@ -11,14 +11,19 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR)))
srctree := $(patsubst %/,%,$(dir $(srctree)))  endif

+include $(srctree)/tools/scripts/Makefile.arch
+
 # Do not use make's built-in rules
 # (this improves performance and avoids hard-to-debug behaviour);
MAKEFLAGS += -r

 override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include

+ifeq ($(SRCARCH),x86)
+ALL_LIBS := libvmbus_bufring.a
+endif
 ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon
-ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
+ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) $(patsubst
%,$(OUTPUT)%,$(ALL_LIBS))

 ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh
hv_set_ifconfig.sh
@@ -27,6 +32,12 @@ all: $(ALL_PROGRAMS)  export srctree OUTPUT CC LD
CFLAGS  include $(srctree)/tools/build/Makefile.include

+HV_VMBUS_BUFRING_IN := $(OUTPUT)vmbus_bufring.o
+$(HV_VMBUS_BUFRING_IN): FORCE
+	$(Q)$(MAKE) $(build)=vmbus_bufring
+$(OUTPUT)libvmbus_bufring.a : vmbus_bufring.o
+	$(AR) rcs $@ $^
+
 HV_KVP_DAEMON_IN := $(OUTPUT)hv_kvp_daemon-in.o
 $(HV_KVP_DAEMON_IN): FORCE
 	$(Q)$(MAKE) $(build)=hv_kvp_daemon
diff --git a/tools/hv/vmbus_bufring.c b/tools/hv/vmbus_bufring.c new
file mode 100644 index 000000000000..fb1f0489c625
--- /dev/null
+++ b/tools/hv/vmbus_bufring.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * Copyright (c) 2009-2012,2016,2023 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ */
+
+#include <errno.h>
+#include <emmintrin.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include "vmbus_bufring.h"
+
+#define	rte_compiler_barrier()	({ asm volatile ("" : : : "memory"); })
+#define RINGDATA_START_OFFSET	(getpagesize())
+#define VMBUS_RQST_ERROR	0xFFFFFFFFFFFFFFFF
+#define ALIGN(val, align)	((typeof(val))((val) & (~((typeof(val))((align) -
1)))))
quoted
+
+/* Increase bufring index by inc with wraparound */ static inline
+uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz) {
+	idx += inc;
+	if (idx >= sz)
+		idx -= sz;
+
+	return idx;
+}
+
+void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int
+blen) {
+	br->vbr = buf;
+	br->windex = br->vbr->windex;
+	br->dsize = blen - RINGDATA_START_OFFSET; }
+
+static inline __always_inline void
+rte_smp_mb(void)
+{
+	asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); }
+
+static inline int
+rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t
+src) {
+	uint8_t res;
+
+	asm volatile("lock ; "
+		     "cmpxchgl %[src], %[dst];"
+		     "sete %[res];"
+		     : [res] "=a" (res),     /* output */
+		     [dst] "=m" (*dst)
+		     : [src] "r" (src),      /* input */
+		     "a" (exp),
+		     "m" (*dst)
+		     : "memory");            /* no-clobber list */
+	return res;
+}
+
+static inline uint32_t
+vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex,
+		  const void *src0, uint32_t cplen) {
+	uint8_t *br_data = (uint8_t *)tbr->vbr + RINGDATA_START_OFFSET;
+	uint32_t br_dsize = tbr->dsize;
+	const uint8_t *src = src0;
+
+	if (cplen > br_dsize - windex) {
+		uint32_t fraglen = br_dsize - windex;
+
+		/* Wrap-around detected */
+		memcpy(br_data + windex, src, fraglen);
+		memcpy(br_data, src + fraglen, cplen - fraglen);
+	} else {
+		memcpy(br_data + windex, src, cplen);
+	}
+
+	return vmbus_br_idxinc(windex, cplen, br_dsize); }
+
+/*
+ * Write scattered channel packet to TX bufring.
+ *
+ * The offset of this channel packet is written as a 64bits value
+ * immediately after this channel packet.
+ *
+ * The write goes through three stages:
+ *  1. Reserve space in ring buffer for the new data.
+ *     Writer atomically moves priv_write_index.
+ *  2. Copy the new data into the ring.
+ *  3. Update the tail of the ring (visible to host) that indicates
+ *     next read location. Writer updates write_index
+ */
+static int
+vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen,
+		 bool *need_sig)
+{
+	struct vmbus_bufring *vbr = tbr->vbr;
+	uint32_t ring_size = tbr->dsize;
+	uint32_t old_windex, next_windex, windex, total;
+	uint64_t save_windex;
+	int i;
+
+	total = 0;
+	for (i = 0; i < iovlen; i++)
+		total += iov[i].iov_len;
+	total += sizeof(save_windex);
+
+	/* Reserve space in ring */
+	do {
+		uint32_t avail;
+
+		/* Get current free location */
+		old_windex = tbr->windex;
+
+		/* Prevent compiler reordering this with calculation */
+		rte_compiler_barrier();
+
+		avail = vmbus_br_availwrite(tbr, old_windex);
+
+		/* If not enough space in ring, then tell caller. */
+		if (avail <= total)
+			return -EAGAIN;
+
+		next_windex = vmbus_br_idxinc(old_windex, total, ring_size);
+
+		/* Atomic update of next write_index for other threads */
+	} while (!rte_atomic32_cmpset(&tbr->windex, old_windex,
+next_windex));
+
+	/* Space from old..new is now reserved */
+	windex = old_windex;
+	for (i = 0; i < iovlen; i++)
+		windex = vmbus_txbr_copyto(tbr, windex, iov[i].iov_base,
+iov[i].iov_len);
+
+	/* Set the offset of the current channel packet. */
+	save_windex = ((uint64_t)old_windex) << 32;
+	windex = vmbus_txbr_copyto(tbr, windex, &save_windex,
+				   sizeof(save_windex));
+
+	/* The region reserved should match region used */
+	if (windex != next_windex)
+		return -EINVAL;
+
+	/* Ensure that data is available before updating host index */
+	rte_compiler_barrier();
+
+	/* Checkin for our reservation. wait for our turn to update host */
+	while (!rte_atomic32_cmpset(&vbr->windex, old_windex,
next_windex))
quoted
+		_mm_pause();
+
+	return 0;
+}
+
+int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void
*data,
quoted
+			uint32_t dlen, uint32_t flags)
+{
+	struct vmbus_chanpkt pkt;
+	unsigned int pktlen, pad_pktlen;
+	const uint32_t hlen = sizeof(pkt);
+	bool send_evt = false;
+	uint64_t pad = 0;
+	struct iovec iov[3];
+	int error;
+
+	pktlen = hlen + dlen;
+	pad_pktlen = ALIGN(pktlen, sizeof(uint64_t));
This ALIGN function rounds down.  So pad_pktlen could be less than pktlen.
Thanks for pointing this, will fix.
quoted
+
+	pkt.hdr.type = type;
+	pkt.hdr.flags = flags;
+	pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT;
+	pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT;
+	pkt.hdr.xactid = VMBUS_RQST_ERROR; /* doesn't support multiple
+requests at same time */
+
+	iov[0].iov_base = &pkt;
+	iov[0].iov_len = hlen;
+	iov[1].iov_base = data;
+	iov[1].iov_len = dlen;
+	iov[2].iov_base = &pad;
+	iov[2].iov_len = pad_pktlen - pktlen;
Given the way your ALIGN function works, the above could produce a
negative value for iov[2].iov_len.  Then bad things will happen. :-(
Got it.
quoted
+
+	error = vmbus_txbr_write(txbr, iov, 3, &send_evt);
+
+	return error;
+}
+
<snip>
quoted
+	 * we can request that the receiver interrupt the sender
+	 * when the ring transitions from being full to being able
+	 * to handle a message of size "pending_send_sz".
+	 *
+	 * Add necessary state for this enhancement.
+	 */
+	volatile uint32_t pending_send;
+	uint32_t reserved1[12];
+
+	union {
+		struct {
+			uint32_t feat_pending_send_sz:1;
+		};
+		uint32_t value;
+	} feature_bits;
+
+	/*
+	 * Ring data starts here + RingDataStartOffset
This mention of RingDataStartOffset looks stale.  I could not find it defined
anywhere.
Will correct it to:
Ring data starts after PAGE_SIZE offset from the start of this struct (RINGDATA_START_OFFSET).

- Saurabh
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help