RE: [PATCH v3 2/3] tools: hv: Add vmbus_bufring
From: Saurabh Singh Sengar <hidden>
Date: 2023-08-03 12:07:12
Also in:
linux-doc, lkml
-----Original Message----- From: Michael Kelley (LINUX) <redacted> Sent: Thursday, August 3, 2023 3:14 AM To: Saurabh Sengar <ssengar@linux.microsoft.com>; KY Srinivasan [off-list ref]; Haiyang Zhang [off-list ref]; wei.liu@kernel.org; Dexuan Cui [off-list ref]; gregkh@linuxfoundation.org; corbet@lwn.net; linux-kernel@vger.kernel.org; linux-hyperv@vger.kernel.org; linux-doc@vger.kernel.org Subject: [EXTERNAL] RE: [PATCH v3 2/3] tools: hv: Add vmbus_bufring From: Saurabh Sengar <ssengar@linux.microsoft.com> Sent: Friday, July 14, 2023 3:26 AMquoted
Provide a userspace interface for userspace drivers or applications to read/write a VMBus ringbuffer. A significant part of this code is borrowed from DPDK[1]. Current library is supported exclusively for the x86 architecture. To build this library: make -C tools/hv libvmbus_bufring.a Applications using this library can include the vmbus_bufring.h header file and libvmbus_bufring.a statically. [1] https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FDPDK%2Fdpdk%2F&data=05%7C01%7Cssengar%40microsoft.com %7C7aa6dquoted
4dbbcb44895db5008db93a193c9%7C72f988bf86f141af91ab2d7cd011db47%7 C1%7C0quoted
%7C638266094508922046%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLj AwMDAiLCJQquoted
IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata =O0cvlquoted
EWlbNS51VoaBHo5l2wWDDjAFJVdfDeT3t%2FR36Y%3D&reserved=0 Signed-off-by: Mary Hardy <redacted> Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com> --- [V3] - Made ring buffer data offset depend on page size - remove rte_smp_rwmb macro and reused rte_compiler_barrier instead - Added legal counsel sign-off - Removed "Link:" tag - Improve commit messages - new library compilation dependent on x86 - simplify mmap [V2] - simpler sysfs path, less parsing tools/hv/Build | 1 + tools/hv/Makefile | 13 +- tools/hv/vmbus_bufring.c | 297 +++++++++++++++++++++++++++++++++++++++ tools/hv/vmbus_bufring.h | 154 ++++++++++++++++++++ 4 files changed, 464 insertions(+), 1 deletion(-) create mode 100644 tools/hv/vmbus_bufring.c create mode 100644 tools/hv/vmbus_bufring.hdiff --git a/tools/hv/Build b/tools/hv/Build index6cf51fa4b306..2a667d3d94cb 100644--- a/tools/hv/Build +++ b/tools/hv/Build@@ -1,3 +1,4 @@ hv_kvp_daemon-y += hv_kvp_daemon.o hv_vss_daemon-y += hv_vss_daemon.o hv_fcopy_daemon-y += hv_fcopy_daemon.o +vmbus_bufring-y += vmbus_bufring.odiff --git a/tools/hv/Makefile b/tools/hv/Makefile indexfe770e679ae8..33cf488fd20f 100644--- a/tools/hv/Makefile +++ b/tools/hv/Makefile@@ -11,14 +11,19 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR)))srctree := $(patsubst %/,%,$(dir $(srctree))) endif +include $(srctree)/tools/scripts/Makefile.arch + # Do not use make's built-in rules # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include +ifeq ($(SRCARCH),x86) +ALL_LIBS := libvmbus_bufring.a +endif ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon -ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) $(patsubst %,$(OUTPUT)%,$(ALL_LIBS)) ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh hv_set_ifconfig.sh@@ -27,6 +32,12 @@ all: $(ALL_PROGRAMS) export srctree OUTPUT CC LDCFLAGS include $(srctree)/tools/build/Makefile.include +HV_VMBUS_BUFRING_IN := $(OUTPUT)vmbus_bufring.o +$(HV_VMBUS_BUFRING_IN): FORCE + $(Q)$(MAKE) $(build)=vmbus_bufring +$(OUTPUT)libvmbus_bufring.a : vmbus_bufring.o + $(AR) rcs $@ $^ + HV_KVP_DAEMON_IN := $(OUTPUT)hv_kvp_daemon-in.o $(HV_KVP_DAEMON_IN): FORCE $(Q)$(MAKE) $(build)=hv_kvp_daemondiff --git a/tools/hv/vmbus_bufring.c b/tools/hv/vmbus_bufring.c newfile mode 100644 index 000000000000..fb1f0489c625--- /dev/null +++ b/tools/hv/vmbus_bufring.c@@ -0,0 +1,297 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Copyright (c) 2009-2012,2016,2023 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + */ + +#include <errno.h> +#include <emmintrin.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/uio.h> +#include "vmbus_bufring.h" + +#define rte_compiler_barrier() ({ asm volatile ("" : : : "memory"); }) +#define RINGDATA_START_OFFSET (getpagesize()) +#define VMBUS_RQST_ERROR 0xFFFFFFFFFFFFFFFF +#define ALIGN(val, align) ((typeof(val))((val) & (~((typeof(val))((align) -1)))))quoted
+ +/* Increase bufring index by inc with wraparound */ static inline +uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz) { + idx += inc; + if (idx >= sz) + idx -= sz; + + return idx; +} + +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int +blen) { + br->vbr = buf; + br->windex = br->vbr->windex; + br->dsize = blen - RINGDATA_START_OFFSET; } + +static inline __always_inline void +rte_smp_mb(void) +{ + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); } + +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t +src) { + uint8_t res; + + asm volatile("lock ; " + "cmpxchgl %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + return res; +} + +static inline uint32_t +vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex, + const void *src0, uint32_t cplen) { + uint8_t *br_data = (uint8_t *)tbr->vbr + RINGDATA_START_OFFSET; + uint32_t br_dsize = tbr->dsize; + const uint8_t *src = src0; + + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + memcpy(br_data + windex, src, fraglen); + memcpy(br_data, src + fraglen, cplen - fraglen); + } else { + memcpy(br_data + windex, src, cplen); + } + + return vmbus_br_idxinc(windex, cplen, br_dsize); } + +/* + * Write scattered channel packet to TX bufring. + * + * The offset of this channel packet is written as a 64bits value + * immediately after this channel packet. + * + * The write goes through three stages: + * 1. Reserve space in ring buffer for the new data. + * Writer atomically moves priv_write_index. + * 2. Copy the new data into the ring. + * 3. Update the tail of the ring (visible to host) that indicates + * next read location. Writer updates write_index + */ +static int +vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen, + bool *need_sig) +{ + struct vmbus_bufring *vbr = tbr->vbr; + uint32_t ring_size = tbr->dsize; + uint32_t old_windex, next_windex, windex, total; + uint64_t save_windex; + int i; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + /* Reserve space in ring */ + do { + uint32_t avail; + + /* Get current free location */ + old_windex = tbr->windex; + + /* Prevent compiler reordering this with calculation */ + rte_compiler_barrier(); + + avail = vmbus_br_availwrite(tbr, old_windex); + + /* If not enough space in ring, then tell caller. */ + if (avail <= total) + return -EAGAIN; + + next_windex = vmbus_br_idxinc(old_windex, total, ring_size); + + /* Atomic update of next write_index for other threads */ + } while (!rte_atomic32_cmpset(&tbr->windex, old_windex, +next_windex)); + + /* Space from old..new is now reserved */ + windex = old_windex; + for (i = 0; i < iovlen; i++) + windex = vmbus_txbr_copyto(tbr, windex, iov[i].iov_base, +iov[i].iov_len); + + /* Set the offset of the current channel packet. */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* The region reserved should match region used */ + if (windex != next_windex) + return -EINVAL; + + /* Ensure that data is available before updating host index */ + rte_compiler_barrier(); + + /* Checkin for our reservation. wait for our turn to update host */ + while (!rte_atomic32_cmpset(&vbr->windex, old_windex,next_windex))quoted
+ _mm_pause(); + + return 0; +} + +int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void*data,quoted
+ uint32_t dlen, uint32_t flags) +{ + struct vmbus_chanpkt pkt; + unsigned int pktlen, pad_pktlen; + const uint32_t hlen = sizeof(pkt); + bool send_evt = false; + uint64_t pad = 0; + struct iovec iov[3]; + int error; + + pktlen = hlen + dlen; + pad_pktlen = ALIGN(pktlen, sizeof(uint64_t));This ALIGN function rounds down. So pad_pktlen could be less than pktlen.
Thanks for pointing this, will fix.
quoted
+ + pkt.hdr.type = type; + pkt.hdr.flags = flags; + pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.xactid = VMBUS_RQST_ERROR; /* doesn't support multiple +requests at same time */ + + iov[0].iov_base = &pkt; + iov[0].iov_len = hlen; + iov[1].iov_base = data; + iov[1].iov_len = dlen; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - pktlen;Given the way your ALIGN function works, the above could produce a negative value for iov[2].iov_len. Then bad things will happen. :-(
Got it.
quoted
+ + error = vmbus_txbr_write(txbr, iov, 3, &send_evt); + + return error; +} +
<snip>
quoted
+ * we can request that the receiver interrupt the sender + * when the ring transitions from being full to being able + * to handle a message of size "pending_send_sz". + * + * Add necessary state for this enhancement. + */ + volatile uint32_t pending_send; + uint32_t reserved1[12]; + + union { + struct { + uint32_t feat_pending_send_sz:1; + }; + uint32_t value; + } feature_bits; + + /* + * Ring data starts here + RingDataStartOffsetThis mention of RingDataStartOffset looks stale. I could not find it defined anywhere.
Will correct it to: Ring data starts after PAGE_SIZE offset from the start of this struct (RINGDATA_START_OFFSET). - Saurabh