Re: [PATCH net-next v4 5/5] tools: virtio: introduce vhost_net_test
From: Jason Wang <jasowang@redhat.com>
Date: 2024-02-04 01:30:48
Also in:
lkml, virtualization
On Fri, Feb 2, 2024 at 8:24 PM Yunsheng Lin [off-list ref] wrote:
On 2024/2/2 12:05, Jason Wang wrote:quoted
On Tue, Jan 30, 2024 at 7:38 PM Yunsheng Lin [off-list ref] wrote:quoted
introduce vhost_net_test basing on virtio_test to test vhost_net changing in the kernel.Let's describe what kind of test is being done and how it is done here.How about something like below: This patch introduces testing for both vhost_net tx and rx. Steps for vhost_net tx testing: 1. Prepare a out buf 2. Kick the vhost_net to do tx processing 3. Do the receiving in the tun side 4. verify the data received by tun is correct Steps for vhost_net rx testing:: 1. Prepare a in buf 2. Do the sending in the tun side 3. Kick the vhost_net to do rx processing 4. verify the data received by vhost_net is correct
It looks like some important details were lost, e.g the logic for batching etc.
quoted
quoted
+ +static int tun_alloc(struct vdev_info *dev) +{ + struct ifreq ifr; + int len = HDR_LEN;Any reason you can't just use the virtio_net uapi?I didn't find a macro for that in include/uapi/linux/virtio_net.h. Did you mean using something like below? sizeof(struct virtio_net_hdr_mrg_rxbuf)
Yes.
quoted
quoted
+ int fd, e; + + fd = open("/dev/net/tun", O_RDWR); + if (fd < 0) { + perror("Cannot open /dev/net/tun"); + return fd; + } + + memset(&ifr, 0, sizeof(ifr)); + + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; + snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid()); + + e = ioctl(fd, TUNSETIFF, &ifr); + if (e < 0) { + perror("ioctl[TUNSETIFF]"); + close(fd); + return e; + } + + e = ioctl(fd, TUNSETVNETHDRSZ, &len); + if (e < 0) { + perror("ioctl[TUNSETVNETHDRSZ]"); + close(fd); + return e; + } + + e = ioctl(fd, SIOCGIFHWADDR, &ifr); + if (e < 0) { + perror("ioctl[SIOCGIFHWADDR]"); + close(fd); + return e; + } + + memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); + return fd; +} + +static void vdev_create_socket(struct vdev_info *dev) +{ + struct ifreq ifr; + + dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE)); + assert(dev->sock != -1); + + snprintf(ifr.ifr_name, IFNAMSIZ, "tun_%d", getpid());Nit: it might be better to accept the device name instead of repeating the snprintf trick here, this would facilitate the future changes.I am not sure I understand what did you mean by "accept the device name" here. The above is used to get ifindex of the tun netdevice created in tun_alloc(), so that we can use it in vdev_send_packet() to send a packet using the tun netdevice created in tun_alloc(). Is there anything obvious I missed here?
I meant a const char *ifname for this function and let the caller to pass the name.
quoted
quoted
+ assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >= 0); + + dev->ifindex = ifr.ifr_ifindex; + + /* Set the flags that bring the device up */ + assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >= 0); + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >= 0); +} + +static void vdev_send_packet(struct vdev_info *dev) +{ + char *sendbuf = dev->test_buf + HDR_LEN; + struct sockaddr_ll saddrll = {0}; + int sockfd = dev->sock; + int ret; + + saddrll.sll_family = PF_PACKET; + saddrll.sll_ifindex = dev->ifindex; + saddrll.sll_halen = ETH_ALEN; + saddrll.sll_protocol = htons(TEST_PTYPE); + + ret = sendto(sockfd, sendbuf, TEST_BUF_LEN, 0, + (struct sockaddr *)&saddrll, + sizeof(struct sockaddr_ll)); + assert(ret >= 0); +} +...quoted
quoted
+ +static void vq_info_add(struct vdev_info *dev, int idx, int num, int fd) +{ + struct vhost_vring_file backend = { .index = idx, .fd = fd }; + struct vq_info *info = &dev->vqs[idx]; + int r; + + info->idx = idx; + info->kick = eventfd(0, EFD_NONBLOCK); + info->call = eventfd(0, EFD_NONBLOCK);If we don't care about the callback, let's just avoid to set the call here? (As I see vq_callback is a NULL)Sure, will remove the vq_callback related code.quoted
quoted
+ r = posix_memalign(&info->ring, 4096, vring_size(num, 4096)); + assert(r >= 0); + vq_reset(info, num, &dev->vdev); + vhost_vq_setup(dev, info); + info->fds.fd = info->call; + info->fds.events = POLLIN; + + r = ioctl(dev->control, VHOST_NET_SET_BACKEND, &backend); + assert(!r); +} + +static void vdev_info_init(struct vdev_info *dev, unsigned long long features) +{ + struct ether_header *eh; + int i, r; + + dev->vdev.features = features; + INIT_LIST_HEAD(&dev->vdev.vqs); + spin_lock_init(&dev->vdev.vqs_list_lock); + + dev->buf_size = (HDR_LEN + TEST_BUF_LEN) * 2; + dev->buf = malloc(dev->buf_size); + assert(dev->buf); + dev->test_buf = dev->buf; + dev->res_buf = dev->test_buf + HDR_LEN + TEST_BUF_LEN; + + memset(dev->test_buf, 0, HDR_LEN + TEST_BUF_LEN); + eh = (struct ether_header *)(dev->test_buf + HDR_LEN); + eh->ether_type = htons(TEST_PTYPE); + memcpy(eh->ether_dhost, dev->mac, ETHER_ADDR_LEN); + memcpy(eh->ether_shost, dev->mac, ETHER_ADDR_LEN); + + for (i = sizeof(*eh); i < TEST_BUF_LEN; i++) + dev->test_buf[i + HDR_LEN] = (char)i; + + dev->control = open("/dev/vhost-net", O_RDWR); + assert(dev->control >= 0); + + r = ioctl(dev->control, VHOST_SET_OWNER, NULL); + assert(r >= 0); + + dev->mem = malloc(offsetof(struct vhost_memory, regions) + + sizeof(dev->mem->regions[0])); + assert(dev->mem); + memset(dev->mem, 0, offsetof(struct vhost_memory, regions) + + sizeof(dev->mem->regions[0])); + dev->mem->nregions = 1; + dev->mem->regions[0].guest_phys_addr = (long)dev->buf; + dev->mem->regions[0].userspace_addr = (long)dev->buf; + dev->mem->regions[0].memory_size = dev->buf_size; + + r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); + assert(r >= 0); + + r = ioctl(dev->control, VHOST_SET_FEATURES, &features); + assert(r >= 0); + + dev->nvqs = 2; +} + +static void wait_for_interrupt(struct vq_info *vq) +{ + unsigned long long val; + + poll(&vq->fds, 1, -1); + + if (vq->fds.revents & POLLIN) + read(vq->fds.fd, &val, sizeof(val)); +} + +static void verify_res_buf(char *res_buf) +{ + int i; + + for (i = ETHER_HDR_LEN; i < TEST_BUF_LEN; i++) + assert(res_buf[i] == (char)i); +} + +static void run_tx_test(struct vdev_info *dev, struct vq_info *vq, + bool delayed, int batch, int bufs)It might be better to describe the test design briefly above as a comment. Or we can start from simple test logic and add sophisticated ones on top.Does something described in the comment log as suggested by you make sense to you? Steps for vhost_net tx testing: 1. Prepare a out buf 2. Kick the vhost_net to do tx processing 3. Do the receiving in the tun side 4. verify the data received by tun is correct
See my reply above.
quoted
quoted
+{ + const bool random_batch = batch == RANDOM_BATCH; + long long spurious = 0; + struct scatterlist sl; + unsigned int len; + int r; + + for (;;) { + long started_before = vq->started; + long completed_before = vq->completed; + + virtqueue_disable_cb(vq->vq); + do { + if (random_batch) + batch = (random() % vq->vring.num) + 1; + + while (vq->started < bufs && + (vq->started - vq->completed) < batch) { + sg_init_one(&sl, dev->test_buf, HDR_LEN + TEST_BUF_LEN); + r = virtqueue_add_outbuf(vq->vq, &sl, 1, + dev->test_buf + vq->started, + GFP_ATOMIC); + if (unlikely(r != 0)) { + if (r == -ENOSPC && + vq->started > started_before) + r = 0; + else + r = -1; + break; + } + + ++vq->started; + + if (unlikely(!virtqueue_kick(vq->vq))) { + r = -1; + break; + } + } + + if (vq->started >= bufs) + r = -1; + + /* Flush out completed bufs if any */ + while (virtqueue_get_buf(vq->vq, &len)) { + int n; + + n = recvfrom(dev->sock, dev->res_buf, TEST_BUF_LEN, 0, NULL, NULL); + assert(n == TEST_BUF_LEN); + verify_res_buf(dev->res_buf); + + ++vq->completed; + r = 0; + } + } while (r == 0); + + if (vq->completed == completed_before && vq->started == started_before) + ++spurious; + + assert(vq->completed <= bufs); + assert(vq->started <= bufs); + if (vq->completed == bufs) + break; + + if (delayed) { + if (virtqueue_enable_cb_delayed(vq->vq)) + wait_for_interrupt(vq); + } else { + if (virtqueue_enable_cb(vq->vq)) + wait_for_interrupt(vq); + } + } + printf("TX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n", + spurious, vq->started, vq->completed); +} + +static void run_rx_test(struct vdev_info *dev, struct vq_info *vq, + bool delayed, int batch, int bufs) +{ + const bool random_batch = batch == RANDOM_BATCH; + long long spurious = 0; + struct scatterlist sl; + unsigned int len; + int r; + + for (;;) { + long started_before = vq->started; + long completed_before = vq->completed; + + do { + if (random_batch) + batch = (random() % vq->vring.num) + 1; + + while (vq->started < bufs && + (vq->started - vq->completed) < batch) { + sg_init_one(&sl, dev->res_buf, HDR_LEN + TEST_BUF_LEN); + + r = virtqueue_add_inbuf(vq->vq, &sl, 1, + dev->res_buf + vq->started, + GFP_ATOMIC); + if (unlikely(r != 0)) { + if (r == -ENOSPC &&Drivers usually maintain a #free_slots, this can help to avoid the trick for checking ENOSPC?The above "(vq->started - vq->completed) < batch" seems to ensure that the 'r' can't be '-ENOSPC'?
Well, if this is true any reason we still check ENOSPEC here?
We just need to ensure the batch <= desc_num, and the 'r == -ENOSPC' checking seems to be unnecessary.quoted
quoted
+ vq->started > started_before) + r = 0; + else + r = -1; + break; + } + + ++vq->started; + + vdev_send_packet(dev); + + if (unlikely(!virtqueue_kick(vq->vq))) { + r = -1; + break; + } + } + + if (vq->started >= bufs) + r = -1; + + /* Flush out completed bufs if any */ + while (virtqueue_get_buf(vq->vq, &len)) { + struct ether_header *eh; + + eh = (struct ether_header *)(dev->res_buf + HDR_LEN); + + /* tun netdev is up and running, ignore the + * non-TEST_PTYPE packet. + */I wonder if it's better to set up some kind of qdisc to avoid the unexpected packet here, or is it too complicated?Yes, at least I don't know to do that yet.
For example, the blackhole qdisc? Thanks