Thread (8 messages) 8 messages, 3 authors, 2018-10-03

Re: [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it

From: Takeshi Yoshimura <hidden>
Date: 2018-07-31 09:38:13

2018-07-30 20:17 GMT+09:00 Anatoly Burakov [off-list ref]:
quoted hunk ↗ jump to hunk
Currently, DPDK will skip mapping some areas (or even an entire BAR)
if MSI-X happens to be in it but is smaller than page address.

Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
as a capability flag. Capability flags themselves are also only
supported since kernel 4.6 [2].

This commit will introduce support for checking VFIO capabilities,
and will use it to check if we are allowed to map BARs with MSI-X
tables in them, along with backwards compatibility for older
kernels, including a workaround for a variable rename in VFIO
region info structure [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279

[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4

Signed-off-by: Anatoly Burakov <redacted>
---
 drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
 lib/librte_eal/common/include/rte_vfio.h |  26 +++++
 2 files changed, 140 insertions(+), 13 deletions(-)
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 686386d6a..e7765ee11 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
        return 0;
 }

+/*
+ * region info may contain capability headers, so we need to keep reallocating
+ * the memory until we match allocated memory size with argsz.
+ */
+static int
+pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
+               int region)
+{
+       struct vfio_region_info *ri;
+       size_t argsz = sizeof(*ri);
+       int ret;
+
+       ri = malloc(sizeof(*ri));
+       if (ri == NULL) {
+               RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
+               return -1;
+       }
+again:
+       memset(ri, 0, argsz);
+       ri->argsz = argsz;
+       ri->index = region;
+
+       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info);
+       if (ret) {
+               free(ri);
+               return ret;
+       }
+       if (ri->argsz != argsz) {
+               argsz = ri->argsz;
+               ri = realloc(ri, argsz);
+
+               if (ri == NULL) {
+                       RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
+                       return -1;
+               }
+               goto again;
+       }
+       *info = ri;
+
+       return 0;
+}
+
+static struct vfio_info_cap_header *
+pci_vfio_info_cap(struct vfio_region_info *info, int cap)
+{
+       struct vfio_info_cap_header *h;
+       size_t offset;
+
+       if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
+               /* VFIO info does not advertise capabilities */
+               return NULL;
+       }
+
+       offset = VFIO_CAP_OFFSET(info);
+       while (offset != 0) {
+               h = RTE_PTR_ADD(info, offset);
+               if (h->id == cap)
+                       return h;
+               offset = h->next;
+       }
+       return NULL;
+}
+
+static int
+pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
+{
+       struct vfio_region_info *info;
+       int ret;
+
+       ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
+       if (ret < 0)
+               return -1;
+
+       ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
+
+       /* cleanup */
+       free(info);
+
+       return ret;
+}
+
+
 static int
 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
@@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
        if (ret < 0) {
                RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
                                pci_addr);
-               goto err_vfio_dev_fd;
+               goto err_vfio_res;
+       }
+       /* if we found our MSI-X BAR region, check if we can mmap it */
+       if (vfio_res->msix_table.bar_index != -1) {
+               int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
+                               vfio_res->msix_table.bar_index);
+               if (ret < 0) {
+                       RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
+                       goto err_vfio_res;
+               } else if (ret != 0) {
+                       /* we can map it, so we don't care where it is */
+                       RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
+                       vfio_res->msix_table.bar_index = -1;
+               }
        }

        for (i = 0; i < (int) vfio_res->nb_maps; i++) {
-               struct vfio_region_info reg = { .argsz = sizeof(reg) };
+               struct vfio_region_info *reg;
                void *bar_addr;

-               reg.index = i;
-
-               ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
-               if (ret) {
+               ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
+               if (ret < 0) {
                        RTE_LOG(ERR, EAL, "  %s cannot get device region info "
-                                       "error %i (%s)\n", pci_addr, errno, strerror(errno));
+                               "error %i (%s)\n", pci_addr, errno,
+                               strerror(errno));
                        goto err_vfio_res;
                }

                /* chk for io port region */
                ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
-               if (ret < 0)
+               if (ret < 0) {
+                       free(reg);
                        goto err_vfio_res;
-               else if (ret) {
+               } else if (ret) {
                        RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
                                        i);
+                       free(reg);
                        continue;
                }

                /* skip non-mmapable BARs */
-               if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+               if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+                       free(reg);
                        continue;
+               }

                /* try mapping somewhere close to the end of hugepages */
                if (pci_map_addr == NULL)
                        pci_map_addr = pci_find_max_end_va();

                bar_addr = pci_map_addr;
-               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);

                maps[i].addr = bar_addr;
-               maps[i].offset = reg.offset;
-               maps[i].size = reg.size;
+               maps[i].offset = reg->offset;
+               maps[i].size = reg->size;
                maps[i].path = NULL; /* vfio doesn't have per-resource paths */

                ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
                if (ret < 0) {
                        RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
                                        pci_addr, i, strerror(errno));
+                       free(reg);
                        goto err_vfio_res;
                }

                dev->mem_resource[i].addr = maps[i].addr;
+
+               free(reg);
        }

        if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcce..f6617e004 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
 extern "C" {
 #endif

+#include <stdint.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -44,6 +46,30 @@ extern "C" {
 #define RTE_VFIO_NOIOMMU 8
 #endif

+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+       uint16_t id;
+       uint16_t version;
+       uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
 #else /* not VFIO_PRESENT */

 /* we don't need an actual definition, only pointer is used */
--
2.17.1
Hi Anatoly,
I have tested the patch on our ppc64le machine, but the
ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info) in
pci_vfio_get_region_info() failed.
This may be an issue of ppc64le VFIO implementation. Let me investigate more...

Thanks,
Takeshi
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help