Re: [PATCH bpf-next v4 1/2] libbpf: add support for using AF_XDP sockets

From: Maciej Fijalkowski <hidden>
Date: 2019-02-18 11:21:54

On Mon, 18 Feb 2019 09:59:30 +0100
Magnus Karlsson [off-list ref] wrote:

On Fri, Feb 15, 2019 at 6:09 PM Daniel Borkmann [off-list ref] wrote:

quoted

On 02/08/2019 02:05 PM, Magnus Karlsson wrote:

quoted

This commit adds AF_XDP support to libbpf. The main reason for this is
to facilitate writing applications that use AF_XDP by offering
higher-level APIs that hide many of the details of the AF_XDP
uapi. This is in the same vein as libbpf facilitates XDP adoption by
offering easy-to-use higher level interfaces of XDP
functionality. Hopefully this will facilitate adoption of AF_XDP, make
applications using it simpler and smaller, and finally also make it
possible for applications to benefit from optimizations in the AF_XDP
user space access code. Previously, people just copied and pasted the
code from the sample application into their application, which is not
desirable.

The interface is composed of two parts:

* Low-level access interface to the four rings and the packet
* High-level control plane interface for creating and setting
  up umems and af_xdp sockets as well as a simple XDP program.

Tested-by: Björn Töpel <redacted>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>

[...]

quoted

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
new file mode 100644
index 0000000..a982a76
--- /dev/null
+++ b/tools/lib/bpf/xsk.c

@@ -0,0 +1,742 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <asm/barrier.h>
+#include <linux/compiler.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <linux/if_packet.h>
+#include <linux/if_xdp.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_util.h"
+#include "nlattr.h"
+#include "xsk.h"
+
+#ifndef SOL_XDP
+ #define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+ #define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+ #define PF_XDP AF_XDP
+#endif
+
+struct xsk_umem {
+     struct xsk_ring_prod *fill;
+     struct xsk_ring_cons *comp;
+     char *umem_area;
+     struct xsk_umem_config config;
+     int fd;
+     int refcount;
+};
+
+struct xsk_socket {
+     struct xsk_ring_cons *rx;
+     struct xsk_ring_prod *tx;
+     __u64 outstanding_tx;
+     struct xsk_umem *umem;
+     struct xsk_socket_config config;
+     int fd;
+     int xsks_map;
+     int ifindex;
+     int prog_fd;
+     int qidconf_map_fd;
+     int xsks_map_fd;
+     __u32 queue_id;
+};
+
+struct xsk_nl_info {
+     bool xdp_prog_attached;
+     int ifindex;
+     int fd;
+};
+
+#define MAX_QUEUES 128

Why is this a fixed constant here, shouldn't this be dynamic due to being NIC
specific anyway?

It was only here for simplicity. If a NIC had more queues, it would
require a recompile of the lib. Obviously, not desirable in a distro.
What I could do is to read the max "combined" queues (pre-set maximum
in the ethtool output) from the same interface as ethool uses and size
the array after that. Or is there a simpler way? What to do if the NIC
does not have a "combined", or is there no such NIC (seems the common
HW ones set this)?

quoted

[...]

quoted

+void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
+{
+     return &((char *)(umem->umem_area))[addr];
+}

There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp.
when to choose which? ;)

There is enough to have the xsk_umem__get_data_raw() function.
xsk_umem__get_data() is just a convenience function for which the
application does not have to store the beginning of the umem. But as
the application always has to provide this anyway in the
xsk_umem__create() function, it might as well store this pointer. I
will delete xsk_umem__get_data() and rename xsk_umem__get_data_raw()
to xsk_umem__get_data().

quoted

+int xsk_umem__fd(const struct xsk_umem *umem)
+{
+     return umem ? umem->fd : -EINVAL;
+}
+
+int xsk_socket__fd(const struct xsk_socket *xsk)
+{
+     return xsk ? xsk->fd : -EINVAL;
+}
+
+static bool xsk_page_aligned(void *buffer)
+{
+     unsigned long addr = (unsigned long)buffer;
+
+     return !(addr & (getpagesize() - 1));
+}
+
+static void xsk_set_umem_config(struct xsk_umem_config *cfg,
+                             const struct xsk_umem_config *usr_cfg)
+{
+     if (!usr_cfg) {
+             cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+             cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+             cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+             cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+             return;
+     }
+
+     cfg->fill_size = usr_cfg->fill_size;
+     cfg->comp_size = usr_cfg->comp_size;
+     cfg->frame_size = usr_cfg->frame_size;
+     cfg->frame_headroom = usr_cfg->frame_headroom;

Just optional nit, might be a bit nicer to have it in this form:

        cfg->fill_size = usr_cfg ? usr_cfg->fill_size :
                         XSK_RING_PROD__DEFAULT_NUM_DESCS;

I actually think the current form is clearer when there are multiple
lines. If there was only one line, I would agree with you.

quoted

+}
+
+static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
+                                   const struct xsk_socket_config *usr_cfg)
+{
+     if (!usr_cfg) {
+             cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+             cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+             cfg->libbpf_flags = 0;
+             cfg->xdp_flags = 0;
+             cfg->bind_flags = 0;
+             return;
+     }
+
+     cfg->rx_size = usr_cfg->rx_size;
+     cfg->tx_size = usr_cfg->tx_size;
+     cfg->libbpf_flags = usr_cfg->libbpf_flags;
+     cfg->xdp_flags = usr_cfg->xdp_flags;
+     cfg->bind_flags = usr_cfg->bind_flags;

(Ditto)

quoted

+}
+
+int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
+                  struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
+                  const struct xsk_umem_config *usr_config)
+{
+     struct xdp_mmap_offsets off;
+     struct xdp_umem_reg mr;
+     struct xsk_umem *umem;
+     socklen_t optlen;
+     void *map;
+     int err;
+
+     if (!umem_area || !umem_ptr || !fill || !comp)
+             return -EFAULT;
+     if (!size && !xsk_page_aligned(umem_area))
+             return -EINVAL;
+
+     umem = calloc(1, sizeof(*umem));
+     if (!umem)
+             return -ENOMEM;
+
+     umem->fd = socket(AF_XDP, SOCK_RAW, 0);
+     if (umem->fd < 0) {
+             err = -errno;
+             goto out_umem_alloc;
+     }
+
+     umem->umem_area = umem_area;
+     xsk_set_umem_config(&umem->config, usr_config);
+
+     mr.addr = (uintptr_t)umem_area;
+     mr.len = size;
+     mr.chunk_size = umem->config.frame_size;
+     mr.headroom = umem->config.frame_headroom;
+
+     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
+     if (err) {
+             err = -errno;
+             goto out_socket;
+     }
+     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
+                      &umem->config.fill_size,
+                      sizeof(umem->config.fill_size));
+     if (err) {
+             err = -errno;
+             goto out_socket;
+     }
+     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
+                      &umem->config.comp_size,
+                      sizeof(umem->config.comp_size));
+     if (err) {
+             err = -errno;
+             goto out_socket;
+     }
+
+     optlen = sizeof(off);
+     err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+     if (err) {
+             err = -errno;
+             goto out_socket;
+     }
+
+     map = xsk_mmap(NULL, off.fr.desc +
+                    umem->config.fill_size * sizeof(__u64),
+                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+                    umem->fd, XDP_UMEM_PGOFF_FILL_RING);
+     if (map == MAP_FAILED) {
+             err = -errno;
+             goto out_socket;
+     }
+
+     umem->fill = fill;
+     fill->mask = umem->config.fill_size - 1;
+     fill->size = umem->config.fill_size;
+     fill->producer = map + off.fr.producer;
+     fill->consumer = map + off.fr.consumer;
+     fill->ring = map + off.fr.desc;
+     fill->cached_cons = umem->config.fill_size;
+
+     map = xsk_mmap(NULL,
+                    off.cr.desc + umem->config.comp_size * sizeof(__u64),
+                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+                    umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
+     if (map == MAP_FAILED) {
+             err = -errno;
+             goto out_mmap;
+     }
+
+     umem->comp = comp;
+     comp->mask = umem->config.comp_size - 1;
+     comp->size = umem->config.comp_size;
+     comp->producer = map + off.cr.producer;
+     comp->consumer = map + off.cr.consumer;
+     comp->ring = map + off.cr.desc;
+
+     *umem_ptr = umem;
+     return 0;
+
+out_mmap:
+     munmap(umem->fill,
+            off.fr.desc + umem->config.fill_size * sizeof(__u64));
+out_socket:
+     close(umem->fd);
+out_umem_alloc:
+     free(umem);
+     return err;
+}
+
+static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
+{
+     struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
+     struct xsk_nl_info *nl_info = cookie;
+     struct ifinfomsg *ifinfo = msg;
+     unsigned char mode;
+     int err;
+
+     if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
+             return 0;
+
+     if (!tb[IFLA_XDP])
+             return 0;
+
+     err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
+                                   NULL);
+     if (err)
+             return err;
+
+     if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
+             return 0;
+
+     mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
+     if (mode == XDP_ATTACHED_NONE)
+             return 0;
+
+     nl_info->xdp_prog_attached = true;
+     nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);

Hm, I don't think this works if I read the intention of this helper correctly.

IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the
above is a bug.

We also have bpf_get_link_xdp_id(). This should probably just be reused in
this context here.

If bpf_get_link_xdp_id() will fit my bill, I will happily use it. I
will check it out and hopefully I can drop all this code. Thanks.

I see that all you need to know is whether there's already attached XDP program
to xsk socket's related interface, no?
If so, then within the xsk_setup_xdp_prog, you could do something like:

	u32 prog_id = 0;

	bpf_get_link_xdp_id(xsk->ifindex, &prog_id, xsk->config.xdp_flags);
	if (!prog_id) {
		// create maps
		// load xdp prog
	} else {
		xsk->fd = prog_id;
	}

	xsk_update_bpf_maps(xsk, true, xsk->fd);

If that's ok then xsk_xdp_prog_attached and xsk_parse_nl could be dropped.

quoted

+     return 0;
+}
+
+static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
+{
+     struct xsk_nl_info nl_info;
+     unsigned int nl_pid;
+     char err_buf[256];
+     int sock, err;
+
+     sock = libbpf_netlink_open(&nl_pid);
+     if (sock < 0)
+             return false;
+
+     nl_info.xdp_prog_attached = false;
+     nl_info.ifindex = xsk->ifindex;
+     nl_info.fd = -1;
+
+     err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
+     if (err) {
+             libbpf_strerror(err, err_buf, sizeof(err_buf));
+             pr_warning("Error:\n%s\n", err_buf);
+             close(sock);
+             return false;
+     }
+
+     close(sock);
+     xsk->prog_fd = nl_info.fd;
+     return nl_info.xdp_prog_attached;
+}

(See bpf_get_link_xdp_id().)

quoted

+
+static int xsk_load_xdp_prog(struct xsk_socket *xsk)
+{
+     char bpf_log_buf[BPF_LOG_BUF_SIZE];
+     int err, prog_fd;
+
+     /* This is the C-program:
+      * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+      * {
+      *     int *qidconf, index = ctx->rx_queue_index;

[...]

quoted

+
+int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
+                    __u32 queue_id, struct xsk_umem *umem,
+                    struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
+                    const struct xsk_socket_config *usr_config)
+{
+     struct sockaddr_xdp sxdp = {};
+     struct xdp_mmap_offsets off;
+     struct xsk_socket *xsk;
+     socklen_t optlen;
+     void *map;
+     int err;
+
+     if (!umem || !xsk_ptr || !rx || !tx)
+             return -EFAULT;
+
+     if (umem->refcount) {
+             pr_warning("Error: shared umems not supported by libbpf.\n");
+             return -EBUSY;
+     }
+
+     xsk = calloc(1, sizeof(*xsk));
+     if (!xsk)
+             return -ENOMEM;
+
+     if (umem->refcount++ > 0) {

Should this refcount rather be atomic actually?

Neither our config nor data plane interfaces are reentrant for
performance reasons. Any concurrency has to be handled explicitly on
the application level. This so that it only penalizes apps that really
need this.

Thanks for all your reviews: Magnus

quoted

+             xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
+             if (xsk->fd < 0) {
+                     err = -errno;
+                     goto out_xsk_alloc;
+             }
+     } else {
+             xsk->fd = umem->fd;
+     }
+
+     xsk->outstanding_tx = 0;
+     xsk->queue_id = queue_id;
+     xsk->umem = umem;
+     xsk->ifindex = if_nametoindex(ifname);
+     if (!xsk->ifindex) {
+             err = -errno;
+             goto out_socket;
+     }
+
+     xsk_set_xdp_socket_config(&xsk->config, usr_config);

[...]

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help