Re: [PATCH v6 bpf-next 03/13] bpf: introduce BPF token object

From: Andrii Nakryiko <hidden>
Date: 2023-10-12 00:31:20
Also in: bpf, linux-fsdevel, netdev

On Tue, Oct 10, 2023 at 7:35 PM Hou Tao [off-list ref] wrote:

Hi,

On 9/28/2023 6:57 AM, Andrii Nakryiko wrote:

quoted

Add new kind of BPF kernel object, BPF token. BPF token is meant to
allow delegating privileged BPF functionality, like loading a BPF
program or creating a BPF map, from privileged process to a *trusted*
unprivileged process, all while have a good amount of control over which
privileged operations could be performed using provided BPF token.

This is achieved through mounting BPF FS instance with extra delegation
mount options, which determine what operations are delegatable, and also
constraining it to the owning user namespace (as mentioned in the
previous patch).

SNIP

quoted

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70bfa997e896..78692911f4a0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h

@@ -847,6 +847,37 @@ union bpf_iter_link_info {
  *           Returns zero on success. On error, -1 is returned and *errno*
  *           is set appropriately.
  *
+ * BPF_TOKEN_CREATE
+ *   Description
+ *           Create BPF token with embedded information about what
+ *           BPF-related functionality it allows:
+ *           - a set of allowed bpf() syscall commands;
+ *           - a set of allowed BPF map types to be created with
+ *           BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ *           - a set of allowed BPF program types and BPF program attach
+ *           types to be loaded with BPF_PROG_LOAD command, if
+ *           BPF_PROG_LOAD itself is allowed.
+ *
+ *           BPF token is created (derived) from an instance of BPF FS,
+ *           assuming it has necessary delegation mount options specified.
+ *           BPF FS mount is specified with openat()-style path FD + string.
+ *           This BPF token can be passed as an extra parameter to various
+ *           bpf() syscall commands to grant BPF subsystem functionality to
+ *           unprivileged processes.
+ *
+ *           When created, BPF token is "associated" with the owning
+ *           user namespace of BPF FS instance (super block) that it was
+ *           derived from, and subsequent BPF operations performed with
+ *           BPF token would be performing capabilities checks (i.e.,
+ *           CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ *           that user namespace. Without BPF token, such capabilities
+ *           have to be granted in init user namespace, making bpf()
+ *           syscall incompatible with user namespace, for the most part.
+ *
+ *   Return
+ *           A new file descriptor (a nonnegative integer), or -1 if an
+ *           error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *   eBPF objects (maps and programs) can be shared between processes.
  *

@@ -901,6 +932,8 @@ enum bpf_cmd {
      BPF_ITER_CREATE,
      BPF_LINK_DETACH,
      BPF_PROG_BIND_MAP,
+     BPF_TOKEN_CREATE,
+     __MAX_BPF_CMD,
 };

 enum bpf_map_type {

@@ -1694,6 +1727,12 @@ union bpf_attr {
              __u32           flags;          /* extra flags */
      } prog_bind_map;

+     struct { /* struct used by BPF_TOKEN_CREATE command */
+             __u32           flags;
+             __u32           bpffs_path_fd;
+             __u64           bpffs_pathname;

Because bppfs_pathname is a string pointer, so __aligned_u64 is preferred.

ok, I'll use __aligned_u64, even though it can never be unaligned in this case

quoted

+     } token_create;
+
 } __attribute__((aligned(8)));

 /* The description below is an attempt at providing documentation to eBPF

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..4ce95acfcaa7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile

@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)

-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 24b3faf901f4..de1fdf396521 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c

@@ -99,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
 static const struct inode_operations bpf_link_iops  = { };

-static struct inode *bpf_get_inode(struct super_block *sb,
-                                const struct inode *dir,
-                                umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+                         const struct inode *dir,
+                         umode_t mode)
 {
      struct inode *inode;

@@ -603,11 +603,13 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
      struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
      umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+     u64 mask;

      if (mode != S_IRWXUGO)
              seq_printf(m, ",mode=%o", mode);

-     if (opts->delegate_cmds == ~0ULL)
+     mask = (1ULL << __MAX_BPF_CMD) - 1;
+     if ((opts->delegate_cmds & mask) == mask)
              seq_printf(m, ",delegate_cmds=any");

Should we add a BUILD_BUG_ON assertion to guarantee __MAX_BPF_CMD is
less than sizeof(u64) * 8 ?

yep, good idea, will add for CMD and all others

quoted

      else if (opts->delegate_cmds)
              seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7445dad01fb3..b47791a80930 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c

@@ -5304,6 +5304,20 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
      return ret;
 }

+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_pathname
+
+static int token_create(union bpf_attr *attr)
+{
+     if (CHECK_ATTR(BPF_TOKEN_CREATE))
+             return -EINVAL;
+
+     /* no flags are supported yet */
+     if (attr->token_create.flags)
+             return -EINVAL;
+
+     return bpf_token_create(attr);
+}
+
 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
      union bpf_attr attr;

@@ -5437,6 +5451,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
      case BPF_PROG_BIND_MAP:
              err = bpf_prog_bind_map(&attr);
              break;
+     case BPF_TOKEN_CREATE:
+             err = token_create(&attr);
+             break;
      default:
              err = -EINVAL;
              break;

diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..779aad5007a3
--- /dev/null
+++ b/kernel/bpf/token.c

SNIP

quoted

+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+static const struct file_operations bpf_token_fops = {
+     .release        = bpf_token_release,
+     .show_fdinfo    = bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+     struct bpf_mount_opts *mnt_opts;
+     struct bpf_token *token = NULL;
+     struct inode *inode;
+     struct file *file;
+     struct path path;
+     umode_t mode;
+     int err, fd;
+
+     err = user_path_at(attr->token_create.bpffs_path_fd,
+                        u64_to_user_ptr(attr->token_create.bpffs_pathname),
+                        LOOKUP_FOLLOW | LOOKUP_EMPTY, &path);
+     if (err)
+             return err;

Need to check the mount is a bpffs mount instead of other filesystem mount.

yep, missed that. Fixed, will check `path.mnt->mnt_sb->s_op != &bpf_super_ops`.

quoted

+
+     if (path.mnt->mnt_root != path.dentry) {
+             err = -EINVAL;
+             goto out_path;
+     }
+     err = path_permission(&path, MAY_ACCESS);
+     if (err)
+             goto out_path;
+
+     mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+     inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
+     if (IS_ERR(inode)) {
+             err = PTR_ERR(inode);
+             goto out_path;
+     }
+
+     inode->i_op = &bpf_token_iops;
+     inode->i_fop = &bpf_token_fops;
+     clear_nlink(inode); /* make sure it is unlinked */
+
+     file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
+     if (IS_ERR(file)) {
+             iput(inode);
+             err = PTR_ERR(file);
+             goto out_file;

goto out_path ?

eagle eye, fixed, thanks!

quoted

+     }
+
+     token = bpf_token_alloc();
+     if (!token) {
+             err = -ENOMEM;
+             goto out_file;
+     }
+
+     /* remember bpffs owning userns for future ns_capable() checks */
+     token->userns = get_user_ns(path.dentry->d_sb->s_user_ns);
+
+     mnt_opts = path.dentry->d_sb->s_fs_info;
+     token->allowed_cmds = mnt_opts->delegate_cmds;
+
+     fd = get_unused_fd_flags(O_CLOEXEC);
+     if (fd < 0) {
+             err = fd;
+             goto out_token;
+     }
+
+     file->private_data = token;
+     fd_install(fd, file);
+
+     path_put(&path);
+     return fd;
+
+out_token:
+     bpf_token_free(token);
+out_file:
+     fput(file);
+out_path:
+     path_put(&path);
+     return err;
+}
+

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help