[PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
From: Alexei Starovoitov <ast@kernel.org>
Date: 2019-08-27 20:52:56
Also in:
bpf, netdev
Subsystem:
bpf [core], bpf [general] (safe dynamic programs and tools), bpf [networking] (tcx & tc bpf, sock_addr), bpf [security & lsm] (security audit and enforcement using bpf), bpf [selftests] (test runners & infrastructure), bpf [storage & cgroups], bpf [tracing], capabilities, kernel selftest framework, networking [general], selinux security module, the rest, tracing, xdp (express data path) · Maintainers:
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau, KP Singh, Matt Bobrowski, Song Liu, Serge Hallyn, Shuah Khan, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Paul Moore, Stephen Smalley, Linus Torvalds, Steven Rostedt, Masami Hiramatsu, David S. Miller, Jesper Dangaard Brouer, John Fastabend
Introduce CAP_BPF that allows loading all types of BPF programs, create most map types, load BTF, iterate programs and maps. CAP_BPF alone is not enough to attach or run programs. Networking: CAP_BPF and CAP_NET_ADMIN are necessary to: - attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT - run networking bpf programs (like xdp, skb, flow_dissector) Tracing: CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1) are necessary to: - attach bpf program to raw tracepoint - use bpf_trace_printk() in all program types (not only tracing programs) - create bpf stackmap To attach bpf to perf_events perf_event_open() needs to succeed as usual. CAP_BPF controls BPF side. CAP_NET_ADMIN controls intersection where BPF calls into networking. perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing. In the future CAP_TRACING could be introduced to control creation of kprobe/uprobe and attaching bpf to perf_events. In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF. Whereas probe_read() would be controlled by CAP_TRACING. CAP_TRACING would also control generic kprobe+probe_read. CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs that want to use bpf_probe_read. Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- I would prefer to introduce CAP_TRACING soon, since it will make tracing and networking permission model symmetrical. include/linux/filter.h | 1 + include/uapi/linux/capability.h | 5 ++- kernel/bpf/arraymap.c | 2 +- kernel/bpf/cgroup.c | 2 +- kernel/bpf/core.c | 10 ++++-- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +-- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 32 ++++++++++------- kernel/bpf/verifier.c | 4 +-- kernel/trace/bpf_trace.c | 2 +- net/core/bpf_sk_storage.c | 2 +- net/core/filter.c | 10 +++--- security/selinux/include/classmap.h | 4 +-- tools/testing/selftests/bpf/test_verifier.c | 39 ++++++++++++++++----- 18 files changed, 84 insertions(+), 43 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92c6e31fb008..16cea50af014 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h@@ -857,6 +857,7 @@ static inline bool bpf_dump_raw_ok(void) return kallsyms_show_value() == 1; } +bool cap_bpf_tracing(void); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..b3390f34c9f5 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h@@ -366,8 +366,11 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* Allow bpf() syscall except attach and tracing */ -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_BPF 38 + +#define CAP_LAST_CAP CAP_BPF #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..045e30b7160d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c@@ -73,7 +73,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool unpriv = !capable(CAP_BPF); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a6a154cfa7b..97f733354421 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c@@ -795,7 +795,7 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) + if (cap_bpf_tracing()) return bpf_get_trace_printk_proto(); /* fall through */ default:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..5756c8a56f44 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !capable(CAP_BPF)) return; spin_lock_bh(&bpf_lock);
@@ -768,7 +768,7 @@ static int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { - if (!capable(CAP_SYS_ADMIN)) { + if (!capable(CAP_BPF)) { atomic_long_sub(pages, &bpf_jit_current); return -EPERM; }
@@ -2104,6 +2104,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); +bool cap_bpf_tracing(void) +{ + return capable(CAP_SYS_ADMIN) || + (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw()); +} + /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..ca483c9a9c2e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c@@ -84,7 +84,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) int ret, cpu; u64 cost; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return ERR_PTR(-EPERM); /* check sanity of attributes */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..f459315625ac 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c@@ -244,9 +244,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !capable(CAP_BPF)) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d..a45fa5464d98 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return ERR_PTR(-EPERM); /* check sanity of attributes */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..ca0ba9edca86 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return -EPERM; /* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 50c083ba978c..bfad7d41a061 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return ERR_PTR(-EPERM); array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..c540b2b3fc4a 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c@@ -90,7 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!cap_bpf_tracing()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0f62fd67c6b..ef7b06ca30e5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c@@ -1176,7 +1176,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!capable(CAP_BPF)) { err = -EPERM; goto err_put; }
@@ -1634,7 +1634,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_BPF)) return -EPERM; /* copy eBPF program license from user space */
@@ -1647,11 +1647,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (capable(CAP_BPF) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_BPF)) return -EPERM; bpf_prog_load_fixup_attach_type(attr);
@@ -1802,6 +1802,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) char tp_name[128]; int tp_fd, err; + if (!cap_bpf_tracing()) + return -EPERM; + if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), sizeof(tp_name) - 1) < 0) return -EFAULT;
@@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF)) + /* test_run callback is available for networking progs only. + * Add cap_bpf_tracing() above when tracing progs become runable. + */ return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL;
@@ -2117,7 +2123,7 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return -EPERM; next_id++;
@@ -2143,7 +2149,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return -EPERM; spin_lock_bh(&prog_idr_lock);
@@ -2177,7 +2183,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) attr->open_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return -EPERM; f_flags = bpf_get_file_flag(attr->open_flags);
@@ -2352,7 +2358,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!capable(CAP_BPF)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0;
@@ -2670,7 +2676,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return -EPERM; return btf_new_fd(attr);
@@ -2683,7 +2689,7 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return -EPERM; return btf_get_fd_by_id(attr->btf_id);
@@ -2752,7 +2758,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (CHECK_ATTR(BPF_TASK_FD_QUERY)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!cap_bpf_tracing()) return -EPERM; if (attr->task_fd_query.flags != 0)
@@ -2820,7 +2826,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr = {}; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !capable(CAP_BPF)) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 10c0ff93f52b..5810e8cc9342 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c@@ -987,7 +987,7 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->umax_value = U64_MAX; /* constant backtracking is enabled for root only for now */ - reg->precise = capable(CAP_SYS_ADMIN) ? false : true; + reg->precise = capable(CAP_BPF) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */
@@ -9233,7 +9233,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = capable(CAP_BPF); /* grab the mutex to protect few globals used by verifier */ if (!is_priv)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..2bf58ff5bf75 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c@@ -1246,7 +1246,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!cap_bpf_tracing()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..0b29f6abbeba 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c@@ -616,7 +616,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return -EPERM; if (attr->value_size >= KMALLOC_MAX_SIZE -
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c1059cdad3d..986277abfde2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c@@ -5990,7 +5990,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return NULL; switch (func_id) {
@@ -5999,7 +5999,9 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); + if (cap_bpf_tracing()) + return bpf_get_trace_printk_proto(); + /* fall through */ default: return NULL; }
@@ -6563,7 +6565,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return false; break; }
@@ -6575,7 +6577,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_BPF)) return false; break; default:
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 201f7e588a29..1c925bc04072 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h@@ -26,9 +26,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read" + "wake_alarm", "block_suspend", "audit_read", "bpf" -#if CAP_LAST_CAP > CAP_AUDIT_READ +#if CAP_LAST_CAP > CAP_BPF #error New capability defined, please update COMMON_CAP2_PERMS. #endif
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 44e2d640b088..b31b961f1020 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c@@ -805,10 +805,18 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, } } +struct libcap { + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data[2]; +}; + static int set_admin(bool admin) { cap_t caps; - const cap_value_t cap_val = CAP_SYS_ADMIN; + /* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs */ + const cap_value_t cap_val[] = {38/*CAP_BPF*/, CAP_NET_ADMIN}; + const cap_value_t cap_val_admin = CAP_SYS_ADMIN; + struct libcap *cap; int ret = -1; caps = cap_get_proc();
@@ -816,11 +824,23 @@ static int set_admin(bool admin) perror("cap_get_proc"); return -1; } - if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, + cap = (struct libcap *)caps; + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val_admin, CAP_CLEAR)) { + perror("cap_set_flag clear admin"); + goto out; + } + if (cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_val, admin ? CAP_SET : CAP_CLEAR)) { - perror("cap_set_flag"); + perror("cap_set_flag set_or_clear bpf+net"); goto out; } + /* libcap is likely old and simply ignores CAP_BPF, + * so update effective bits manually + */ + if (admin) + cap->data[1].effective |= 1 << (38 - 32); + else + cap->data[1].effective &= ~(1 << (38 - 32)); if (cap_set_proc(caps)) { perror("cap_set_proc"); goto out;
@@ -1013,8 +1033,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv, static bool is_admin(void) { cap_t caps; - cap_flag_value_t sysadmin = CAP_CLEAR; - const cap_value_t cap_val = CAP_SYS_ADMIN; + cap_flag_value_t bpf_priv = CAP_CLEAR; + cap_flag_value_t net_priv = CAP_CLEAR; + struct libcap *cap; #ifdef CAP_IS_SUPPORTED if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1027,11 +1048,13 @@ static bool is_admin(void) perror("cap_get_proc"); return false; } - if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin)) - perror("cap_get_flag"); + cap = (struct libcap *)caps; + bpf_priv = cap->data[1].effective & (1 << (38/* CAP_BPF */ - 32)); + if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv)) + perror("cap_get_flag NET"); if (cap_free(caps)) perror("cap_free"); - return (sysadmin == CAP_SET); + return bpf_priv == CAP_SET && net_priv == CAP_SET; } static void get_unpriv_disabled()
--
2.20.0