Thread (7 messages) 7 messages, 2 authors, 2021-11-04
STALE1704d
Revisions (5)
  1. v1 current
  2. v2 [diff vs current]
  3. v3 [diff vs current]
  4. v4 [diff vs current]
  5. v5 [diff vs current]

[PATCH bpf-next 1/2] bpf: introduce helper bpf_find_vma

From: Song Liu <hidden>
Date: 2021-10-27 22:00:56
Also in: bpf
Subsystem: bpf [core], bpf [general] (safe dynamic programs and tools), bpf [iterator], bpf [security & lsm] (security audit and enforcement using bpf), bpf [tracing], the rest, tracing · Maintainers: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi, Yonghong Song, KP Singh, Matt Bobrowski, Song Liu, Linus Torvalds, Steven Rostedt, Masami Hiramatsu

In some profiler use cases, it is necessary to map an address to the
backing file, e.g., a shared library. bpf_find_vma helper provides a
flexible way to achieve this. bpf_find_vma maps an address of a task to
the vma (vm_area_struct) for this address, and feed the vma to an callback
BPF function. The callback function is necessary here, as we need to
ensure mmap_sem is unlocked.

It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem
safely when irqs are disable, we use the same mechanism as stackmap with
build_id. Specifically, when irqs are disabled, the unlocked is postponed
in an irq_work.

Signed-off-by: Song Liu <redacted>
---
 include/linux/bpf.h            |   1 +
 include/uapi/linux/bpf.h       |  20 +++++++
 kernel/bpf/task_iter.c         | 102 ++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c          |  36 ++++++++++++
 kernel/trace/bpf_trace.c       |   2 +
 tools/include/uapi/linux/bpf.h |  19 ++++++
 6 files changed, 179 insertions(+), 1 deletion(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 31421c74ba081..65def8467b609 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2109,6 +2109,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
+extern const struct bpf_func_proto bpf_find_vma_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c108200378834..e1401ae4decc9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4915,6 +4915,25 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *unix_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or **NULL** otherwise.
+ *
+ * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		Find vma of *task* that contains *addr*, call *callback_fn*
+ *		function with *task*, *vma*, and *callback_ctx*.
+ *		The *callback_fn* should be a static function and
+ *		the *callback_ctx* should be a pointer to the stack.
+ *		The *flags* is used to control certain aspects of the helper.
+ *		Currently, the *flags* must be 0.
+ *
+ *		The expected callback signature is
+ *
+ *		long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*ctx);
+ *
+ *	Return
+ *		0 on success.
+ *		**-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
+ *		**-EBUSY** if failed to try lock mmap_lock.
+ *		**-EINVAL** for invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5096,6 +5115,7 @@ union bpf_attr {
 	FN(get_branch_snapshot),	\
 	FN(trace_vprintk),		\
 	FN(skc_to_unix_sock),		\
+	FN(find_vma),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b48750bfba5aa..ad30f2e885356 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -8,6 +8,7 @@
 #include <linux/fdtable.h>
 #include <linux/filter.h>
 #include <linux/btf_ids.h>
+#include <linux/irq_work.h>
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
@@ -21,6 +22,25 @@ struct bpf_iter_seq_task_info {
 	u32 tid;
 };
 
+/* irq_work to run mmap_read_unlock() */
+struct task_iter_irq_work {
+	struct irq_work irq_work;
+	struct mm_struct *mm;
+};
+
+static DEFINE_PER_CPU(struct task_iter_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+	struct task_iter_irq_work *work;
+
+	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+		return;
+
+	work = container_of(entry, struct task_iter_irq_work, irq_work);
+	mmap_read_unlock_non_owner(work->mm);
+}
+
 static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
 					     u32 *tid,
 					     bool skip_if_dup_files)
@@ -586,9 +606,89 @@ static struct bpf_iter_reg task_vma_reg_info = {
 	.seq_info		= &task_vma_seq_info,
 };
 
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+	   bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+	struct task_iter_irq_work *work = NULL;
+	struct mm_struct *mm = task->mm;
+	struct vm_area_struct *vma;
+	bool irq_work_busy = false;
+	int ret = -ENOENT;
+
+	if (flags)
+		return -EINVAL;
+
+	if (!mm)
+		return -ENOENT;
+
+	/*
+	 * Similar to stackmap with build_id support, we cannot simply do
+	 * mmap_read_unlock when the irq is disabled. Instead, we need do
+	 * the unlock in the irq_work.
+	 */
+	if (irqs_disabled()) {
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+			work = this_cpu_ptr(&mmap_unlock_work);
+			if (irq_work_is_busy(&work->irq_work)) {
+				/* cannot queue more mmap_unlock, abort. */
+				irq_work_busy = true;
+			}
+		} else {
+			/*
+			 * PREEMPT_RT does not allow to trylock mmap sem in
+			 * interrupt disabled context, abort.
+			 */
+			irq_work_busy = true;
+		}
+	}
+
+	if (irq_work_busy || !mmap_read_trylock(mm))
+		return -EBUSY;
+
+	vma = find_vma(mm, start);
+
+	if (vma && vma->vm_start <= start && vma->vm_end > start) {
+		callback_fn((u64)(long)task, (u64)(long)vma,
+			    (u64)(long)callback_ctx, 0, 0);
+		ret = 0;
+	}
+	if (!work) {
+		mmap_read_unlock(current->mm);
+	} else {
+		work->mm = current->mm;
+
+		/* The lock will be released once we're out of interrupt
+		 * context. Tell lockdep that we've released it now so
+		 * it doesn't complain that we forgot to release it.
+		 */
+		rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
+		irq_work_queue(&work->irq_work);
+	}
+	return ret;
+}
+
+BTF_ID_LIST_SINGLE(btf_find_vma_ids, struct, task_struct)
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+	.func		= bpf_find_vma,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &btf_find_vma_ids[0],
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_FUNC,
+	.arg4_type	= ARG_PTR_TO_STACK_OR_NULL,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 static int __init task_iter_init(void)
 {
-	int ret;
+	struct task_iter_irq_work *work;
+	int ret, cpu;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&mmap_unlock_work, cpu);
+		init_irq_work(&work->irq_work, do_mmap_read_unlock);
+	}
 
 	task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
 	ret = bpf_iter_reg_target(&task_reg_info);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c6616e3258038..393ab21529b08 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6119,6 +6119,35 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+BTF_ID_LIST_SINGLE(btf_set_find_vma_ids, struct, vm_area_struct)
+
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+				       struct bpf_func_state *caller,
+				       struct bpf_func_state *callee,
+				       int insn_idx)
+{
+	/* bpf_find_vma(struct task_struct *task, u64 start,
+	 *               void *callback_fn, void *callback_ctx, u64 flags)
+	 * (callback_fn)(struct task_struct *task,
+	 *               struct vm_area_struct *vma, void *ctx);
+	 */
+	callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+	callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+	callee->regs[BPF_REG_2].btf =  btf_vmlinux;
+	callee->regs[BPF_REG_2].btf_id = btf_set_find_vma_ids[0];
+
+	/* pointer to stack or null */
+	callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -6476,6 +6505,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_find_vma) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_find_vma_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	if (func_id == BPF_FUNC_snprintf) {
 		err = check_bpf_snprintf_call(env, regs);
 		if (err < 0)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cbcd0d6fca7c7..c95397c55d0e8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1208,6 +1208,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_func_ip_proto_tracing;
 	case BPF_FUNC_get_branch_snapshot:
 		return &bpf_get_branch_snapshot_proto;
+	case BPF_FUNC_find_vma:
+		return &bpf_find_vma_proto;
 	case BPF_FUNC_trace_vprintk:
 		return bpf_get_trace_vprintk_proto();
 	default:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c108200378834..056c00da1b5d6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4915,6 +4915,24 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *unix_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or **NULL** otherwise.
+ * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		Find vma of *task* that contains *addr*, call *callback_fn*
+ *		function with *task*, *vma*, and *callback_ctx*.
+ *		The *callback_fn* should be a static function and
+ *		the *callback_ctx* should be a pointer to the stack.
+ *		The *flags* is used to control certain aspects of the helper.
+ *		Currently, the *flags* must be 0.
+ *
+ *		The expected callback signature is
+ *
+ *		long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*ctx);
+ *
+ *	Return
+ *		0 on success.
+ *		**-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
+ *		**-EBUSY** if failed to try lock mmap_lock.
+ *		**-EINVAL** for invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5096,6 +5114,7 @@ union bpf_attr {
 	FN(get_branch_snapshot),	\
 	FN(trace_vprintk),		\
 	FN(skc_to_unix_sock),		\
+	FN(find_vma),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
2.30.2
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help