Re: [PATCH bpf] tools/runqslower: fix handle__sched_switch for updated tp sched_switch
From: Song Liu <hidden>
Date: 2022-03-30 00:39:33
Also in:
bpf
On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko [off-list ref] wrote: On Tue, Mar 29, 2022 at 4:19 PM Song Liu [off-list ref] wrote:quoted
TP_PROTO of sched_switch is updated with a new arg prev_state, which causes runqslower load failure: libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- R1 type=ctx expected=fp 0: R1=ctx(off=0,imm=0) R10=fp0 ; int handle__sched_switch(u64 *ctx) 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) ; struct task_struct *next = (struct task_struct *)ctx[2]; 1: (79) r6 = *(u64 *)(r7 +16) func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) ; struct task_struct *prev = (struct task_struct *)ctx[1]; 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) 3: (b7) r1 = 0 ; R1_w=0 ; struct runq_event event = {}; 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 ; if (prev->__state == TASK_RUNNING) 8: (61) r1 = *(u32 *)(r2 +24) R2 invalid mem access 'scalar' processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 -- END PROG LOAD LOG -- libbpf: failed to load program 'handle__sched_switch' libbpf: failed to load object 'runqslower_bpf' libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 failed to load BPF object: -13 Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG in runqslower for cleaner code. Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") Signed-off-by: Song Liu <song@kernel.org> --- tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-)It would be much less disruptive if that prev_state was added after "next", but oh well...
Maybe we should change that. +Valentin and Steven, how about we change the order with the attached diff (not the original patch in this thread, but the one at the end of this email)?
But anyways, let's handle this in a way that can handle both old kernels and new ones and do the same change in libbpf-tool's runqslower ([0]). Can you please follow up there as well?
Yeah, I will also fix that one.
We can use BPF CO-RE to detect which order of arguments running kernel has by checking prev_state field existence in struct trace_event_raw_sched_switch. Can you please try that? Use bpf_core_field_exists() for that.
Do you mean something like
if (bpf_core_field_exists(ctx->prev_state))
/* use ctx[2] and ctx[3] */
else
/* use ctx[1] and ctx[2] */
? I think we will need BTF for the arguments, which doesn't exist yet.
Did I miss something?
I was thinking about adding something like struct tp_sched_switch_args
for all the raw tracepoints, but haven't got time to look into how.
Thanks,
Song
[0] https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqslower.bpf.cquoted
diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 9a5c1f008fe6..30e491d8308f 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c@@ -2,6 +2,7 @@// Copyright (c) 2019 Facebook #include "vmlinux.h" #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include "runqslower.h" #define TASK_RUNNING 0@@ -43,31 +44,21 @@ static int trace_enqueue(struct task_struct *t)} SEC("tp_btf/sched_wakeup") -int handle__sched_wakeup(u64 *ctx) +int BPF_PROG(handle__sched_wakeup, struct task_struct *p) { - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") -int handle__sched_wakeup_new(u64 *ctx) +int BPF_PROG(handle__sched_wakeup_new, struct task_struct *p) { - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p); } SEC("tp_btf/sched_switch") -int handle__sched_switch(u64 *ctx) +int BPF_PROG(handle__sched_switch, bool preempt, unsigned long prev_state, + struct task_struct *prev, struct task_struct *next) { - /* TP_PROTO(bool preempt, struct task_struct *prev, - * struct task_struct *next) - */ - struct task_struct *prev = (struct task_struct *)ctx[1]; - struct task_struct *next = (struct task_struct *)ctx[2]; struct runq_event event = {}; u64 *tsp, delta_us; long state; -- 2.30.2
diff --git i/include/trace/events/sched.h w/include/trace/events/sched.h
index 65e786756321..fbb99a61f714 100644
--- i/include/trace/events/sched.h
+++ w/include/trace/events/sched.h@@ -222,11 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next), + struct task_struct *next, + unsigned int prev_state), - TP_ARGS(preempt, prev_state, prev, next), + TP_ARGS(preempt, prev, next, prev_state), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN )
diff --git i/kernel/sched/core.c w/kernel/sched/core.c
index d575b4914925..25784f3efd81 100644
--- i/kernel/sched/core.c
+++ w/kernel/sched/core.c@@ -6376,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf);
diff --git i/kernel/trace/fgraph.c w/kernel/trace/fgraph.c
index 19028e072cdb..d7a81d277f66 100644
--- i/kernel/trace/fgraph.c
+++ w/kernel/trace/fgraph.c@@ -415,9 +415,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) + { unsigned long long timestamp; int index;
diff --git i/kernel/trace/ftrace.c w/kernel/trace/ftrace.c
index 4f1d2f5e7263..957354488242 100644
--- i/kernel/trace/ftrace.c
+++ w/kernel/trace/ftrace.c@@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *pid_list;
diff --git i/kernel/trace/trace_events.c w/kernel/trace/trace_events.c
index e11e167b7809..e7b6a2155e96 100644
--- i/kernel/trace/trace_events.c
+++ w/kernel/trace/trace_events.c@@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list;
@@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list;
diff --git i/kernel/trace/trace_sched_switch.c w/kernel/trace/trace_sched_switch.c
index 45796d8bd4b2..c9ffdcfe622e 100644
--- i/kernel/trace/trace_sched_switch.c
+++ w/kernel/trace/trace_sched_switch.c@@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { int flags;
diff --git i/kernel/trace/trace_sched_wakeup.c w/kernel/trace/trace_sched_wakeup.c
index 46429f9a96fa..330aee1c1a49 100644
--- i/kernel/trace/trace_sched_wakeup.c
+++ w/kernel/trace/trace_sched_wakeup.c@@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { struct trace_array_cpu *data; u64 T0, T1, delta;