Thread (7 messages) 7 messages, 3 authors, 2022-03-31

Re: [PATCH bpf] tools/runqslower: fix handle__sched_switch for updated tp sched_switch

From: Song Liu <hidden>
Date: 2022-03-30 00:39:33
Also in: bpf

On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko [off-list ref] wrote:

On Tue, Mar 29, 2022 at 4:19 PM Song Liu [off-list ref] wrote:
quoted
TP_PROTO of sched_switch is updated with a new arg prev_state, which
causes runqslower load failure:

libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied
libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG --
R1 type=ctx expected=fp
0: R1=ctx(off=0,imm=0) R10=fp0
; int handle__sched_switch(u64 *ctx)
0: (bf) r7 = r1                       ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0)
; struct task_struct *next = (struct task_struct *)ctx[2];
1: (79) r6 = *(u64 *)(r7 +16)
func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct'
2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0)
; struct task_struct *prev = (struct task_struct *)ctx[1];
2: (79) r2 = *(u64 *)(r7 +8)          ; R2_w=scalar() R7_w=ctx(off=0,imm=0)
3: (b7) r1 = 0                        ; R1_w=0
; struct runq_event event = {};
4: (7b) *(u64 *)(r10 -8) = r1         ; R1_w=P0 R10=fp0 fp-8_w=00000000
5: (7b) *(u64 *)(r10 -16) = r1        ; R1_w=P0 R10=fp0 fp-16_w=00000000
6: (7b) *(u64 *)(r10 -24) = r1        ; R1_w=P0 R10=fp0 fp-24_w=00000000
7: (7b) *(u64 *)(r10 -32) = r1        ; R1_w=P0 R10=fp0 fp-32_w=00000000
; if (prev->__state == TASK_RUNNING)
8: (61) r1 = *(u32 *)(r2 +24)
R2 invalid mem access 'scalar'
processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
-- END PROG LOAD LOG --
libbpf: failed to load program 'handle__sched_switch'
libbpf: failed to load object 'runqslower_bpf'
libbpf: failed to load BPF skeleton 'runqslower_bpf': -13
failed to load BPF object: -13

Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG
in runqslower for cleaner code.

Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event")
Signed-off-by: Song Liu <song@kernel.org>
---
tools/bpf/runqslower/runqslower.bpf.c | 19 +++++--------------
1 file changed, 5 insertions(+), 14 deletions(-)
It would be much less disruptive if that prev_state was added after
"next", but oh well...
Maybe we should change that. 

+Valentin and Steven, how about we change the order with the attached 
diff (not the original patch in this thread, but the one at the end of 
this email)?
But anyways, let's handle this in a way that can handle both old
kernels and new ones and do the same change in libbpf-tool's
runqslower ([0]). Can you please follow up there as well?
Yeah, I will also fix that one. 

We can use BPF CO-RE to detect which order of arguments running kernel
has by checking prev_state field existence in struct
trace_event_raw_sched_switch. Can you please try that? Use
bpf_core_field_exists() for that.
Do you mean something like

if (bpf_core_field_exists(ctx->prev_state))    
    /* use ctx[2] and ctx[3] */
else
    /* use ctx[1] and ctx[2] */

? I think we will need BTF for the arguments, which doesn't exist yet.
Did I miss something? 

I was thinking about adding something like struct tp_sched_switch_args 
for all the raw tracepoints, but haven't got time to look into how. 

Thanks,
Song

 [0] https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqslower.bpf.c

quoted
diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
index 9a5c1f008fe6..30e491d8308f 100644
--- a/tools/bpf/runqslower/runqslower.bpf.c
+++ b/tools/bpf/runqslower/runqslower.bpf.c
@@ -2,6 +2,7 @@
// Copyright (c) 2019 Facebook
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
#include "runqslower.h"

#define TASK_RUNNING 0
@@ -43,31 +44,21 @@ static int trace_enqueue(struct task_struct *t)
}

SEC("tp_btf/sched_wakeup")
-int handle__sched_wakeup(u64 *ctx)
+int BPF_PROG(handle__sched_wakeup, struct task_struct *p)
{
-       /* TP_PROTO(struct task_struct *p) */
-       struct task_struct *p = (void *)ctx[0];
-
       return trace_enqueue(p);
}

SEC("tp_btf/sched_wakeup_new")
-int handle__sched_wakeup_new(u64 *ctx)
+int BPF_PROG(handle__sched_wakeup_new, struct task_struct *p)
{
-       /* TP_PROTO(struct task_struct *p) */
-       struct task_struct *p = (void *)ctx[0];
-
       return trace_enqueue(p);
}

SEC("tp_btf/sched_switch")
-int handle__sched_switch(u64 *ctx)
+int BPF_PROG(handle__sched_switch, bool preempt, unsigned long prev_state,
+            struct task_struct *prev, struct task_struct *next)
{
-       /* TP_PROTO(bool preempt, struct task_struct *prev,
-        *          struct task_struct *next)
-        */
-       struct task_struct *prev = (struct task_struct *)ctx[1];
-       struct task_struct *next = (struct task_struct *)ctx[2];
       struct runq_event event = {};
       u64 *tsp, delta_us;
       long state;
--
2.30.2

diff --git i/include/trace/events/sched.h w/include/trace/events/sched.h
index 65e786756321..fbb99a61f714 100644
--- i/include/trace/events/sched.h
+++ w/include/trace/events/sched.h
@@ -222,11 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt,
 TRACE_EVENT(sched_switch,
 
 	TP_PROTO(bool preempt,
-		 unsigned int prev_state,
 		 struct task_struct *prev,
-		 struct task_struct *next),
+		 struct task_struct *next,
+		 unsigned int prev_state),
 
-	TP_ARGS(preempt, prev_state, prev, next),
+	TP_ARGS(preempt, prev, next, prev_state),
 
 	TP_STRUCT__entry(
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
diff --git i/kernel/sched/core.c w/kernel/sched/core.c
index d575b4914925..25784f3efd81 100644
--- i/kernel/sched/core.c
+++ w/kernel/sched/core.c
@@ -6376,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
+		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
 
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
diff --git i/kernel/trace/fgraph.c w/kernel/trace/fgraph.c
index 19028e072cdb..d7a81d277f66 100644
--- i/kernel/trace/fgraph.c
+++ w/kernel/trace/fgraph.c
@@ -415,9 +415,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 
 static void
 ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
-				unsigned int prev_state,
 				struct task_struct *prev,
-				struct task_struct *next)
+				struct task_struct *next,
+				unsigned int prev_state)
+
 {
 	unsigned long long timestamp;
 	int index;
diff --git i/kernel/trace/ftrace.c w/kernel/trace/ftrace.c
index 4f1d2f5e7263..957354488242 100644
--- i/kernel/trace/ftrace.c
+++ w/kernel/trace/ftrace.c
@@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 
 static void
 ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
-				     unsigned int prev_state,
 				     struct task_struct *prev,
-				     struct task_struct *next)
+				     struct task_struct *next,
+				     unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *pid_list;
diff --git i/kernel/trace/trace_events.c w/kernel/trace/trace_events.c
index e11e167b7809..e7b6a2155e96 100644
--- i/kernel/trace/trace_events.c
+++ w/kernel/trace/trace_events.c
@@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
 
 static void
 event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
-					unsigned int prev_state,
 					 struct task_struct *prev,
-					struct task_struct *next)
+					 struct task_struct *next,
+					 unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
@@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 
 static void
 event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
-					 unsigned int prev_state,
 					 struct task_struct *prev,
-					 struct task_struct *next)
+					 struct task_struct *next,
+					 unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
diff --git i/kernel/trace/trace_sched_switch.c w/kernel/trace/trace_sched_switch.c
index 45796d8bd4b2..c9ffdcfe622e 100644
--- i/kernel/trace/trace_sched_switch.c
+++ w/kernel/trace/trace_sched_switch.c
@@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(void *ignore, bool preempt,
-		   unsigned int prev_state,
-		   struct task_struct *prev, struct task_struct *next)
+		   struct task_struct *prev, struct task_struct *next,
+		   unsigned int prev_state)
 {
 	int flags;
 
diff --git i/kernel/trace/trace_sched_wakeup.c w/kernel/trace/trace_sched_wakeup.c
index 46429f9a96fa..330aee1c1a49 100644
--- i/kernel/trace/trace_sched_wakeup.c
+++ w/kernel/trace/trace_sched_wakeup.c
@@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 
 static void notrace
 probe_wakeup_sched_switch(void *ignore, bool preempt,
-			  unsigned int prev_state,
-			  struct task_struct *prev, struct task_struct *next)
+			  struct task_struct *prev, struct task_struct *next,
+			  unsigned int prev_state)
 {
 	struct trace_array_cpu *data;
 	u64 T0, T1, delta;

Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help