[RFC PATCH v2 1/3] sched, tracing: add to report task state in symbolic chars
From: Ze Gao <hidden>
Date: 2023-07-26 12:16:38
Also in:
linux-perf-users, lkml
Subsystem:
the rest, tracing · Maintainers:
Linus Torvalds, Steven Rostedt, Masami Hiramatsu
Internal representations of task state are likely to be changed or ordered, and reporting them to userspace without exporting them as part of API is not a good choice, which can easily break a userspace observability tool as kernel evolves. For example, perf suffers from this and still reports wrong states by this patch. OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX are also reported inadvertently, which confuses things even more. So add a new variable in company with the old raw value to report task state in symbolic char, which is self-explaining and no further translation is needed, and also report priorities in 'short' to save some buffer space. Of course this does not break any userspace tool. Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old conventions for the rest. Signed-off-by: Ze Gao <redacted> --- include/trace/events/sched.h | 60 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 25 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..0fcf68f49e45 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h@@ -6,6 +6,7 @@ #define _TRACE_SCHED_H #include <linux/kthread.h> +#include <linux/sched.h> #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h>
@@ -214,6 +215,27 @@ static inline long __trace_sched_switch_state(bool preempt, return state ? (1 << (state - 1)) : state; } + +static inline char __trace_sched_switch_state_char(bool preempt, + unsigned int prev_state, + struct task_struct *p) +{ + long state; + +#ifdef CONFIG_SCHED_DEBUG + BUG_ON(p != current); +#endif /* CONFIG_SCHED_DEBUG */ + + /* + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old + * conventions for the rest. + */ + if (preempt) + return 'p'; + + state = __task_state_index(prev_state, p->exit_state); + return task_index_to_char(state); +} #endif /* CREATE_TRACE_POINTS */ /*
@@ -231,41 +253,29 @@ TRACE_EVENT(sched_switch, TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) - __field( int, prev_prio ) - __field( long, prev_state ) + __field( short, prev_prio ) + __field( int, prev_state ) + __field( char, prev_state_char ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) - __field( int, next_prio ) + __field( short, next_prio ) ), TP_fast_assign( memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); - __entry->prev_pid = prev->pid; - __entry->prev_prio = prev->prio; - __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); + __entry->prev_pid = prev->pid; + __entry->prev_prio = (short) prev->prio; + __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); - __entry->next_pid = next->pid; - __entry->next_prio = next->prio; + __entry->next_pid = next->pid; + __entry->next_prio = (short) next->prio; /* XXX SCHED_DEADLINE */ ), - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", - { TASK_INTERRUPTIBLE, "S" }, - { TASK_UNINTERRUPTIBLE, "D" }, - { __TASK_STOPPED, "T" }, - { __TASK_TRACED, "t" }, - { EXIT_DEAD, "X" }, - { EXIT_ZOMBIE, "Z" }, - { TASK_PARKED, "P" }, - { TASK_DEAD, "I" }) : - "R", - - __entry->prev_state & TASK_REPORT_MAX ? "+" : "", - __entry->next_comm, __entry->next_pid, __entry->next_prio) + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm, + __entry->next_pid, __entry->next_prio) ); /*
--
2.40.1