Thread (17 messages) 17 messages, 3 authors, 4d ago
COOLING4d
Revisions (3)
  1. rfc [diff vs current]
  2. v2 [diff vs current]
  3. v3 current

[RFC PATCH v3 2/3] trace: integrate stackmap into ftrace stack recording path

From: Li Pengfei <hidden>
Date: 2026-05-26 11:53:52
Also in: lkml
Subsystem: the rest, tracing · Maintainers: Linus Torvalds, Steven Rostedt, Masami Hiramatsu

From: Pengfei Li <redacted>

Add TRACE_STACK_ID event type and integrate ftrace_stackmap into
__ftrace_trace_stack(). When the 'stackmap' trace option is enabled,
the stack recording path stores a 4-byte stack_id in the ring buffer
instead of the full stack trace.

Changes:
- New TRACE_STACK_ID in trace_type enum
- New stack_id_entry in trace_entries.h
- New TRACE_ITER(STACKMAP) trace option flag; when CONFIG_FTRACE_STACKMAP
  is disabled, TRACE_ITER_STACKMAP_BIT is defined as -1 so that
  TRACE_ITER(STACKMAP) evaluates to 0 (following the existing pattern
  used by TRACE_ITER_PROF_TEXT_OFFSET)
- 'stackmap' is added to TOP_LEVEL_TRACE_FLAGS and ZEROED_TRACE_FLAGS
  so it is only exposed under the top-level trace instance, matching
  the convention already used for global-only options such as 'printk'
  and 'record-cmd'. Secondary instances under tracing/instances/*/
  do not see the option at all, avoiding a confusing no-op.
- Modified __ftrace_trace_stack() to call ftrace_stackmap_get_id()
  when the stackmap option is active. If reserving a TRACE_STACK_ID
  ring-buffer slot fails after a successful get_id(), the path falls
  through to the full-stack recording so the event still gets a stack
  trace recorded.
- Stackmap pointer read with smp_load_acquire(), published with
  smp_store_release() to ensure proper initialization ordering
- NULL check on tr->stackmap is retained as defense-in-depth: events
  that fire before fs_initcall (when the map is created) or after a
  failed ftrace_stackmap_create() observe a NULL pointer and fall back
  to full stack recording without dereferencing it
- ftrace_stackmap_create() takes the owning trace_array so the
  stackmap can later check tracing state during reset
- Added stack_id print handler in trace_output.c
- Added TRACE_STACK_ID to trace_valid_entry() in trace_selftest.c
  so ftrace startup selftests don't reject the new entry type when
  the stackmap option is enabled

Fallback behavior: if stackmap returns an error (pool exhausted,
resetting, or NULL pointer), the full stack trace is recorded as
before -- no new failure modes introduced.

Per-instance stackmap support is left as a follow-up; gating the
option via TOP_LEVEL_TRACE_FLAGS makes the global-only scope
explicit at the tracefs interface rather than relying on a silent
runtime fallback.

Usage:
  echo 1 > /sys/kernel/debug/tracing/options/stackmap
  echo 1 > /sys/kernel/debug/tracing/options/stacktrace

Signed-off-by: Pengfei Li <redacted>
---
 kernel/trace/trace.c          | 78 ++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h          | 16 +++++++
 kernel/trace/trace_entries.h  | 15 +++++++
 kernel/trace/trace_output.c   | 23 +++++++++++
 kernel/trace/trace_selftest.c |  1 +
 5 files changed, 131 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..36120355e549 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -57,6 +57,7 @@
 
 #include "trace.h"
 #include "trace_output.h"
+#include "trace_stackmap.h"
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 /*
@@ -509,12 +510,13 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
 /* trace_options that are only supported by global_trace */
 #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) |			\
 	       TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) |	\
-	       TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS)
+	       TRACE_ITER(PROF_TEXT_OFFSET) | TRACE_ITER(STACKMAP) |	\
+	       FPROFILE_DEFAULT_FLAGS)
 
 /* trace_flags that are default zero for instances */
 #define ZEROED_TRACE_FLAGS \
 	(TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \
-	 TRACE_ITER(COPY_MARKER))
+	 TRACE_ITER(COPY_MARKER) | TRACE_ITER(STACKMAP))
 
 /*
  * The global_trace is the descriptor that holds the top-level tracing
@@ -2184,6 +2186,49 @@ void __ftrace_trace_stack(struct trace_array *tr,
 	}
 #endif
 
+#ifdef CONFIG_FTRACE_STACKMAP
+	/*
+	 * If stackmap dedup is enabled, try to store only the stack_id
+	 * in the ring buffer instead of the full stack trace.
+	 */
+	if (tr->trace_flags & TRACE_ITER(STACKMAP)) {
+		struct ftrace_stackmap *smap;
+		struct stack_id_entry *sid_entry;
+		int sid;
+
+		smap = smp_load_acquire(&tr->stackmap);
+		if (!smap)
+			goto full_stack;
+
+		sid = ftrace_stackmap_get_id(smap, fstack->calls, nr_entries);
+		if (sid >= 0) {
+			event = __trace_buffer_lock_reserve(buffer,
+					TRACE_STACK_ID,
+					sizeof(*sid_entry), trace_ctx);
+			if (!event) {
+				/*
+				 * Could not reserve a TRACE_STACK_ID slot;
+				 * fall back to the full-stack path so the
+				 * event still gets a stack trace recorded.
+				 */
+				goto full_stack;
+			}
+			sid_entry = ring_buffer_event_data(event);
+			sid_entry->stack_id = sid;
+			/*
+			 * stack_id is a synthetic side-event attached to a
+			 * primary trace event that was already subject to
+			 * filtering. No per-event filter is defined for
+			 * TRACE_STACK_ID, so commit unconditionally.
+			 */
+			__buffer_unlock_commit(buffer, event);
+			goto out;
+		}
+		/* On stackmap failure, record the full stack instead. */
+	}
+full_stack:
+#endif
+
 	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
 				    struct_size(entry, caller, nr_entries),
 				    trace_ctx);
@@ -9222,6 +9267,35 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work)
 			NULL, &tracing_dyn_info_fops);
 #endif
 
+#ifdef CONFIG_FTRACE_STACKMAP
+	{
+		struct ftrace_stackmap *smap;
+
+		smap = ftrace_stackmap_create(&global_trace);
+		if (!IS_ERR(smap)) {
+			/*
+			 * Use smp_store_release to ensure the stackmap
+			 * structure is fully initialized before publishing
+			 * the pointer to concurrent trace event readers.
+			 */
+			smp_store_release(&global_trace.stackmap, smap);
+			trace_create_file("stack_map", TRACE_MODE_WRITE, NULL,
+					smap, &ftrace_stackmap_fops);
+			trace_create_file("stack_map_stat", TRACE_MODE_READ, NULL,
+					smap, &ftrace_stackmap_stat_fops);
+			trace_create_file("stack_map_bin", TRACE_MODE_READ, NULL,
+					smap, &ftrace_stackmap_bin_fops);
+		} else {
+			pr_warn("ftrace stackmap init failed, dedup disabled\n");
+			/*
+			 * global_trace is statically defined; its stackmap
+			 * field is zero-initialized via BSS, so leaving it
+			 * NULL ensures the smp_load_acquire() in
+			 * __ftrace_trace_stack() falls back to full stack.
+			 */
+		}
+	}
+#endif
 	create_trace_instances(NULL);
 
 	update_tracer_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 80fe152af1dd..7e7d5e5a35ff 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -57,6 +57,7 @@ enum trace_type {
 	TRACE_TIMERLAT,
 	TRACE_RAW_DATA,
 	TRACE_FUNC_REPEATS,
+	TRACE_STACK_ID,
 
 	__TRACE_LAST_TYPE,
 };
@@ -453,6 +454,9 @@ struct trace_array {
 	struct cond_snapshot	*cond_snapshot;
 #endif
 	struct trace_func_repeats	__percpu *last_func_repeats;
+#ifdef CONFIG_FTRACE_STACKMAP
+	struct ftrace_stackmap		*stackmap;
+#endif
 	/*
 	 * On boot up, the ring buffer is set to the minimum size, so that
 	 * we do not waste memory on systems that are not using tracing.
@@ -579,6 +583,8 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct func_repeats_entry,		\
 			  TRACE_FUNC_REPEATS);				\
+		IF_ASSIGN(var, ent, struct stack_id_entry,		\
+			  TRACE_STACK_ID);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -1449,7 +1455,16 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 # define STACK_FLAGS
 #endif
 
+#ifdef CONFIG_FTRACE_STACKMAP
+# define STACKMAP_FLAGS				\
+			C(STACKMAP,		"stackmap"),
+#else
+# define STACKMAP_FLAGS
+# define TRACE_ITER_STACKMAP_BIT	-1
+#endif
+
 #ifdef CONFIG_FUNCTION_PROFILER
+
 # define PROFILER_FLAGS					\
 		C(PROF_TEXT_OFFSET,	"prof-text-offset"),
 # ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -1506,6 +1521,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		FUNCTION_FLAGS					\
 		FGRAPH_FLAGS					\
 		STACK_FLAGS					\
+		STACKMAP_FLAGS					\
 		BRANCH_FLAGS					\
 		PROFILER_FLAGS					\
 		FPROFILE_FLAGS
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 54417468fdeb..89ed14b7e5fd 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -250,6 +250,21 @@ FTRACE_ENTRY(user_stack, userstack_entry,
 		 (void *)__entry->caller[6], (void *)__entry->caller[7])
 );
 
+/*
+ * Stack ID entry - stores only a stack_id referencing the stackmap.
+ * Used when CONFIG_FTRACE_STACKMAP is enabled to deduplicate stacks.
+ */
+FTRACE_ENTRY(stack_id, stack_id_entry,
+
+	TRACE_STACK_ID,
+
+	F_STRUCT(
+		__field(	int,		stack_id	)
+	),
+
+	F_printk("<stack_id %d>", __entry->stack_id)
+);
+
 /*
  * trace_printk entry:
  */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a5ad76175d10..68678ea88159 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1517,6 +1517,28 @@ static struct trace_event trace_user_stack_event = {
 	.funcs		= &trace_user_stack_funcs,
 };
 
+/* TRACE_STACK_ID */
+static enum print_line_t trace_stack_id_print(struct trace_iterator *iter,
+					      int flags, struct trace_event *event)
+{
+	struct stack_id_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+	trace_seq_printf(s, "<stack_id %d>\n", field->stack_id);
+
+	return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_stack_id_funcs = {
+	.trace		= trace_stack_id_print,
+};
+
+static struct trace_event trace_stack_id_event = {
+	.type		= TRACE_STACK_ID,
+	.funcs		= &trace_stack_id_funcs,
+};
+
 /* TRACE_HWLAT */
 static enum print_line_t
 trace_hwlat_print(struct trace_iterator *iter, int flags,
@@ -1908,6 +1930,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_wake_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_stack_id_event,
 	&trace_bputs_event,
 	&trace_bprint_event,
 	&trace_print_event,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 929c84075315..0c97065b0d68 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -14,6 +14,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_CTX:
 	case TRACE_WAKE:
 	case TRACE_STACK:
+	case TRACE_STACK_ID:
 	case TRACE_PRINT:
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
-- 
2.34.1
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help