Thread (12 messages) 12 messages, 4 authors, 2021-03-26

Re: [PATCH RESEND WITH CCs v3 4/4] perf tools: determine if LR is the return address

From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: 2021-03-06 12:56:13
Also in: lkml

Em Fri, Mar 05, 2021 at 10:54:03AM +0200, James Clark escreveu:
I've tested this patchset on a few different applications and have seen it significantly improve
quality of frame pointer stacks on aarch64. For example with GDB 10 and default build options,
'bfd_calc_gnu_debuglink_crc32' is a leaf function, and its caller 'gdb_bfd_crc' is ommitted,
but with the patchset it is included. I've also confirmed that this is correct from looking at
the source code.

Before:

        # Children      Self  Command          Shared Object               Symbol                                                                                                                                                                                                                                                                                                                                                                                                                                                          
        # ........  ........  ...............  ..........................  ...........
        #
            34.55%     0.00%  gdb-100          gdb-100                     [.] _start
                   0.78%
                        _start
                        __libc_start_main
                        main
                        gdb_main
                        captured_command_loop
                        gdb_do_one_event
                        check_async_event_handlers
                        fetch_inferior_event
                        inferior_event_handler
                        do_all_inferior_continuations
                        attach_post_wait
                        post_create_inferior
                        svr4_solib_create_inferior_hook
                        solib_add
                        solib_read_symbols
                        symbol_file_add_with_addrs
                        read_symbols
                        elf_symfile_read
                        find_separate_debug_file_by_debuglink[abi:cxx11]
                        find_separate_debug_file
                        separate_debug_file_exists
                        gdb_bfd_crc
                        bfd_calc_gnu_debuglink_crc32

After:

        # Children      Self  Command          Shared Object               Symbol                                                                                                                                                                                                                                                                                                                                                                                                                                                          
        # ........  ........  ...............  ..........................  ...........
        #
            34.55%     0.00%  gdb-100          gdb-100                     [.] _start
                   0.78%
                        _start
                        __libc_start_main
                        main
                        gdb_main
                        captured_command_loop
                        gdb_do_one_event
                        check_async_event_handlers
                        fetch_inferior_event
                        inferior_event_handler
                        do_all_inferior_continuations
                        attach_post_wait
                        post_create_inferior
                        svr4_solib_create_inferior_hook
                        solib_add
                        solib_read_symbols
                        symbol_file_add_with_addrs
                        read_symbols
                        elf_symfile_read
                        find_separate_debug_file_by_debuglink[abi:cxx11]
                        find_separate_debug_file
                        separate_debug_file_exists
                        get_file_crc   <--------------------- leaf frame caller added
                        bfd_calc_gnu_debuglink_crc32

There is a question about whether the overhead of recording all the registers is acceptable, for
filesize and time. We could make it a manual step, at the cost of not showing better frame pointer
stacks by default.
Can someone quantify this, i.e. how much space per perf.data for a
typical scenario? But anyway, I'm applying it as is now, we can change
it if needed, its not like files with the extra registers won't be
valid if/when we decide not to collect it by default in the future.

If we decide to make this selectable, we should have it as a .perfconfig
knob as well, so that one can set it and change the default, etc.

- Arnaldo
 
Tested-by: James Clark <redacted>

On 04/03/2021 18:32, Alexandre Truong wrote:
quoted
On arm64 and frame pointer mode (e.g: perf record --callgraph fp),
use dwarf unwind info to check if the link register is the return
address in order to inject it to the frame pointer stack.

Write the following application:

	int a = 10;

	void f2(void)
	{
		for (int i = 0; i < 1000000; i++)
			a *= a;
	}

	void f1()
	{
		for (int i = 0; i < 10; i++)
			f2();
	}

	int main (void)
	{
		f1();
		return 0;
	}

with the following compilation flags:
	gcc -fno-omit-frame-pointer -fno-inline -O2

The compiler omits the frame pointer for f2 on arm. This is a problem
with any leaf call, for example an application with many different
calls to malloc() would always omit the calling frame, even if it
can be determined.

	./perf record --call-graph fp ./a.out
	./perf report

currently gives the following stack:

0xffffea52f361
_start
__libc_start_main
main
f2

After this change, perf report correctly shows f1() calling f2(),
even though it was missing from the frame pointer unwind:

	./perf report

0xffffea52f361
_start
__libc_start_main
main
f1
f2

Signed-off-by: Alexandre Truong <redacted>
Cc: John Garry <redacted>
Cc: Will Deacon <will@kernel.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Leo Yan <redacted>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <redacted>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Kemeng Shi <redacted>
Cc: Ian Rogers <irogers@google.com>
Cc: Andi Kleen <redacted>
Cc: Kan Liang <redacted>
Cc: Jin Yao <redacted>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Al Grant <redacted>
Cc: James Clark <redacted>
Cc: Wilco Dijkstra <redacted>
---
 tools/perf/util/Build                         |  1 +
 .../util/arm-frame-pointer-unwind-support.c   | 44 +++++++++++++++++++
 .../util/arm-frame-pointer-unwind-support.h   |  7 +++
 tools/perf/util/machine.c                     |  9 ++--
 4 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 tools/perf/util/arm-frame-pointer-unwind-support.c
 create mode 100644 tools/perf/util/arm-frame-pointer-unwind-support.h
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 188521f34347..3b82cb992bce 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -1,3 +1,4 @@
+perf-y += arm-frame-pointer-unwind-support.o
 perf-y += annotate.o
 perf-y += block-info.o
 perf-y += block-range.o
diff --git a/tools/perf/util/arm-frame-pointer-unwind-support.c b/tools/perf/util/arm-frame-pointer-unwind-support.c
new file mode 100644
index 000000000000..964efd08e72e
--- /dev/null
+++ b/tools/perf/util/arm-frame-pointer-unwind-support.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../arch/arm64/include/uapi/asm/perf_regs.h"
+#include "arch/arm64/include/perf_regs.h"
+#include "event.h"
+#include "arm-frame-pointer-unwind-support.h"
+#include "callchain.h"
+#include "unwind.h"
+
+struct entries {
+	u64 stack[2];
+	size_t length;
+};
+
+static bool get_leaf_frame_caller_enabled(struct perf_sample *sample)
+{
+	return callchain_param.record_mode == CALLCHAIN_FP && sample->user_regs.regs
+		&& sample->user_regs.mask == PERF_REGS_MASK;
+}
+
+static int add_entry(struct unwind_entry *entry, void *arg)
+{
+	struct entries *entries = arg;
+
+	entries->stack[entries->length++] = entry->ip;
+	return 0;
+}
+
+u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread)
+{
+	int ret;
+
+	struct entries entries = {{0, 0}, 0};
+
+	if (!get_leaf_frame_caller_enabled(sample))
+		return 0;
+
+	ret = unwind__get_entries(add_entry, &entries, thread, sample, 2);
+
+	if (ret || entries.length != 2)
+		return ret;
+
+	return callchain_param.order == ORDER_CALLER ?
+		entries.stack[0] : entries.stack[1];
+}
diff --git a/tools/perf/util/arm-frame-pointer-unwind-support.h b/tools/perf/util/arm-frame-pointer-unwind-support.h
new file mode 100644
index 000000000000..16dc03fa9abe
--- /dev/null
+++ b/tools/perf/util/arm-frame-pointer-unwind-support.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
+#define __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
+
+u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread);
+
+#endif /* __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 7f03ffa016b0..dfb72dbc0e2d 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -34,6 +34,7 @@
 #include "bpf-event.h"
 #include <internal/lib.h> // page_size
 #include "cgroup.h"
+#include "arm-frame-pointer-unwind-support.h"
 
 #include <linux/ctype.h>
 #include <symbol/kallsyms.h>
@@ -2671,10 +2672,12 @@ static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread,
 	return err;
 }
 
-static u64 get_leaf_frame_caller(struct perf_sample *sample __maybe_unused,
-		struct thread *thread __maybe_unused)
+static u64 get_leaf_frame_caller(struct perf_sample *sample, struct thread *thread)
 {
-	return 0;
+	if (strncmp(thread->maps->machine->env->arch, "aarch64", 7) == 0)
+		return get_leaf_frame_caller_aarch64(sample, thread);
+	else
+		return 0;
 }
 
 static int thread__resolve_callchain_sample(struct thread *thread,
-- 

- Arnaldo
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help