Re: [PATCH RESEND WITH CCs v3 4/4] perf tools: determine if LR is the return address
From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: 2021-03-06 12:56:13
Also in:
lkml
Em Fri, Mar 05, 2021 at 10:54:03AM +0200, James Clark escreveu:
I've tested this patchset on a few different applications and have seen it significantly improve
quality of frame pointer stacks on aarch64. For example with GDB 10 and default build options,
'bfd_calc_gnu_debuglink_crc32' is a leaf function, and its caller 'gdb_bfd_crc' is ommitted,
but with the patchset it is included. I've also confirmed that this is correct from looking at
the source code.
Before:
# Children Self Command Shared Object Symbol
# ........ ........ ............... .......................... ...........
#
34.55% 0.00% gdb-100 gdb-100 [.] _start
0.78%
_start
__libc_start_main
main
gdb_main
captured_command_loop
gdb_do_one_event
check_async_event_handlers
fetch_inferior_event
inferior_event_handler
do_all_inferior_continuations
attach_post_wait
post_create_inferior
svr4_solib_create_inferior_hook
solib_add
solib_read_symbols
symbol_file_add_with_addrs
read_symbols
elf_symfile_read
find_separate_debug_file_by_debuglink[abi:cxx11]
find_separate_debug_file
separate_debug_file_exists
gdb_bfd_crc
bfd_calc_gnu_debuglink_crc32
After:
# Children Self Command Shared Object Symbol
# ........ ........ ............... .......................... ...........
#
34.55% 0.00% gdb-100 gdb-100 [.] _start
0.78%
_start
__libc_start_main
main
gdb_main
captured_command_loop
gdb_do_one_event
check_async_event_handlers
fetch_inferior_event
inferior_event_handler
do_all_inferior_continuations
attach_post_wait
post_create_inferior
svr4_solib_create_inferior_hook
solib_add
solib_read_symbols
symbol_file_add_with_addrs
read_symbols
elf_symfile_read
find_separate_debug_file_by_debuglink[abi:cxx11]
find_separate_debug_file
separate_debug_file_exists
get_file_crc <--------------------- leaf frame caller added
bfd_calc_gnu_debuglink_crc32
There is a question about whether the overhead of recording all the registers is acceptable, for
filesize and time. We could make it a manual step, at the cost of not showing better frame pointer
stacks by default.Can someone quantify this, i.e. how much space per perf.data for a typical scenario? But anyway, I'm applying it as is now, we can change it if needed, its not like files with the extra registers won't be valid if/when we decide not to collect it by default in the future. If we decide to make this selectable, we should have it as a .perfconfig knob as well, so that one can set it and change the default, etc. - Arnaldo
Tested-by: James Clark <redacted> On 04/03/2021 18:32, Alexandre Truong wrote:quoted
On arm64 and frame pointer mode (e.g: perf record --callgraph fp), use dwarf unwind info to check if the link register is the return address in order to inject it to the frame pointer stack. Write the following application: int a = 10; void f2(void) { for (int i = 0; i < 1000000; i++) a *= a; } void f1() { for (int i = 0; i < 10; i++) f2(); } int main (void) { f1(); return 0; } with the following compilation flags: gcc -fno-omit-frame-pointer -fno-inline -O2 The compiler omits the frame pointer for f2 on arm. This is a problem with any leaf call, for example an application with many different calls to malloc() would always omit the calling frame, even if it can be determined. ./perf record --call-graph fp ./a.out ./perf report currently gives the following stack: 0xffffea52f361 _start __libc_start_main main f2 After this change, perf report correctly shows f1() calling f2(), even though it was missing from the frame pointer unwind: ./perf report 0xffffea52f361 _start __libc_start_main main f1 f2 Signed-off-by: Alexandre Truong <redacted> Cc: John Garry <redacted> Cc: Will Deacon <will@kernel.org> Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Leo Yan <redacted> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Jiri Olsa <redacted> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Kemeng Shi <redacted> Cc: Ian Rogers <irogers@google.com> Cc: Andi Kleen <redacted> Cc: Kan Liang <redacted> Cc: Jin Yao <redacted> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Al Grant <redacted> Cc: James Clark <redacted> Cc: Wilco Dijkstra <redacted> --- tools/perf/util/Build | 1 + .../util/arm-frame-pointer-unwind-support.c | 44 +++++++++++++++++++ .../util/arm-frame-pointer-unwind-support.h | 7 +++ tools/perf/util/machine.c | 9 ++-- 4 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 tools/perf/util/arm-frame-pointer-unwind-support.c create mode 100644 tools/perf/util/arm-frame-pointer-unwind-support.hdiff --git a/tools/perf/util/Build b/tools/perf/util/Build index 188521f34347..3b82cb992bce 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build@@ -1,3 +1,4 @@ +perf-y += arm-frame-pointer-unwind-support.o perf-y += annotate.o perf-y += block-info.o perf-y += block-range.odiff --git a/tools/perf/util/arm-frame-pointer-unwind-support.c b/tools/perf/util/arm-frame-pointer-unwind-support.c new file mode 100644 index 000000000000..964efd08e72e --- /dev/null +++ b/tools/perf/util/arm-frame-pointer-unwind-support.c@@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../arch/arm64/include/uapi/asm/perf_regs.h" +#include "arch/arm64/include/perf_regs.h" +#include "event.h" +#include "arm-frame-pointer-unwind-support.h" +#include "callchain.h" +#include "unwind.h" + +struct entries { + u64 stack[2]; + size_t length; +}; + +static bool get_leaf_frame_caller_enabled(struct perf_sample *sample) +{ + return callchain_param.record_mode == CALLCHAIN_FP && sample->user_regs.regs + && sample->user_regs.mask == PERF_REGS_MASK; +} + +static int add_entry(struct unwind_entry *entry, void *arg) +{ + struct entries *entries = arg; + + entries->stack[entries->length++] = entry->ip; + return 0; +} + +u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread) +{ + int ret; + + struct entries entries = {{0, 0}, 0}; + + if (!get_leaf_frame_caller_enabled(sample)) + return 0; + + ret = unwind__get_entries(add_entry, &entries, thread, sample, 2); + + if (ret || entries.length != 2) + return ret; + + return callchain_param.order == ORDER_CALLER ? + entries.stack[0] : entries.stack[1]; +}diff --git a/tools/perf/util/arm-frame-pointer-unwind-support.h b/tools/perf/util/arm-frame-pointer-unwind-support.h new file mode 100644 index 000000000000..16dc03fa9abe --- /dev/null +++ b/tools/perf/util/arm-frame-pointer-unwind-support.h@@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H +#define __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H + +u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread); + +#endif /* __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H */diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 7f03ffa016b0..dfb72dbc0e2d 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c@@ -34,6 +34,7 @@ #include "bpf-event.h" #include <internal/lib.h> // page_size #include "cgroup.h" +#include "arm-frame-pointer-unwind-support.h" #include <linux/ctype.h> #include <symbol/kallsyms.h>@@ -2671,10 +2672,12 @@ static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread, return err; } -static u64 get_leaf_frame_caller(struct perf_sample *sample __maybe_unused, - struct thread *thread __maybe_unused) +static u64 get_leaf_frame_caller(struct perf_sample *sample, struct thread *thread) { - return 0; + if (strncmp(thread->maps->machine->env->arch, "aarch64", 7) == 0) + return get_leaf_frame_caller_aarch64(sample, thread); + else + return 0; } static int thread__resolve_callchain_sample(struct thread *thread,
-- - Arnaldo