[RFC PATCH v3 15/15] perf synthetic-events: use workqueue parallel_for
From: Riccardo Mancini <hidden>
Date: 2021-08-20 10:55:19
Also in:
lkml
Subsystem:
performance events subsystem, the rest · Maintainers:
Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim, Linus Torvalds
To generate synthetic events, perf has the option to use multiple threads. These threads are created manually using pthread_created. This patch replaces the manual pthread_create with a workqueue, using the parallel_for utility. Experimental results show that workqueue has a slightly higher overhead, but this is repayed by the improved work balancing among threads. Results of perf bench before and after are reported below: Command: sudo ./perf bench internals synthesize -t Average synthesis time in usec is reported. Laptop (2 cores 4 threads i7), avg num events ~21500: N pthread (before) workqueue (after) 1 121475.200 +- 2227.757 118882.900 +- 1389.398 2 72834.100 +- 1860.677 67668.600 +- 2847.693 3 70650.200 +- 540.096 55694.200 +- 496.155 4 55554.300 +- 259.968 50901.400 +- 434.327 VM (16 vCPU over 16 cores 32 threads Xeon), avg num events ~2920: N pthread (before) workqueue (after) 1 35182.400 +- 3561.189 37528.300 +- 2972.887 2 29188.400 +- 2191.767 28250.300 +- 1694.575 3 22172.200 +- 788.659 19062.400 +- 611.201 4 21600.700 +- 728.941 16812.900 +- 1085.359 5 19395.800 +- 1070.617 14764.600 +- 1339.113 6 18553.000 +- 1272.486 12814.200 +- 408.462 7 14691.400 +- 485.105 12382.200 +- 464.964 8 16036.400 +- 842.728 15015.000 +- 1648.844 9 15606.800 +- 470.100 13230.800 +- 1288.246 10 15527.000 +- 822.317 12661.800 +- 873.199 11 13097.400 +- 513.870 13082.700 +- 974.378 12 14053.700 +- 592.427 13123.400 +- 1054.939 13 15446.400 +- 765.850 12837.200 +- 770.646 14 14979.400 +- 1056.955 13695.400 +- 1066.302 15 12578.000 +- 846.142 15053.600 +- 992.118 16 12394.800 +- 602.295 13683.700 +- 911.517 Signed-off-by: Riccardo Mancini <redacted> --- tools/perf/bench/synthesize.c | 12 +-- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-record.c | 3 +- tools/perf/builtin-top.c | 3 +- tools/perf/builtin-trace.c | 3 +- tools/perf/tests/mmap-thread-lookup.c | 2 +- tools/perf/util/synthetic-events.c | 135 +++++++++----------------- tools/perf/util/synthetic-events.h | 8 +- 8 files changed, 56 insertions(+), 112 deletions(-)
diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c
index 738821113a005a6c..f1880116f4375c46 100644
--- a/tools/perf/bench/synthesize.c
+++ b/tools/perf/bench/synthesize.c@@ -63,7 +63,6 @@ static int do_run_single_threaded(struct perf_session *session, struct perf_thread_map *threads, struct target *target, bool data_mmap) { - const unsigned int nr_threads_synthesize = 1; struct timeval start, end, diff; u64 runtime_us; unsigned int i;
@@ -81,8 +80,7 @@ static int do_run_single_threaded(struct perf_session *session, NULL, target, threads, process_synthesized_event, - data_mmap, - nr_threads_synthesize); + data_mmap); if (err) return err;
@@ -148,8 +146,7 @@ static int run_single_threaded(void) return err; } -static int do_run_multi_threaded(struct target *target, - unsigned int nr_threads_synthesize) +static int do_run_multi_threaded(struct target *target) { struct timeval start, end, diff; u64 runtime_us;
@@ -172,8 +169,7 @@ static int do_run_multi_threaded(struct target *target, NULL, target, NULL, process_synthesized_event, - false, - nr_threads_synthesize); + false); if (err) { perf_session__delete(session); return err;
@@ -236,7 +232,7 @@ static int run_multi_threaded(void) printf(" Number of synthesis threads: %u\n", nr_threads_synthesize); - err = do_run_multi_threaded(&target, nr_threads_synthesize); + err = do_run_multi_threaded(&target); if (err) return err;
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index aa1b127ffb5be047..7afa1a41a627f353 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c@@ -1456,7 +1456,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm, perf_session__set_id_hdr_size(kvm->session); ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true); machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target, - kvm->evlist->core.threads, false, 1); + kvm->evlist->core.threads, false); err = kvm_live_open_events(kvm); if (err) goto out;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4d7b610b1d0bb9af..cccc2d0f9977d5b3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c@@ -1481,8 +1481,7 @@ static int record__synthesize(struct record *rec, bool tail) } err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads, - f, opts->sample_address, - rec->opts.nr_threads_synthesize); + f, opts->sample_address); if (rec->opts.nr_threads_synthesize > 1) perf_set_singlethreaded();
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 9b4f220920780a95..36cd1294d9b4ebd3 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c@@ -1272,8 +1272,7 @@ static int __cmd_top(struct perf_top *top) pr_debug("Couldn't synthesize cgroup events.\n"); machine__synthesize_threads(&top->session->machines.host, &opts->target, - top->evlist->core.threads, false, - top->nr_threads_synthesize); + top->evlist->core.threads, false); if (top->nr_threads_synthesize > 1) perf_set_singlethreaded();
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 2bf21194c7b3959e..e2b50ba55a5ea93d 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c@@ -1628,8 +1628,7 @@ static int trace__symbols_init(struct trace *trace, struct evlist *evlist) goto out; err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, - evlist->core.threads, trace__tool_process, false, - 1); + evlist->core.threads, trace__tool_process, false); out: if (err) symbol__exit();
diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 8d9d4cbff76d17d5..809be9510e849d1b 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c@@ -135,7 +135,7 @@ static int synth_all(struct machine *machine) { return perf_event__synthesize_threads(NULL, perf_event__process, - machine, 0, 1); + machine, 0); } static int synth_process(struct machine *machine)
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index a7e981b2d7decd3b..5f41e2f9579e3f77 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c@@ -23,6 +23,7 @@ #include <linux/string.h> #include <linux/zalloc.h> #include <linux/perf_event.h> +#include <linux/err.h> #include <asm/bug.h> #include <perf/evsel.h> #include <perf/cpumap.h>
@@ -42,6 +43,8 @@ #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> +#include "util/workqueue/workqueue.h" +#include "util/util.h" #define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500
@@ -883,16 +886,13 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data, - struct dirent **dirent, - int start, - int num) + char *d_name) { union perf_event *comm_event, *mmap_event, *fork_event; union perf_event *namespaces_event; int err = -1; char *end; pid_t pid; - int i; comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); if (comm_event == NULL)
@@ -912,24 +912,22 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool, if (namespaces_event == NULL) goto out_free_fork; - for (i = start; i < start + num; i++) { - if (!isdigit(dirent[i]->d_name[0])) - continue; + if (!isdigit(d_name[0])) + goto out_free_namespaces; - pid = (pid_t)strtol(dirent[i]->d_name, &end, 10); - /* only interested in proper numerical dirents */ - if (*end) - continue; - /* - * We may race with exiting thread, so don't stop just because - * one thread couldn't be synthesized. - */ - __event__synthesize_thread(comm_event, mmap_event, fork_event, - namespaces_event, pid, 1, process, - tool, machine, mmap_data); - } + pid = (pid_t)strtol(d_name, &end, 10); + /* only interested in proper numerical dirents */ + if (*end) + goto out_free_namespaces; + /* + * We may race with exiting thread, so don't stop just because + * one thread couldn't be synthesized. + */ + __event__synthesize_thread(comm_event, mmap_event, fork_event, + namespaces_event, pid, 1, process, + tool, machine, mmap_data); err = 0; - +out_free_namespaces: free(namespaces_event); out_free_fork: free(fork_event);
@@ -947,36 +945,28 @@ struct synthesize_threads_arg { struct machine *machine; bool mmap_data; struct dirent **dirent; - int num; - int start; }; -static void *synthesize_threads_worker(void *arg) +static void synthesize_threads_worker(int i, void *arg) { struct synthesize_threads_arg *args = arg; __perf_event__synthesize_threads(args->tool, args->process, args->machine, args->mmap_data, - args->dirent, - args->start, args->num); - return NULL; + args->dirent[i]->d_name); } int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, - bool mmap_data, - unsigned int nr_threads_synthesize) + bool mmap_data) { - struct synthesize_threads_arg *args = NULL; - pthread_t *synthesize_threads = NULL; + struct synthesize_threads_arg args; char proc_path[PATH_MAX]; struct dirent **dirent; - int num_per_thread; - int m, n, i, j; - int thread_nr; - int base = 0; - int err = -1; + int n, i; + int err = -1, ret; + char err_buf[WORKQUEUE_STRERR_BUFSIZE]; if (machine__is_default_guest(machine))
@@ -987,60 +977,27 @@ int perf_event__synthesize_threads(struct perf_tool *tool, if (n < 0) return err; - if (nr_threads_synthesize == UINT_MAX) - thread_nr = sysconf(_SC_NPROCESSORS_ONLN); - else - thread_nr = nr_threads_synthesize; - - if (thread_nr <= 1) { - err = __perf_event__synthesize_threads(tool, process, - machine, mmap_data, - dirent, base, n); + if (perf_singlethreaded) { + for (i = 0; i < n; i++) + err = __perf_event__synthesize_threads(tool, process, + machine, mmap_data, + dirent[i]->d_name); goto free_dirent; } - if (thread_nr > n) - thread_nr = n; - - synthesize_threads = calloc(sizeof(pthread_t), thread_nr); - if (synthesize_threads == NULL) - goto free_dirent; - args = calloc(sizeof(*args), thread_nr); - if (args == NULL) - goto free_threads; + args.tool = tool; + args.process = process; + args.machine = machine; + args.mmap_data = mmap_data; + args.dirent = dirent; - num_per_thread = n / thread_nr; - m = n % thread_nr; - for (i = 0; i < thread_nr; i++) { - args[i].tool = tool; - args[i].process = process; - args[i].machine = machine; - args[i].mmap_data = mmap_data; - args[i].dirent = dirent; - } - for (i = 0; i < m; i++) { - args[i].num = num_per_thread + 1; - args[i].start = i * args[i].num; - } - if (i != 0) - base = args[i-1].start + args[i-1].num; - for (j = i; j < thread_nr; j++) { - args[j].num = num_per_thread; - args[j].start = base + (j - i) * args[i].num; + err = parallel_for(global_wq, 0, n, 1, synthesize_threads_worker, &args); + if (err) { + ret = workqueue_strerror(global_wq, err, err_buf, sizeof(err_buf)); + pr_err("parallel_for: %s\n", + ret <= 0 ? "Error generating error msg" : err_buf); } - for (i = 0; i < thread_nr; i++) { - if (pthread_create(&synthesize_threads[i], NULL, - synthesize_threads_worker, &args[i])) - goto out_join; - } - err = 0; -out_join: - for (i = 0; i < thread_nr; i++) - pthread_join(synthesize_threads[i], NULL); - free(args); -free_threads: - free(synthesize_threads); free_dirent: for (i = 0; i < n; i++) zfree(&dirent[i]);
@@ -1775,26 +1732,22 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_ int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, struct target *target, struct perf_thread_map *threads, - perf_event__handler_t process, bool data_mmap, - unsigned int nr_threads_synthesize) + perf_event__handler_t process, bool data_mmap) { if (target__has_task(target)) return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap); else if (target__has_cpu(target)) return perf_event__synthesize_threads(tool, process, - machine, data_mmap, - nr_threads_synthesize); + machine, data_mmap); /* command specified */ return 0; } int machine__synthesize_threads(struct machine *machine, struct target *target, - struct perf_thread_map *threads, bool data_mmap, - unsigned int nr_threads_synthesize) + struct perf_thread_map *threads, bool data_mmap) { return __machine__synthesize_threads(machine, NULL, target, threads, - perf_event__process, data_mmap, - nr_threads_synthesize); + perf_event__process, data_mmap); } static struct perf_record_event_update *event_update_event__new(size_t size, u64 type, u64 id)
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index c845e2b9b444df57..cd3c43451a237563 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h@@ -54,7 +54,7 @@ int perf_event__synthesize_stat_round(struct perf_tool *tool, u64 time, u64 type int perf_event__synthesize_stat(struct perf_tool *tool, u32 cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_thread_map2(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool mmap_data); -int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data, unsigned int nr_threads_synthesize); +int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data); int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process); int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); pid_t perf_event__synthesize_comm(struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine);
@@ -65,11 +65,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, struct target *target, struct perf_thread_map *threads, - perf_event__handler_t process, bool data_mmap, - unsigned int nr_threads_synthesize); + perf_event__handler_t process, bool data_mmap); int machine__synthesize_threads(struct machine *machine, struct target *target, - struct perf_thread_map *threads, bool data_mmap, - unsigned int nr_threads_synthesize); + struct perf_thread_map *threads, bool data_mmap); #ifdef HAVE_AUXTRACE_SUPPORT int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool,
--
2.31.1