[RFC PATCH v3 15/15] perf synthetic-events: use workqueue parallel_for

From: Riccardo Mancini <hidden>
Date: 2021-08-20 10:55:19
Also in: lkml
Subsystem: performance events subsystem, the rest · Maintainers: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim, Linus Torvalds

To generate synthetic events, perf has the option to use multiple
threads. These threads are created manually using pthread_created.

This patch replaces the manual pthread_create with a workqueue,
using the parallel_for utility.

Experimental results show that workqueue has a slightly higher overhead,
but this is repayed by the improved work balancing among threads.

Results of perf bench before and after are reported below:
Command: sudo ./perf bench internals synthesize -t
Average synthesis time in usec is reported.

Laptop (2 cores 4 threads i7), avg num events ~21500:
 N    pthread (before)        workqueue (after)
 1  121475.200 +- 2227.757  118882.900 +- 1389.398
 2   72834.100 +- 1860.677   67668.600 +- 2847.693
 3   70650.200 +-  540.096   55694.200 +-  496.155
 4   55554.300 +-  259.968   50901.400 +-  434.327

VM (16 vCPU over 16 cores 32 threads Xeon), avg num events ~2920:
 N    pthread (before)        workqueue (after)
 1  35182.400 +- 3561.189   37528.300 +- 2972.887
 2  29188.400 +- 2191.767   28250.300 +- 1694.575
 3  22172.200 +-  788.659   19062.400 +-  611.201
 4  21600.700 +-  728.941   16812.900 +- 1085.359
 5  19395.800 +- 1070.617   14764.600 +- 1339.113
 6  18553.000 +- 1272.486   12814.200 +-  408.462
 7  14691.400 +-  485.105   12382.200 +-  464.964
 8  16036.400 +-  842.728   15015.000 +- 1648.844
 9  15606.800 +-  470.100   13230.800 +- 1288.246
10  15527.000 +-  822.317   12661.800 +-  873.199
11  13097.400 +-  513.870   13082.700 +-  974.378
12  14053.700 +-  592.427   13123.400 +- 1054.939
13  15446.400 +-  765.850   12837.200 +-  770.646
14  14979.400 +- 1056.955   13695.400 +- 1066.302
15  12578.000 +-  846.142   15053.600 +-  992.118
16  12394.800 +-  602.295   13683.700 +-  911.517

Signed-off-by: Riccardo Mancini <redacted>
---
 tools/perf/bench/synthesize.c         |  12 +--
 tools/perf/builtin-kvm.c              |   2 +-
 tools/perf/builtin-record.c           |   3 +-
 tools/perf/builtin-top.c              |   3 +-
 tools/perf/builtin-trace.c            |   3 +-
 tools/perf/tests/mmap-thread-lookup.c |   2 +-
 tools/perf/util/synthetic-events.c    | 135 +++++++++-----------------
 tools/perf/util/synthetic-events.h    |   8 +-
 8 files changed, 56 insertions(+), 112 deletions(-)

diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c
index 738821113a005a6c..f1880116f4375c46 100644
--- a/tools/perf/bench/synthesize.c
+++ b/tools/perf/bench/synthesize.c

@@ -63,7 +63,6 @@ static int do_run_single_threaded(struct perf_session *session,
 				struct perf_thread_map *threads,
 				struct target *target, bool data_mmap)
 {
-	const unsigned int nr_threads_synthesize = 1;
 	struct timeval start, end, diff;
 	u64 runtime_us;
 	unsigned int i;

@@ -81,8 +80,7 @@ static int do_run_single_threaded(struct perf_session *session,
 						NULL,
 						target, threads,
 						process_synthesized_event,
-						data_mmap,
-						nr_threads_synthesize);
+						data_mmap);
 		if (err)
 			return err;

@@ -148,8 +146,7 @@ static int run_single_threaded(void)
 	return err;
 }
 
-static int do_run_multi_threaded(struct target *target,
-				unsigned int nr_threads_synthesize)
+static int do_run_multi_threaded(struct target *target)
 {
 	struct timeval start, end, diff;
 	u64 runtime_us;

@@ -172,8 +169,7 @@ static int do_run_multi_threaded(struct target *target,
 						NULL,
 						target, NULL,
 						process_synthesized_event,
-						false,
-						nr_threads_synthesize);
+						false);
 		if (err) {
 			perf_session__delete(session);
 			return err;

@@ -236,7 +232,7 @@ static int run_multi_threaded(void)
 		printf("  Number of synthesis threads: %u\n",
 			nr_threads_synthesize);
 
-		err = do_run_multi_threaded(&target, nr_threads_synthesize);
+		err = do_run_multi_threaded(&target);
 		if (err)
 			return err;

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index aa1b127ffb5be047..7afa1a41a627f353 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c

@@ -1456,7 +1456,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
 	perf_session__set_id_hdr_size(kvm->session);
 	ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true);
 	machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target,
-				    kvm->evlist->core.threads, false, 1);
+				    kvm->evlist->core.threads, false);
 	err = kvm_live_open_events(kvm);
 	if (err)
 		goto out;

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4d7b610b1d0bb9af..cccc2d0f9977d5b3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c

@@ -1481,8 +1481,7 @@ static int record__synthesize(struct record *rec, bool tail)
 	}
 
 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
-					    f, opts->sample_address,
-					    rec->opts.nr_threads_synthesize);
+					    f, opts->sample_address);
 
 	if (rec->opts.nr_threads_synthesize > 1)
 		perf_set_singlethreaded();

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 9b4f220920780a95..36cd1294d9b4ebd3 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c

@@ -1272,8 +1272,7 @@ static int __cmd_top(struct perf_top *top)
 		pr_debug("Couldn't synthesize cgroup events.\n");
 
 	machine__synthesize_threads(&top->session->machines.host, &opts->target,
-				    top->evlist->core.threads, false,
-				    top->nr_threads_synthesize);
+				    top->evlist->core.threads, false);
 
 	if (top->nr_threads_synthesize > 1)
 		perf_set_singlethreaded();

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 2bf21194c7b3959e..e2b50ba55a5ea93d 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c

@@ -1628,8 +1628,7 @@ static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
 		goto out;
 
 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
-					    evlist->core.threads, trace__tool_process, false,
-					    1);
+					    evlist->core.threads, trace__tool_process, false);
 out:
 	if (err)
 		symbol__exit();

diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 8d9d4cbff76d17d5..809be9510e849d1b 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c

@@ -135,7 +135,7 @@ static int synth_all(struct machine *machine)
 {
 	return perf_event__synthesize_threads(NULL,
 					      perf_event__process,
-					      machine, 0, 1);
+					      machine, 0);
 }
 
 static int synth_process(struct machine *machine)

diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index a7e981b2d7decd3b..5f41e2f9579e3f77 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c

@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/zalloc.h>
 #include <linux/perf_event.h>
+#include <linux/err.h>
 #include <asm/bug.h>
 #include <perf/evsel.h>
 #include <perf/cpumap.h>

@@ -42,6 +43,8 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
+#include "util/workqueue/workqueue.h"
+#include "util/util.h"
 
 #define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500

@@ -883,16 +886,13 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool,
 					    perf_event__handler_t process,
 					    struct machine *machine,
 					    bool mmap_data,
-					    struct dirent **dirent,
-					    int start,
-					    int num)
+					    char *d_name)
 {
 	union perf_event *comm_event, *mmap_event, *fork_event;
 	union perf_event *namespaces_event;
 	int err = -1;
 	char *end;
 	pid_t pid;
-	int i;
 
 	comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
 	if (comm_event == NULL)

@@ -912,24 +912,22 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool,
 	if (namespaces_event == NULL)
 		goto out_free_fork;
 
-	for (i = start; i < start + num; i++) {
-		if (!isdigit(dirent[i]->d_name[0]))
-			continue;
+	if (!isdigit(d_name[0]))
+		goto out_free_namespaces;
 
-		pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
-		/* only interested in proper numerical dirents */
-		if (*end)
-			continue;
-		/*
-		 * We may race with exiting thread, so don't stop just because
-		 * one thread couldn't be synthesized.
-		 */
-		__event__synthesize_thread(comm_event, mmap_event, fork_event,
-					   namespaces_event, pid, 1, process,
-					   tool, machine, mmap_data);
-	}
+	pid = (pid_t)strtol(d_name, &end, 10);
+	/* only interested in proper numerical dirents */
+	if (*end)
+		goto out_free_namespaces;
+	/*
+	 * We may race with exiting thread, so don't stop just because
+	 * one thread couldn't be synthesized.
+	 */
+	__event__synthesize_thread(comm_event, mmap_event, fork_event,
+					namespaces_event, pid, 1, process,
+					tool, machine, mmap_data);
 	err = 0;
-
+out_free_namespaces:
 	free(namespaces_event);
 out_free_fork:
 	free(fork_event);

@@ -947,36 +945,28 @@ struct synthesize_threads_arg {
 	struct machine *machine;
 	bool mmap_data;
 	struct dirent **dirent;
-	int num;
-	int start;
 };
 
-static void *synthesize_threads_worker(void *arg)
+static void synthesize_threads_worker(int i, void *arg)
 {
 	struct synthesize_threads_arg *args = arg;
 
 	__perf_event__synthesize_threads(args->tool, args->process,
 					 args->machine, args->mmap_data,
-					 args->dirent,
-					 args->start, args->num);
-	return NULL;
+					 args->dirent[i]->d_name);
 }
 
 int perf_event__synthesize_threads(struct perf_tool *tool,
 				   perf_event__handler_t process,
 				   struct machine *machine,
-				   bool mmap_data,
-				   unsigned int nr_threads_synthesize)
+				   bool mmap_data)
 {
-	struct synthesize_threads_arg *args = NULL;
-	pthread_t *synthesize_threads = NULL;
+	struct synthesize_threads_arg args;
 	char proc_path[PATH_MAX];
 	struct dirent **dirent;
-	int num_per_thread;
-	int m, n, i, j;
-	int thread_nr;
-	int base = 0;
-	int err = -1;
+	int n, i;
+	int err = -1, ret;
+	char err_buf[WORKQUEUE_STRERR_BUFSIZE];
 
 
 	if (machine__is_default_guest(machine))

@@ -987,60 +977,27 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
 	if (n < 0)
 		return err;
 
-	if (nr_threads_synthesize == UINT_MAX)
-		thread_nr = sysconf(_SC_NPROCESSORS_ONLN);
-	else
-		thread_nr = nr_threads_synthesize;
-
-	if (thread_nr <= 1) {
-		err = __perf_event__synthesize_threads(tool, process,
-						       machine, mmap_data,
-						       dirent, base, n);
+	if (perf_singlethreaded) {
+		for (i = 0; i < n; i++)
+			err = __perf_event__synthesize_threads(tool, process,
+								machine, mmap_data,
+								dirent[i]->d_name);
 		goto free_dirent;
 	}
-	if (thread_nr > n)
-		thread_nr = n;
-
-	synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
-	if (synthesize_threads == NULL)
-		goto free_dirent;
 
-	args = calloc(sizeof(*args), thread_nr);
-	if (args == NULL)
-		goto free_threads;
+	args.tool = tool;
+	args.process = process;
+	args.machine = machine;
+	args.mmap_data = mmap_data;
+	args.dirent = dirent;
 
-	num_per_thread = n / thread_nr;
-	m = n % thread_nr;
-	for (i = 0; i < thread_nr; i++) {
-		args[i].tool = tool;
-		args[i].process = process;
-		args[i].machine = machine;
-		args[i].mmap_data = mmap_data;
-		args[i].dirent = dirent;
-	}
-	for (i = 0; i < m; i++) {
-		args[i].num = num_per_thread + 1;
-		args[i].start = i * args[i].num;
-	}
-	if (i != 0)
-		base = args[i-1].start + args[i-1].num;
-	for (j = i; j < thread_nr; j++) {
-		args[j].num = num_per_thread;
-		args[j].start = base + (j - i) * args[i].num;
+	err = parallel_for(global_wq, 0, n, 1, synthesize_threads_worker, &args);
+	if (err) {
+		ret = workqueue_strerror(global_wq, err, err_buf, sizeof(err_buf));
+		pr_err("parallel_for: %s\n",
+			ret <= 0 ? "Error generating error msg" : err_buf);
 	}
 
-	for (i = 0; i < thread_nr; i++) {
-		if (pthread_create(&synthesize_threads[i], NULL,
-				   synthesize_threads_worker, &args[i]))
-			goto out_join;
-	}
-	err = 0;
-out_join:
-	for (i = 0; i < thread_nr; i++)
-		pthread_join(synthesize_threads[i], NULL);
-	free(args);
-free_threads:
-	free(synthesize_threads);
 free_dirent:
 	for (i = 0; i < n; i++)
 		zfree(&dirent[i]);

@@ -1775,26 +1732,22 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_
 
 int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
 				  struct target *target, struct perf_thread_map *threads,
-				  perf_event__handler_t process, bool data_mmap,
-				  unsigned int nr_threads_synthesize)
+				  perf_event__handler_t process, bool data_mmap)
 {
 	if (target__has_task(target))
 		return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap);
 	else if (target__has_cpu(target))
 		return perf_event__synthesize_threads(tool, process,
-						      machine, data_mmap,
-						      nr_threads_synthesize);
+						      machine, data_mmap);
 	/* command specified */
 	return 0;
 }
 
 int machine__synthesize_threads(struct machine *machine, struct target *target,
-				struct perf_thread_map *threads, bool data_mmap,
-				unsigned int nr_threads_synthesize)
+				struct perf_thread_map *threads, bool data_mmap)
 {
 	return __machine__synthesize_threads(machine, NULL, target, threads,
-					     perf_event__process, data_mmap,
-					     nr_threads_synthesize);
+					     perf_event__process, data_mmap);
 }
 
 static struct perf_record_event_update *event_update_event__new(size_t size, u64 type, u64 id)

diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index c845e2b9b444df57..cd3c43451a237563 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h

@@ -54,7 +54,7 @@ int perf_event__synthesize_stat_round(struct perf_tool *tool, u64 time, u64 type
 int perf_event__synthesize_stat(struct perf_tool *tool, u32 cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_thread_map2(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool mmap_data);
-int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data, unsigned int nr_threads_synthesize);
+int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data);
 int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process);
 int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
 pid_t perf_event__synthesize_comm(struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine);

@@ -65,11 +65,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
 
 int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
 				  struct target *target, struct perf_thread_map *threads,
-				  perf_event__handler_t process, bool data_mmap,
-				  unsigned int nr_threads_synthesize);
+				  perf_event__handler_t process, bool data_mmap);
 int machine__synthesize_threads(struct machine *machine, struct target *target,
-				struct perf_thread_map *threads, bool data_mmap,
-				unsigned int nr_threads_synthesize);
+				struct perf_thread_map *threads, bool data_mmap);
 
 #ifdef HAVE_AUXTRACE_SUPPORT
 int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool,

-- 
2.31.1

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help