Thread (6 messages) 6 messages, 3 authors, 2024-11-26

Re: [PATCH v2 1/4] powerpc/perf: Add perf interface to expose vpa counters

From: Gautam Menghani <hidden>
Date: 2024-11-26 07:36:29
Also in: lkml

On Mon, Nov 18, 2024 at 05:11:11PM +0530, Kajol Jain wrote:
quoted hunk ↗ jump to hunk
To support performance measurement for KVM on PowerVM(KoP)
feature, PowerVM hypervisor has added couple of new software
counters in Virtual Process Area(VPA) of the partition.

Commit e1f288d2f9c69 ("KVM: PPC: Book3S HV nestedv2: Add
support for reading VPA counters for pseries guests")
have updated the paca fields with corresponding changes.

Proposed perf interface is to expose these new software
counters for monitoring of context switch latencies and
runtime aggregate. Perf interface driver is called
"vpa_pmu" and it has dependency on KVM and perf, hence
added new config called "VPA_PMU" which depends on
"CONFIG_KVM_BOOK3S_64_HV" and "CONFIG_HV_PERF_CTRS".
Since, kvm and kvm_host are currently compiled as built-in
modules, this perf interface takes the same path and
registered as a module.

vpa_pmu perf interface needs access to some of the kvm
functions and structures like kvmhv_get_l2_counters_status(),
hence kvm_book3s_64.h and kvm_ppc.h are included.
Below are the events added to monitor KoP:

  vpa_pmu/l1_to_l2_lat/
  vpa_pmu/l2_to_l1_lat/
  vpa_pmu/l2_runtime_agg/

and vpa_pmu driver supports only per-cpu monitoring with this patch.
Example usage:

[command]# perf stat -e vpa_pmu/l1_to_l2_lat/ -a -I 1000
     1.001017682            727,200      vpa_pmu/l1_to_l2_lat/
     2.003540491          1,118,824      vpa_pmu/l1_to_l2_lat/
     3.005699458          1,919,726      vpa_pmu/l1_to_l2_lat/
     4.007827011          2,364,630      vpa_pmu/l1_to_l2_lat/

Signed-off-by: Kajol Jain <redacted>
Co-developed-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
---
Changelog:

v1 -> v2
- Rebase the patch on top of kvm typo fix patch:
  https://github.com/linuxppc/linux/commit/590d2f9347f7974d7954400e5d937672fd844a8b
- Fix the config check reported by kernel test robot:
  https://lore.kernel.org/oe-kbuild-all/202411171117.Eq9JtACb-lkp@intel.com/ (local)

 arch/powerpc/include/asm/kvm_book3s_64.h |   3 +
 arch/powerpc/kvm/book3s_hv.c             |  19 +++
 arch/powerpc/perf/Makefile               |   2 +
 arch/powerpc/perf/vpa-pmu.c              | 197 +++++++++++++++++++++++
 arch/powerpc/platforms/pseries/Kconfig   |  14 ++
 5 files changed, 235 insertions(+)
 create mode 100644 arch/powerpc/perf/vpa-pmu.c
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 11065313d4c1..f620e3126d68 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -688,6 +688,9 @@ int kvmhv_counters_tracepoint_regfunc(void);
 void kvmhv_counters_tracepoint_unregfunc(void);
 int kvmhv_get_l2_counters_status(void);
 void kvmhv_set_l2_counters_status(int cpu, bool status);
+u64 kvmhv_get_l1_to_l2_cs_time(void);
+u64 kvmhv_get_l2_to_l1_cs_time(void);
+u64 kvmhv_get_l2_runtime_agg(void);
 
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d575f7c7ab38..e618533dfc4b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4153,6 +4153,7 @@ void kvmhv_set_l2_counters_status(int cpu, bool status)
 	else
 		lppaca_of(cpu).l2_counters_enable = 0;
 }
+EXPORT_SYMBOL(kvmhv_set_l2_counters_status);
 
 int kvmhv_counters_tracepoint_regfunc(void)
 {
@@ -4192,6 +4193,24 @@ static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu)
 	*l2_runtime_agg_ptr = l2_runtime_ns;
 }
 
+u64 kvmhv_get_l1_to_l2_cs_time(void)
+{
+	return tb_to_ns(be64_to_cpu(get_lppaca()->l1_to_l2_cs_tb));
+}
+EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time);
+
+u64 kvmhv_get_l2_to_l1_cs_time(void)
+{
+	return tb_to_ns(be64_to_cpu(get_lppaca()->l2_to_l1_cs_tb));
+}
+EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time);
+
+u64 kvmhv_get_l2_runtime_agg(void)
+{
+	return tb_to_ns(be64_to_cpu(get_lppaca()->l2_runtime_tb));
+}
+EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg);
+
 #else
 int kvmhv_get_l2_counters_status(void)
 {
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index 4f53d0b97539..ac2cf58d62db 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -16,6 +16,8 @@ obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o
 
 obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o
 
+obj-$(CONFIG_VPA_PMU) += vpa-pmu.o
Do we need a new config option for this? I see you need 2 dependencies
for vpa_pmu, so can we do the following instead?

obj-$(CONFIG_KVM_BOOK3S_64_HV) += vpa-pmu.o

and then in the init func of vpa_pmu, we can add the check for HV_PERF_CTRS

static int __init pseries_vpa_pmu_init(void)
{
	<snip>

	if (!firmware_has_feature(FW_FEATURE_LPAR) || is_kvm_guest() ||
		!IS_ENABLED(CONFIG_HV_PERF_CTRS))
		return -ENODEV;

Thoughts?
quoted hunk ↗ jump to hunk
+
 obj-$(CONFIG_PPC_8xx) += 8xx-pmu.o
 
 obj-$(CONFIG_PPC64)		+= $(obj64-y)
diff --git a/arch/powerpc/perf/vpa-pmu.c b/arch/powerpc/perf/vpa-pmu.c
new file mode 100644
index 000000000000..2c785eee2f71
--- /dev/null
+++ b/arch/powerpc/perf/vpa-pmu.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Performance monitoring support for Virtual Processor Area(VPA) based counters
+ *
+ * Copyright (C) 2024 IBM Corporation
+ */
+#define pr_fmt(fmt) "vpa_pmu: " fmt
+
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_64.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "pseries_vpa_pmu"
+
+#define EVENT(_name, _code)     enum{_name = _code}
+
+#define VPA_PMU_EVENT_VAR(_id)  event_attr_##_id
+#define VPA_PMU_EVENT_PTR(_id)  (&event_attr_##_id.attr.attr)
+
+static ssize_t vpa_pmu_events_sysfs_show(struct device *dev,
+				 struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define VPA_PMU_EVENT_ATTR(_name, _id)				\
+	PMU_EVENT_ATTR(_name, VPA_PMU_EVENT_VAR(_id), _id,	\
+			vpa_pmu_events_sysfs_show)
+
+EVENT(L1_TO_L2_CS_LAT,	0x1);
+EVENT(L2_TO_L1_CS_LAT,	0x2);
+EVENT(L2_RUNTIME_AGG,	0x3);
+
+VPA_PMU_EVENT_ATTR(l1_to_l2_lat,  L1_TO_L2_CS_LAT);
+VPA_PMU_EVENT_ATTR(l2_to_l1_lat,  L2_TO_L1_CS_LAT);
+VPA_PMU_EVENT_ATTR(l2_runtime_agg, L2_RUNTIME_AGG);
+
+static struct attribute *vpa_pmu_events_attr[] = {
+	VPA_PMU_EVENT_PTR(L1_TO_L2_CS_LAT),
+	VPA_PMU_EVENT_PTR(L2_TO_L1_CS_LAT),
+	VPA_PMU_EVENT_PTR(L2_RUNTIME_AGG),
+	NULL
+};
+
+static const struct attribute_group vpa_pmu_events_group = {
+	.name = "events",
+	.attrs = vpa_pmu_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-31");
+static struct attribute *vpa_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group vpa_pmu_format_group = {
+	.name = "format",
+	.attrs = vpa_pmu_format_attr,
+};
+
+static const struct attribute_group *vpa_pmu_attr_groups[] = {
+	&vpa_pmu_events_group,
+	&vpa_pmu_format_group,
+	NULL
+};
+
+static int vpa_pmu_event_init(struct perf_event *event)
+{
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* it does not support event sampling mode */
+	if (is_sampling_event(event))
+		return -EOPNOTSUPP;
+
+	/* no branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	/* Invalid event code */
+	if ((event->attr.config <= 0) || (event->attr.config > 3))
+		return -EINVAL;
+
+	return 0;
+}
+
+static unsigned long get_counter_data(struct perf_event *event)
+{
+	unsigned int config = event->attr.config;
+	u64 data;
+
+	switch (config) {
+	case L1_TO_L2_CS_LAT:
+		data = kvmhv_get_l1_to_l2_cs_time();
+		break;
+	case L2_TO_L1_CS_LAT:
+		data = kvmhv_get_l2_to_l1_cs_time();
+		break;
+	case L2_RUNTIME_AGG:
+		data = kvmhv_get_l2_runtime_agg();
+		break;
+	default:
+		data = 0;
+		break;
+	}
+
+	return data;
+}
+
+static int vpa_pmu_add(struct perf_event *event, int flags)
+{
+	u64 data;
+
+	kvmhv_set_l2_counters_status(
+			smp_processor_id(), true);
+
+	data = get_counter_data(event);
+	local64_set(&event->hw.prev_count, data);
+
+	return 0;
+}
+
+static void vpa_pmu_read(struct perf_event *event)
+{
+	u64 prev_data, new_data, final_data;
+
+	prev_data = local64_read(&event->hw.prev_count);
+	new_data = get_counter_data(event);
+	final_data = new_data - prev_data;
+
+	local64_add(final_data, &event->count);
+}
+
+static void vpa_pmu_del(struct perf_event *event, int flags)
+{
+	vpa_pmu_read(event);
+
+	/*
+	 * Disable vpa counter accumulation
+	 */
+	kvmhv_set_l2_counters_status(
+			smp_processor_id(), false);
+}
+
This won't work well with the kvm_hv:kvmppc_vcpu_stats tracepoint.
Consider the below scenario:

1. I start recording data with the kvmppc_vcpu_stats tracepoint (with trace-cmd)
and while the data is being recorded, I start using the vpa_pmu driver as well.
2. I now stop the vpa_pmu driver. This disables the l2_counters_enable
counter in lppaca. (The VPA flag can be disabled for 1 or more cpus
depending on the pid/tid options specified)
3. When I hit ctrl^c on the trace-cmd capture, I see partial data with
'trace-cmd report'.

To get around this, we can use the fact that the 'l2_counters_enable'
flag in lppaca supports all non-zero values. So we can see if we can use
that as a counter instead of just using 0/1 to enable/disable recording
of data. This may need some brainstorming to get right.

I see the patches have been pulled in already, but I think we need to
consider the above points.

Thanks,
Gautam
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help