[RFC,0/8] EDAC, mce_amd: Add a tracepoint for the decoded error
From: Borislav Petkov <bp@alien8.de>
Date: 2017-07-28 15:09:36
Also in:
lkml
Subsystem:
edac-amd64, edac-core, the rest, tracing, x86 architecture (32-bit and 64-bit) · Maintainers:
Yazen Ghannam, Borislav Petkov, Tony Luck, Linus Torvalds, Steven Rostedt, Masami Hiramatsu, Thomas Gleixner, Ingo Molnar, Dave Hansen
Ok,
here's a working version. It looks pretty straight-forward (to me, at
least) and it does what it is supposed to when I inject an MCE:
# tracer: nop
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
kworker/1:2-76 [001] .... 316.852022: mce_record: CPU: 1, MCGc/s: 100010a/0, MC5: dc00410000020f0f, IPID: 0000000000000000, ADDR/MISC/SYND: 0000000056071033/0000000000000000/0000000000000000, RIP: 00:<0000000000000000>, TSC: 0, PROCESSOR: 2:600f20, TIME: 1501261272, SOCKET: 1, APIC: 1 MC5 Error: AG payload array parity error.
cache level: L3/GEN, mem/io: GEN, mem-tx: GEN, part-proc: GEN (timed out)
kworker/1:2-76 [001] .... 328.372055: mce_record: CPU: 1, MCGc/s: 100010a/0, MC5: dc00410000020f0f, IPID: 0000000000000000, ADDR/MISC/SYND: 0000000056071033/0000000000000000/0000000000000000, RIP: 00:<0000000000000000>, TSC: 0, PROCESSOR: 2:600f20, TIME: 1501261283, SOCKET: 1, APIC: 1 MC5 Error: AG payload array parity error.
cache level: L3/GEN, mem/io: GEN, mem-tx: GEN, part-proc: GEN (timed out)
diff ontop of the current pile:
---diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 181264989db5..ba4c99ace544 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h@@ -146,6 +146,7 @@ struct mca_config { bool ser; bool recovery; bool bios_cmci_threshold; + bool has_decoder; u8 banks; s8 bootlog; int tolerant;
@@ -195,7 +196,8 @@ enum mce_notifier_prios { MCE_PRIO_SRAO = INT_MAX - 1, MCE_PRIO_EXTLOG = INT_MAX - 2, MCE_PRIO_NFIT = INT_MAX - 3, - MCE_PRIO_EDAC = INT_MAX - 4, + MCE_PRIO_DECODER = INT_MAX - 4, + MCE_PRIO_EDAC = INT_MAX - 5, MCE_PRIO_MCELOG = 1, MCE_PRIO_LOWEST = 0, };
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6dde0497efc7..9486a2ca6556 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c@@ -162,6 +162,9 @@ void mce_register_decode_chain(struct notifier_block *nb) atomic_inc(&num_notifiers); + if (nb->priority == MCE_PRIO_DECODER) + mca_cfg.has_decoder = true; + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain);
@@ -556,7 +559,8 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_STOP; /* Emit the trace record: */ - trace_mce_record(m); + if (!mca_cfg.has_decoder) + trace_mce_record(m, ""); set_bit(0, &mce_need_notify);
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index fd4a615200a8..d0b28edd72b8 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c@@ -7,6 +7,8 @@ #include <asm/cpu.h> +#include <trace/events/mce.h> + #include "mce_amd.h" static struct amd_decoder_ops *fam_ops;
@@ -1082,7 +1084,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) amd_decode_err_code(m->status & 0xffff); if (ras_userspace_consumers()) { - trace_mce_decode(sb.buffer); + trace_mce_record(m, sb.buffer); } else { char *l;
@@ -1097,7 +1099,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) static struct notifier_block amd_mce_dec_nb = { .notifier_call = amd_decode_mce, - .priority = MCE_PRIO_EDAC, + .priority = MCE_PRIO_DECODER, }; static int __init mce_amd_init(void)
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 70f02149808c..82ff4014fab1 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h@@ -10,9 +10,9 @@ TRACE_EVENT(mce_record, - TP_PROTO(struct mce *m), + TP_PROTO(struct mce *m, const char *param_str), - TP_ARGS(m), + TP_ARGS(m, param_str), TP_STRUCT__entry( __field( u64, mcgcap )
@@ -32,6 +32,7 @@ TRACE_EVENT(mce_record, __field( u8, cs ) __field( u8, bank ) __field( u8, cpuvendor ) + __string( str, param_str ) ), TP_fast_assign(
@@ -52,9 +53,10 @@ TRACE_EVENT(mce_record, __entry->cs = m->cs; __entry->bank = m->bank; __entry->cpuvendor = m->cpuvendor; + __assign_str(str, param_str); ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x %s", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status,
@@ -65,7 +67,8 @@ TRACE_EVENT(mce_record, __entry->cpuvendor, __entry->cpuid, __entry->walltime, __entry->socketid, - __entry->apicid) + __entry->apicid, + __get_str(str)) ); #endif /* _TRACE_MCE_H */