--- v2
+++ v3
@@ -1,62 +1,188 @@
-r3 in MCE handler.
-
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
-During Machine Check interrupt on pseries platform, register r3 points
-RTAS extended event log passed by hypervisor. Since hypervisor uses r3
-to pass pointer to rtas log, it stores the original r3 value at the
-start of the memory (first 8 bytes) pointed by r3. Since hypervisor
-stores this info and rtas log is in BE format, linux should make
-sure to restore r3 value in correct endian format.
+Extract the MCE error details from RTAS extended log and display it to
+console.
-Without this patch when MCE handler, after recovery, returns to code that
-that caused the MCE may end up with Data SLB access interrupt for invalid
-address followed by kernel panic or hang.
+With this patch you should now see mce logs like below:
-[ 62.878965] Severe Machine check interrupt [Recovered]
-[ 62.878968] NIP [d00000000ca301b8]: init_module+0x1b8/0x338 [bork_kernel]
-[ 62.878969] Initiator: CPU
-[ 62.878970] Error type: SLB [Multihit]
-[ 62.878971] Effective address: d00000000ca70000
-cpu 0xa: Vector: 380 (Data SLB Access) at [c0000000fc7775b0]
- pc: c0000000009694c0: vsnprintf+0x80/0x480
- lr: c0000000009698e0: vscnprintf+0x20/0x60
- sp: c0000000fc777830
- msr: 8000000002009033
- dar: a803a30c000000d0
- current = 0xc00000000bc9ef00
- paca = 0xc00000001eca5c00 softe: 3 irq_happened: 0x01
- pid = 8860, comm = insmod
-[c0000000fc7778b0] c0000000009698e0 vscnprintf+0x20/0x60
-[c0000000fc7778e0] c00000000016b6c4 vprintk_emit+0xb4/0x4b0
-[c0000000fc777960] c00000000016d40c vprintk_func+0x5c/0xd0
-[c0000000fc777980] c00000000016cbb4 printk+0x38/0x4c
-[c0000000fc7779a0] d00000000ca301c0 init_module+0x1c0/0x338 [bork_kernel]
-[c0000000fc777a40] c00000000000d9c4 do_one_initcall+0x54/0x230
-[c0000000fc777b00] c0000000001b3b74 do_init_module+0x8c/0x248
-[c0000000fc777b90] c0000000001b2478 load_module+0x12b8/0x15b0
-[c0000000fc777d30] c0000000001b29e8 sys_finit_module+0xa8/0x110
-[c0000000fc777e30] c00000000000b204 system_call+0x58/0x6c
---- Exception: c00 (System Call) at 00007fff8bda0644
-SP (7fffdfbfe980) is in userspace
-
-This patch fixes this issue.
+[ 142.371818] Severe Machine check interrupt [Recovered]
+[ 142.371822] NIP [d00000000ca301b8]: init_module+0x1b8/0x338 [bork_kernel]
+[ 142.371822] Initiator: CPU
+[ 142.371823] Error type: SLB [Multihit]
+[ 142.371824] Effective address: d00000000ca70000
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
- arch/powerpc/platforms/pseries/ras.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
+ arch/powerpc/include/asm/rtas.h | 5 +
+ arch/powerpc/platforms/pseries/ras.c | 128 +++++++++++++++++++++++++++++++++-
+ 2 files changed, 131 insertions(+), 2 deletions(-)
+diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
+index 3f2fba7ef23b..8100a95c133a 100644
+--- a/arch/powerpc/include/asm/rtas.h
++++ b/arch/powerpc/include/asm/rtas.h
+@@ -190,6 +190,11 @@ static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog)
+ return (elog->byte1 & 0x04) >> 2;
+ }
+
++static inline uint8_t rtas_error_initiator(const struct rtas_error_log *elog)
++{
++ return (elog->byte2 & 0xf0) >> 4;
++}
++
+ #define rtas_error_type(x) ((x)->byte3)
+
+ static inline
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
-index afdf05444bc2..cd9446980092 100644
+index e56759d92356..cd9446980092 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
-@@ -360,7 +360,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+@@ -422,7 +422,130 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
+ return 0; /* need to perform reset */
+ }
+
+-static int mce_handle_error(struct rtas_error_log *errp)
++#define VAL_TO_STRING(ar, val) ((val < ARRAY_SIZE(ar)) ? ar[val] : "Unknown")
++
++static void pseries_print_mce_info(struct pt_regs *regs,
++ struct rtas_error_log *errp, int disposition)
++{
++ const char *level, *sevstr;
++ struct pseries_errorlog *pseries_log;
++ struct pseries_mc_errorlog *mce_log;
++ uint8_t error_type, err_sub_type;
++ uint8_t initiator = rtas_error_initiator(errp);
++ uint64_t addr;
++
++ static const char * const initiators[] = {
++ "Unknown",
++ "CPU",
++ "PCI",
++ "ISA",
++ "Memory",
++ "Power Mgmt",
++ };
++ static const char * const mc_err_types[] = {
++ "UE",
++ "SLB",
++ "ERAT",
++ "TLB",
++ "D-Cache",
++ "Unknown",
++ "I-Cache",
++ };
++ static const char * const mc_ue_types[] = {
++ "Indeterminate",
++ "Instruction fetch",
++ "Page table walk ifetch",
++ "Load/Store",
++ "Page table walk Load/Store",
++ };
++
++ /* SLB sub errors valid values are 0x0, 0x1, 0x2 */
++ static const char * const mc_slb_types[] = {
++ "Parity",
++ "Multihit",
++ "Indeterminate",
++ };
++
++ /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
++ static const char * const mc_soft_types[] = {
++ "Unknown",
++ "Parity",
++ "Multihit",
++ "Indeterminate",
++ };
++
++ pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
++ if (pseries_log == NULL)
++ return;
++
++ mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
++
++ error_type = rtas_mc_error_type(mce_log);
++ err_sub_type = rtas_mc_error_sub_type(mce_log);
++
++ switch (rtas_error_severity(errp)) {
++ case RTAS_SEVERITY_NO_ERROR:
++ level = KERN_INFO;
++ sevstr = "Harmless";
++ break;
++ case RTAS_SEVERITY_WARNING:
++ level = KERN_WARNING;
++ sevstr = "";
++ break;
++ case RTAS_SEVERITY_ERROR:
++ case RTAS_SEVERITY_ERROR_SYNC:
++ level = KERN_ERR;
++ sevstr = "Severe";
++ break;
++ case RTAS_SEVERITY_FATAL:
++ default:
++ level = KERN_ERR;
++ sevstr = "Fatal";
++ break;
++ }
++
++ printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
++ disposition == RTAS_DISP_FULLY_RECOVERED ?
++ "Recovered" : "Not recovered");
++ if (user_mode(regs)) {
++ printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level,
++ regs->nip, current->pid, current->comm);
++ } else {
++ printk("%s NIP [%016lx]: %pS\n", level, regs->nip,
++ (void *)regs->nip);
++ }
++ printk("%s Initiator: %s\n", level,
++ VAL_TO_STRING(initiators, initiator));
++
++ switch (error_type) {
++ case PSERIES_MC_ERROR_TYPE_UE:
++ printk("%s Error type: %s [%s]\n", level,
++ VAL_TO_STRING(mc_err_types, error_type),
++ VAL_TO_STRING(mc_ue_types, err_sub_type));
++ break;
++ case PSERIES_MC_ERROR_TYPE_SLB:
++ printk("%s Error type: %s [%s]\n", level,
++ VAL_TO_STRING(mc_err_types, error_type),
++ VAL_TO_STRING(mc_slb_types, err_sub_type));
++ break;
++ case PSERIES_MC_ERROR_TYPE_ERAT:
++ case PSERIES_MC_ERROR_TYPE_TLB:
++ printk("%s Error type: %s [%s]\n", level,
++ VAL_TO_STRING(mc_err_types, error_type),
++ VAL_TO_STRING(mc_soft_types, err_sub_type));
++ break;
++ default:
++ printk("%s Error type: %s\n", level,
++ VAL_TO_STRING(mc_err_types, error_type));
++ break;
++ }
++
++ addr = rtas_mc_get_effective_addr(mce_log);
++ if (addr)
++ printk("%s Effective address: %016llx\n", level, addr);
++}
++
++static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
+ {
+ struct pseries_errorlog *pseries_log;
+ struct pseries_mc_errorlog *mce_log;
+@@ -442,6 +565,7 @@ static int mce_handle_error(struct rtas_error_log *errp)
+ slb_flush_and_rebolt();
+ disposition = RTAS_DISP_FULLY_RECOVERED;
}
++ pseries_print_mce_info(regs, errp, disposition);
- savep = __va(regs->gpr[3]);
-- regs->gpr[3] = savep[0]; /* restore original r3 */
-+ regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
+ out:
+ return disposition;
+@@ -461,7 +585,7 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+ int recovered = 0;
+ int disposition;
- /* If it isn't an extended log we can use the per cpu 64bit buffer */
- h = (struct rtas_error_log *)&savep[1];
+- disposition = mce_handle_error(err);
++ disposition = mce_handle_error(regs, err);
+
+ if (!(regs->msr & MSR_RI)) {
+ /* If MSR_RI isn't set, we cannot recover */