[PATCH 3/4] rasdaemon: Enumerate memory on noncpu nodes
From: Naveen Krishna Chatradhi <hidden>
Date: 2021-08-10 17:23:01
Subsystem:
the rest · Maintainer:
Linus Torvalds
From: Muralidhara M K <redacted> On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory) connected via xGMI links to the CPUs. The node id information is available in the MCA_IPID[47:44](InstanceIdHI) register. The UMC Phys on Aldeberan nodes are enumerated as csrow The UMC channels connected to HBMs are enumerated as ranks. Signed-off-by: Muralidhara M K <redacted> Signed-off-by: Naveen Krishna Chatradhi <redacted> --- mce-amd-smca.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-)
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 3c346f4..9381aa1 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c@@ -78,6 +78,16 @@ enum smca_bank_types { /* Maximum number of MCA banks per CPU. */ #define MAX_NR_BANKS 64 +/* + * On newer heterogeneous systems the data gabrics of the CPUs and GPUs + * are connected directly via a custom links, like is done with + * 2 socket CPU systems and also within a socket for Multi-chip Module + * (MCM) CPUs like Naples. + * The first GPU node(non cpu) is assumed to have an "AMD Node ID" value + * of 8 (the second GPU node has 9, etc.). + */ +#define NONCPU_NODE_INDEX 8 + /* SMCA Extended error strings */ /* Load Store */ static const char * const smca_ls_mce_desc[] = {
@@ -531,6 +541,26 @@ static int find_umc_channel(struct mce_event *e) { return EXTRACT(e->ipid, 0, 31) >> 20; } + +/* + * The HBM memory managed by the UMCCH of the noncpu node + * can be calculated based on the [15:12]bits of IPID + */ +static int find_hbm_channel(struct mce_event *e) +{ + int umc, tmp; + + umc = EXTRACT(e->ipid, 0, 31) >> 20; + + /* + * The HBM channel managed by the UMC of the noncpu node + * can be calculated based on the [15:12]bits of IPID as follows + */ + tmp = ((e->ipid >> 12) & 0xf); + + return (umc % 2) ? tmp + 4 : tmp; +} + /* Decode extended errors according to Scalable MCA specification */ static void decode_smca_error(struct mce_event *e) {
@@ -539,6 +569,7 @@ static void decode_smca_error(struct mce_event *e) unsigned short xec = (e->status >> 16) & 0x3f; const struct smca_hwid *s_hwid; uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63); + uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47); unsigned int csrow = -1, channel = -1; unsigned int i;
@@ -548,14 +579,16 @@ static void decode_smca_error(struct mce_event *e) bank_type = s_hwid->bank_type; break; } + if (mcatype_instancehi >= NONCPU_NODE_INDEX) + bank_type = SMCA_UMC_V2; } - if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) { + if (i >= MAX_NR_BANKS) { strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID"); return; } - if (bank_type >= N_SMCA_BANK_TYPES) { + if (bank_type >= MAX_NR_BANKS) { strcpy(e->mcastatus_msg, "Don't know how to decode this bank"); return; }
@@ -580,6 +613,16 @@ static void decode_smca_error(struct mce_event *e) mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", channel, csrow); } + + if (bank_type == SMCA_UMC_V2 && xec == 0) { + /* The UMCPHY is reported as csrow in case of noncpu nodes */ + csrow = find_umc_channel(e) / 2; + /* UMCCH is managing the HBM memory */ + channel = find_hbm_channel(e); + mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", + channel, csrow); + } + } int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
--
2.17.1