[PATCH v4 3/3] drm/xe/xe_ras: Add error-event support for CRI
From: Riana Tauro <hidden>
Date: 2026-07-01 09:45:02
Also in:
dri-devel, intel-xe
Subsystem:
drm drivers, intel drm xe driver (lunar lake and newer), the rest · Maintainers:
David Airlie, Simona Vetter, Matthew Brost, Thomas Hellström, Rodrigo Vivi, Linus Torvalds
Add error-event support for Correctable errors in CRI. Report an error
event to userspace for every component that has crossed the threshold on
receiving an interrupt.
Cc: Michal Wajdeczko <redacted>
Signed-off-by: Riana Tauro <redacted>
---
v2: add warns for unexpected values from system controller (Michal)
send an event at most once per component for each interrupt (Raag)
use correct parameters for get_counter (Sashiko)
---
drivers/gpu/drm/xe/xe_ras.c | 75 +++++++++++++++++++++++++++++++++++++
1 file changed, 75 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
index 44f4e1a3455b..b71d51285954 100644
--- a/drivers/gpu/drm/xe/xe_ras.c
+++ b/drivers/gpu/drm/xe/xe_ras.c@@ -77,6 +77,18 @@ static u8 drm_to_xe_ras_severity(u8 severity) } } +static u8 xe_to_drm_ras_severity(u8 severity) +{ + switch (severity) { + case XE_RAS_SEV_CORRECTABLE: + return DRM_XE_RAS_ERR_SEV_CORRECTABLE; + case XE_RAS_SEV_UNCORRECTABLE: + return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE; + default: + return DRM_XE_RAS_ERR_SEV_MAX; + } +} + static u8 drm_to_xe_ras_component(u8 component) { switch (component) {
@@ -95,6 +107,24 @@ static u8 drm_to_xe_ras_component(u8 component) } } +static u8 xe_to_drm_ras_component(u8 component) +{ + switch (component) { + case XE_RAS_COMP_DEVICE_MEMORY: + return DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY; + case XE_RAS_COMP_CORE_COMPUTE: + return DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; + case XE_RAS_COMP_PCIE: + return DRM_XE_RAS_ERR_COMP_PCIE; + case XE_RAS_COMP_FABRIC: + return DRM_XE_RAS_ERR_COMP_FABRIC; + case XE_RAS_COMP_SOC_INTERNAL: + return DRM_XE_RAS_ERR_COMP_SOC_INTERNAL; + default: + return DRM_XE_RAS_ERR_COMP_MAX; + } +} + static int ras_status_to_errno(u32 status) { switch (status) {
@@ -131,14 +161,41 @@ static inline const char *comp_to_str(u8 component) return xe_ras_components[component]; } +static void ras_send_error_event(struct xe_device *xe, u8 severity, u8 component) +{ + u8 drm_severity, drm_component; + u32 value; + int ret; + + drm_severity = xe_to_drm_ras_severity(severity); + if (drm_severity == DRM_XE_RAS_ERR_SEV_MAX) { + xe_warn(xe, "sysctrl: unexpected severity %u\n", severity); + return; + } + + drm_component = xe_to_drm_ras_component(component); + if (drm_component == DRM_XE_RAS_ERR_COMP_MAX) { + xe_warn(xe, "sysctrl: unexpected component %u\n", component); + return; + } + + ret = xe_ras_get_counter(xe, drm_severity, drm_component, &value); + if (ret) + return; + + xe_drm_ras_event(xe, drm_component, drm_severity, value, GFP_KERNEL); +} + void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response) { struct xe_ras_threshold_crossed *pending = (void *)&response->data; struct xe_ras_error_class *errors = pending->counters; u32 id, ncounters = pending->ncounters; + u8 sent = 0; BUILD_BUG_ON(sizeof(response->data) < sizeof(*pending)); + BUILD_BUG_ON(XE_RAS_COMP_MAX > (BITS_PER_BYTE * sizeof(sent))); xe_device_assert_mem_access(xe); if (!ncounters || ncounters > XE_RAS_NUM_COUNTERS)
@@ -154,6 +211,24 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe, xe_warn(xe, "[RAS]: %s %s detected\n", comp_to_str(component), sev_to_str(severity)); + + if (severity != XE_RAS_SEV_CORRECTABLE) { + xe_warn(xe, "sysctrl: unexpected severity %s (%u)\n", sev_to_str(severity), + severity); + continue; + } + + if (component >= XE_RAS_COMP_MAX) { + xe_warn(xe, "sysctrl: unexpected component %u\n", component); + continue; + } + + /* Send event once per component */ + if (sent & BIT(component)) + continue; + sent |= BIT(component); + + ras_send_error_event(xe, severity, component); } }
--
2.47.1