Re: [PATCH v2 11/11] RAS: add DeviceTree firmware-first CPER provider
From: Himanshu Chauhan <hidden>
Date: 2026-02-26 07:02:01
Also in:
linux-acpi, linux-devicetree, linux-doc
On Fri, Feb 20, 2026 at 7:13 PM Ahmed Tiba [off-list ref] wrote:
quoted hunk ↗ jump to hunk
Add a DeviceTree firmware-first CPER provider that reuses the shared GHES helpers, wire it into the RAS Kconfig/Makefile and document it in the admin guide. Update MAINTAINERS now that the driver exists. Signed-off-by: Ahmed Tiba <redacted> --- Documentation/admin-guide/RAS/main.rst | 18 +++ MAINTAINERS | 1 + drivers/acpi/apei/apei-internal.h | 10 +- drivers/acpi/apei/ghes_cper.c | 2 + drivers/ras/Kconfig | 12 ++ drivers/ras/Makefile | 1 + drivers/ras/esource-dt.c | 264 +++++++++++++++++++++++++++++++++ include/acpi/ghes_cper.h | 9 ++ 8 files changed, 308 insertions(+), 9 deletions(-)diff --git a/Documentation/admin-guide/RAS/main.rst b/Documentation/admin-guide/RAS/main.rst index 5a45db32c49b..4ffabaaeabb1 100644 --- a/Documentation/admin-guide/RAS/main.rst +++ b/Documentation/admin-guide/RAS/main.rst@@ -205,6 +205,24 @@ Architecture (MCA)\ [#f3]_. .. [#f3] For more details about the Machine Check Architecture (MCA), please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree. +Firmware-first CPER via DeviceTree +---------------------------------- + +Some systems expose Common Platform Error Record (CPER) data +via DeviceTree instead of ACPI HEST tables. +Enable ``CONFIG_RAS_ESOURCE_DT`` to build the ``drivers/ras/esource-dt.c`` +driver and describe the CPER error source buffer with the +``Documentation/devicetree/bindings/firmware/arm,ras-ffh.yaml`` binding. +The driver reuses the GHES CPER helper object in +``drivers/acpi/apei/ghes_cper.c`` so the logging, notifier chains, and +memory failure handling match the ACPI GHES behaviour even when +ACPI is disabled. + +Once a platform describes a firmware-first provider, both ACPI GHES and the +DeviceTree driver reuse the same code paths. This keeps the behaviour +consistent regardless of whether the error source is described via ACPI +tables or DeviceTree. + EDAC - Error Detection And Correction *************************************diff --git a/MAINTAINERS b/MAINTAINERS index 47db7877b485..fa6113b482b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS@@ -22031,6 +22031,7 @@ RAS ERROR STATUS M: Ahmed Tiba <ahmed.tiba@arm.com> S: Maintained F: Documentation/devicetree/bindings/firmware/arm,ras-ffh.yaml +F: drivers/ras/esource-dt.c RAS INFRASTRUCTURE M: Tony Luck <tony.luck@intel.com>diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index 77c10a7a7a9f..c16ac541f15b 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h@@ -8,6 +8,7 @@ #define APEI_INTERNAL_H #include <linux/acpi.h> +#include <acpi/ghes_cper.h> struct apei_exec_context;@@ -120,15 +121,6 @@ int apei_exec_collect_resources(struct apei_exec_context *ctx, struct dentry; struct dentry *apei_get_debugfs_dir(void); -static inline u32 cper_estatus_len(struct acpi_hest_generic_status *estatus) -{ - if (estatus->raw_data_length) - return estatus->raw_data_offset + \ - estatus->raw_data_length; - else - return sizeof(*estatus) + estatus->data_length; -} - int apei_osc_setup(void); int einj_get_available_error_type(u32 *type, int einj_action);diff --git a/drivers/acpi/apei/ghes_cper.c b/drivers/acpi/apei/ghes_cper.c index 29b790160e91..9b2d1b8cf9f4 100644 --- a/drivers/acpi/apei/ghes_cper.c +++ b/drivers/acpi/apei/ghes_cper.c@@ -42,7 +42,9 @@ #include <asm/fixmap.h> #include <asm/tlbflush.h> +#ifdef CONFIG_ACPI_APEI #include "apei-internal.h" +#endif ATOMIC_NOTIFIER_HEAD(ghes_report_chain);diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index fc4f4bb94a4c..ea6d96713020 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig@@ -34,6 +34,18 @@ if RAS source "arch/x86/ras/Kconfig" source "drivers/ras/amd/atl/Kconfig" +config RAS_ESOURCE_DT + bool "DeviceTree firmware-first CPER error source block provider" + depends on OF + depends on ARM64 + select GHES_CPER_HELPERS + help + Enable support for firmware-first Common Platform Error Record (CPER) + error source block providers that are described via DeviceTree + instead of ACPI HEST tables. The driver reuses the existing GHES + CPER helpers so the error processing matches the ACPI code paths, + but it can be built even when ACPI is disabled. + config RAS_FMPM tristate "FRU Memory Poison Manager" default mdiff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 11f95d59d397..53558a1707b3 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile@@ -2,6 +2,7 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o +obj-$(CONFIG_RAS_ESOURCE_DT) += esource-dt.o obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o obj-y += amd/atl/diff --git a/drivers/ras/esource-dt.c b/drivers/ras/esource-dt.c new file mode 100644 index 000000000000..b575a2258536 --- /dev/null +++ b/drivers/ras/esource-dt.c@@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DeviceTree provider for firmware-first CPER error source block. + * + * This driver shares the GHES CPER helpers so we keep the reporting and + * notifier behaviour identical to ACPI GHES + * + * Copyright (C) 2025 ARM Ltd. + * Author: Ahmed Tiba <ahmed.tiba@arm.com> + */ + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/device.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/module.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/panic.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include <acpi/ghes.h> +#include <acpi/ghes_cper.h> + +static atomic_t ghes_ffh_source_ids = ATOMIC_INIT(0); + +struct ghes_ffh_ack { + void __iomem *addr; + u64 preserve; + u64 set; + u8 width; + bool present; +};
Please don't use ffh. FFH stands for Fixed Feature Hardware. This is making it confusing. As per ACPI specification, FFH can be used to register read/write while handling errors. I have started feeling that all this churn should be avoided. All the GHES code is also being moved in the name of CPER helpers.
quoted hunk ↗ jump to hunk
+ +struct ghes_ffh { + struct device *dev; + void __iomem *status; + size_t status_len; + + struct ghes_ffh_ack ack; + + struct acpi_hest_generic *generic; + struct acpi_hest_generic_status *estatus; + + bool sync; + int irq; + + /* Serializes access to the firmware-owned buffer. */ + spinlock_t lock; +}; + +static int ghes_ffh_init_pool(void) +{ + if (ghes_estatus_pool) + return 0; + + return ghes_estatus_pool_init(1); +} + +static int ghes_ffh_copy_status(struct ghes_ffh *ctx) +{ + memcpy_fromio(ctx->estatus, ctx->status, ctx->status_len); + return 0; +} + +static void ghes_ffh_ack(struct ghes_ffh *ctx) +{ + u64 val; + + if (!ctx->ack.present) + return; + + if (ctx->ack.width == 64) { + val = readq(ctx->ack.addr); + val &= ctx->ack.preserve; + val |= ctx->ack.set; + writeq(val, ctx->ack.addr); + } else { + val = readl(ctx->ack.addr); + val &= (u32)ctx->ack.preserve; + val |= (u32)ctx->ack.set; + writel(val, ctx->ack.addr); + } +} + +static void ghes_ffh_fatal(struct ghes_ffh *ctx) +{ + __ghes_print_estatus(KERN_EMERG, ctx->generic, ctx->estatus); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); + panic("GHES: fatal firmware-first CPER record from %s\n", + dev_name(ctx->dev)); +} + +static void ghes_ffh_process(struct ghes_ffh *ctx) +{ + unsigned long flags; + int sev; + + spin_lock_irqsave(&ctx->lock, flags); + + if (ghes_ffh_copy_status(ctx)) + goto out; + + sev = ghes_severity(ctx->estatus->error_severity); + if (sev >= GHES_SEV_PANIC) + ghes_ffh_fatal(ctx); + + if (!ghes_estatus_cached(ctx->estatus)) { + if (ghes_print_estatus(NULL, ctx->generic, ctx->estatus)) + ghes_estatus_cache_add(ctx->generic, ctx->estatus); + } + + ghes_cper_handle_status(ctx->dev, ctx->generic, ctx->estatus, ctx->sync); + + ghes_ffh_ack(ctx); + +out: + spin_unlock_irqrestore(&ctx->lock, flags); +} + +static irqreturn_t ghes_ffh_irq(int irq, void *data) +{ + struct ghes_ffh *ctx = data; + + ghes_ffh_process(ctx); + + return IRQ_HANDLED; +} + +static int ghes_ffh_init_ack(struct platform_device *pdev, + struct ghes_ffh *ctx) +{ + struct resource *res; + size_t size; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (!res) + return 0; + + ctx->ack.addr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ctx->ack.addr)) + return PTR_ERR(ctx->ack.addr); + + size = resource_size(res); + switch (size) { + case 4: + ctx->ack.width = 32; + ctx->ack.preserve = ~0U; + break; + case 8: + ctx->ack.width = 64; + ctx->ack.preserve = ~0ULL; + break; + default: + dev_err(&pdev->dev, "Unsupported ack resource size %zu\n", size); + return -EINVAL; + } + + ctx->ack.set = BIT_ULL(0); + ctx->ack.present = true; + return 0; +} + +static int ghes_ffh_probe(struct platform_device *pdev) +{ + struct ghes_ffh *ctx; + struct resource *res; + int rc; + + ctx = devm_kzalloc(&pdev->dev, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + spin_lock_init(&ctx->lock); + ctx->dev = &pdev->dev; + ctx->sync = of_property_read_bool(pdev->dev.of_node, "arm,sea-notify"); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "status region missing\n"); + return -EINVAL; + } + + ctx->status_len = resource_size(res); + if (!ctx->status_len) { + dev_err(&pdev->dev, "Status region has zero length\n"); + return -EINVAL; + } + + ctx->status = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ctx->status)) + return PTR_ERR(ctx->status); + + rc = ghes_ffh_init_ack(pdev, ctx); + if (rc) + return rc; + + rc = ghes_ffh_init_pool(); + if (rc) + return rc; + + ctx->estatus = devm_kzalloc(&pdev->dev, ctx->status_len, GFP_KERNEL); + if (!ctx->estatus) + return -ENOMEM; + + ctx->generic = devm_kzalloc(&pdev->dev, sizeof(*ctx->generic), GFP_KERNEL); + if (!ctx->generic) + return -ENOMEM; + + ctx->generic->header.type = ACPI_HEST_TYPE_GENERIC_ERROR; + ctx->generic->header.source_id = + atomic_inc_return(&ghes_ffh_source_ids); + ctx->generic->notify.type = ctx->sync ? + ACPI_HEST_NOTIFY_SEA : ACPI_HEST_NOTIFY_EXTERNAL; + ctx->generic->error_block_length = ctx->status_len; + + ctx->irq = platform_get_irq_optional(pdev, 0); + if (ctx->irq <= 0) { + if (ctx->irq == -EPROBE_DEFER) + return ctx->irq; + dev_err(&pdev->dev, "interrupt is required (%d)\n", ctx->irq); + return -EINVAL; + } + + rc = devm_request_threaded_irq(&pdev->dev, ctx->irq, + NULL, ghes_ffh_irq, + IRQF_ONESHOT, + dev_name(&pdev->dev), ctx); + if (rc) + return rc; + + platform_set_drvdata(pdev, ctx); + dev_info(&pdev->dev, "Firmware-first CPER status provider (interrupt)\n"); + return 0; +} + +static void ghes_ffh_remove(struct platform_device *pdev) +{ +} + +static const struct of_device_id ghes_ffh_of_match[] = { + { .compatible = "arm,ras-ffh" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, ghes_ffh_of_match); + +static struct platform_driver ghes_ffh_driver = { + .driver = { + .name = "esource-dt", + .of_match_table = ghes_ffh_of_match, + }, + .probe = ghes_ffh_probe, + .remove = ghes_ffh_remove, +}; + +module_platform_driver(ghes_ffh_driver); + +MODULE_AUTHOR("Ahmed Tiba [off-list ref]"); +MODULE_DESCRIPTION("Firmware-first CPER provider for DeviceTree platforms"); +MODULE_LICENSE("GPL");diff --git a/include/acpi/ghes_cper.h b/include/acpi/ghes_cper.h index f7c9fba62585..d43185c020ee 100644 --- a/include/acpi/ghes_cper.h +++ b/include/acpi/ghes_cper.h@@ -75,6 +75,15 @@ static inline bool is_hest_sync_notify(struct ghes *ghes) return notify_type == ACPI_HEST_NOTIFY_SEA; } +static inline u32 cper_estatus_len(struct acpi_hest_generic_status *estatus) +{ + if (estatus->raw_data_length) + return estatus->raw_data_offset + \ + estatus->raw_data_length; + else + return sizeof(*estatus) + estatus->data_length; +} + struct ghes_vendor_record_entry { struct work_struct work; int error_severity; --2.43.0