--- v5
+++ v4
@@ -1,459 +1,28 @@
-Converge the APEI path and native AER path of clearing the AER registers
-of the error device.
+dpc_process_error() clears both AER fatal and non fatal status
+registers. Instead of clearing each status registers via a different
+function call use pci_aer_clear_status().
-In APEI path, the system firmware clears the AER registers before
-handing off the record to OS. But in "native AER" path, the execution
-path of clearing the AER register is as follows:
-
- aer_isr_one_error
- aer_print_port_info
- if (find_source_device())
- aer_process_err_devices
- handle_error_source
- pci_write_config_dword(dev, PCI_ERR_COR_STATUS, ...)
-
-The above path has a bug, if the find_source_device() fails, AER
-registers are not cleared from the error device. This means, the error
-device will keep reporting the error again and again and would lead
-to message spew.
-
-Related Bug Report:
- https://lore.kernel.org/linux-pci/20151229155822.GA17321@localhost/
- https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1521173
-
-The above bug could be avoided, if the AER registers are cleared during
-the AER IRQ handler aer_irq(), which would provide guarantee that the AER
-error registers are always cleared. This is similar to how APEI handles
-these errors.
-
-The main aim is that:
-
- When an interrupt handler deals with a interrupt, it must *always*
- clear the source of the interrupt.
+This helps clean up the code a bit.
Signed-off-by: Naveen Naidu <naveennaidu479@gmail.com>
---
- drivers/pci/pci.h | 13 ++-
- drivers/pci/pcie/aer.c | 249 ++++++++++++++++++++++++++++-------------
- 2 files changed, 184 insertions(+), 78 deletions(-)
+ drivers/pci/pcie/dpc.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
-diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
-index 9be7a966fda7..eb88d8bfeaf7 100644
---- a/drivers/pci/pci.h
-+++ b/drivers/pci/pci.h
-@@ -424,7 +424,6 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev)
- #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
-
- struct aer_err_info {
-- struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
- int error_dev_num;
-
- u16 id;
-@@ -440,6 +439,18 @@ struct aer_err_info {
- struct aer_header_log_regs tlp; /* TLP Header */
- };
-
-+/* Preliminary AER error information processed from Root port */
-+struct aer_devices_err_info {
-+ struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
-+ struct aer_err_info err_info;
-+};
-+
-+/* AER information associated with each error device */
-+struct aer_dev_err_info {
-+ struct pci_dev *dev;
-+ struct aer_err_info err_info;
-+};
-+
- int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
- void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
- #endif /* CONFIG_PCIEAER */
-diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
-index 241ff361b43c..d3937f5384e4 100644
---- a/drivers/pci/pcie/aer.c
-+++ b/drivers/pci/pcie/aer.c
-@@ -36,6 +36,18 @@
-
- #define AER_ERROR_SOURCES_MAX 128
-
-+/*
-+ * There can be 128 maximum error sources (AER_ERROR_SOURCES_MAX) and each
-+ * error source can have maximum of 5 error devices (AER_MAX_MULTI_ERR_DEVICES)
-+ * so the maximum error devices we can report is:
-+ *
-+ * AER_ERROR_DEVICES_MAX = AER_ERROR_SOURCES_MAX * AER_MAX_MULTI_ERR_DEVICES == (128 * 5) == 640
-+ *
-+ * But since, the size in KFIFO should be a power of two, the closest value
-+ * to 640 is 1024
-+ */
-+# define AER_ERROR_DEVICES_MAX 1024
-+
- #define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */
- #define AER_MAX_TYPEOF_UNCOR_ERRS 27 /* as per PCI_ERR_UNCOR_STATUS*/
-
-@@ -46,7 +58,7 @@ struct aer_err_source {
-
- struct aer_rpc {
- struct pci_dev *rpd; /* Root Port device */
-- DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX);
-+ DECLARE_KFIFO(aer_fifo, struct aer_dev_err_info, AER_ERROR_DEVICES_MAX);
- };
-
- /* AER stats for the device */
-@@ -803,14 +815,14 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
-
- /**
- * add_error_device - list device to be handled
-- * @e_info: pointer to error info
-+ * @e_dev: pointer to error info
- * @dev: pointer to pci_dev to be added
- */
--static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
-+static int add_error_device(struct aer_devices_err_info *e_dev, struct pci_dev *dev)
- {
-- if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
-- e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
-- e_info->error_dev_num++;
-+ if (e_dev->err_info.error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
-+ e_dev->dev[e_dev->err_info.error_dev_num] = pci_dev_get(dev);
-+ e_dev->err_info.error_dev_num++;
- return 0;
+diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
+index df3f3a10f8bc..faf4a1e77fab 100644
+--- a/drivers/pci/pcie/dpc.c
++++ b/drivers/pci/pcie/dpc.c
+@@ -288,8 +288,7 @@ void dpc_process_error(struct pci_dev *pdev)
+ dpc_get_aer_uncorrect_severity(pdev, &info) &&
+ aer_get_device_error_info(pdev, &info)) {
+ aer_print_error(pdev, &info);
+- pci_aer_clear_nonfatal_status(pdev);
+- pci_aer_clear_fatal_status(pdev);
++ pci_aer_clear_status(pdev);
}
- return -ENOSPC;
-@@ -877,18 +889,18 @@ static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
-
- static int find_device_iter(struct pci_dev *dev, void *data)
- {
-- struct aer_err_info *e_info = (struct aer_err_info *)data;
-+ struct aer_devices_err_info *e_dev = (struct aer_devices_err_info *)data;
-
-- if (is_error_source(dev, e_info)) {
-+ if (is_error_source(dev, &e_dev->err_info)) {
- /* List this device */
-- if (add_error_device(e_info, dev)) {
-+ if (add_error_device(e_dev, dev)) {
- /* We cannot handle more... Stop iteration */
- /* TODO: Should print error message here? */
- return 1;
- }
-
- /* If there is only a single error, stop iteration */
-- if (!e_info->multi_error_valid)
-+ if (!e_dev->err_info.multi_error_valid)
- return 1;
- }
- return 0;
-@@ -897,7 +909,7 @@ static int find_device_iter(struct pci_dev *dev, void *data)
- /**
- * find_source_device - search through device hierarchy for source device
- * @parent: pointer to Root Port pci_dev data structure
-- * @e_info: including detailed error information such like id
-+ * @e_dev: including detailed error information such like id
- *
- * Return true if found.
- *
-@@ -907,26 +919,26 @@ static int find_device_iter(struct pci_dev *dev, void *data)
- * e_info->error_dev_num and e_info->dev[], based on the given information.
- */
- static bool find_source_device(struct pci_dev *parent,
-- struct aer_err_info *e_info)
-+ struct aer_devices_err_info *e_dev)
- {
- struct pci_dev *dev = parent;
- int result;
-
- /* Must reset in this function */
-- e_info->error_dev_num = 0;
-+ e_dev->err_info.error_dev_num = 0;
-
- /* Is Root Port an agent that sends error message? */
-- result = find_device_iter(dev, e_info);
-+ result = find_device_iter(dev, e_dev);
- if (result)
- return true;
-
- if (pci_pcie_type(parent) == PCI_EXP_TYPE_RC_EC)
-- pcie_walk_rcec(parent, find_device_iter, e_info);
-+ pcie_walk_rcec(parent, find_device_iter, e_dev);
- else
-- pci_walk_bus(parent->subordinate, find_device_iter, e_info);
-+ pci_walk_bus(parent->subordinate, find_device_iter, e_dev);
-
-- if (!e_info->error_dev_num) {
-- pci_info(parent, "can't find device of ID%04x\n", e_info->id);
-+ if (!e_dev->err_info.error_dev_num) {
-+ pci_info(parent, "can't find device of ID%04x\n", e_dev->err_info.id);
- return false;
- }
- return true;
-@@ -940,24 +952,42 @@ static bool find_source_device(struct pci_dev *parent,
- * Invoked when an error being detected by Root Port.
- */
- static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
-+{
-+ /*
-+ * Correctable error does not need software intervention.
-+ * No need to go through error recovery process.
-+ */
-+ if (info->severity == AER_NONFATAL)
-+ pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
-+ else if (info->severity == AER_FATAL)
-+ pcie_do_recovery(dev, pci_channel_io_frozen, aer_root_reset);
-+ pci_dev_put(dev);
-+}
-+
-+/**
-+ * clear_error_source_aer_registers - clear AER registers of the error source device
-+ * @dev: pointer to pci_dev data structure of error source device
-+ * @info: comprehensive error information
-+ *
-+ * Invoked when an error being detected by Root Port but before we handle the
-+ * error.
-+ */
-+static void clear_error_source_aer_registers(struct pci_dev *dev, struct aer_err_info info)
- {
- int aer = dev->aer_cap;
-
-- if (info->severity == AER_CORRECTABLE) {
-- /*
-- * Correctable error does not need software intervention.
-- * No need to go through error recovery process.
-- */
-+ if (info.severity == AER_CORRECTABLE) {
- if (aer)
- pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
-- info->status);
-+ info.status);
- if (pcie_aer_is_native(dev))
- pcie_clear_device_status(dev);
-- } else if (info->severity == AER_NONFATAL)
-- pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
-- else if (info->severity == AER_FATAL)
-- pcie_do_recovery(dev, pci_channel_io_frozen, aer_root_reset);
-- pci_dev_put(dev);
-+ } else if (info.severity == AER_NONFATAL) {
-+ pci_aer_clear_nonfatal_status(dev);
-+ } else if (info.severity == AER_FATAL) {
-+ pci_aer_clear_fatal_status(dev);
-+ }
-+
}
- #ifdef CONFIG_ACPI_APEI_PCIEAER
-@@ -1093,70 +1123,112 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
- return 1;
- }
-
--static inline void aer_process_err_devices(struct aer_err_info *e_info)
--{
-- int i;
--
-- /* Report all before handle them, not to lost records by reset etc. */
-- for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
-- if (aer_get_device_error_info(e_info->dev[i], e_info))
-- aer_print_error(e_info->dev[i], e_info);
-- }
-- for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
-- if (aer_get_device_error_info(e_info->dev[i], e_info))
-- handle_error_source(e_info->dev[i], e_info);
-- }
--}
--
- /**
-- * aer_isr_one_error - consume an error detected by root port
-- * @rpc: pointer to the root port which holds an error
-+ * aer_find_corr_error_source_device - find the error source which detected the corrected error
-+ * @rp: pointer to Root Port pci_dev data structure
- * @e_src: pointer to an error source
-+ * @e_info: including detailed error information such like id
-+ *
-+ * Return true if found.
-+ *
-+ * Process the error information received at the Root Port, set these values
-+ * in the aer_devices_err_info and find all the devices that are related to
-+ * the error.
- */
--static void aer_isr_one_error(struct aer_rpc *rpc,
-- struct aer_err_source *e_src)
-+static bool aer_find_corr_error_source_device(struct pci_dev *rp,
-+ struct aer_err_source *e_src,
-+ struct aer_devices_err_info *e_info)
- {
-- struct pci_dev *pdev = rpc->rpd;
-- struct aer_err_info e_info;
--
-- pci_rootport_aer_stats_incr(pdev, e_src);
--
-- /*
-- * There is a possibility that both correctable error and
-- * uncorrectable error being logged. Report correctable error first.
-- */
- if (e_src->status & PCI_ERR_ROOT_COR_RCV) {
-- e_info.id = ERR_COR_ID(e_src->id);
-- e_info.severity = AER_CORRECTABLE;
-+ e_info->err_info.id = ERR_COR_ID(e_src->id);
-+ e_info->err_info.severity = AER_CORRECTABLE;
-
- if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV)
-- e_info.multi_error_valid = 1;
-+ e_info->err_info.multi_error_valid = 1;
- else
-- e_info.multi_error_valid = 0;
-- aer_print_port_info(pdev, &e_info);
-+ e_info->err_info.multi_error_valid = 0;
-
-- if (find_source_device(pdev, &e_info))
-- aer_process_err_devices(&e_info);
-+ if (!find_source_device(rp, e_info))
-+ return false;
- }
-+ return true;
-+}
-
-+/**
-+ * aer_find_uncorr_error_source_device - find the error source which detected the uncorrected error
-+ * @rp: pointer to Root Port pci_dev data structure
-+ * @e_src: pointer to an error source
-+ * @e_info: including detailed error information such like id
-+ *
-+ * Return true if found.
-+ *
-+ * Process the error information received at the Root Port, set these values
-+ * in the aer_devices_err_info and find all the devices that are related to
-+ * the error.
-+ */
-+static bool aer_find_uncorr_error_source_device(struct pci_dev *rp,
-+ struct aer_err_source *e_src,
-+ struct aer_devices_err_info *e_info)
-+{
- if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
-- e_info.id = ERR_UNCOR_ID(e_src->id);
-+ e_info->err_info.id = ERR_UNCOR_ID(e_src->id);
-
- if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
-- e_info.severity = AER_FATAL;
-+ e_info->err_info.severity = AER_FATAL;
- else
-- e_info.severity = AER_NONFATAL;
-+ e_info->err_info.severity = AER_NONFATAL;
-
- if (e_src->status & PCI_ERR_ROOT_MULTI_UNCOR_RCV)
-- e_info.multi_error_valid = 1;
-+ e_info->err_info.multi_error_valid = 1;
- else
-- e_info.multi_error_valid = 0;
-+ e_info->err_info.multi_error_valid = 0;
-+
-+ if (!find_source_device(rp, e_info))
-+ return false;
-+ }
-
-- aer_print_port_info(pdev, &e_info);
-+ return true;
-+}
-
-- if (find_source_device(pdev, &e_info))
-- aer_process_err_devices(&e_info);
-+/**
-+ * aer_isr_one_error - consume an error detected by root port
-+ * @rp: pointer to Root Port pci_dev data structure
-+ * @e_dev: pointer to an error device
-+ */
-+static void aer_isr_one_error(struct pci_dev *rp, struct aer_dev_err_info *e_dev)
-+{
-+ aer_print_port_info(rp, &e_dev->err_info);
-+ aer_print_error(e_dev->dev, &e_dev->err_info);
-+ handle_error_source(e_dev->dev, &e_dev->err_info);
-+}
-+
-+static bool aer_add_err_devices_to_queue(struct aer_rpc *rpc,
-+ struct aer_devices_err_info *e_info)
-+{
-+ int i;
-+ struct aer_dev_err_info *e_dev;
-+
-+ e_dev = kzalloc(sizeof(*e_dev), GFP_ATOMIC);
-+ if (!e_dev)
-+ return false;
-+
-+ for (i = 0; i < e_info->err_info.error_dev_num && e_info->dev[i]; i++) {
-+ e_dev->err_info = e_info->err_info;
-+ e_dev->dev = e_info->dev[i];
-+
-+ /*
-+ * Store the AER register information for each error device on
-+ * the queue
-+ */
-+ if (aer_get_device_error_info(e_dev->dev, &e_dev->err_info)) {
-+ if (!kfifo_put(&rpc->aer_fifo, *e_dev))
-+ return false;
-+
-+ clear_error_source_aer_registers(e_dev->dev, e_dev->err_info);
-+ }
- }
-+
-+ return true;
- }
-
- /**
-@@ -1170,13 +1242,13 @@ static irqreturn_t aer_isr(int irq, void *context)
- {
- struct pcie_device *dev = (struct pcie_device *)context;
- struct aer_rpc *rpc = get_service_data(dev);
-- struct aer_err_source e_src;
-+ struct aer_dev_err_info e_dev;
-
- if (kfifo_is_empty(&rpc->aer_fifo))
- return IRQ_NONE;
-
-- while (kfifo_get(&rpc->aer_fifo, &e_src))
-- aer_isr_one_error(rpc, &e_src);
-+ while (kfifo_get(&rpc->aer_fifo, &e_dev))
-+ aer_isr_one_error(rpc->rpd, &e_dev);
- return IRQ_HANDLED;
- }
-
-@@ -1194,6 +1266,11 @@ static irqreturn_t aer_irq(int irq, void *context)
- struct pci_dev *rp = rpc->rpd;
- int aer = rp->aer_cap;
- struct aer_err_source e_src = {};
-+ struct aer_devices_err_info *e_info;
-+
-+ e_info = kzalloc(sizeof(*e_info), GFP_ATOMIC);
-+ if (!e_info)
-+ return IRQ_NONE;
-
- pci_read_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, &e_src.status);
- if (!(e_src.status & (PCI_ERR_ROOT_UNCOR_RCV|PCI_ERR_ROOT_COR_RCV)))
-@@ -1202,8 +1279,26 @@ static irqreturn_t aer_irq(int irq, void *context)
- pci_read_config_dword(rp, aer + PCI_ERR_ROOT_ERR_SRC, &e_src.id);
- pci_write_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, e_src.status);
-
-- if (!kfifo_put(&rpc->aer_fifo, e_src))
-- return IRQ_HANDLED;
-+ pci_rootport_aer_stats_incr(rp, &e_src);
-+
-+ /*
-+ * There is a possibility that both correctable error and
-+ * uncorrectable error are being logged. Find the devices which caused
-+ * correctable errors first so that they can be added to the queue first
-+ * and will be reported first.
-+ *
-+ * Before adding the error device to the queue to be handled, clear the
-+ * AER status registers.
-+ */
-+ if (aer_find_corr_error_source_device(rp, &e_src, e_info)) {
-+ if (!aer_add_err_devices_to_queue(rpc, e_info))
-+ return IRQ_NONE;
-+ }
-+
-+ if (aer_find_uncorr_error_source_device(rp, &e_src, e_info)) {
-+ if (!aer_add_err_devices_to_queue(rpc, e_info))
-+ return IRQ_NONE;
-+ }
-
- return IRQ_WAKE_THREAD;
- }
--
2.25.1