[PATCH RFC net-next 16/19] net/mlx5: Report devlink health on FW issues
From: Eran Ben Elisha <hidden>
Date: 2018-12-31 14:32:25
Subsystem:
mellanox mlx5 ib driver, mellanox mlx5 core vpi driver, networking drivers, the rest · Maintainers:
Leon Romanovsky, Saeed Mahameed, Tariq Toukan, Mark Bloch, Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
From: Moshe Shemesh <redacted> Use devlink_health_report() to report any symptom of FW issue as FW counter miss or new health syndrom. Signed-off-by: Moshe Shemesh <redacted> --- .../net/ethernet/mellanox/mlx5/core/devlink.c | 21 +++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/devlink.h | 1 + .../net/ethernet/mellanox/mlx5/core/health.c | 10 +++++++++ include/linux/mlx5/driver.h | 2 ++ 4 files changed, 34 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 07bc473a8ebb..04c904214e4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c@@ -205,6 +205,27 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, return 0; } +void mlx5_fw_reporter_err_work(struct work_struct *work) +{ + struct mlx5_fw_reporter_ctx fw_reporter_ctx; + struct mlx5_core_health *health; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + + health = container_of(work, struct mlx5_core_health, report_work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + + fw_reporter_ctx.err_synd = health->synd; + fw_reporter_ctx.miss_counter = health->miss_counter; + if (fw_reporter_ctx.err_synd) + devlink_health_report(dev->fw_reporter, "FW syndrom reported", + &fw_reporter_ctx); + else if (fw_reporter_ctx.miss_counter) + devlink_health_report(dev->fw_reporter, "FW miss counter reported", + &fw_reporter_ctx); +} + static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { .name = "FW", .objdump_size = SAVED_TRACES_BUFFER_SIZE_BYTE,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 34f6bfed1cfb..082a648a3af3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h@@ -16,5 +16,6 @@ int mlx5_devlink_register(struct devlink *devlink, struct device *dev); void mlx5_devlink_unregister(struct devlink *devlink); int mlx5_fw_reporter_create(struct mlx5_core_dev *dev); void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev); +void mlx5_fw_reporter_err_work(struct work_struct *work); #endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2fa78edde1fe..4d0ad792b226 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c@@ -38,6 +38,7 @@ #include <linux/mlx5/driver.h> #include <linux/mlx5/cmd.h> #include "mlx5_core.h" +#include "devlink.h" #include "lib/eq.h" #include "lib/mlx5.h"
@@ -289,7 +290,9 @@ static void poll_health(struct timer_list *t) { struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; u32 count; + u8 synd; if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) goto out;
@@ -304,8 +307,13 @@ static void poll_health(struct timer_list *t) if (health->miss_counter == MAX_MISSES) { dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n"); mlx5_print_health_info(dev); + queue_work(health->wq, &health->report_work); } + synd = ioread8(&h->synd); + if (synd && synd != health->synd) + queue_work(health->wq, &health->report_work); + if (in_fatal(dev) && !health->sick) { health->sick = true; mlx5_print_health_info(dev);
@@ -356,6 +364,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); spin_unlock_irqrestore(&health->wq_lock, flags); cancel_delayed_work_sync(&health->recover_work); + cancel_work_sync(&health->report_work); cancel_work_sync(&health->work); }
@@ -395,6 +404,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) return -ENOMEM; spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); + INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); INIT_DELAYED_WORK(&health->recover_work, health_recover); return 0;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bedc9bc08963..b2dc32b553b4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h@@ -432,12 +432,14 @@ struct mlx5_core_health { struct timer_list timer; u32 prev; int miss_counter; + u8 synd; bool sick; /* wq spinlock to synchronize draining */ spinlock_t wq_lock; struct workqueue_struct *wq; unsigned long flags; struct work_struct work; + struct work_struct report_work; struct delayed_work recover_work; };
--
2.17.1