Thread (31 messages) 31 messages, 4 authors, 2019-01-04
STALE2711d

[PATCH RFC net-next 16/19] net/mlx5: Report devlink health on FW issues

From: Eran Ben Elisha <hidden>
Date: 2018-12-31 14:32:25
Subsystem: mellanox mlx5 ib driver, mellanox mlx5 core vpi driver, networking drivers, the rest · Maintainers: Leon Romanovsky, Saeed Mahameed, Tariq Toukan, Mark Bloch, Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds

From: Moshe Shemesh <redacted>

Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrom.

Signed-off-by: Moshe Shemesh <redacted>
---
 .../net/ethernet/mellanox/mlx5/core/devlink.c | 21 +++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/devlink.h |  1 +
 .../net/ethernet/mellanox/mlx5/core/health.c  | 10 +++++++++
 include/linux/mlx5/driver.h                   |  2 ++
 4 files changed, 34 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 07bc473a8ebb..04c904214e4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -205,6 +205,27 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
 	return 0;
 }
 
+void mlx5_fw_reporter_err_work(struct work_struct *work)
+{
+	struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+
+	health = container_of(work, struct mlx5_core_health, report_work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	fw_reporter_ctx.err_synd = health->synd;
+	fw_reporter_ctx.miss_counter = health->miss_counter;
+	if (fw_reporter_ctx.err_synd)
+		devlink_health_report(dev->fw_reporter, "FW syndrom reported",
+				      &fw_reporter_ctx);
+	else if (fw_reporter_ctx.miss_counter)
+		devlink_health_report(dev->fw_reporter, "FW miss counter reported",
+				      &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.name = "FW",
 		.objdump_size = SAVED_TRACES_BUFFER_SIZE_BYTE,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 34f6bfed1cfb..082a648a3af3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -16,5 +16,6 @@ int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
 void mlx5_devlink_unregister(struct devlink *devlink);
 int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
 void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+void mlx5_fw_reporter_err_work(struct work_struct *work);
 
 #endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2fa78edde1fe..4d0ad792b226 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -38,6 +38,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
+#include "devlink.h"
 #include "lib/eq.h"
 #include "lib/mlx5.h"
 
@@ -289,7 +290,9 @@ static void poll_health(struct timer_list *t)
 {
 	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
 	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
 	u32 count;
+	u8 synd;
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		goto out;
@@ -304,8 +307,13 @@ static void poll_health(struct timer_list *t)
 	if (health->miss_counter == MAX_MISSES) {
 		dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
 		mlx5_print_health_info(dev);
+		queue_work(health->wq, &health->report_work);
 	}
 
+	synd = ioread8(&h->synd);
+	if (synd && synd != health->synd)
+		queue_work(health->wq, &health->report_work);
+
 	if (in_fatal(dev) && !health->sick) {
 		health->sick = true;
 		mlx5_print_health_info(dev);
@@ -356,6 +364,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 	spin_unlock_irqrestore(&health->wq_lock, flags);
 	cancel_delayed_work_sync(&health->recover_work);
+	cancel_work_sync(&health->report_work);
 	cancel_work_sync(&health->work);
 }
 
@@ -395,6 +404,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
+	INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 	INIT_DELAYED_WORK(&health->recover_work, health_recover);
 
 	return 0;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bedc9bc08963..b2dc32b553b4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -432,12 +432,14 @@ struct mlx5_core_health {
 	struct timer_list		timer;
 	u32				prev;
 	int				miss_counter;
+	u8				synd;
 	bool				sick;
 	/* wq spinlock to synchronize draining */
 	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
 	struct work_struct		work;
+	struct work_struct		report_work;
 	struct delayed_work		recover_work;
 };
 
-- 
2.17.1
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help