Thread (5 messages) 5 messages, 3 authors, 2026-02-27

Re: [PATCH, net-next] net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout

From: Dipayaan Roy <hidden>
Date: 2026-02-27 08:10:21
Also in: linux-hyperv, linux-rdma, lkml

On Thu, Feb 26, 2026 at 07:48:31PM +0000, Long Li wrote:
quoted
The GF stats periodic query is used as mechanism to monitor HWC health check.
If this HWC command times out, it is a strong indication that the device/SoC is in a
faulty state and requires recovery.

Today, when a timeout is detected, the driver marks hwc_timeout_occurred,
clears cached stats, and stops rescheduling the periodic work. However, the
device itself is left in the same failing state.

Extend the timeout handling path to trigger the existing MANA VF recovery
service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item.
This is expected to initiate the appropriate recovery flow by suspende resume
first and if it fails then trigger a bus rescan.

This change is intentionally limited to HWC command timeouts and does not
trigger recovery for errors reported by the SoC as a normal command response.

Signed-off-by: Dipayaan Roy <redacted>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 14 +++-------
 drivers/net/ethernet/microsoft/mana/mana_en.c | 28 ++++++++++++++++++-
 include/net/mana/gdma.h                       | 16 +++++++++--
 3 files changed, 45 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 0055c231acf6..16c438d2aaa3 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev)
 		dev_info(&pdev->dev, "MANA reset cycle completed\n");

 out:
-	gc->in_service = false;
+	clear_bit(GC_IN_SERVICE, &gc->flags);
 }

-struct mana_serv_work {
-	struct work_struct serv_work;
-	struct pci_dev *pdev;
-	enum gdma_eqe_type type;
-};
-
 static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev)
{
 	switch (type) {
@@ -542,7 +536,7 @@ static void mana_recovery_delayed_func(struct
work_struct *w)
 	spin_unlock_irqrestore(&work->lock, flags);  }

-static void mana_serv_func(struct work_struct *w)
+void mana_serv_func(struct work_struct *w)
 {
 	struct mana_serv_work *mns_wk;
 	struct pci_dev *pdev;
@@ -624,7 +618,7 @@ static void mana_gd_process_eqe(struct gdma_queue
*eq)
 			break;
 		}

-		if (gc->in_service) {
+		if (test_bit(GC_IN_SERVICE, &gc->flags)) {
 			dev_info(gc->dev, "Already in service\n");
 			break;
 		}
@@ -641,7 +635,7 @@ static void mana_gd_process_eqe(struct gdma_queue
*eq)
 		}

 		dev_info(gc->dev, "Start MANA service type:%d\n", type);
-		gc->in_service = true;
+		set_bit(GC_IN_SERVICE, &gc->flags);
 		mns_wk->pdev = to_pci_dev(gc->dev);
 		mns_wk->type = type;
 		pci_dev_get(mns_wk->pdev);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 91c418097284..8da574cf06f2 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -879,7 +879,7 @@ static void mana_tx_timeout(struct net_device *netdev,
unsigned int txqueue)
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;

 	/* Already in service, hence tx queue reset is not required.*/
-	if (gc->in_service)
+	if (test_bit(GC_IN_SERVICE, &gc->flags))
 		return;

 	/* Note: If there are pending queue reset work for this port(apc), @@ -
3533,6 +3533,8 @@ static void mana_gf_stats_work_handler(struct work_struct
*work)  {
 	struct mana_context *ac =
 		container_of(to_delayed_work(work), struct mana_context,
gf_stats_work);
+	struct gdma_context *gc = ac->gdma_dev->gdma_context;
+	struct mana_serv_work *mns_wk;
 	int err;

 	err = mana_query_gf_stats(ac);
@@ -3540,6 +3542,30 @@ static void mana_gf_stats_work_handler(struct
work_struct *work)
 		/* HWC timeout detected - reset stats and stop rescheduling */
 		ac->hwc_timeout_occurred = true;
 		memset(&ac->hc_stats, 0, sizeof(ac->hc_stats));
+		dev_warn(gc->dev,
+			 "Gf stats wk handler: gf stats query timed out.\n");
+
+		/* As HWC timed out, indicating a faulty HW state and needs a
+		 * reset.
+		 */
+		if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) {
+			if (!try_module_get(THIS_MODULE)) {
+				dev_info(gc->dev, "Module is unloading\n");
+				return;
+			}
+
+			mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
+			if (!mns_wk) {
+				module_put(THIS_MODULE);
Maybe it's not necessary: check if you want to call  clear_bit(GC_IN_SERVICE, &gc->flags) here?
yes it makes sense to clear it here. 
quoted
+				return;
+			}
+
+			mns_wk->pdev = to_pci_dev(gc->dev);
+			mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST;
+			pci_dev_get(mns_wk->pdev);
+			INIT_WORK(&mns_wk->serv_work, mana_serv_func);
+			schedule_work(&mns_wk->serv_work);
+		}
 		return;
 	}
 	schedule_delayed_work(&ac->gf_stats_work,
MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h
Regards
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help