DORMANTno replies REVIEWED: 1 (0M)

[PATCH net-next] net/mlx5e: bound TX CQ poll softirq residency with a time budget

From: Jose Fernandez (Anthropic) <hidden>
Date: 2026-07-03 01:37:25
Also in: linux-rdma, lkml
Subsystem: mellanox ethernet driver (mlx5e), mellanox mlx5 core vpi driver, networking drivers, the rest · Maintainers: Saeed Mahameed, Tariq Toukan, Mark Bloch, Leon Romanovsky, Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds

Under strict IOMMU invalidation (iommu.strict=1), each per-fragment DMA
unmap in the TX completion path issues a synchronous TLB invalidate and
waits for CMD_SYNC, spinning IRQ-off in the SMMU command queue. Under
cross-CPU command-queue contention this per-unmap cost inflates from
microseconds to hundreds of microseconds. mlx5e_poll_tx_cq()'s per-CQE
budget (128) does not bound time in this regime: one CQE can cover a
multi-WQE batch with many fragments, so a single poll invocation can
accumulate seconds of softirq residency and trip the soft-lockup
watchdog on arm64/SMMU-v3 systems.

Bound the invocation by time: check local_clock() every 8 CQEs against
a budget (default 500us; module parameter tx_cq_time_budget_us,
runtime-writable, 0 disables) and break out of the CQE loop when
exceeded, reporting busy exactly like the existing CQE-budget
exhaustion path so NAPI keeps the poll scheduled. Remaining
completions are delayed by one reschedule, never stranded. The inner
WQE walk is never interrupted mid-CQE (sqcc/dma_fifo_cc accounting).
A new ethtool statistic (tx_time_budget_exit) counts early exits.

Also add cond_resched() in mlx5e_free_txqsq_descs(): the teardown path
walks the same per-fragment unmaps in process context.

Tested on arm64 with SMMU-v3 under strict mode: throughput cost is
within run-to-run variance at every measured load shape; under active
invalidation-storm contention, the bounded poll measures 35-50%
faster than unbounded (bounded polling yields cores back to the
transmit path).

Assisted-by: Claude:unspecified
Signed-off-by: Jose Fernandez (Anthropic) <redacted>
Reviewed-by: Ben Cressey <redacted>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  5 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    | 29 +++++++++++++++++++++-
 3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 7f33261ba655..b940280af19d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -171,6 +171,7 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqes) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_time_budget_exit) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_xmit) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_mpwqe) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_inlnw) },
@@ -426,6 +427,7 @@ static void mlx5e_stats_grp_sw_update_stats_sq(struct mlx5e_sw_stats *s,
 	s->tx_queue_wake            += sq_stats->wake;
 	s->tx_queue_dropped         += sq_stats->dropped;
 	s->tx_cqe_err               += sq_stats->cqe_err;
+	s->tx_time_budget_exit      += sq_stats->time_budget_exit;
 	s->tx_recover               += sq_stats->recover;
 	s->tx_xmit_more             += sq_stats->xmit_more;
 	s->tx_csum_partial_inner    += sq_stats->csum_partial_inner;
@@ -2323,6 +2325,7 @@ static const struct counter_desc sq_stats_desc[] = {
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqes) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, time_budget_exit) },
 };
 
 static const struct counter_desc rq_xdpsq_stats_desc[] = {
@@ -2399,6 +2402,7 @@ static const struct counter_desc ptp_sq_stats_desc[] = {
 	{ MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, cqes) },
 	{ MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, wake) },
 	{ MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
+	{ MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, time_budget_exit) },
 };
 
 static const struct counter_desc ptp_ch_stats_desc[] = {
@@ -2476,6 +2480,7 @@ static const struct counter_desc qos_sq_stats_desc[] = {
 	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqes) },
 	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, wake) },
 	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, time_budget_exit) },
 };
 
 #define NUM_RQ_STATS			ARRAY_SIZE(rq_stats_desc)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 09f155acb461..5ba954f42ccd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -187,6 +187,7 @@ struct mlx5e_sw_stats {
 	u64 tx_cqes;
 	u64 tx_queue_wake;
 	u64 tx_cqe_err;
+	u64 tx_time_budget_exit;
 	u64 tx_xdp_xmit;
 	u64 tx_xdp_mpwqe;
 	u64 tx_xdp_inlnw;
@@ -445,6 +446,7 @@ struct mlx5e_sq_stats {
 	u64 cqes ____cacheline_aligned_in_smp;
 	u64 wake;
 	u64 cqe_err;
+	u64 time_budget_exit;
 };
 
 struct mlx5e_xdpsq_stats {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 0b5e600e4a6a..994df912b765 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -43,6 +43,13 @@
 #include "en_accel/macsec.h"
 #include "en/ptp.h"
 #include <net/ipv6.h>
+#include <linux/moduleparam.h>
+#include <linux/sched/clock.h>
+
+static unsigned int mlx5e_tx_cq_time_budget_us = 500;
+module_param_named(tx_cq_time_budget_us, mlx5e_tx_cq_time_budget_us, uint, 0644);
+MODULE_PARM_DESC(tx_cq_time_budget_us,
+		 "Max microseconds one TX CQ poll may spend before yielding (0 = unbounded)");
 
 static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma)
 {
@@ -760,9 +767,12 @@ void mlx5e_txqsq_wake(struct mlx5e_txqsq *sq)
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 {
 	struct mlx5e_sq_stats *stats;
+	bool time_exceeded = false;
+	u64 time_budget_end = 0;
 	struct mlx5e_txqsq *sq;
 	struct mlx5_cqe64 *cqe;
 	u32 dma_fifo_cc;
+	u32 budget_us;
 	u32 nbytes;
 	u16 npkts;
 	u16 sqcc;
@@ -790,6 +800,10 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 	/* avoid dirtying sq cache line every cqe */
 	dma_fifo_cc = sq->dma_fifo_cc;
 
+	budget_us = READ_ONCE(mlx5e_tx_cq_time_budget_us);
+	if (budget_us)
+		time_budget_end = local_clock() + (u64)budget_us * NSEC_PER_USEC;
+
 	i = 0;
 	do {
 		struct mlx5e_tx_wqe_info *wi;
@@ -842,8 +856,19 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 			stats->cqe_err++;
 		}
 
+		/* Check between CQEs only (sqcc/dma_fifo_cc must advance together). */
+		if (unlikely(time_budget_end && (i & 7) == 7 &&
+			     local_clock() >= time_budget_end)) {
+			time_exceeded = true;
+			i++;
+			break;
+		}
+
 	} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
 
+	if (unlikely(time_exceeded))
+		stats->time_budget_exit++;
+
 	stats->cqes += i;
 
 	mlx5_cqwq_update_db_record(&cq->wq);
@@ -858,7 +883,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 
 	mlx5e_txqsq_wake(sq);
 
-	return (i == MLX5E_TX_CQ_POLL_BUDGET);
+	return time_exceeded || (i == MLX5E_TX_CQ_POLL_BUDGET);
 }
 
 static void mlx5e_tx_wi_kfree_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi)
@@ -879,6 +904,8 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq)
 	dma_fifo_cc = sq->dma_fifo_cc;
 
 	while (sqcc != sq->pc) {
+		cond_resched();
+
 		ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
 		wi = &sq->db.wqe_info[ci];
 
---
base-commit: 08bc5b2636afcbadc31bb17243eec094e048bd79
change-id: 20260702-mlx5e-tx-cq-time-budget-02cccf37bf54

Best regards,
--  
Jose Fernandez (Anthropic) [off-list ref]
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help