[PATCH net-next] net/mlx5e: bound TX CQ poll softirq residency with a time budget
From: Jose Fernandez (Anthropic) <hidden>
Date: 2026-07-03 01:37:25
Also in:
linux-rdma, lkml
Subsystem:
mellanox ethernet driver (mlx5e), mellanox mlx5 core vpi driver, networking drivers, the rest · Maintainers:
Saeed Mahameed, Tariq Toukan, Mark Bloch, Leon Romanovsky, Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
Under strict IOMMU invalidation (iommu.strict=1), each per-fragment DMA unmap in the TX completion path issues a synchronous TLB invalidate and waits for CMD_SYNC, spinning IRQ-off in the SMMU command queue. Under cross-CPU command-queue contention this per-unmap cost inflates from microseconds to hundreds of microseconds. mlx5e_poll_tx_cq()'s per-CQE budget (128) does not bound time in this regime: one CQE can cover a multi-WQE batch with many fragments, so a single poll invocation can accumulate seconds of softirq residency and trip the soft-lockup watchdog on arm64/SMMU-v3 systems. Bound the invocation by time: check local_clock() every 8 CQEs against a budget (default 500us; module parameter tx_cq_time_budget_us, runtime-writable, 0 disables) and break out of the CQE loop when exceeded, reporting busy exactly like the existing CQE-budget exhaustion path so NAPI keeps the poll scheduled. Remaining completions are delayed by one reschedule, never stranded. The inner WQE walk is never interrupted mid-CQE (sqcc/dma_fifo_cc accounting). A new ethtool statistic (tx_time_budget_exit) counts early exits. Also add cond_resched() in mlx5e_free_txqsq_descs(): the teardown path walks the same per-fragment unmaps in process context. Tested on arm64 with SMMU-v3 under strict mode: throughput cost is within run-to-run variance at every measured load shape; under active invalidation-storm contention, the bounded poll measures 35-50% faster than unbounded (bounded polling yields cores back to the transmit path). Assisted-by: Claude:unspecified Signed-off-by: Jose Fernandez (Anthropic) <redacted> Reviewed-by: Ben Cressey <redacted> --- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 5 ++++ drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 29 +++++++++++++++++++++- 3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 7f33261ba655..b940280af19d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c@@ -171,6 +171,7 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqes) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_time_budget_exit) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_xmit) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_mpwqe) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_inlnw) },
@@ -426,6 +427,7 @@ static void mlx5e_stats_grp_sw_update_stats_sq(struct mlx5e_sw_stats *s, s->tx_queue_wake += sq_stats->wake; s->tx_queue_dropped += sq_stats->dropped; s->tx_cqe_err += sq_stats->cqe_err; + s->tx_time_budget_exit += sq_stats->time_budget_exit; s->tx_recover += sq_stats->recover; s->tx_xmit_more += sq_stats->xmit_more; s->tx_csum_partial_inner += sq_stats->csum_partial_inner;
@@ -2323,6 +2325,7 @@ static const struct counter_desc sq_stats_desc[] = { { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqes) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, time_budget_exit) }, }; static const struct counter_desc rq_xdpsq_stats_desc[] = {
@@ -2399,6 +2402,7 @@ static const struct counter_desc ptp_sq_stats_desc[] = { { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, cqes) }, { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, wake) }, { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, time_budget_exit) }, }; static const struct counter_desc ptp_ch_stats_desc[] = {
@@ -2476,6 +2480,7 @@ static const struct counter_desc qos_sq_stats_desc[] = { { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqes) }, { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, wake) }, { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, time_budget_exit) }, }; #define NUM_RQ_STATS ARRAY_SIZE(rq_stats_desc)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 09f155acb461..5ba954f42ccd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h@@ -187,6 +187,7 @@ struct mlx5e_sw_stats { u64 tx_cqes; u64 tx_queue_wake; u64 tx_cqe_err; + u64 tx_time_budget_exit; u64 tx_xdp_xmit; u64 tx_xdp_mpwqe; u64 tx_xdp_inlnw;
@@ -445,6 +446,7 @@ struct mlx5e_sq_stats { u64 cqes ____cacheline_aligned_in_smp; u64 wake; u64 cqe_err; + u64 time_budget_exit; }; struct mlx5e_xdpsq_stats {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 0b5e600e4a6a..994df912b765 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c@@ -43,6 +43,13 @@ #include "en_accel/macsec.h" #include "en/ptp.h" #include <net/ipv6.h> +#include <linux/moduleparam.h> +#include <linux/sched/clock.h> + +static unsigned int mlx5e_tx_cq_time_budget_us = 500; +module_param_named(tx_cq_time_budget_us, mlx5e_tx_cq_time_budget_us, uint, 0644); +MODULE_PARM_DESC(tx_cq_time_budget_us, + "Max microseconds one TX CQ poll may spend before yielding (0 = unbounded)"); static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) {
@@ -760,9 +767,12 @@ void mlx5e_txqsq_wake(struct mlx5e_txqsq *sq) bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) { struct mlx5e_sq_stats *stats; + bool time_exceeded = false; + u64 time_budget_end = 0; struct mlx5e_txqsq *sq; struct mlx5_cqe64 *cqe; u32 dma_fifo_cc; + u32 budget_us; u32 nbytes; u16 npkts; u16 sqcc;
@@ -790,6 +800,10 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) /* avoid dirtying sq cache line every cqe */ dma_fifo_cc = sq->dma_fifo_cc; + budget_us = READ_ONCE(mlx5e_tx_cq_time_budget_us); + if (budget_us) + time_budget_end = local_clock() + (u64)budget_us * NSEC_PER_USEC; + i = 0; do { struct mlx5e_tx_wqe_info *wi;
@@ -842,8 +856,19 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) stats->cqe_err++; } + /* Check between CQEs only (sqcc/dma_fifo_cc must advance together). */ + if (unlikely(time_budget_end && (i & 7) == 7 && + local_clock() >= time_budget_end)) { + time_exceeded = true; + i++; + break; + } + } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + if (unlikely(time_exceeded)) + stats->time_budget_exit++; + stats->cqes += i; mlx5_cqwq_update_db_record(&cq->wq);
@@ -858,7 +883,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) mlx5e_txqsq_wake(sq); - return (i == MLX5E_TX_CQ_POLL_BUDGET); + return time_exceeded || (i == MLX5E_TX_CQ_POLL_BUDGET); } static void mlx5e_tx_wi_kfree_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi)
@@ -879,6 +904,8 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq) dma_fifo_cc = sq->dma_fifo_cc; while (sqcc != sq->pc) { + cond_resched(); + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); wi = &sq->db.wqe_info[ci];
--- base-commit: 08bc5b2636afcbadc31bb17243eec094e048bd79 change-id: 20260702-mlx5e-tx-cq-time-budget-02cccf37bf54 Best regards, -- Jose Fernandez (Anthropic) [off-list ref]