Thread (3 messages) 3 messages, 2 authors, 2026-02-27

Re: [net-next] net/smc: transition to RDMA core CQ pooling

From: "D. Wythe" <alibuda@linux.alibaba.com >
Date: 2026-02-27 09:33:57
Also in: linux-rdma, linux-s390, lkml

On Thu, Feb 26, 2026 at 07:53:00PM -0800, Jakub Kicinski wrote:
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.

For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
net/smc: transition to RDMA core CQ pooling

This commit refactors SMC-R completion handling to use the ib_cqe API
and RDMA core CQ pooling, replacing the previous per-device global CQs
and manual tasklet polling. The refactoring aims to eliminate scalability
bottlenecks by enabling multi-CQ parallelization and O(1) context retrieval.
quoted
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 5feafa98ab1a..4a4924c5cade 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
[ ... ]
quoted
@@ -63,55 +60,52 @@ void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
 	wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
 }

-static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+static void smc_wr_tx_rdma_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
 {
-	u32 i;
+	struct smc_link *link = wc->qp->qp_context;

-	for (i = 0; i < link->wr_tx_cnt; i++) {
-		if (link->wr_tx_pends[i].wr_id == wr_id)
-			return i;
-	}
-	return link->wr_tx_cnt;
+	/* terminate link */
+	if (wc->status)
+		smcr_link_down_cond_sched(link);
+}
+
+static void smc_wr_reg_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct smc_link *link = wc->qp->qp_context;
+
+	if (wc->status)
+		link->wr_reg_state = FAILED;
+	else
+		link->wr_reg_state = CONFIRMED;
+	smc_wr_wakeup_reg_wait(link);
 }

-static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+static void smc_wr_tx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct smc_wr_tx_pend pnd_snd;
+	struct smc_wr_tx_pend *tx_pend, pnd_snd;
+	struct smc_ib_send_wr *send_wr;
 	struct smc_link *link;
 	u32 pnd_snd_idx;

 	link = wc->qp->qp_context;

-	if (wc->opcode == IB_WC_REG_MR) {
-		if (wc->status)
-			link->wr_reg_state = FAILED;
-		else
-			link->wr_reg_state = CONFIRMED;
-		smc_wr_wakeup_reg_wait(link);
-		return;
-	}
+	send_wr = container_of(wc->wr_cqe, struct smc_ib_send_wr, cqe);
+	pnd_snd_idx = send_wr->idx;
+
+	tx_pend = (pnd_snd_idx == link->wr_tx_cnt) ? link->wr_tx_v2_pend :
+		&link->wr_tx_pends[pnd_snd_idx];
+
+	tx_pend->wc_status = wc->status;
+	memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
+	/* clear the full struct smc_wr_tx_pend including .priv */
+	memset(tx_pend, 0, sizeof(*tx_pend));

-	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
 	if (pnd_snd_idx == link->wr_tx_cnt) {
-		if (link->lgr->smc_version != SMC_V2 ||
-		    link->wr_tx_v2_pend->wr_id != wc->wr_id)
-			return;
-		link->wr_tx_v2_pend->wc_status = wc->status;
-		memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
-		/* clear the full struct smc_wr_tx_pend including .priv */
-		memset(link->wr_tx_v2_pend, 0,
-		       sizeof(*link->wr_tx_v2_pend));
 		memset(link->lgr->wr_tx_buf_v2, 0,
 		       sizeof(*link->lgr->wr_tx_buf_v2));
 	} else {
-		link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
-		if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
+		if (tx_pend->compl_requested)
                    ^^^^^^

Does this read the correct field? The code above memsets tx_pend to zero,
so compl_requested is always 0 and complete() is never called.

The refactoring created a local copy pnd_snd before the memset:

memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
memset(tx_pend, 0, sizeof(*tx_pend));

But then it reads from the zeroed tx_pend instead of pnd_snd. Should this be
pnd_snd.compl_requested instead?

This affects smc_wr_tx_send_wait(), which sets compl_requested and waits for
the completion that never arrives, causing LLC message timeouts.
Nice catch! It should indeed be pnd_snd.compl_requested here since tx_pend has been zeroed.
Will fix in v2.
quoted
 			complete(&link->wr_tx_compl[pnd_snd_idx]);
-		memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
-		       sizeof(pnd_snd));
-		/* clear the full struct smc_wr_tx_pend including .priv */
-		memset(&link->wr_tx_pends[pnd_snd_idx], 0,
-		       sizeof(link->wr_tx_pends[pnd_snd_idx]));
 		memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
 		       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
 		if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
[ ... ]
quoted
@@ -550,7 +472,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
 	lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr,
 			       lnk->qp_attr.cap.max_send_wr);
 	lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr,
-			       lnk->qp_attr.cap.max_recv_wr);
+			       lnk->qp_attr.cap.max_recv_wr - 1);	/* -1 for ib_draib_rq() */
                                                                                        ^^^^^^^^^^^^^^
This isn't a bug, but there's a typo in the comment: ib_draib_rq() should be
ib_drain_rq().
Will fix in v2. Thanks.

D. Wythe
quoted
 }
[ ... ]
-- 
pw-bot: cr
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help