Re: [net-next] net/smc: transition to RDMA core CQ pooling
From: "D. Wythe" <alibuda@linux.alibaba.com >
Date: 2026-02-27 09:33:57
Also in:
linux-rdma, linux-s390, lkml
On Thu, Feb 26, 2026 at 07:53:00PM -0800, Jakub Kicinski wrote:
This is an AI-generated review of your patch. The human sending this email has considered the AI review valid, or at least plausible. For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html --- net/smc: transition to RDMA core CQ pooling This commit refactors SMC-R completion handling to use the ib_cqe API and RDMA core CQ pooling, replacing the previous per-device global CQs and manual tasklet polling. The refactoring aims to eliminate scalability bottlenecks by enabling multi-CQ parallelization and O(1) context retrieval.quoted
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 5feafa98ab1a..4a4924c5cade 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c[ ... ]quoted
@@ -63,55 +60,52 @@ void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); } -static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) +static void smc_wr_tx_rdma_process_cqe(struct ib_cq *cq, struct ib_wc *wc) { - u32 i; + struct smc_link *link = wc->qp->qp_context; - for (i = 0; i < link->wr_tx_cnt; i++) { - if (link->wr_tx_pends[i].wr_id == wr_id) - return i; - } - return link->wr_tx_cnt; + /* terminate link */ + if (wc->status) + smcr_link_down_cond_sched(link); +} + +static void smc_wr_reg_process_cqe(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smc_link *link = wc->qp->qp_context; + + if (wc->status) + link->wr_reg_state = FAILED; + else + link->wr_reg_state = CONFIRMED; + smc_wr_wakeup_reg_wait(link); } -static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) +static void smc_wr_tx_process_cqe(struct ib_cq *cq, struct ib_wc *wc) { - struct smc_wr_tx_pend pnd_snd; + struct smc_wr_tx_pend *tx_pend, pnd_snd; + struct smc_ib_send_wr *send_wr; struct smc_link *link; u32 pnd_snd_idx; link = wc->qp->qp_context; - if (wc->opcode == IB_WC_REG_MR) { - if (wc->status) - link->wr_reg_state = FAILED; - else - link->wr_reg_state = CONFIRMED; - smc_wr_wakeup_reg_wait(link); - return; - } + send_wr = container_of(wc->wr_cqe, struct smc_ib_send_wr, cqe); + pnd_snd_idx = send_wr->idx; + + tx_pend = (pnd_snd_idx == link->wr_tx_cnt) ? link->wr_tx_v2_pend : + &link->wr_tx_pends[pnd_snd_idx]; + + tx_pend->wc_status = wc->status; + memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(tx_pend, 0, sizeof(*tx_pend)); - pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); if (pnd_snd_idx == link->wr_tx_cnt) { - if (link->lgr->smc_version != SMC_V2 || - link->wr_tx_v2_pend->wr_id != wc->wr_id) - return; - link->wr_tx_v2_pend->wc_status = wc->status; - memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); - /* clear the full struct smc_wr_tx_pend including .priv */ - memset(link->wr_tx_v2_pend, 0, - sizeof(*link->wr_tx_v2_pend)); memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2)); } else { - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; - if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + if (tx_pend->compl_requested)^^^^^^ Does this read the correct field? The code above memsets tx_pend to zero, so compl_requested is always 0 and complete() is never called. The refactoring created a local copy pnd_snd before the memset: memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd)); memset(tx_pend, 0, sizeof(*tx_pend)); But then it reads from the zeroed tx_pend instead of pnd_snd. Should this be pnd_snd.compl_requested instead? This affects smc_wr_tx_send_wait(), which sets compl_requested and waits for the completion that never arrives, causing LLC message timeouts.
Nice catch! It should indeed be pnd_snd.compl_requested here since tx_pend has been zeroed. Will fix in v2.
quoted
complete(&link->wr_tx_compl[pnd_snd_idx]); - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], - sizeof(pnd_snd)); - /* clear the full struct smc_wr_tx_pend including .priv */ - memset(&link->wr_tx_pends[pnd_snd_idx], 0, - sizeof(link->wr_tx_pends[pnd_snd_idx])); memset(&link->wr_tx_bufs[pnd_snd_idx], 0, sizeof(link->wr_tx_bufs[pnd_snd_idx])); if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))[ ... ]quoted
@@ -550,7 +472,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr, lnk->qp_attr.cap.max_send_wr); lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr, - lnk->qp_attr.cap.max_recv_wr); + lnk->qp_attr.cap.max_recv_wr - 1); /* -1 for ib_draib_rq() */^^^^^^^^^^^^^^ This isn't a bug, but there's a typo in the comment: ib_draib_rq() should be ib_drain_rq().
Will fix in v2. Thanks. D. Wythe
quoted
}[ ... ] -- pw-bot: cr