[PATCH net-next v11 05/12] enic: add admin CQ service with MSI-X interrupt and workqueue polling
From: Satish Kharat <satishkh@cisco.com>
Date: 2026-07-03 18:09:05
Also in:
lkml
Subsystem:
cisco vic ethernet nic driver, networking drivers, the rest · Maintainers:
Satish Kharat, Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
Add completion queue (CQ) service for the admin channel work queue (WQ) and receive queue (RQ), driven by a dedicated MSI-X interrupt and a workqueue-based CQ poller. The admin WQ CQ service advances the completion ring and returns the number of descriptors consumed. The admin RQ CQ service does the same for receive completions and copies each received message into a preallocated buffer. Received messages are enqueued for deferred dispatch by a separate work_struct so the CQ poller stays short. When the MSI-X interrupt fires, the ISR schedules the CQ poll work_struct. The work handler drains all pending completions, kicks message dispatch if work was done, and returns credits to unmask the interrupt. The poll handler snapshots the pending credit count before draining the CQ so it acknowledges exactly what the hardware reported for this interrupt; any credits that accrue during draining are serviced by the next interrupt. At least one credit is returned so the write always carries a non-zero acknowledgment and re-arms the vector, since the admin channel is not re-polled like the NAPI data path. Log a rate-limited warning when admin RQ buffer refill fails so that transient memory pressure is visible without flooding the log. Signed-off-by: Satish Kharat <satishkh@cisco.com> --- drivers/net/ethernet/cisco/enic/enic.h | 8 + drivers/net/ethernet/cisco/enic/enic_admin.c | 322 ++++++++++++++++++++++++++- drivers/net/ethernet/cisco/enic/enic_admin.h | 12 + 3 files changed, 338 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 398227448b37..401123e6df1d 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h@@ -301,6 +301,14 @@ struct enic { struct vnic_rq admin_rq; struct vnic_cq admin_cq[2]; struct vnic_intr admin_intr; + struct work_struct admin_poll_work; + unsigned int admin_intr_index; + struct work_struct admin_msg_work; + spinlock_t admin_msg_lock; /* protects admin_msg_list */ + struct list_head admin_msg_list; + unsigned int admin_msg_count; /* current depth of admin_msg_list */ + void (*admin_rq_handler)(struct enic *enic, void *buf, + unsigned int len); }; static inline struct net_device *vnic_get_netdev(struct vnic_dev *vdev)
diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.c b/drivers/net/ethernet/cisco/enic/enic_admin.c
index b2be42092106..c9b19ed002fb 100644
--- a/drivers/net/ethernet/cisco/enic/enic_admin.c
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.c@@ -4,6 +4,7 @@ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/dma-mapping.h> +#include <linux/interrupt.h> #include "vnic_dev.h" #include "vnic_wq.h"
@@ -15,6 +16,7 @@ #include "enic.h" #include "enic_admin.h" #include "cq_desc.h" +#include "cq_enet_desc.h" #include "wq_enet_desc.h" #include "rq_enet_desc.h"
@@ -94,6 +96,265 @@ static void enic_admin_rq_drain(struct enic *enic) vnic_rq_clean(&enic->admin_rq, enic_admin_rq_buf_clean); } +static unsigned int enic_admin_cq_color(void *cq_desc, unsigned int desc_size) +{ + u8 type_color = *((u8 *)cq_desc + desc_size - 1); + + return (type_color >> CQ_DESC_COLOR_SHIFT) & CQ_DESC_COLOR_MASK; +} + +unsigned int enic_admin_wq_cq_service(struct enic *enic) +{ + struct vnic_cq *cq = &enic->admin_cq[0]; + unsigned int work = 0; + void *desc; + + desc = vnic_cq_to_clean(cq); + while (enic_admin_cq_color(desc, cq->ring.desc_size) != + cq->last_color) { + vnic_cq_inc_to_clean(cq); + work++; + desc = vnic_cq_to_clean(cq); + } + + return work; +} + +/* Upper bound on pending admin messages. A buggy or hostile VF could flood + * the PF admin channel faster than admin_msg_work drains it; cap the backlog + * so a guest cannot drive the host out of memory. + */ +#define ENIC_ADMIN_MSG_MAX 256 + +static void enic_admin_msg_enqueue(struct enic *enic, void *buf, + unsigned int len) +{ + struct enic_admin_msg *msg; + + msg = kmalloc(struct_size(msg, data, len), GFP_KERNEL); + if (!msg) + return; + + msg->len = len; + memcpy(msg->data, buf, len); + + spin_lock(&enic->admin_msg_lock); + if (enic->admin_msg_count >= ENIC_ADMIN_MSG_MAX) { + spin_unlock(&enic->admin_msg_lock); + kfree(msg); + if (net_ratelimit()) + netdev_warn(enic->netdev, + "admin msg backlog full (%u); dropping\n", + ENIC_ADMIN_MSG_MAX); + return; + } + list_add_tail(&msg->list, &enic->admin_msg_list); + enic->admin_msg_count++; + spin_unlock(&enic->admin_msg_lock); +} + +unsigned int enic_admin_rq_cq_service(struct enic *enic) +{ + struct vnic_cq *cq = &enic->admin_cq[1]; + struct vnic_rq *rq = &enic->admin_rq; + struct cq_enet_rq_desc *rq_desc; + struct vnic_rq_buf *buf; + u16 bwf, bytes_written; + unsigned int work = 0; + void *desc; + + /* The admin RQ and its CQ form a single in-order channel: firmware + * posts exactly one CQE per consumed RQ descriptor, in submission + * order. Each CQE therefore pairs with rq->to_clean below without a + * completed_index cross-check, mirroring the in-order assumption of + * the main enic RX path. + */ + desc = vnic_cq_to_clean(cq); + while (enic_admin_cq_color(desc, cq->ring.desc_size) != + cq->last_color) { + /* Ensure DMA descriptor fields are read after + * the color/valid check. dma_rmb() is the + * correct barrier for DMA-written descriptors. + */ + dma_rmb(); + buf = rq->to_clean; + + /* Decode the actual number of bytes hardware wrote into + * the RX buffer. buf->len is the static allocation size + * (ENIC_ADMIN_BUF_SIZE) and would expose uninitialised + * heap memory beyond the real payload. bytes_written_flags + * is at the same offset in every cq_enet_rq_desc[_32|_64] + * variant. + */ + rq_desc = desc; + bwf = le16_to_cpu(rq_desc->bytes_written_flags); + bytes_written = bwf & CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK; + if (bytes_written > buf->len) + goto next_desc; + + dma_sync_single_for_cpu(&enic->pdev->dev, + buf->dma_addr, buf->len, + DMA_FROM_DEVICE); + + /* Drop on hardware error indications. Admin messages + * are internal to the VIC, not received over the wire. + * Firmware sets TRUNCATED when the message does not fit + * in the posted buffer, and FCS_OK is always set on + * healthy admin completions. + */ + if (bwf & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) { + netdev_warn_once(enic->netdev, + "admin RQ: truncated message dropped\n"); + goto next_desc; + } + if (!(rq_desc->flags & CQ_ENET_RQ_DESC_FLAGS_FCS_OK)) { + netdev_warn_once(enic->netdev, + "admin RQ: bad FCS, dropping message\n"); + goto next_desc; + } + + enic_admin_msg_enqueue(enic, buf->os_buf, bytes_written); + +next_desc: + enic_admin_rq_buf_clean(rq, rq->to_clean); + rq->to_clean = rq->to_clean->next; + rq->ring.desc_avail++; + + vnic_cq_inc_to_clean(cq); + work++; + desc = vnic_cq_to_clean(cq); + } + + if (enic_admin_rq_fill(enic, GFP_KERNEL) && net_ratelimit()) + netdev_warn(enic->netdev, + "admin RQ refill failed\n"); + + return work; +} + +static irqreturn_t enic_admin_isr_msix(int irq, void *data) +{ + struct enic *enic = data; + + schedule_work(&enic->admin_poll_work); + + return IRQ_HANDLED; +} + +static void enic_admin_msg_work_handler(struct work_struct *work) +{ + struct enic *enic = container_of(work, struct enic, admin_msg_work); + struct enic_admin_msg *msg, *tmp; + LIST_HEAD(local_list); + + spin_lock_bh(&enic->admin_msg_lock); + list_splice_init(&enic->admin_msg_list, &local_list); + enic->admin_msg_count = 0; + spin_unlock_bh(&enic->admin_msg_lock); + + list_for_each_entry_safe(msg, tmp, &local_list, list) { + if (enic->admin_rq_handler) + enic->admin_rq_handler(enic, msg->data, msg->len); + list_del(&msg->list); + kfree(msg); + } +} + +static void enic_admin_poll_work_handler(struct work_struct *work) +{ + struct enic *enic = container_of(work, struct enic, admin_poll_work); + unsigned int credits; + unsigned int rq_work; + + /* Snapshot the pending credit count before draining so we acknowledge + * exactly what the hardware reported for this interrupt. Credits that + * accrue while enic_admin_rq_cq_service() runs are left for the next + * interrupt, which is harmless on this low-rate control path. + */ + credits = vnic_intr_credits(&enic->admin_intr); + + rq_work = enic_admin_rq_cq_service(enic); + + if (rq_work > 0) + schedule_work(&enic->admin_msg_work); + + /* Acknowledge the snapshotted credits and unmask the vector. Unlike + * the NAPI data path, the admin channel is not re-polled, so the vector + * must be re-armed here to receive the next completion. Return at least + * one credit so the write always carries a non-zero acknowledgment and + * re-arms the vector even on a spurious wakeup that found no credits. + */ + vnic_intr_return_credits(&enic->admin_intr, + credits ?: 1, + 1 /* unmask */, 0); +} + +static int enic_admin_setup_intr(struct enic *enic) +{ + unsigned int intr_index = enic->intr_count; + int err; + + if (vnic_dev_get_intr_mode(enic->vdev) != VNIC_DEV_INTR_MODE_MSIX || + intr_index >= enic->intr_avail) + return -ENODEV; + + /* The admin INTR uses a slot in the same RES_TYPE_INTR_CTRL + * strided array of per-vector control blocks (mask, coalescing + * timer, credit return) that the data-path IRQs occupy in BAR0. + * vnic_intr_alloc() defaults to RES_TYPE_INTR_CTRL, which is what + * we want here. + */ + err = vnic_intr_alloc(enic->vdev, &enic->admin_intr, intr_index); + if (err) { + netdev_warn(enic->netdev, + "Failed to alloc admin intr at index %u: %d\n", + intr_index, err); + return err; + } + + enic->admin_intr_index = intr_index; + + /* A V2 VF opens the admin channel during probe, before + * register_netdev() resolves the "eth%d" name template, so using + * netdev->name here would register the literal "eth%d-admin" in + * /proc/interrupts. Use the already-stable PCI device name instead. + */ + snprintf(enic->msix[intr_index].devname, + sizeof(enic->msix[intr_index].devname), + "%s-admin", pci_name(enic->pdev)); + enic->msix[intr_index].isr = enic_admin_isr_msix; + enic->msix[intr_index].devid = enic; + + err = request_irq(enic->msix_entry[intr_index].vector, + enic->msix[intr_index].isr, 0, + enic->msix[intr_index].devname, + enic->msix[intr_index].devid); + if (err) { + netdev_warn(enic->netdev, + "Failed to request admin MSI-X irq: %d\n", err); + vnic_intr_free(&enic->admin_intr); + return err; + } + + enic->msix[intr_index].requested = 1; + + netdev_dbg(enic->netdev, + "admin channel using MSI-X interrupt (index %u)\n", + intr_index); + + return 0; +} + +static void enic_admin_teardown_intr(struct enic *enic) +{ + unsigned int intr_index = enic->admin_intr_index; + + free_irq(enic->msix_entry[intr_index].vector, + enic->msix[intr_index].devid); + cancel_work_sync(&enic->admin_poll_work); + enic->msix[intr_index].requested = 0; +} + static int enic_admin_qp_type_set(struct enic *enic, u32 enable) { u64 a0 = QP_TYPE_ADMIN, a1 = enable;
@@ -173,6 +434,7 @@ static int enic_admin_alloc_resources(struct enic *enic) static void enic_admin_free_resources(struct enic *enic) { + vnic_intr_free(&enic->admin_intr); vnic_cq_free(&enic->admin_cq[1]); vnic_cq_free(&enic->admin_cq[0]); vnic_rq_free(&enic->admin_rq);
@@ -181,6 +443,8 @@ static void enic_admin_free_resources(struct enic *enic) static void enic_admin_init_resources(struct enic *enic) { + unsigned int intr_offset = enic->admin_intr_index; + vnic_wq_init(&enic->admin_wq, 0, 0, 0); /* cq_index, err_intr_enable, err_intr_offset */ vnic_rq_init(&enic->admin_rq,
@@ -189,20 +453,35 @@ static void enic_admin_init_resources(struct enic *enic) VNIC_CQ_FC_DISABLE, VNIC_CQ_COLOR_ENABLE, 0, 0, 1, /* cq_head, cq_tail, cq_tail_color */ - VNIC_CQ_INTR_DISABLE, + VNIC_CQ_INTR_DISABLE, /* polled synchronously by mbox send */ VNIC_CQ_ENTRY_ENABLE, VNIC_CQ_MSG_DISABLE, - 0, /* interrupt_offset */ + intr_offset, 0 /* cq_message_addr */); vnic_cq_init(&enic->admin_cq[1], VNIC_CQ_FC_DISABLE, VNIC_CQ_COLOR_ENABLE, 0, 0, 1, /* cq_head, cq_tail, cq_tail_color */ - VNIC_CQ_INTR_DISABLE, + VNIC_CQ_INTR_ENABLE, VNIC_CQ_ENTRY_ENABLE, VNIC_CQ_MSG_DISABLE, - 0, /* interrupt_offset */ + intr_offset, 0 /* cq_message_addr */); + vnic_intr_init(&enic->admin_intr, + 0, 0, 1); /* coalescing_timer, coalescing_type, mask_on_assertion */ +} + +static void enic_admin_msg_drain(struct enic *enic) +{ + struct enic_admin_msg *msg, *tmp; + + spin_lock_bh(&enic->admin_msg_lock); + list_for_each_entry_safe(msg, tmp, &enic->admin_msg_list, list) { + list_del(&msg->list); + kfree(msg); + } + enic->admin_msg_count = 0; + spin_unlock_bh(&enic->admin_msg_lock); } int enic_admin_channel_open(struct enic *enic)
@@ -220,6 +499,19 @@ int enic_admin_channel_open(struct enic *enic) return err; } + spin_lock_init(&enic->admin_msg_lock); + INIT_LIST_HEAD(&enic->admin_msg_list); + INIT_WORK(&enic->admin_msg_work, enic_admin_msg_work_handler); + INIT_WORK(&enic->admin_poll_work, enic_admin_poll_work_handler); + + err = enic_admin_setup_intr(enic); + if (err) { + netdev_err(enic->netdev, + "Admin channel requires MSI-X, SR-IOV unavailable: %d\n", + err); + goto free_resources; + } + enic_admin_init_resources(enic); vnic_wq_enable(&enic->admin_wq);
@@ -239,17 +531,31 @@ int enic_admin_channel_open(struct enic *enic) goto disable_queues; } + vnic_intr_unmask(&enic->admin_intr); + + netdev_dbg(enic->netdev, + "admin channel open: intr=%u wq_avail=%u rq_avail=%u cq0_color=%u cq1_color=%u\n", + enic->admin_intr_index, + vnic_wq_desc_avail(&enic->admin_wq), + vnic_rq_desc_avail(&enic->admin_rq), + enic->admin_cq[0].last_color, + enic->admin_cq[1].last_color); + enic->admin_chan_up = true; return 0; disable_queues: + enic_admin_teardown_intr(enic); enic_admin_qp_type_set(enic, QP_DISABLE); if (vnic_wq_disable(&enic->admin_wq)) netdev_warn(enic->netdev, "Failed to disable admin WQ\n"); if (vnic_rq_disable(&enic->admin_rq)) netdev_warn(enic->netdev, "Failed to disable admin RQ\n"); + cancel_work_sync(&enic->admin_msg_work); + enic_admin_msg_drain(enic); enic_admin_rq_drain(enic); +free_resources: enic_admin_free_resources(enic); return err; }
@@ -268,6 +574,13 @@ void enic_admin_channel_close(struct enic *enic) if (!enic->admin_chan_up) return; + netdev_dbg(enic->netdev, "admin channel close\n"); + + vnic_intr_mask(&enic->admin_intr); + enic_admin_teardown_intr(enic); + cancel_work_sync(&enic->admin_msg_work); + enic_admin_msg_drain(enic); + enic_admin_qp_type_set(enic, QP_DISABLE); err = vnic_wq_disable(&enic->admin_wq);
@@ -283,6 +596,7 @@ void enic_admin_channel_close(struct enic *enic) enic_admin_rq_drain(enic); vnic_cq_clean(&enic->admin_cq[0]); vnic_cq_clean(&enic->admin_cq[1]); + vnic_intr_clean(&enic->admin_intr); enic_admin_free_resources(enic); enic->admin_chan_up = false;
diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.h b/drivers/net/ethernet/cisco/enic/enic_admin.h
index 569aadeb9312..62c80220b0ca 100644
--- a/drivers/net/ethernet/cisco/enic/enic_admin.h
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.h@@ -9,7 +9,19 @@ struct enic; +/* Wrapper for received admin messages queued for deferred processing. + * The admin CQ poll work handler enqueues these; a separate work handler + * processes them where sleeping (mutex, GFP_KERNEL) is safe. + */ +struct enic_admin_msg { + struct list_head list; + unsigned int len; + u8 data[] __aligned(8); +}; + int enic_admin_channel_open(struct enic *enic); void enic_admin_channel_close(struct enic *enic); +unsigned int enic_admin_wq_cq_service(struct enic *enic); +unsigned int enic_admin_rq_cq_service(struct enic *enic); #endif /* _ENIC_ADMIN_H_ */
--
2.43.0