Re: [Patch v7 12/12] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter
From: Paolo Abeni <pabeni@redhat.com>
Date: 2022-10-18 10:12:24
Also in:
linux-hyperv, linux-rdma, lkml
On Mon, 2022-10-17 at 12:20 -0700, longli@linuxonhyperv.com wrote:
quoted hunk ↗ jump to hunk
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c new file mode 100644 index 000000000000..57e5f9dca454 --- /dev/null +++ b/drivers/infiniband/hw/mana/main.c
[...]
+static int mana_gd_destroy_doorbell_page(struct gdma_context *gc,
+ int doorbell_page)
+{
+ struct gdma_destroy_resource_range_req req = {};
+ struct gdma_resp_hdr resp = {};
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_RESOURCE_RANGE,
+ sizeof(req), sizeof(resp));
+
+ req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
+ req.num_resources = 1;
+ req.allocated_resources = doorbell_page;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.status) {
+ dev_err(gc->dev,
+ "Failed to destroy doorbell page: ret %d, 0x%x\n",
+ err, resp.status);
+ return err ? err : -EPROTO;Minor nit: the preferred style is: return err ?: -EPROTO; a few other occurences below.
+ } + + return 0; +}
[...]
+int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
+ mana_handle_t *gdma_region)
+{
+ struct gdma_dma_region_add_pages_req *add_req = NULL;
+ struct gdma_create_dma_region_resp create_resp = {};
+ struct gdma_create_dma_region_req *create_req;
+ size_t num_pages_cur, num_pages_to_handle;
+ unsigned int create_req_msg_size;
+ struct hw_channel_context *hwc;
+ struct ib_block_iter biter;
+ size_t max_pgs_create_cmd;
+ struct gdma_context *gc;
+ size_t num_pages_total;
+ struct gdma_dev *mdev;
+ unsigned long page_sz;
+ void *request_buf;
+ unsigned int i;
+ int err;
+
+ mdev = dev->gdma_dev;
+ gc = mdev->gdma_context;
+ hwc = gc->hwc.driver_data;
+
+ /* Hardware requires dma region to align to chosen page size */
+ page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
+ if (!page_sz) {
+ ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
+ return -ENOMEM;
+ }
+ num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
+
+ max_pgs_create_cmd =
+ (hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64);
+ num_pages_to_handle =
+ min_t(size_t, num_pages_total, max_pgs_create_cmd);
+ create_req_msg_size =
+ struct_size(create_req, page_addr_list, num_pages_to_handle);
+
+ request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL);
+ if (!request_buf)
+ return -ENOMEM;
+
+ create_req = request_buf;
+ mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION,
+ create_req_msg_size, sizeof(create_resp));
+
+ create_req->length = umem->length;
+ create_req->offset_in_page = umem->address & (page_sz - 1);
+ create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT;
+ create_req->page_count = num_pages_total;
+ create_req->page_addr_list_len = num_pages_to_handle;
+
+ ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n",
+ umem->length, num_pages_total);
+
+ ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
+ page_sz, create_req->offset_in_page);
+
+ ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u",
+ num_pages_to_handle, create_req->gdma_page_type);
+
+ __rdma_umem_block_iter_start(&biter, umem, page_sz);
+
+ for (i = 0; i < num_pages_to_handle; ++i) {
+ dma_addr_t cur_addr;
+
+ __rdma_block_iter_next(&biter);
+ cur_addr = rdma_block_iter_dma_address(&biter);
+
+ create_req->page_addr_list[i] = cur_addr;
+ }
+
+ err = mana_gd_send_request(gc, create_req_msg_size, create_req,
+ sizeof(create_resp), &create_resp);
+ if (err || create_resp.hdr.status) {
+ ibdev_dbg(&dev->ib_dev,
+ "Failed to create DMA region: %d, 0x%x\n", err,
+ create_resp.hdr.status);
+ if (!err)
+ err = -EPROTO;
+
+ kfree(request_buf);
+ return err;Minor nit: you can avoid a little code doplication replacing the above 2 lines with: goto out; and ...
+ }
+
+ *gdma_region = create_resp.dma_region_handle;
+ ibdev_dbg(&dev->ib_dev, "Created DMA region with handle 0x%llx\n",
+ *gdma_region);
+
+ num_pages_cur = num_pages_to_handle;
+
+ if (num_pages_cur < num_pages_total) {
+ unsigned int add_req_msg_size;
+ size_t max_pgs_add_cmd =
+ (hwc->max_req_msg_size - sizeof(*add_req)) /
+ sizeof(u64);
+
+ num_pages_to_handle =
+ min_t(size_t, num_pages_total - num_pages_cur,
+ max_pgs_add_cmd);
+
+ /* Calculate the max num of pages that will be handled */
+ add_req_msg_size = struct_size(add_req, page_addr_list,
+ num_pages_to_handle);
+ add_req = request_buf;
+
+ while (num_pages_cur < num_pages_total) {
+ struct gdma_general_resp add_resp = {};
+ u32 expected_status = 0;
+
+ if (num_pages_cur + num_pages_to_handle <
+ num_pages_total) {
+ /* Status indicating more pages are needed */
+ expected_status = GDMA_STATUS_MORE_ENTRIES;
+ }
+
+ memset(add_req, 0, add_req_msg_size);
+
+ mana_gd_init_req_hdr(&add_req->hdr,
+ GDMA_DMA_REGION_ADD_PAGES,
+ add_req_msg_size,
+ sizeof(add_resp));
+ add_req->dma_region_handle = *gdma_region;
+ add_req->page_addr_list_len = num_pages_to_handle;
+
+ for (i = 0; i < num_pages_to_handle; ++i) {
+ dma_addr_t cur_addr =
+ rdma_block_iter_dma_address(&biter);
+ add_req->page_addr_list[i] = cur_addr;
+ __rdma_block_iter_next(&biter);
+
+ ibdev_dbg(&dev->ib_dev,
+ "page_addr_list %lu addr 0x%llx\n",
+ num_pages_cur + i, cur_addr);
+ }
+
+ err = mana_gd_send_request(gc, add_req_msg_size,
+ add_req, sizeof(add_resp),
+ &add_resp);
+ if (err || add_resp.hdr.status != expected_status) {
+ ibdev_dbg(&dev->ib_dev,
+ "Failed put DMA pages %u: %d,0x%x\n",
+ i, err, add_resp.hdr.status);
+ err = -EPROTO;
+ break;
+ }
+
+ num_pages_cur += num_pages_to_handle;
+ num_pages_to_handle =
+ min_t(size_t, num_pages_total - num_pages_cur,
+ max_pgs_add_cmd);
+ add_req_msg_size = sizeof(*add_req) +
+ num_pages_to_handle * sizeof(u64);
+ }
+ }
+
+ kfree(request_buf);
+
+ if (err)
+ mana_ib_gd_destroy_dma_region(dev, create_resp.dma_region_handle);... here: if (err) mana_ib_gd_destroy_dma_region(dev, create_resp.dma_region_handle); out: kfree(request_buf);
+ + return err; +}
[...]
quoted hunk ↗ jump to hunk
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c new file mode 100644 index 000000000000..fec7d4a06ace --- /dev/null +++ b/drivers/infiniband/hw/mana/qp.c@@ -0,0 +1,505 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, + struct net_device *ndev, + mana_handle_t default_rxobj, + mana_handle_t ind_table[], + u32 log_ind_tbl_size, u32 rx_hash_key_len, + u8 *rx_hash_key) +{ + struct mana_port_context *mpc = netdev_priv(ndev); + struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_resp resp = {}; + mana_handle_t *req_indir_tab; + struct gdma_context *gc; + struct gdma_dev *mdev; + u32 req_buf_size; + int i, err; + + mdev = dev->gdma_dev; + gc = mdev->gdma_context; + + req_buf_size = + sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE; + req = kzalloc(req_buf_size, GFP_KERNEL); + if (!req) + return -ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, + sizeof(resp)); + + req->vport = mpc->port_handle; + req->rx_enable = 1; + req->update_default_rxobj = 1; + req->default_rxobj = default_rxobj; + req->hdr.dev_id = mdev->dev_id; + + /* If there are more than 1 entries in indirection table, enable RSS */ + if (log_ind_tbl_size) + req->rss_enable = true; + + req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; + req->indir_tab_offset = sizeof(*req); + req->update_indir_tab = true; + + req_indir_tab = (mana_handle_t *)(req + 1); + /* The ind table passed to the hardware must have + * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb + * ind_table to MANA_INDIRECT_TABLE_SIZE if required + */ + ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size); + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { + req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; + ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i, + req_indir_tab[i]); + } + + req->update_hashkey = true; + if (rx_hash_key_len) + memcpy(req->hashkey, rx_hash_key, rx_hash_key_len); + else + netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE); + + ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n", + req->vport, default_rxobj); + + err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp); + if (err) { + netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; + } + + if (resp.hdr.status) { + netdev_err(ndev, "vPort RX configuration failed: 0x%x\n", + resp.hdr.status); + err = -EPROTO;
This is confusing: if this error condition is reached, both error and succesful configuration will be logged. I guess an additional: goto out; is needed.
+ }
+
+ netdev_info(ndev, "Configured steering vPort %llu log_entries %u\n",
+ mpc->port_handle, log_ind_tbl_size);
+
+out:
+ kfree(req);
+ return err;
+}
+
+static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+ struct mana_ib_dev *mdev =
+ container_of(pd->device, struct mana_ib_dev, ib_dev);
+ struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
+ struct mana_ib_create_qp_rss_resp resp = {};
+ struct mana_ib_create_qp_rss ucmd = {};
+ struct gdma_dev *gd = mdev->gdma_dev;
+ mana_handle_t *mana_ind_table;
+ struct mana_port_context *mpc;
+ struct mana_context *mc;
+ struct net_device *ndev;
+ struct mana_ib_cq *cq;
+ struct mana_ib_wq *wq;
+ unsigned int ind_tbl_size;
+ struct ib_cq *ibcq;
+ struct ib_wq *ibwq;
+ u32 port;
+ int ret;
+ int i;
This causes a build warning with clang:
../drivers/infiniband/hw/mana/qp.c:172:6: warning: variable 'i' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
if (!mana_ind_table) {
^~~~~~~~~~~~~~~
../drivers/infiniband/hw/mana/qp.c:241:9: note: uninitialized use occurs here
while (i-- > 0) {
^
../drivers/infiniband/hw/mana/qp.c:172:2: note: remove the 'if' if its condition is always false
if (!mana_ind_table) {
^~~~~~~~~~~~~~~~~~~~~~
../drivers/infiniband/hw/mana/qp.c:113:7: note: initialize the variable 'i' to silence this warning
int i;
+
+ mc = gd->driver_data;
+
+ if (!udata || udata->inlen < sizeof(ucmd))
+ return -EINVAL;
+
+ ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+ if (ret) {
+ ibdev_dbg(&mdev->ib_dev,
+ "Failed copy from udata for create rss-qp, err %d\n",
+ ret);
+ return -EFAULT;
+ }
+
+ if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
+ ibdev_dbg(&mdev->ib_dev,
+ "Requested max_recv_wr %d exceeding limit\n",
+ attr->cap.max_recv_wr);
+ return -EINVAL;
+ }
+
+ if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
+ ibdev_dbg(&mdev->ib_dev,
+ "Requested max_recv_sge %d exceeding limit\n",
+ attr->cap.max_recv_sge);
+ return -EINVAL;
+ }
+
+ ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size;
+ if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) {
+ ibdev_dbg(&mdev->ib_dev,
+ "Indirect table size %d exceeding limit\n",
+ ind_tbl_size);
+ return -EINVAL;
+ }
+
+ if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) {
+ ibdev_dbg(&mdev->ib_dev,
+ "RX Hash function is not supported, %d\n",
+ ucmd.rx_hash_function);
+ return -EINVAL;
+ }
+
+ /* IB ports start with 1, MANA start with 0 */
+ port = ucmd.port;
+ if (port < 1 || port > mc->num_ports) {
+ ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n",
+ port);
+ return -EINVAL;
+ }
+ ndev = mc->ports[port - 1];
+ mpc = netdev_priv(ndev);
+
+ ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
+ ucmd.rx_hash_function, port);
+
+ mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t),
+ GFP_KERNEL);
+ if (!mana_ind_table) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ qp->port = port;
+
+ for (i = 0; i < ind_tbl_size; i++) {
+ struct mana_obj_spec wq_spec = {};
+ struct mana_obj_spec cq_spec = {};
+
+ ibwq = ind_tbl->ind_tbl[i];
+ wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+
+ ibcq = ibwq->cq;
+ cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+
+ wq_spec.gdma_region = wq->gdma_region;
+ wq_spec.queue_size = wq->wq_buf_size;
+
+ cq_spec.gdma_region = cq->gdma_region;
+ cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
+ cq_spec.modr_ctx_id = 0;
+ cq_spec.attached_eq = GDMA_CQ_NO_EQ;
+
+ ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
+ &wq_spec, &cq_spec, &wq->rx_object);
+ if (ret)
+ goto fail;
+
+ /* The GDMA regions are now owned by the WQ object */
+ wq->gdma_region = GDMA_INVALID_DMA_REGION;
+ cq->gdma_region = GDMA_INVALID_DMA_REGION;
+
+ wq->id = wq_spec.queue_index;
+ cq->id = cq_spec.queue_index;
+
+ ibdev_dbg(&mdev->ib_dev,
+ "ret %d rx_object 0x%llx wq id %llu cq id %llu\n",
+ ret, wq->rx_object, wq->id, cq->id);
+
+ resp.entries[i].cqid = cq->id;
+ resp.entries[i].wqid = wq->id;
+
+ mana_ind_table[i] = wq->rx_object;
+ }
+ resp.num_entries = i;
+
+ ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object,
+ mana_ind_table,
+ ind_tbl->log_ind_tbl_size,
+ ucmd.rx_hash_key_len,
+ ucmd.rx_hash_key);
+ if (ret)
+ goto fail;
+
+ ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (ret) {
+ ibdev_dbg(&mdev->ib_dev,
+ "Failed to copy to udata create rss-qp, %d\n",
+ ret);
+ goto fail;
+ }
+
+ kfree(mana_ind_table);
+
+ return 0;
+
+fail:
+ while (i-- > 0) {
+ ibwq = ind_tbl->ind_tbl[i];
+ wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+ mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
+ }
+
+ kfree(mana_ind_table);
+
+ return ret;
+}Cheers, Paolo