Thread (13 messages) 13 messages, 3 authors, 2022-01-05

RE: [External] : Re: [PATCH rdma-core 2/5] RDMA-CORE/erdma: Add userspace verbs implementation

From: Devesh Sharma <hidden>
Date: 2021-12-28 12:18:05

-----Original Message-----
From: Cheng Xu <chengyou@linux.alibaba.com>
Sent: Monday, December 27, 2021 1:29 PM
To: Devesh Sharma <redacted>; leon@kernel.org
Cc: dledford@redhat.com; jgg@mellanox.com; linux-rdma@vger.kernel.org;
KaiShen@linux.alibaba.com
Subject: [External] : Re: [PATCH rdma-core 2/5] RDMA-CORE/erdma: Add
userspace verbs implementation



On 12/27/21 2:29 PM, Devesh Sharma wrote:
quoted
<...>
quoted
quoted
+	++page->used;
+
+	for (i = 0; !page->free[i]; ++i)
+		;/* nothing */
Why?
page->free[i] is a 64bits bitmap, all zeros means the corespoding
entries are all used, then we need to traverse next. If stopped, then we get a
valid entry to use.
The loop could be a candidate for compiler optimization?
quoted
quoted
+
+	j = ffsl(page->free[i]) - 1;
+	page->free[i] &= ~(1UL << j);
+
+	db_records = page->page_buf + (i * bits_perlong + j) *
+ERDMA_DBRECORDS_SIZE;
+
+out:
+	pthread_mutex_unlock(&ctx->dbrecord_pages_mutex);
+
+	return db_records;
+}
+
+void erdma_dealloc_dbrecords(struct erdma_context *ctx, uint64_t
+*dbrecords) {
+	struct erdma_dbrecord_page *page;
+	int page_mask = ~(ctx->page_size - 1);
+	int idx;
+
+	pthread_mutex_lock(&ctx->dbrecord_pages_mutex);
+	for (page = ctx->dbrecord_pages; page; page = page->next)
+		if (((uintptr_t)dbrecords & page_mask) == (uintptr_t)page-
quoted
page_buf)
+			break;
+
+	if (!page)
+		goto out;
+
+	idx = ((void *)dbrecords - page->page_buf) /
ERDMA_DBRECORDS_SIZE;
+	page->free[idx / (8 * sizeof(unsigned long))] |=
+		1UL << (idx % (8 * sizeof(unsigned long)));
+
+	if (!--page->used) {
+		if (page->prev)
+			page->prev->next = page->next;
+		else
+			ctx->dbrecord_pages = page->next;
+		if (page->next)
+			page->next->prev = page->prev;
+
+		free(page->page_buf);
+		free(page);
+	}
+
+out:
+	pthread_mutex_unlock(&ctx->dbrecord_pages_mutex);
+}
<...>
quoted
quoted
+static void __erdma_alloc_dbs(struct erdma_qp *qp, struct
+erdma_context
+*ctx) {
+	uint32_t qpn = qp->id;
+
+	if (ctx->sdb_type == ERDMA_SDB_PAGE) {
+		/* qpn[4:0] as the index in this db page. */
+		qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE;
+	} else if (ctx->sdb_type == ERDMA_SDB_ENTRY) {
+		/* for type 'ERDMA_SDB_ENTRY', each uctx has 2 dwqe,
totally takes 256Bytes. */
+		qp->sq.db = ctx->sdb + ctx->sdb_offset * 256;
Generally we use macros to define hard-coded integers. E.g 256 should use
a macro.

OK, will fix.
quoted
quoted
+	} else {
+		/* qpn[4:0] as the index in this db page. */
+		qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE;
+	}
+
+	/* qpn[6:0] as the index in this rq db page. */
+	qp->rq.db = ctx->rdb + (qpn & 127) * ERDMA_RQDB_SPACE_SIZE; }
+
+struct ibv_qp *erdma_create_qp(struct ibv_pd *pd, struct
+ibv_qp_init_attr *attr) {
+	struct erdma_cmd_create_qp cmd = {};
+	struct erdma_cmd_create_qp_resp resp = {};
+	struct erdma_qp *qp;
+	struct ibv_context *base_ctx = pd->context;
+	struct erdma_context *ctx = to_ectx(base_ctx);
+	uint64_t *db_records  = NULL;
+	int rv, tbl_idx, tbl_off;
+	int sq_size = 0, rq_size = 0, total_bufsize = 0;
+
+	memset(&cmd, 0, sizeof(cmd));
+	memset(&resp, 0, sizeof(resp));
No need of memset due to declaration step.
OK, will fix.
quoted
quoted
+
+	qp = calloc(1, sizeof(*qp));
+	if (!qp)
+		return NULL;
+
+	sq_size = roundup_pow_of_two(attr->cap.max_send_wr *
MAX_WQEBB_PER_SQE) << SQEBB_SHIFT;
+	sq_size = align(sq_size, ctx->page_size);
+	rq_size = align(roundup_pow_of_two(attr->cap.max_recv_wr) <<
RQE_SHIFT, ctx->page_size);
+	total_bufsize = sq_size + rq_size;
+	rv = posix_memalign(&qp->qbuf, ctx->page_size, total_bufsize);
+	if (rv || !qp->qbuf) {
+		errno = ENOMEM;
+		goto error_alloc;
+	}
<...>
quoted
quoted
+
+int erdma_destroy_qp(struct ibv_qp *base_qp) {
+	struct erdma_qp *qp = to_eqp(base_qp);
+	struct ibv_context *base_ctx = base_qp->pd->context;
+	struct erdma_context *ctx = to_ectx(base_ctx);
+	int rv, tbl_idx, tbl_off;
+
+	pthread_spin_lock(&qp->sq_lock);
+	pthread_spin_lock(&qp->rq_lock);
Why to hold these?
Here, we are destroying the qp resources, such as the queue buffers.
we want to avoid race condition with post_send and post_recv.
Application should make sure no one is accessing the QP when qp-destroy is called. Probably you can be easy on these locks.
quoted
quoted
+
+	pthread_mutex_lock(&ctx->qp_table_mutex);
+	tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT;
+	tbl_off = qp->id & ERDMA_QP_TABLE_MASK;
+
+	ctx->qp_table[tbl_idx].table[tbl_off] = NULL;
+	ctx->qp_table[tbl_idx].refcnt--;
+
+	if (ctx->qp_table[tbl_idx].refcnt == 0) {
+		free(ctx->qp_table[tbl_idx].table);
+		ctx->qp_table[tbl_idx].table = NULL;
+	}
+
+	pthread_mutex_unlock(&ctx->qp_table_mutex);
+
+	rv = ibv_cmd_destroy_qp(base_qp);
+	if (rv) {
+		pthread_spin_unlock(&qp->rq_lock);
+		pthread_spin_unlock(&qp->sq_lock);
+		return rv;
+	}
+	pthread_spin_destroy(&qp->rq_lock);
+	pthread_spin_destroy(&qp->sq_lock);
+
+	if (qp->db_records)
+		erdma_dealloc_dbrecords(ctx, qp->db_records);
+
+	if (qp->qbuf)
+		free(qp->qbuf);
+
+	free(qp);
+
+	return 0;
+}
+
<...>
quoted
quoted
+
+int erdma_post_send(struct ibv_qp *base_qp, struct ibv_send_wr *wr,
+struct ibv_send_wr **bad_wr) {
+	struct erdma_qp *qp = to_eqp(base_qp);
+	uint16_t sq_pi;
+	int new_sqe = 0, rv = 0;
+
+	*bad_wr = NULL;
+
+	if (base_qp->state == IBV_QPS_ERR) {
Post_send is allowed in Error state. Thus the check is redundant.
Does this have specification? We didn't find the description in IB
specification.
ERDMA uses iWarp, and in this specification (link: [1]), it says that two actions
should be taken: "post wqe, and then flush it" or "return an immediate
error" when post WR in ERROR state. After modify qp to err, our hardware
will flush all the wqes, thus for the newly posted wr, we return an error
immediately.
Also, I review other providers' implementation, ocrdma/efa/bnxt_re also
don't allow post_send in ERROR state. Does this can have a little difference
depended on different HCAs?
Well its indeed. You can omit it for now.
Thanks,
Cheng Xu

[1]
https://urldefense.com/v3/__https://datatracker.ietf.org/doc/html/draft-
hilland-rddp-verbs-00*section-
6.2.4__;Iw!!ACWV5N9M2RV99hQ!fcFF2OWy2n6miTv9UwPpXs1y1RbO4pxYs
XOCc59hTAL0Z4nagUt0Z2Aaht8jX0alE_TU$


quoted
quoted
+		*bad_wr = wr;
+		return -EIO;
+	}
+
+	pthread_spin_lock(&qp->sq_lock);
+
+	sq_pi = qp->sq.pi;
+
+	while (wr) {
+		if ((uint16_t)(sq_pi - qp->sq.ci) >= qp->sq.depth) {
+			rv = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
<...>
quoted
quoted
--
2.27.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help