Re: [PATCH 4/8] [I/OAT] Utility functions for offloading sk_buff to iovec copies

From: Andrew Morton <hidden>
Date: 2006-03-05 07:17:13
Also in: lkml

Chris Leech [off-list ref] wrote:

+
+#define NUM_PAGES_SPANNED(start, length) \
+	((PAGE_ALIGN((unsigned long)start + length) - \
+	((unsigned long)start & PAGE_MASK)) >> PAGE_SHIFT)

static inline all-lower-case functions are much nicer.

+/*
+ * Lock down all the iovec pages needed for len bytes.
+ * Return a struct dma_locked_list to keep track of pages locked down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+int dma_lock_iovec_pages(struct iovec *iov, size_t len, struct dma_locked_list
+	**locked_list)

Please rename this to dma_pin_iovec_pages().  Locking a page is a quite
different concept from pinning it, and this function doesn't lock any
pages.

+{
+	struct dma_locked_list *local_list;
+	struct page **pages;
+	int i;
+	int ret;
+
+	int nr_iovecs = 0;
+	int iovec_len_used = 0;
+	int iovec_pages_used = 0;

Extraneous blank line there.

+	/* don't lock down non-user-based iovecs */
+	if (segment_eq(get_fs(), KERNEL_DS)) {
+		*locked_list = NULL;
+		return 0;
+	}

hm, haven't seen that before.  Makes sense, I guess.

+	/* determine how many iovecs/pages there are, up front */
+	do {
+		iovec_len_used += iov[nr_iovecs].iov_len;
+		iovec_pages_used += NUM_PAGES_SPANNED(iov[nr_iovecs].iov_base,
+		                                      iov[nr_iovecs].iov_len);
+		nr_iovecs++;
+	} while (iovec_len_used < len);
+
+	/* single kmalloc for locked list, page_list[], and the page arrays */
+	local_list = kmalloc(sizeof(*local_list)
+		+ (nr_iovecs * sizeof (struct dma_page_list))
+		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);

What is the upper bound on the size of this allocation?

+	if (!local_list)
+		return -ENOMEM;
+
+	/* list of pages starts right after the page list array */
+	pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+	/* it's a userspace pointer */
+	might_sleep();

kmalloc(GFP_KERNEL) already did that.

+	for (i = 0; i < nr_iovecs; i++) {
+		struct dma_page_list *page_list = &local_list->page_list[i];
+
+		len -= iov[i].iov_len;
+
+		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+			dma_unlock_iovec_pages(local_list);
+			return -EFAULT;
+		}

A return statement buried down in the guts of a largeish function isn't
good from a code maintainability POV.

+		page_list->nr_pages = NUM_PAGES_SPANNED(iov[i].iov_base,
+		                                        iov[i].iov_len);
+		page_list->base_address = iov[i].iov_base;
+
+		page_list->pages = pages;
+		pages += page_list->nr_pages;
+
+		/* lock pages down */
+		down_read(&current->mm->mmap_sem);
+		ret = get_user_pages(
+			current,
+			current->mm,
+			(unsigned long) iov[i].iov_base,
+			page_list->nr_pages,
+			1,
+			0,
+			page_list->pages,
+			NULL);

Yes, it has a lot of args.  It's nice to add comments like this:

		ret = get_user_pages(
			current,
			current->mm,
			(unsigned long) iov[i].iov_base,
			page_list->nr_pages,
			1,			/* write */
			0,			/* force */
			page_list->pages,
			NULL);

+		up_read(&current->mm->mmap_sem);
+
+		if (ret != page_list->nr_pages) {
+			goto mem_error;
+		}

Unneded braces.

+		local_list->nr_iovecs = i + 1;
+	}
+
+	*locked_list = local_list;
+	return 0;

Suggest you change this function to return locked_list, or an IS_ERR value
on error.

+void dma_unlock_iovec_pages(struct dma_locked_list *locked_list)
+{
+	int i, j;
+
+	if (!locked_list)
+		return;
+
+	for (i = 0; i < locked_list->nr_iovecs; i++) {
+		struct dma_page_list *page_list = &locked_list->page_list[i];
+		for (j = 0; j < page_list->nr_pages; j++) {
+			SetPageDirty(page_list->pages[j]);
+			page_cache_release(page_list->pages[j]);
+		}
+	}
+
+	kfree(locked_list);
+}

SetPageDirty() is very wrong.  It fails to mark pagecache pages as dirty in
the radix tree so they won't get written back.

You'll need to use set_page_dirty_lock() here or, if you happen to have
protected the inode which backs this potential mmap (really the
address_space) from reclaim then set_page_dirty() will work.  Probably
it'll be set_page_dirty_lock().

If this is called from cant-sleep context then things get ugly.  If it's
called from interrupt context then moreso.  See fs/direct-io.c,
bio_set_pages_dirty(), bio_check_pages_dirty(), etc.


I don't see a check for "did we write to user pages" here.  Because we
don't need to dirty the pages if we were reading them (transmitting from
userspace).

But given that dma_lock_iovec_pages() is only set up for writing to
userspace I guess this code is implicitly receive-only.  It's hard to tell
when the description, is, like the code comments, so scant.

+static dma_cookie_t dma_memcpy_tokerneliovec(struct dma_chan *chan, struct
+	iovec *iov, unsigned char *kdata, size_t len)

You owe us two underscores ;)

+/*
+ * We have already locked down the pages we will be using in the iovecs.

"pinned"

+ * Each entry in iov array has corresponding entry in locked_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ *   by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_toiovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_locked_list *locked_list, unsigned char *kdata, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+
+	if (!chan)
+		return memcpy_toiovec(iov, kdata, len);
+
+	/* -> kernel copies (e.g. smbfs) */
+	if (!locked_list)
+		return dma_memcpy_tokerneliovec(chan, iov, kdata, len);
+
+	iovec_idx = 0;
+	while (iovec_idx < locked_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;

Is it assured that this array was zero-terminated?

+
+dma_cookie_t dma_memcpy_pg_toiovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_locked_list *locked_list, struct page *page,
+	unsigned int offset, size_t len)

pleeeeeze comment your code.

+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+	int err;
+
+	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
+	/* TODO: use dma for this */
+	if (!chan || !locked_list) {

Really you should rename locked_list to pinned_list throughout, and
dma_locked_list to dma_pinned_list.

+	iovec_idx = 0;
+	while (iovec_idx < locked_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;

Can this also run off the end?

+int dma_lock_iovec_pages(struct iovec *iov, size_t len, struct dma_locked_list
+	**locked_list)
+{
+	*locked_list = NULL;
+
+	return 0;
+}
+
+void dma_unlock_iovec_pages(struct dma_locked_list* locked_list)
+{ }

You might want to make these guys static inlines in a header and not
compile this file at all if !CONFIG_DMA_ENGINE.

+struct dma_page_list
+{

   struct dma_page_list {

+struct dma_locked_list
+{

   struct dma_pinned_list {

+	int nr_iovecs;
+	struct dma_page_list page_list[0];

We can use [] instead of [0] now that gcc-2.95.x has gone away.

+int dma_lock_iovec_pages(struct iovec *iov, size_t len,
+	struct dma_locked_list	**locked_list);
+void dma_unlock_iovec_pages(struct dma_locked_list* locked_list);

"pin", "unpin".

+#ifdef CONFIG_NET_DMA
+
+/**
+ *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb - buffer to copy
+ *	@offset - offset in the buffer to start copying from
+ *	@iovec - io vector to copy to
+ *	@len - amount of data to copy from buffer to iovec
+ *	@locked_list - locked iovec buffer data
+ *
+ *	Note: the iovec is modified during the copy.

Modifying the caller's iovec is a bit rude.    Hard to avoid, I guess.

+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_locked_list *locked_list)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	dma_cookie_t cookie = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if ((cookie = dma_memcpy_toiovec(chan, to, locked_list,
+		     skb->data + offset, copy)) < 0)
+			goto fault;
+		if ((len -= copy) == 0)
+			goto end;

Please avoid

	if ((lhs = rhs))

constructs.  Instead do

	lhs = rhs;
	if (lhs)

(entire patchset - there are quite a lot)

+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset + len);

<wonders why BUG_TRAP still exists>

+		if ((copy = end - offset) > 0) {
...
+			if (!(len -= copy))
...
+			if ((copy = end - offset) > 0) {
...
+				if ((len -= copy) == 0)

See above.

+#else
+
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			const struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_locked_list *locked_list)
+{
+	return skb_copy_datagram_iovec(skb, offset, to, len);
+}
+
+#endif

Again, consider putting this in a header as an inline, avoid compiling this
file altogether.

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help