Re: [PATCH] Btrfs: fix dio write vs buffered read race

From: Miao Xie <hidden>
Date: 2012-06-21 02:39:41
Subsystem: btrfs file system, filesystems (vfs and infrastructure), the rest · Maintainers: Chris Mason, David Sterba, Alexander Viro, Christian Brauner, Linus Torvalds

On tue, 19 Jun 2012 10:58:09 -0400, Josef Bacik wrote:

Miao pointed out there's a problem with mixing dio writes and buffered
reads.  If the read happens between us invalidating the page range and
actually locking the extent we can bring in pages into page cache.  Then
once the write finishes if somebody tries to read again it will just find
uptodate pages and we'll read stale data.  So we need to lock the extent and
check for uptodate bits in the range.  If there are uptodate bits we need to
unlock and invalidate again.  This will keep this race from happening since
we will hold the extent locked until we create the ordered extent, and then
teh read side always waits for ordered extents.  Thanks,

Sorry to replay late. 
My test still failed. I thought it is because the reader may hold the lock of
the pages and the page invalidation will fail, the pages which have old data
still exist in the page cache.

Why not update the existed page directly? Just like this:
(Maybe we should move this patch to vfs layer)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 876cddd..fc0f485 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c

@@ -18,6 +18,7 @@
 
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>

@@ -1328,6 +1329,91 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	return num_written ? num_written : ret;
 }
 
+static void btrfs_dio_update_existed_pages(struct inode *inode,
+					   struct iov_iter *it,
+					   loff_t pos, size_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t index;
+	pgoff_t end;
+	size_t copied;
+	loff_t copy_pos = pos;
+	int offset = pos & (PAGE_CACHE_SIZE - 1);
+	int i;
+
+	BUG_ON(pos & (PAGE_CACHE_SIZE - 1));
+	BUG_ON(size & (PAGE_CACHE_SIZE - 1));
+
+	pagevec_init(&pvec, 0);
+	index = pos >> PAGE_CACHE_SHIFT;
+	end = (pos + size - 1) >> PAGE_CACHE_SHIFT;
+
+	while (index <= end && pagevec_lookup(&pvec, mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		i = 0;
+		while (i < pagevec_count(&pvec)) {
+			struct page *page = pvec.pages[i];
+			size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+					     size);
+
+			index = page->index;
+			if (index > end)
+				break;
+
+			lock_page(page);
+			WARN_ON(page->index != index);
+			BUG_ON(PageDirty(page));
+			BUG_ON(PageWriteback(page));
+
+			if (page->mapping != mapping) {
+				unlock_page(page);
+				i++;
+				continue;
+			}
+
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				i++;
+				continue;
+			}
+
+			if ((index << PAGE_CACHE_SHIFT) > copy_pos) {
+				copied = (index << PAGE_CACHE_SHIFT) - copy_pos;
+				iov_iter_advance(it, copied);
+				offset = 0;
+				size -= copied;
+				count = min_t(size_t, PAGE_CACHE_SIZE, size);
+			}
+
+			pagefault_disable();
+			copied = iov_iter_copy_from_user_atomic(page, it,
+								offset,
+								count);
+			pagefault_enable();
+			flush_dcache_page(page);
+
+			if (copied < count)
+				copied = 0;
+
+			iov_iter_advance(it, copied);
+			size -= copied;
+			copy_pos += copied;
+
+			if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+				offset += copied;
+			} else {
+				offset = 0;
+				i++;
+			}
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+		index++;
+	}
+}
+
 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs, loff_t pos,

@@ -1356,6 +1442,11 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 		mark_inode_dirty(inode);
 	}
 
+	if (written > 0) {
+		iov_iter_init(&i, iov, nr_segs, count, 0);
+		btrfs_dio_update_existed_pages(inode, &i, pos, written);
+	}
+
 	if (written < 0 || written == count)
 		return written;

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3f2c8cb..6de67a5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -6350,6 +6350,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
 out:
 	return retval;
 }
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)

quoted hunk ↗ jump to hunk

Signed-off-by: Josef Bacik <redacted>
---
 fs/btrfs/inode.c |   26 +++++++++++++++++++++++---
 1 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9d8c45d..9b631eb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -6360,12 +6360,32 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		 */
 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
 						     lockend - lockstart + 1);
-		if (!ordered)
+
+		/*
+		 * We need to make sure there are no buffered pages in this
+		 * range either, we could have raced between the invalidate in
+		 * generic_file_direct_write and locking the extent.  The
+		 * invalidate needs to happen so that reads after a write do not
+		 * get stale data.
+		 */
+		if (!ordered && (!writing ||
+		    !test_range_bit(&BTRFS_I(inode)->io_tree,
+				    lockstart, lockend, EXTENT_UPTODATE, 0,
+				    cached_state)))
 			break;
+
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 				     &cached_state, GFP_NOFS);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
+
+		if (ordered) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+		} else {
+			invalidate_mapping_pages(file->f_mapping,
+						 lockstart >> PAGE_CACHE_SHIFT,
+						 lockend >> PAGE_CACHE_SHIFT);
+		}
+
 		cond_resched();
 	}

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help