Re: [PATCH] libfs: Fix DIO mode aligment
From: Благодаренко Артём <hidden>
Date: 2021-03-01 16:17:05
Hello Ted!
Thank you very much for the new version of the fix.
The problem can be easily reproduced, using loop device.
Here is how I reproduced it originally (without the patch)
# truncate -s 512MB /tmp/lustre-ost
# losetup -b 4096 /dev/loop0 /tmp/lustre-ost
# mkfs.ext4 /dev/loop0
mke2fs 1.45.6.wc1 (20-Mar-2020)
detected raid stride 4096 too large, use optimum 512
Discarding device blocks: done
Creating filesystem with 125000 4k blocks and 125056 inodes
Filesystem UUID: 541ad524-556a-45be-84fe-5b68c1ca4562
Superblock backups stored on blocks:
32768, 98304
Allocating group tables: done
Writing inode tables: done
Creating journal (4096 blocks): done
Writing superblocks and filesystem accounting information: done
# git rev-parse HEAD
0f880f553dfa09dabe4cb03752b97a07b66eb998
[root@CO82 e2fsprogs-hpe]# misc/e2image /dev/loop0 /tmp/ost-image
e2image 1.45.2.cr2 (09-Apr-2020)
misc/e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/loop0
Couldn't find valid filesystem superblock.
With your patch e2image works fine.
[root@CO82 e2fsprogs-kernel]# truncate -s 512MB /tmp/lustre-ost
[root@CO82 e2fsprogs-kernel]# losetup -b 4096 /dev/loop0 /tmp/lustre-ost
[root@CO82 e2fsprogs-kernel]# mkfs.ext4 /dev/loop0
mke2fs 1.45.6.wc1 (20-Mar-2020)
detected raid stride 4096 too large, use optimum 512
Discarding device blocks: done
Creating filesystem with 125000 4k blocks and 125056 inodes
Filesystem UUID: 42f6fc7d-382a-4681-99b8-79f3fcd2b4bf
Superblock backups stored on blocks:
32768, 98304
Allocating group tables: done
Writing inode tables: done
Creating journal (4096 blocks): done
Writing superblocks and filesystem accounting information: done
[root@CO82 e2fsprogs-kernel]# git rev-parse HEAD
67f2b54667e65cf5a478fcea8b85722be9ee6e8d
[root@CO82 e2fsprogs-kernel]# misc/e2image /dev/loop0 /tmp/ost-image
e2image 1.46.1 (9-Feb-2021)
Thanks again.
Best regards,
Artem Blagodarenko.
quoted hunk ↗ jump to hunk
On 27 Feb 2021, at 02:17, Theodore Ts'o [off-list ref] wrote: I've rewritten the patch because it was simplest way to review the code. The resulting code is simpler and has a smaller number of lines of code. I don't have any 4k advanced format disks handy, so I'd appreciate it if someone can test it. It passes the existing regression tests, and I've run a number of manual tests to simulate a advanced format HDD, with the tests being run with clang and UBSAN and ADDRSAN enabled. If someone with access to an advanced format disk can test running debugfs -D on an advanced format disk, that would be great, thanks. - Ted commit c001596110e834a85b01a47a20811b318cb3b9e4 Author: Theodore Ts'o [off-list ref] Date: Fri Feb 26 17:41:06 2021 -0500 libext2fs: fix unix_io's Direct I/O support The previous Direct I/O support worked on HDD's with 512 byte logical sector sizes, and on FreeBSD which required 4k aligned memory buffers. However, it was incomplete and was not correctly working on HDD's with 4k logical sector sizes (aka Advanced Format Disks). Based on a patch from Alexey Lyashkov [off-list ref] but rewritten to work with the latest e2fsprogs and to minimize changes to make it easier to review. Signed-off-by: Theodore Ts'o [off-list ref] Reported-by: Artem Blagodarenko [off-list ref]diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c index c395d615..996c31a1 100644 --- a/lib/ext2fs/io_manager.c +++ b/lib/ext2fs/io_manager.c@@ -134,9 +134,11 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)else size = -count; - if (io->align) + if (io->align) { + if (io->align > size) + size = io->align; return ext2fs_get_memalign(size, io->align, ptr); - else + } else return ext2fs_get_mem(size, ptr); }diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c index 73f5667e..8965535c 100644 --- a/lib/ext2fs/unix_io.c +++ b/lib/ext2fs/unix_io.c@@ -218,6 +218,8 @@ static errcode_t raw_read_blk(io_channel channel,int actual = 0; unsigned char *buf = bufv; ssize_t really_read = 0; + unsigned long long aligned_blk; + int align_size, offset; size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size; mutex_lock(data, STATS_MTX);@@ -226,7 +228,7 @@ static errcode_t raw_read_blk(io_channel channel,location = ((ext2_loff_t) block * channel->block_size) + data->offset; if (data->flags & IO_FLAG_FORCE_BOUNCE) { - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) { retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; goto error_out; }@@ -237,6 +239,7 @@ static errcode_t raw_read_blk(io_channel channel,/* Try an aligned pread */ if ((channel->align == 0) || (IS_ALIGNED(buf, channel->align) && + IS_ALIGNED(location, channel->align) && IS_ALIGNED(size, channel->align))) { actual = pread64(data->dev, buf, size, location); if (actual == size)@@ -248,6 +251,7 @@ static errcode_t raw_read_blk(io_channel channel,if ((sizeof(off_t) >= sizeof(ext2_loff_t)) && ((channel->align == 0) || (IS_ALIGNED(buf, channel->align) && + IS_ALIGNED(location, channel->align) && IS_ALIGNED(size, channel->align)))) { actual = pread(data->dev, buf, size, location); if (actual == size)@@ -256,12 +260,13 @@ static errcode_t raw_read_blk(io_channel channel,} #endif /* HAVE_PREAD */ - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) { retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; goto error_out; } if ((channel->align == 0) || (IS_ALIGNED(buf, channel->align) && + IS_ALIGNED(location, channel->align) && IS_ALIGNED(size, channel->align))) { actual = read(data->dev, buf, size); if (actual != size) {@@ -286,23 +291,39 @@ static errcode_t raw_read_blk(io_channel channel,* to the O_DIRECT rules, so we need to do this the hard way... */ bounce_read: + if ((channel->block_size > channel->align) && + (channel->block_size % channel->align) == 0) + align_size = channel->block_size; + else + align_size = channel->align; + aligned_blk = location / align_size; + offset = location % align_size; + + if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) { + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; + goto error_out; + } while (size > 0) { mutex_lock(data, BOUNCE_MTX); - actual = read(data->dev, data->bounce, channel->block_size); - if (actual != channel->block_size) { + actual = read(data->dev, data->bounce, align_size); + if (actual != align_size) { mutex_unlock(data, BOUNCE_MTX); actual = really_read; buf -= really_read; size += really_read; goto short_read; } - actual = size; - if (size > channel->block_size) - actual = channel->block_size; - memcpy(buf, data->bounce, actual); + if ((actual + offset) > align_size) + actual = align_size - offset; + if (actual > size) + actual = size; + memcpy(buf, data->bounce + offset, actual); + really_read += actual; size -= actual; buf += actual; + offset = 0; + aligned_blk++; mutex_unlock(data, BOUNCE_MTX); } return 0;@@ -326,6 +347,8 @@ static errcode_t raw_write_blk(io_channel channel,int actual = 0; errcode_t retval; const unsigned char *buf = bufv; + unsigned long long aligned_blk; + int align_size, offset; if (count == 1) size = channel->block_size;@@ -342,7 +365,7 @@ static errcode_t raw_write_blk(io_channel channel,location = ((ext2_loff_t) block * channel->block_size) + data->offset; if (data->flags & IO_FLAG_FORCE_BOUNCE) { - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) { retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; goto error_out; }@@ -353,6 +376,7 @@ static errcode_t raw_write_blk(io_channel channel,/* Try an aligned pwrite */ if ((channel->align == 0) || (IS_ALIGNED(buf, channel->align) && + IS_ALIGNED(location, channel->align) && IS_ALIGNED(size, channel->align))) { actual = pwrite64(data->dev, buf, size, location); if (actual == size)@@ -363,6 +387,7 @@ static errcode_t raw_write_blk(io_channel channel,if ((sizeof(off_t) >= sizeof(ext2_loff_t)) && ((channel->align == 0) || (IS_ALIGNED(buf, channel->align) && + IS_ALIGNED(location, channel->align) && IS_ALIGNED(size, channel->align)))) { actual = pwrite(data->dev, buf, size, location); if (actual == size)@@ -370,13 +395,14 @@ static errcode_t raw_write_blk(io_channel channel,} #endif /* HAVE_PWRITE */ - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { + if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) { retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; goto error_out; } if ((channel->align == 0) || (IS_ALIGNED(buf, channel->align) && + IS_ALIGNED(location, channel->align) && IS_ALIGNED(size, channel->align))) { actual = write(data->dev, buf, size); if (actual < 0) {@@ -400,40 +426,59 @@ static errcode_t raw_write_blk(io_channel channel,* to the O_DIRECT rules, so we need to do this the hard way... */ bounce_write: + if ((channel->block_size > channel->align) && + (channel->block_size % channel->align) == 0) + align_size = channel->block_size; + else + align_size = channel->align; + aligned_blk = location / align_size; + offset = location % align_size; + while (size > 0) { + int actual_w; + mutex_lock(data, BOUNCE_MTX); - if (size < channel->block_size) { + if (size < align_size || offset) { + if (ext2fs_llseek(data->dev, aligned_blk * align_size, + SEEK_SET) < 0) { + retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; + goto error_out; + } actual = read(data->dev, data->bounce, - channel->block_size); - if (actual != channel->block_size) { + align_size); + if (actual != align_size) { if (actual < 0) { mutex_unlock(data, BOUNCE_MTX); retval = errno; goto error_out; } memset((char *) data->bounce + actual, 0, - channel->block_size - actual); + align_size - actual); } } actual = size; - if (size > channel->block_size) - actual = channel->block_size; - memcpy(data->bounce, buf, actual); - if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) { + if ((actual + offset) > align_size) + actual = align_size - offset; + if (actual > size) + actual = size; + memcpy(((char *)data->bounce) + offset, buf, actual); + if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) { retval = errno ? errno : EXT2_ET_LLSEEK_FAILED; goto error_out; } - actual = write(data->dev, data->bounce, channel->block_size); + actual_w = write(data->dev, data->bounce, align_size); mutex_unlock(data, BOUNCE_MTX); - if (actual < 0) { + if (actual_w < 0) { retval = errno; goto error_out; } - if (actual != channel->block_size) + if (actual_w != align_size) goto short_write; size -= actual; buf += actual; location += actual; + aligned_blk++; + offset = 0; } return 0;