From 20370c657303514c9f73fe7117336d6fc95c2521 Mon Sep 17 00:00:00 2001 From: Bryant Duffy-Ly Date: Wed, 2 Mar 2022 10:25:42 -0500 Subject: [PATCH 1/4] Add O_DIRECT support We want to first pass a mapping of unwritten extents to the blockdev_direct_IO call. Then based upon the amount of bytes written we want to convert those unwritten extents into written. --- kmod/src/data.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++- kmod/src/file.c | 6 + 2 files changed, 287 insertions(+), 5 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 2c0822dbb..3aa8cc860 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "format.h" #include "super.h" @@ -471,6 +472,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode, ext->map = blkno; ext->flags = 0; ret = 0; + out: if (ret < 0 && blkno > 0) { err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, @@ -488,8 +490,68 @@ static int alloc_block(struct super_block *sb, struct inode *inode, return ret; } +static int alloc_block_dio(struct super_block *sb, struct inode *inode, + struct scoutfs_extent *ext, struct buffer_head *bh, + u64 iblock, struct scoutfs_lock *lock) +{ + DECLARE_DATA_INFO(sb, datinf); + const u64 ino = scoutfs_ino(inode); + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; + u64 blkno = 0; + u64 blocks = 0; + u64 count = 0; + u64 last; + u8 ext_fl = 0; + int ret = 0; + bool first = true; + int err; + + last = (bh->b_size - 1) >> SCOUTFS_BLOCK_SM_SHIFT; + + while(blocks < last) { + if (ext->len >= last && first) + count = min_t(u64, last, SCOUTFS_FALLOCATE_ALLOC_LIMIT); + else + count = min_t(u64, last - blocks, SCOUTFS_FALLOCATE_ALLOC_LIMIT); + + mutex_lock(&datinf->mutex); + ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, + &datinf->dalloc, count, &blkno, &count); + if (ret == 0) { + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, + count, blkno, + ext_fl | SEF_UNWRITTEN); + if (ret < 0) { + err = scoutfs_free_data(sb, datinf->alloc, + datinf->wri, + &datinf->data_freed, + blkno, count); + BUG_ON(err); /* inconsistent */ + } + } + + mutex_unlock(&datinf->mutex); + + if (ret < 0) + break; + + blocks += count; + first = false; + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, ext); + if (ret < 0) + break; + } + + return ret; +} + static int scoutfs_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) + struct buffer_head *bh, int create, bool dio_flag) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); const u64 ino = scoutfs_ino(inode); @@ -530,7 +592,7 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, } /* convert unwritten to written, could be staging */ - if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) { + if (create && ext.map && !dio_flag && (ext.flags & SEF_UNWRITTEN)) { un.start = iblock; un.len = 1; un.map = ext.map + (iblock - ext.start); @@ -542,11 +604,26 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, set_buffer_new(bh); } goto out; + } else if (create && ext.map && dio_flag) { + un.start = iblock; + un.len = 1; + un.map = ext.map + (iblock - ext.start); + un.flags = ext.flags; + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, + un.start, un.len, un.map, un.flags); + if (ret == 0) { + ext = un; + set_buffer_new(bh); + } + goto out; } /* allocate and map blocks containing our logical block */ if (create && !ext.map) { - ret = alloc_block(sb, inode, &ext, iblock, lock); + if (dio_flag) + ret = alloc_block_dio(sb, inode, &ext, bh, iblock, lock); + else + ret = alloc_block(sb, inode, &ext, iblock, lock); if (ret == 0) set_buffer_new(bh); } else { @@ -580,7 +657,7 @@ static int scoutfs_get_block_read(struct inode *inode, sector_t iblock, int ret; down_read(&si->extent_sem); - ret = scoutfs_get_block(inode, iblock, bh, create); + ret = scoutfs_get_block(inode, iblock, bh, create, false); up_read(&si->extent_sem); return ret; @@ -593,12 +670,62 @@ static int scoutfs_get_block_write(struct inode *inode, sector_t iblock, int ret; down_write(&si->extent_sem); - ret = scoutfs_get_block(inode, iblock, bh, create); + ret = scoutfs_get_block(inode, iblock, bh, create, false); up_write(&si->extent_sem); return ret; } + +static int scoutfs_get_block_write_dio(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct super_block *sb = inode->i_sb; + struct scoutfs_lock *lock = NULL; + LIST_HEAD(ind_locks); + int ret; + + lock = scoutfs_per_task_get(&si->pt_data_lock); + if (WARN_ON_ONCE(!lock)) { + return -EINVAL; + } + + if (inode) + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, + true, false); + else + ret = scoutfs_hold_trans(sb, false); + if (ret) + goto out; + + if (inode) + ret = scoutfs_dirty_inode_item(inode, lock); + + if (ret < 0) + goto out_unlock; + + down_write(&si->extent_sem); + ret = scoutfs_get_block(inode, iblock, bh, create, true); + up_write(&si->extent_sem); + + if (inode) { + scoutfs_inode_set_data_seq(inode); + scoutfs_inode_inc_data_version(inode); + inode_inc_iversion(inode); + + if (ret > 0) + i_size_write(inode, ret); + scoutfs_update_inode_item(inode, lock, &ind_locks); + } + +out_unlock: + scoutfs_release_trans(sb); + scoutfs_inode_index_unlock(sb, &ind_locks); +out: + return ret; +} + /* * This is almost never used. We can't block on a cluster lock while * holding the page lock because lock invalidation gets the page lock @@ -861,6 +988,154 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping, return ret; } +static s64 convert_unwritten_items(struct super_block *sb, struct inode *inode, + u64 ino, u64 iblock, u64 last, + struct scoutfs_lock *lock) +{ + struct data_ext_args args = { + .ino = ino, + .inode = inode, + .lock = lock, + }; + struct scoutfs_extent ext; + struct scoutfs_extent un; + u64 offset; + s64 ret; + int i; + + ret = 0; + + for (i = 0; iblock <= last; i++) { + if (i == EXTENTS_PER_HOLD) { + ret = iblock; + break; + } + + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &ext); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + /* done if we went past the region */ + if (ext.start > last) { + ret = 0; + break; + } + + /* nothing to do when already marked written */ + if (!(ext.flags & SEF_UNWRITTEN)) { + iblock = ext.start + ext.len; + continue; + } + + iblock = max(ext.start, iblock); + offset = iblock - ext.start; + + un.start = iblock; + un.map = ext.map ? ext.map + offset : 0; + un.len = min(ext.len - offset, last - iblock + 1); + un.flags = ext.flags & ~(SEF_OFFLINE|SEF_UNWRITTEN); + + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, + un.start, un.len, un.map, un.flags); + if (ret < 0) + break; + + iblock += un.len; + } + + return ret; +} + +static ssize_t +convert_unwritten_extent(struct inode *inode, loff_t offset, ssize_t count) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct super_block *sb = inode->i_sb; + const u64 ino = scoutfs_ino(inode); + struct scoutfs_lock *lock = NULL; + LIST_HEAD(ind_locks); + u64 iblock; + u64 last; + ssize_t ret = 0; + + lock = scoutfs_per_task_get(&si->pt_data_lock); + if (WARN_ON_ONCE(!lock)) { + ret = -EINVAL; + goto out; + } + + iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT; + last = (offset + count - 1) >> SCOUTFS_BLOCK_SM_SHIFT; + while(iblock <= last) { + if (inode) + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, + true, false); + else + ret = scoutfs_hold_trans(sb, false); + if (ret) + break; + + if (inode) + ret = scoutfs_dirty_inode_item(inode, lock); + else + ret = 0; + + if (ret == 0) { + down_write(&si->extent_sem); + ret = convert_unwritten_items(sb, inode, ino, iblock, + last, lock); + up_write(&si->extent_sem); + } + + if (ret < 0) + goto out; + + if (inode) { + scoutfs_inode_set_data_seq(inode); + scoutfs_inode_inc_data_version(inode); + inode_inc_iversion(inode); + + if (ret > 0) + i_size_write(inode, ret); + scoutfs_update_inode_item(inode, lock, &ind_locks); + } + scoutfs_release_trans(sb); + if (inode) + scoutfs_inode_index_unlock(sb, &ind_locks); + + if (ret <= 0) + break; + + iblock = ret; + } + +out: + return ret; +} + +static ssize_t +scoutfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + scoutfs_get_block_write_dio); + if (ret > 0 && (rw & WRITE)) + { + ret = convert_unwritten_extent(inode, offset, ret); + } + + return ret; +} + /* * Try to allocate unwritten extents for any unallocated regions of the * logical block extent from the caller. The caller manages locks and @@ -1804,6 +2079,7 @@ const struct address_space_operations scoutfs_file_aops = { .writepages = scoutfs_writepages, .write_begin = scoutfs_write_begin, .write_end = scoutfs_write_end, + .direct_IO = scoutfs_direct_IO, }; const struct file_operations scoutfs_file_fops = { diff --git a/kmod/src/file.c b/kmod/src/file.c index 586d77fda..9ee31de81 100644 --- a/kmod/src/file.c +++ b/kmod/src/file.c @@ -47,6 +47,9 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov, DECLARE_DATA_WAIT(dw); int ret; + if (!is_sync_kiocb(iocb)) + return -EINVAL; + retry: /* protect checked extents from release */ mutex_lock(&inode->i_mutex); @@ -97,6 +100,9 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, DECLARE_DATA_WAIT(dw); int ret; + if (!is_sync_kiocb(iocb)) + return -EINVAL; + if (iocb->ki_left == 0) /* Does this even happen? */ return 0; From bb11617fe376b45cd0e203090e6fb5fd15bb11f9 Mon Sep 17 00:00:00 2001 From: Bryant Duffy-Ly Date: Wed, 16 Mar 2022 12:11:27 -0400 Subject: [PATCH 2/4] Fix EOF extent in last block Currently if there is an extent on the last block the code will only set EOF on ENOENT. In the case that the last block has an extent it wont go to the next iteration due to iblock <= last. This then doesnt set the EOF on the last block in these cases. We want to just allow the loop to keep looping and rely on if (ext.start > last) to protect us from infinite loop. --- kmod/src/data.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 3aa8cc860..7b908bd7c 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -1748,13 +1748,14 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, iblock = start >> SCOUTFS_BLOCK_SM_SHIFT; last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; - while (iblock <= last) { + while (true) { ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); if (ret < 0) { - if (ret == -ENOENT) + if (ret == -ENOENT) { ret = 0; - last_flags = FIEMAP_EXTENT_LAST; + last_flags = FIEMAP_EXTENT_LAST; + } break; } From 9fc759ce47709ec3792b087ff9b6c75ba918dd1d Mon Sep 17 00:00:00 2001 From: Bryant Duffy-Ly Date: Wed, 16 Mar 2022 19:04:34 -0400 Subject: [PATCH 3/4] Fix truncate for O_DIRECT In the buffered case page tail zeroing happens automatically. In the O_DIRECT case it does not so we need to add it in our setattr path just like EXT2. We want to zero the end of the block that contains i_size during truncate, so we just call block_truncate_page in set_inode_size. --- kmod/src/data.c | 4 ++-- kmod/src/data.h | 2 ++ kmod/src/inode.c | 11 ++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 7b908bd7c..7d8d6a8a3 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -663,8 +663,8 @@ static int scoutfs_get_block_read(struct inode *inode, sector_t iblock, return ret; } -static int scoutfs_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) +int scoutfs_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); int ret; diff --git a/kmod/src/data.h b/kmod/src/data.h index c056915e2..2d4a119a9 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -49,6 +49,8 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); +int scoutfs_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, struct scoutfs_lock *lock); int scoutfs_data_move_blocks(struct inode *from, u64 from_off, diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 9d22f52a7..7d007688b 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "format.h" #include "super.h" @@ -354,15 +355,22 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, { struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; + SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); LIST_HEAD(ind_locks); int ret; if (!S_ISREG(inode->i_mode)) return 0; + scoutfs_per_task_add(&si->pt_data_lock, &pt_ent, lock); + ret = block_truncate_page(inode->i_mapping, new_size, scoutfs_get_block_write); + scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); + if (ret) + goto out; + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); if (ret) - return ret; + goto out; if (new_size != i_size_read(inode)) scoutfs_inode_inc_data_version(inode); @@ -378,6 +386,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); +out: return ret; } From 1029d5a0fe20e0b63e3524674f96f4ee71e4cbc8 Mon Sep 17 00:00:00 2001 From: "Bryant G. Duffy-Ly" Date: Fri, 18 Mar 2022 07:54:28 -0500 Subject: [PATCH 4/4] Enable and Disable correct unit tests for O_DIRECT Signed-off-by: Bryant G. Duffy-Ly --- tests/golden/xfstests | 22 ++++++---------------- tests/tests/xfstests.sh | 10 ++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/golden/xfstests b/tests/golden/xfstests index 9a818f512..b598b5007 100644 --- a/tests/golden/xfstests +++ b/tests/golden/xfstests @@ -45,9 +45,14 @@ generic/107 generic/117 generic/124 generic/129 +generic/130 generic/131 +generic/135 generic/169 generic/184 +generic/211 +generic/212 +generic/214 generic/221 generic/228 generic/236 @@ -100,13 +105,9 @@ generic/078 generic/079 generic/081 generic/082 -generic/091 -generic/094 generic/096 generic/110 generic/111 -generic/113 -generic/114 generic/115 generic/116 generic/118 @@ -115,9 +116,7 @@ generic/121 generic/122 generic/123 generic/128 -generic/130 generic/134 -generic/135 generic/136 generic/138 generic/139 @@ -165,7 +164,6 @@ generic/194 generic/195 generic/196 generic/197 -generic/198 generic/199 generic/200 generic/201 @@ -173,11 +171,6 @@ generic/202 generic/203 generic/205 generic/206 -generic/207 -generic/210 -generic/211 -generic/212 -generic/214 generic/216 generic/217 generic/218 @@ -185,13 +178,11 @@ generic/219 generic/220 generic/222 generic/223 -generic/225 generic/227 generic/229 generic/230 generic/235 generic/238 -generic/240 generic/244 generic/250 generic/252 @@ -203,7 +194,6 @@ generic/259 generic/260 generic/261 generic/262 -generic/263 generic/264 generic/265 generic/266 @@ -282,4 +272,4 @@ shared/004 shared/032 shared/051 shared/289 -Passed all 75 tests +Passed all 80 tests diff --git a/tests/tests/xfstests.sh b/tests/tests/xfstests.sh index 140a38191..0a4df47ff 100644 --- a/tests/tests/xfstests.sh +++ b/tests/tests/xfstests.sh @@ -64,19 +64,29 @@ generic/029 # mmap missing generic/030 # mmap missing generic/075 # file content mismatch failures (fds, etc) generic/080 # mmap missing +generic/091 # skip fsx tests +generic/094 # odirect streaming pre-alloc treated as failure in xfstests generic/103 # enospc causes trans commit failures generic/105 # needs trigage: something about acls generic/108 # mount fails on failing device? generic/112 # file content mismatch failures (fds, etc) +generic/113 # block aio dio runs +generic/114 # block aio dio runs generic/120 # (can't exec 'cause no mmap) generic/126 # (can't exec 'cause no mmap) generic/141 # mmap missing +generic/198 # block aio dio runs +generic/207 # block aio dio runs +generic/210 # block aio dio runs generic/213 # enospc causes trans commit failures generic/215 # mmap missing +generic/225 # odirect streaming pre-alloc treated as failure in xfstests generic/237 # wrong error return from failing setfacl? +generic/240 # block aio dio runs generic/246 # mmap missing generic/247 # mmap missing generic/248 # mmap missing +generic/263 # do not support allocate mode FALLOC_FL_PUNCH_HOLE, FALLOC_FL_KEEP_SIZE, FALLOC_FL_ZERO_RANGE... generic/319 # utils output change? update branch? generic/321 # requires selinux enabled for '+' in ls? generic/325 # mmap missing