From 48966b42bb997c81b407244efdecd29d65f9e7e5 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 17 Feb 2022 11:07:32 -0800 Subject: [PATCH 1/2] Add simple fallocate test Signed-off-by: Zach Brown --- tests/golden/fallocate | 3 +++ tests/sequence | 1 + tests/tests/fallocate.sh | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 tests/golden/fallocate create mode 100644 tests/tests/fallocate.sh diff --git a/tests/golden/fallocate b/tests/golden/fallocate new file mode 100644 index 00000000..44b6c818 --- /dev/null +++ b/tests/golden/fallocate @@ -0,0 +1,3 @@ +== creating reasonably large per-mount files +== 10s of racing cold reads and fallocate nop +== cleaning up files diff --git a/tests/sequence b/tests/sequence index c6e463f1..b86452dd 100644 --- a/tests/sequence +++ b/tests/sequence @@ -4,6 +4,7 @@ inode-items-updated.sh simple-inode-index.sh simple-staging.sh simple-release-extents.sh +fallocate.sh setattr_more.sh offline-extent-waiting.sh move-blocks.sh diff --git a/tests/tests/fallocate.sh b/tests/tests/fallocate.sh new file mode 100644 index 00000000..bf49cbe7 --- /dev/null +++ b/tests/tests/fallocate.sh @@ -0,0 +1,38 @@ + +t_require_commands fallocate cat + +echo "== creating reasonably large per-mount files" +for n in $(t_fs_nrs); do + eval path="\$T_D${n}/file-$n" + + LC_ALL=C fallocate -l 128MiB "$path" || \ + t_fail "initial creating fallocate failed" +done + +# +# we had lock inversions between read and fallocate, dropping +# the cache each time forces waiting for IO during the calls +# with the inverted locks held so we have a better chance +# of the deadlock happening. +# +DURATION=10 +echo "== ${DURATION}s of racing cold reads and fallocate nop" +END=$((SECONDS + DURATION)) +while [ $SECONDS -le $END ]; do + + echo 3 > /proc/sys/vm/drop_caches + + for n in $(t_fs_nrs); do + eval path="\$T_D${n}/file-$n" + + LC_ALL=C fallocate -o 0 -l 4KiB "$path" & + cat "$path" > /dev/null & + done + + wait || t_fail "fallocate or cat failed" +done + +echo "== cleaning up files" +rm -f "$T_D0"/file-* + +t_pass From 4d6350b3b0579ac019f3f8444f59a950e9f301c8 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 17 Feb 2022 14:48:13 -0800 Subject: [PATCH 2/2] Fix lock ordering in fallocate We were seeing ABBA deadlocks on the dio_count wait and extent_sem between fallocate and reads. It turns out that fallocate got lock ordering wrong. This brings fallocate in line with the rest of the adherents to the lock heirarchy. Most importantly, the extent_sem is used after the dio_count. While we're at it we bring the i_mutex down to just before the cluster lock for consistency. Signed-off-by: Zach Brown --- kmod/src/data.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index aae6f601..2c0822db 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -983,9 +983,6 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) u64 last; s64 ret; - mutex_lock(&inode->i_mutex); - down_write(&si->extent_sem); - /* XXX support more flags */ if (mode & ~(FALLOC_FL_KEEP_SIZE)) { ret = -EOPNOTSUPP; @@ -1003,18 +1000,22 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } + mutex_lock(&inode->i_mutex); + ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); if (ret) - goto out; + goto out_mutex; inode_dio_wait(inode); + down_write(&si->extent_sem); + if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > i_size_read(inode))) { ret = inode_newsize_ok(inode, offset + len); if (ret) - goto out; + goto out_extent; } iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT; @@ -1024,7 +1025,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret) - goto out; + goto out_extent; ret = fallocate_extents(sb, inode, iblock, last, lock); @@ -1050,17 +1051,19 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (ret <= 0) - goto out; + goto out_extent; iblock += ret; ret = 0; } -out: - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); +out_extent: up_write(&si->extent_sem); +out_mutex: + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); mutex_unlock(&inode->i_mutex); +out: trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret); return ret; }