From 3d99fda0f6e5cc689e156c0b92a4a91ee0647b6a Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 28 Jun 2023 12:21:25 -0700 Subject: [PATCH 1/3] Preallocate data around iblock when noncontig If the _contig_only option isn't set then we try to preallocate aligned regions of files. The initial implementation naively only allowed one preallocation attempt in each aligned region. If it got a small allocation that didn't fill the region then every future allocation in the region would be a single block. This changes every preallocation in the region to attempt to fill the hole in the region that iblock fell in. It uses an extra extent search (item cache search) to try and avoid thousands of single block allocations. Signed-off-by: Zach Brown --- kmod/src/data.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index bf934084..e05c23c4 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -456,11 +456,13 @@ static int alloc_block(struct super_block *sb, struct inode *inode, } else { /* - * Preallocation of aligned regions only preallocates if - * the aligned region contains no extents at all. This - * could be fooled by offline sparse extents but we - * don't want to iterate over all offline extents in the - * aligned region. + * Preallocation within aligned regions tries to + * allocate an extent to fill the hole in the region + * that contains iblock. We search for a next extent + * from the start of the region. If it's at the start + * we might have to search again to find an existing + * extent at the end of the region. (This next could be + * given to us by the caller). */ div64_u64_rem(iblock, opts.data_prealloc_blocks, &rem); start = iblock - rem; @@ -468,8 +470,20 @@ static int alloc_block(struct super_block *sb, struct inode *inode, ret = scoutfs_ext_next(sb, &data_ext_ops, &args, start, 1, &found); if (ret < 0 && ret != -ENOENT) goto out; - if (found.len && found.start < start + count) - count = 1; + + /* trim count if there's an extent in the region before iblock */ + if (found.len && found.start < iblock) { + count -= (found.start + found.len) - start; + start = found.start + found.len; + /* see if there's also an extent after iblock */ + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found); + if (ret < 0 && ret != -ENOENT) + goto out; + } + + /* trim count by a next extent in the region */ + if (found.len && found.start > start && found.start < start + count) + count = (found.start - start); } /* overall prealloc limit */ From 564b942eadb0fb523806a23d14769cf4debc7101 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 28 Jun 2023 15:01:36 -0700 Subject: [PATCH 2/3] Write test for hole filling noncontig prealloc Add a test which exercises filling holes in prealloc regions when the _contig_only prealloc option is not set. Signed-off-by: Zach Brown --- tests/golden/data-prealloc | 322 +++++++++++++++++++++++++++++++++++ tests/tests/data-prealloc.sh | 94 ++++++++++ 2 files changed, 416 insertions(+) diff --git a/tests/golden/data-prealloc b/tests/golden/data-prealloc index 93854034..f6ac8d65 100644 --- a/tests/golden/data-prealloc +++ b/tests/golden/data-prealloc @@ -24,3 +24,325 @@ /mnt/test/test/data-prealloc/file-2: 5 extents found /mnt/test/test/data-prealloc/file-1: 3 extents found /mnt/test/test/data-prealloc/file-2: 3 extents found +== block writes into region allocs hole +wrote blk 24 +wrote blk 32 +wrote blk 40 +wrote blk 55 +wrote blk 63 +wrote blk 71 +wrote blk 72 +wrote blk 79 +wrote blk 80 +wrote blk 87 +wrote blk 88 +wrote blk 95 +before: +24.. 1: +32.. 1: +40.. 1: +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 0 at pos 0 +wrote blk 0 +0.. 1: +1.. 7: unwritten +24.. 1: +32.. 1: +40.. 1: +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 0 at pos 1 +wrote blk 15 +0.. 1: +1.. 14: unwritten +15.. 1: +24.. 1: +32.. 1: +40.. 1: +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 0 at pos 2 +wrote blk 19 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +32.. 1: +40.. 1: +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 1 at pos 0 +wrote blk 25 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +40.. 1: +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 1 at pos 1 +wrote blk 39 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 1 at pos 2 +wrote blk 44 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +41.. 3: unwritten +44.. 1: +45.. 3: unwritten +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 2 at pos 0 +wrote blk 48 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +41.. 3: unwritten +44.. 1: +45.. 3: unwritten +48.. 1: +49.. 6: unwritten +55.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 2 at pos 1 +wrote blk 62 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +41.. 3: unwritten +44.. 1: +45.. 3: unwritten +48.. 1: +49.. 6: unwritten +55.. 1: +56.. 6: unwritten +62.. 1: +63.. 1: +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 2 at pos 2 +wrote blk 67 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +41.. 3: unwritten +44.. 1: +45.. 3: unwritten +48.. 1: +49.. 6: unwritten +55.. 1: +56.. 6: unwritten +62.. 1: +63.. 1: +64.. 3: unwritten +67.. 1: +68.. 3: unwritten +71.. 2: +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 3 at pos 0 +wrote blk 73 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +41.. 3: unwritten +44.. 1: +45.. 3: unwritten +48.. 1: +49.. 6: unwritten +55.. 1: +56.. 6: unwritten +62.. 1: +63.. 1: +64.. 3: unwritten +67.. 1: +68.. 3: unwritten +71.. 2: +73.. 1: +74.. 5: unwritten +79.. 2: +87.. 2: +95.. 1: eof +writing into existing 3 at pos 1 +wrote blk 86 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +41.. 3: unwritten +44.. 1: +45.. 3: unwritten +48.. 1: +49.. 6: unwritten +55.. 1: +56.. 6: unwritten +62.. 1: +63.. 1: +64.. 3: unwritten +67.. 1: +68.. 3: unwritten +71.. 2: +73.. 1: +74.. 5: unwritten +79.. 2: +81.. 5: unwritten +86.. 1: +87.. 2: +95.. 1: eof +writing into existing 3 at pos 2 +wrote blk 92 +0.. 1: +1.. 14: unwritten +15.. 1: +16.. 3: unwritten +19.. 1: +20.. 4: unwritten +24.. 1: +25.. 1: +26.. 6: unwritten +32.. 1: +33.. 6: unwritten +39.. 1: +40.. 1: +41.. 3: unwritten +44.. 1: +45.. 3: unwritten +48.. 1: +49.. 6: unwritten +55.. 1: +56.. 6: unwritten +62.. 1: +63.. 1: +64.. 3: unwritten +67.. 1: +68.. 3: unwritten +71.. 2: +73.. 1: +74.. 5: unwritten +79.. 2: +81.. 5: unwritten +86.. 1: +87.. 2: +89.. 3: unwritten +92.. 1: +93.. 2: unwritten +95.. 1: eof diff --git a/tests/tests/data-prealloc.sh b/tests/tests/data-prealloc.sh index b0bb81d2..09f42637 100644 --- a/tests/tests/data-prealloc.sh +++ b/tests/tests/data-prealloc.sh @@ -6,6 +6,15 @@ # t_require_commands scoutfs stat filefrag dd touch truncate +write_block() +{ + local file="$1" + local blk="$2" + + dd if=/dev/zero of="$file" bs=4096 seek=$blk count=1 conv=notrunc status=none + echo "wrote blk $blk" +} + write_forwards() { local prefix="$1" @@ -70,6 +79,25 @@ print_extents_found() filefrag "$prefix"* 2>&1 | grep "extent.*found" | t_filter_fs } +# +# print the logical start, len, and flags if they're there. +# +print_logical_extents() +{ + local file="$1" + + filefrag -v -b4096 "$file" 2>&1 | t_filter_fs | awk ' + ($1 ~ /[0-9]+:/) { + if ($NF !~ /[0-9]+:/) { + flags=$NF + } else { + flags="" + } + print $2, $6, flags + } + ' +} + t_save_all_sysfs_mount_options data_prealloc_blocks t_save_all_sysfs_mount_options data_prealloc_contig_only restore_options() @@ -133,4 +161,70 @@ t_set_sysfs_mount_option 0 data_prealloc_contig_only 0 write_forwards $prefix 3 print_extents_found $prefix +# +# prepare aligned regions of 8 blocks that we'll write into. +# We'll right into the first, last, and middle block of each +# region which was prepared with no existing extents, one at +# the start, and one at the end. +# +# Let's keep this last because it creates a ton of output to read +# through. +# +echo "== block writes into region allocs hole" +t_set_sysfs_mount_option 0 data_prealloc_blocks 8 +t_set_sysfs_mount_option 0 data_prealloc_contig_only 1 +touch "$prefix" +truncate -s 0 "$prefix" + +# write initial blocks in regions +base=0 +for sides in 0 1 2 3; do + for i in 0 1 2; do + case "$sides" in + # none + 0) ;; + # left + 1) write_block $prefix $((base + 0)) ;; + # right + 2) write_block $prefix $((base + 7)) ;; + # both + 3) write_block $prefix $((base + 0)) + write_block $prefix $((base + 7)) ;; + esac + ((base+=8)) + done +done + +echo before: +print_logical_extents "$prefix" + +# now write into the first, middle, and last empty block of each +t_set_sysfs_mount_option 0 data_prealloc_contig_only 0 +base=0 +for sides in 0 1 2 3; do + for i in 0 1 2; do + echo "writing into existing $sides at pos $i" + case "$sides" in + # none + 0) left=$base; right=$((base + 7));; + # left + 1) left=$((base + 1)); right=$((base + 7));; + # right + 2) left=$((base)); right=$((base + 6));; + # both + 3) left=$((base + 1)); right=$((base + 6));; + esac + case "$i" in + # start + 0) write_block $prefix $left ;; + # end + 1) write_block $prefix $right ;; + # mid (both has 6 blocks internally) + 2) write_block $prefix $((left + 3)) ;; + esac + print_logical_extents "$prefix" + ((base+=8)) + done +done + t_pass From 847916860d6bf5a0a12aa556aeacd3652a5938d1 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 28 Jun 2023 16:47:41 -0700 Subject: [PATCH 3/3] Advance move_blocks extent search offset The move_blocks ioctl finds extents to move in the source file by searching from the starting block offset of the region to move. Logically, this is fine. After each extent item is deleted the next search will find the next extent. The problem is that deleted items still exist in the item cache. The next iteration has to skip over all the deleted extents from the start of the region. This is fine with large extents, but with heavily fragmented extents this creates a huge amplification of the number of items to traverse when moving the fragmented extents in a large file. (It's not quite O(n^2)/2 for the total extents, deleted items are purged as we write out the dirty items in each transaction.. but it's still immense.) The fix is to simply start searching for the next extent after the one we just moved. Signed-off-by: Zach Brown --- kmod/src/data.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index e05c23c4..6b581a3b 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -1267,6 +1267,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off, from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT; count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT; to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT; + from_start = from_iblock; /* only move extent blocks inside i_size, careful not to wrap */ from_size = i_size_read(from); @@ -1343,7 +1344,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off, /* find the next extent to move */ ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args, - from_iblock, 1, &ext); + from_start, 1, &ext); if (ret < 0) { if (ret == -ENOENT) { done = true; @@ -1431,6 +1432,12 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off, i_size_read(from); i_size_write(to, to_size); } + + /* find next after moved extent, avoiding wrapping */ + if (from_start + len < from_start) + from_start = from_iblock + count + 1; + else + from_start += len; }