From e8c64b42177a971159c41935fc7998eb6f2b63c7 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 13 Jul 2022 09:33:13 -0700 Subject: [PATCH 1/5] Move freed data extents in multiple server commits As _get_log_trees() in the server prepares the log_trees item for the client's commit, it moves all the freed data extents from the log_trees item into core data extent allocator btree items. If the freed blocks are very fragmented then it can exceed a commit's metadata allocation budget trying to dirty blocks in the free data extent btree. The fix is to move the freed data extents in multiple commits. First we move a limited number in the main commit that does all the rest of the work preparing the commit. Then we try to move the remaining freed extents in multiple additional commits. Signed-off-by: Zach Brown --- kmod/src/server.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index ca2c6524..32e99ecd 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1226,6 +1226,82 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l return ret; } +/* + * The calling get_log_trees ran out of available blocks in its commit's + * metadata allocator while moving extents from the log tree's + * data_freed into the core data_avail. This finishes moving the + * extents in as many additional commits as it takes. The logs mutex + * is nested inside holding commits so we recheck the persistent item + * each time we commit to make sure it's still what we think. The + * caller is still going to send the item to the client so we update the + * caller's each time we make progress. This is a best-effort attempt + * to clean up and it's valid to leave extents in data_freed we don't + * return errors to the caller. The client will continue the work later + * in get_log_trees or as the rid is reclaimed. + */ +static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt) +{ + DECLARE_SERVER_INFO(sb, server); + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + const u64 rid = le64_to_cpu(lt->rid); + const u64 nr = le64_to_cpu(lt->nr); + struct scoutfs_log_trees drain; + struct scoutfs_key key; + COMMIT_HOLD(hold); + int ret = 0; + int err; + + scoutfs_key_init_log_trees(&key, rid, nr); + + while (lt->data_freed.total_len != 0) { + server_hold_commit(sb, &hold); + mutex_lock(&server->logs_mutex); + + ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain); + if (ret < 0) + break; + + /* careful to only keep draining the caller's specific open trans */ + if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq || + drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) { + ret = -ENOENT; + break; + } + + ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri, + &super->logs_root, &key); + if (ret < 0) + break; + + /* moving can modify and return errors, always update caller and item */ + mutex_lock(&server->alloc_mutex); + ret = alloc_move_empty(sb, &super->data_alloc, &drain.data_freed, + COMMIT_HOLD_ALLOC_BUDGET / 2); + mutex_unlock(&server->alloc_mutex); + if (ret == -EINPROGRESS) + ret = 0; + + *lt = drain; + err = scoutfs_btree_force(sb, &server->alloc, &server->wri, + &super->logs_root, &key, &drain, sizeof(drain)); + BUG_ON(err < 0); /* dirtying must guarantee success */ + + mutex_unlock(&server->logs_mutex); + + ret = server_apply_commit(sb, &hold, ret); + if (ret < 0) { + ret = 0; /* don't try to abort, ignoring ret */ + break; + } + } + + /* try to cleanly abort and write any partial dirty btree blocks, but ignore result */ + if (ret < 0) { + mutex_unlock(&server->logs_mutex); + server_apply_commit(sb, &hold, 0); + } +} + /* * Give the client roots to all the trees that they'll use to build * their transaction. @@ -1351,7 +1427,9 @@ static int server_get_log_trees(struct super_block *sb, goto update; } - ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 0); + ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 100); + if (ret == -EINPROGRESS) + ret = 0; if (ret < 0) { err_str = "emptying committed data_freed"; goto update; @@ -1429,6 +1507,10 @@ out: scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s", ret, rid, err_str); + /* try to drain excessive data_freed with additional commits, if needed, ignoring err */ + if (ret == 0) + try_drain_data_freed(sb, <); + return scoutfs_net_response(sb, conn, cmd, id, ret, <, sizeof(lt)); } From 198d3cda3228184f6d8b0955b0a07c3dc4aba542 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 28 Jul 2022 11:28:06 -0700 Subject: [PATCH 2/5] Add scoutfs_alloc_meta_low_since() Add scoutfs_alloc_meta_low_since() to test if the metadata avail or freed resources have been used by a given amount since a previous snapshot. Signed-off-by: Zach Brown --- kmod/src/alloc.c | 21 +++++++++++++++++++++ kmod/src/alloc.h | 2 ++ 2 files changed, 23 insertions(+) diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index 76bff395..0284150e 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -1351,6 +1351,27 @@ void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, } while (read_seqretry(&alloc->seqlock, seq)); } +/* + * Returns true if the caller's consumption of nr from either avail or + * freed would end up exceeding their budget relative to the starting + * remaining snapshot they took. + */ +bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start, + u32 budget, u32 nr) +{ + u32 avail_use; + u32 freed_use; + u32 avail; + u32 freed; + + scoutfs_alloc_meta_remaining(alloc, &avail, &freed); + + avail_use = avail_start - avail; + freed_use = freed_start - freed; + + return ((avail_use + nr) > budget) || ((freed_use + nr) > budget); +} + bool scoutfs_alloc_test_flag(struct super_block *sb, struct scoutfs_alloc *alloc, u32 flag) { diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index b3a7e3c6..5e0d4ff3 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -159,6 +159,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb, bool scoutfs_alloc_meta_low(struct super_block *sb, struct scoutfs_alloc *alloc, u32 nr); void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space); +bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start, + u32 budget, u32 nr); bool scoutfs_alloc_test_flag(struct super_block *sb, struct scoutfs_alloc *alloc, u32 flag); From 233fbb39f343d9e42555408a9083b8d2d06c91c5 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 27 Jul 2022 14:19:44 -0700 Subject: [PATCH 3/5] Limit alloc_move per-call allocator consumption Recently scoutfs_alloc_move() was changed to try and limit the amount of metadata blocks it could allocate or free. The intent was to stop concurrent holders of a transaction from fully consuming the available allocator for the transaction. The limiting logic was a bit off. It stopped when the allocator had the caller's limit remaining, not when it had consumed the caller's limit. This is overly permissive and could still allow concurrent callers to consume the allocator. It was also triggering warning messages when a call consumed more than its allowed budget while holding a transaction. Unfortunately, we don't have per-caller tracking of allocator resource consumption. The best we can do is sample the allocators as we start and return if they drop by the caller's limit. This is overly conservative in that it accounts any consumption during concurrent callers to all callers. This isn't perfect but it makes the failure case less likely and the impact shouldn't be significant. We don't often have a lot of concurrency and the limits are larger than callers will typically consume. Signed-off-by: Zach Brown --- kmod/src/alloc.c | 26 +++++++++++++++----------- kmod/src/alloc.h | 2 +- kmod/src/server.c | 4 ++-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index 0284150e..a36387b7 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -892,12 +892,11 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r * -ENOENT is returned if we run out of extents in the source tree * before moving the total. * - * If meta_reserved is non-zero then -EINPROGRESS can be returned if the - * current meta allocator's avail blocks or room for freed blocks would - * have fallen under the reserved amount. The could have been - * successfully dirtied in this case but the number of blocks moved is - * not returned. The caller is expected to deal with the partial - * progress by commiting the dirty trees and examining the resulting + * If meta_budget is non-zero then -EINPROGRESS can be returned if the + * the caller's budget is consumed in the allocator during this call + * (though not necessarily by us, we don't have per-thread tracking of + * allocator consumption :/). The call can still have made progress and + * caller is expected commit the dirty trees and examining the resulting * modified trees to see if they need to continue moving extents. * * The caller can specify that extents in the source tree should first @@ -914,7 +913,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *dst, struct scoutfs_alloc_root *src, u64 total, - __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved) + __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget) { struct alloc_ext_args args = { .alloc = alloc, @@ -922,6 +921,8 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, }; struct scoutfs_extent found; struct scoutfs_extent ext; + u32 avail_start = 0; + u32 freed_start = 0; u64 moved = 0; u64 count; int ret = 0; @@ -932,6 +933,9 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, vacant = NULL; } + if (meta_budget != 0) + scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start); + while (moved < total) { count = total - moved; @@ -964,10 +968,10 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, if (ret < 0) break; - if (meta_reserved != 0 && - scoutfs_alloc_meta_low(sb, alloc, meta_reserved + - extent_mod_blocks(src->root.height) + - extent_mod_blocks(dst->root.height))) { + if (meta_budget != 0 && + scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget, + extent_mod_blocks(src->root.height) + + extent_mod_blocks(dst->root.height))) { ret = -EINPROGRESS; break; } diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 5e0d4ff3..4496802c 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *dst, struct scoutfs_alloc_root *src, u64 total, - __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved); + __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget); int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root, u64 start, u64 len); diff --git a/kmod/src/server.c b/kmod/src/server.c index 32e99ecd..17f38481 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -694,13 +694,13 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_ static int alloc_move_empty(struct super_block *sb, struct scoutfs_alloc_root *dst, - struct scoutfs_alloc_root *src, u64 meta_reserved) + struct scoutfs_alloc_root *src, u64 meta_budget) { DECLARE_SERVER_INFO(sb, server); return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0, - meta_reserved); + meta_budget); } /* From acb94dd9b78b7bb68b4d244589d30ad9004486c8 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 15 Jul 2022 10:20:03 -0700 Subject: [PATCH 4/5] Add test of large fragmented free lists Add a test which gives the server a transaction with a free list block that contains blknos that each dirty an individiaul btree blocks in the global data free extent btree. Signed-off-by: Zach Brown --- tests/Makefile | 3 +- tests/golden/large-fragmented-free | 3 + tests/sequence | 1 + tests/src/fragmented_data_extents.c | 113 +++++++++++++++++++++++++++ tests/tests/large-fragmented-free.sh | 22 ++++++ 5 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 tests/golden/large-fragmented-free create mode 100644 tests/src/fragmented_data_extents.c create mode 100644 tests/tests/large-fragmented-free.sh diff --git a/tests/Makefile b/tests/Makefile index 60eea516..fcc43df1 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -10,7 +10,8 @@ BIN := src/createmany \ src/bulk_create_paths \ src/stage_tmpfile \ src/find_xattrs \ - src/create_xattr_loop + src/create_xattr_loop \ + src/fragmented_data_extents DEPS := $(wildcard src/*.d) diff --git a/tests/golden/large-fragmented-free b/tests/golden/large-fragmented-free new file mode 100644 index 00000000..58aea5f7 --- /dev/null +++ b/tests/golden/large-fragmented-free @@ -0,0 +1,3 @@ +== creating fragmented extents +== unlink file with moved extents to free extents per block +== cleanup diff --git a/tests/sequence b/tests/sequence index 37f30647..db6e9ba0 100644 --- a/tests/sequence +++ b/tests/sequence @@ -9,6 +9,7 @@ fallocate.sh setattr_more.sh offline-extent-waiting.sh move-blocks.sh +large-fragmented-free.sh enospc.sh srch-basic-functionality.sh simple-xattr-unit.sh diff --git a/tests/src/fragmented_data_extents.c b/tests/src/fragmented_data_extents.c new file mode 100644 index 00000000..d3fcc877 --- /dev/null +++ b/tests/src/fragmented_data_extents.c @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2021 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* + * This creates fragmented data extents. + * + * A file is created that has alternating free and allocated extents. + * This also results in the global allocator having the matching + * fragmented free extent pattern. While that file is being created, + * occasionally an allocated extent is moved to another file. This + * results in a file that has fragmented extents at a given stride that + * can be deleted to create free data extents with a given stride. + * + * We don't have hole punching so to do this quickly we use a goofy + * combination of fallocate, truncate, and our move_blocks ioctl. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ioctl.h" + +#define BLOCK_SIZE 4096 + +int main(int argc, char **argv) +{ + struct scoutfs_ioctl_move_blocks mb = {0,}; + unsigned long long freed_extents; + unsigned long long move_stride; + unsigned long long i; + int alloc_fd; + int trunc_fd; + off_t off; + int ret; + + if (argc != 5) { + printf("%s \n", argv[0]); + return 1; + } + + freed_extents = strtoull(argv[1], NULL, 0); + move_stride = strtoull(argv[2], NULL, 0); + + alloc_fd = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (alloc_fd == -1) { + fprintf(stderr, "error opening %s: %d (%s)\n", argv[3], errno, strerror(errno)); + exit(1); + } + + trunc_fd = open(argv[4], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (trunc_fd == -1) { + fprintf(stderr, "error opening %s: %d (%s)\n", argv[4], errno, strerror(errno)); + exit(1); + } + + for (i = 0, off = 0; i < freed_extents; i++, off += BLOCK_SIZE * 2) { + + ret = fallocate(alloc_fd, 0, off, BLOCK_SIZE * 2); + if (ret < 0) { + fprintf(stderr, "fallocate at off %llu error: %d (%s)\n", + (unsigned long long)off, errno, strerror(errno)); + exit(1); + } + + ret = ftruncate(alloc_fd, off + BLOCK_SIZE); + if (ret < 0) { + fprintf(stderr, "truncate to off %llu error: %d (%s)\n", + (unsigned long long)off + BLOCK_SIZE, errno, strerror(errno)); + exit(1); + } + + if ((i % move_stride) == 0) { + mb.from_fd = alloc_fd; + mb.from_off = off; + mb.len = BLOCK_SIZE; + mb.to_off = i * BLOCK_SIZE; + + ret = ioctl(trunc_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb); + if (ret < 0) { + fprintf(stderr, "move from off %llu error: %d (%s)\n", + (unsigned long long)off, + errno, strerror(errno)); + } + } + } + + if (alloc_fd > -1) + close(alloc_fd); + if (trunc_fd > -1) + close(trunc_fd); + + return 0; +} diff --git a/tests/tests/large-fragmented-free.sh b/tests/tests/large-fragmented-free.sh new file mode 100644 index 00000000..e27d1bbe --- /dev/null +++ b/tests/tests/large-fragmented-free.sh @@ -0,0 +1,22 @@ +# +# Make sure the server can handle a transaction with a data_freed whose +# blocks all hit different btree blocks in the main free list. It +# probably has to be merged in multiple commits. +# + +t_require_commands fragmented_data_extents + +EXTENTS_PER_BTREE_BLOCK=600 +EXTENTS_PER_LIST_BLOCK=8192 +FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK)) + +echo "== creating fragmented extents" +fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move" + +echo "== unlink file with moved extents to free extents per block" +rm -f "$T_D0/move" + +echo "== cleanup" +rm -f "$T_D0/alloc" + +t_pass From 1cbc927ccb64e86e3e5273291f5309e9e082f7bb Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 28 Jul 2022 16:24:23 -0700 Subject: [PATCH 5/5] Only clear trying inode deletion bit when set try_delete_inode_items() is responsible for making sure that it's safe to delete an inode's persistent items. One of the things it has to check is that there isn't another deletion attempt on the inode in this mount. It sets a bit in lock data while it's working and backs off if the bit is already set. Unfortunately it was always clearing this bit as it exited, regardless of whether it set it or not. This would let the next attempt perform the deletion again before the working task had finished. This was often not a problem because background orphan scanning is the only source of regular concurrent deletion attempts. But it's a big problem if a deletion attempt takes a very long time. It gives enough time for an orphan scan attempt to clear the bit then try again and clobber on whoever is performing the very slow deletion. I hit this in a test that built files with an absurd number of fragmented extents. The second concurrent orphan attempt was able to proceed with deletion and performed a bunch of duplicate data extent frees and caused corruption. The fix is to only clear the bit if we set it. Now all concurrent attempts will back off until the first task is done. Signed-off-by: Zach Brown --- kmod/src/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 9d22f52a..62ef3093 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -1685,6 +1685,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) struct scoutfs_lock *lock = NULL; struct scoutfs_inode sinode; struct scoutfs_key key; + bool clear_trying = false; u64 group_nr; int bit_nr; int ret; @@ -1704,6 +1705,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) ret = 0; goto out; } + clear_trying = true; /* can't delete if it's cached in local or remote mounts */ if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) { @@ -1730,7 +1732,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock); out: - if (ldata) + if (clear_trying) clear_bit(bit_nr, ldata->trying); scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);