diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index 76bff395..a36387b7 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -892,12 +892,11 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r * -ENOENT is returned if we run out of extents in the source tree * before moving the total. * - * If meta_reserved is non-zero then -EINPROGRESS can be returned if the - * current meta allocator's avail blocks or room for freed blocks would - * have fallen under the reserved amount. The could have been - * successfully dirtied in this case but the number of blocks moved is - * not returned. The caller is expected to deal with the partial - * progress by commiting the dirty trees and examining the resulting + * If meta_budget is non-zero then -EINPROGRESS can be returned if the + * the caller's budget is consumed in the allocator during this call + * (though not necessarily by us, we don't have per-thread tracking of + * allocator consumption :/). The call can still have made progress and + * caller is expected commit the dirty trees and examining the resulting * modified trees to see if they need to continue moving extents. * * The caller can specify that extents in the source tree should first @@ -914,7 +913,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *dst, struct scoutfs_alloc_root *src, u64 total, - __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved) + __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget) { struct alloc_ext_args args = { .alloc = alloc, @@ -922,6 +921,8 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, }; struct scoutfs_extent found; struct scoutfs_extent ext; + u32 avail_start = 0; + u32 freed_start = 0; u64 moved = 0; u64 count; int ret = 0; @@ -932,6 +933,9 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, vacant = NULL; } + if (meta_budget != 0) + scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start); + while (moved < total) { count = total - moved; @@ -964,10 +968,10 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, if (ret < 0) break; - if (meta_reserved != 0 && - scoutfs_alloc_meta_low(sb, alloc, meta_reserved + - extent_mod_blocks(src->root.height) + - extent_mod_blocks(dst->root.height))) { + if (meta_budget != 0 && + scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget, + extent_mod_blocks(src->root.height) + + extent_mod_blocks(dst->root.height))) { ret = -EINPROGRESS; break; } @@ -1351,6 +1355,27 @@ void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, } while (read_seqretry(&alloc->seqlock, seq)); } +/* + * Returns true if the caller's consumption of nr from either avail or + * freed would end up exceeding their budget relative to the starting + * remaining snapshot they took. + */ +bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start, + u32 budget, u32 nr) +{ + u32 avail_use; + u32 freed_use; + u32 avail; + u32 freed; + + scoutfs_alloc_meta_remaining(alloc, &avail, &freed); + + avail_use = avail_start - avail; + freed_use = freed_start - freed; + + return ((avail_use + nr) > budget) || ((freed_use + nr) > budget); +} + bool scoutfs_alloc_test_flag(struct super_block *sb, struct scoutfs_alloc *alloc, u32 flag) { diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index b3a7e3c6..4496802c 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *dst, struct scoutfs_alloc_root *src, u64 total, - __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved); + __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget); int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root, u64 start, u64 len); @@ -159,6 +159,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb, bool scoutfs_alloc_meta_low(struct super_block *sb, struct scoutfs_alloc *alloc, u32 nr); void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space); +bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start, + u32 budget, u32 nr); bool scoutfs_alloc_test_flag(struct super_block *sb, struct scoutfs_alloc *alloc, u32 flag); diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 9d22f52a..62ef3093 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -1685,6 +1685,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) struct scoutfs_lock *lock = NULL; struct scoutfs_inode sinode; struct scoutfs_key key; + bool clear_trying = false; u64 group_nr; int bit_nr; int ret; @@ -1704,6 +1705,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) ret = 0; goto out; } + clear_trying = true; /* can't delete if it's cached in local or remote mounts */ if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) { @@ -1730,7 +1732,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock); out: - if (ldata) + if (clear_trying) clear_bit(bit_nr, ldata->trying); scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); diff --git a/kmod/src/server.c b/kmod/src/server.c index ca2c6524..17f38481 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -694,13 +694,13 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_ static int alloc_move_empty(struct super_block *sb, struct scoutfs_alloc_root *dst, - struct scoutfs_alloc_root *src, u64 meta_reserved) + struct scoutfs_alloc_root *src, u64 meta_budget) { DECLARE_SERVER_INFO(sb, server); return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0, - meta_reserved); + meta_budget); } /* @@ -1226,6 +1226,82 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l return ret; } +/* + * The calling get_log_trees ran out of available blocks in its commit's + * metadata allocator while moving extents from the log tree's + * data_freed into the core data_avail. This finishes moving the + * extents in as many additional commits as it takes. The logs mutex + * is nested inside holding commits so we recheck the persistent item + * each time we commit to make sure it's still what we think. The + * caller is still going to send the item to the client so we update the + * caller's each time we make progress. This is a best-effort attempt + * to clean up and it's valid to leave extents in data_freed we don't + * return errors to the caller. The client will continue the work later + * in get_log_trees or as the rid is reclaimed. + */ +static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt) +{ + DECLARE_SERVER_INFO(sb, server); + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + const u64 rid = le64_to_cpu(lt->rid); + const u64 nr = le64_to_cpu(lt->nr); + struct scoutfs_log_trees drain; + struct scoutfs_key key; + COMMIT_HOLD(hold); + int ret = 0; + int err; + + scoutfs_key_init_log_trees(&key, rid, nr); + + while (lt->data_freed.total_len != 0) { + server_hold_commit(sb, &hold); + mutex_lock(&server->logs_mutex); + + ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain); + if (ret < 0) + break; + + /* careful to only keep draining the caller's specific open trans */ + if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq || + drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) { + ret = -ENOENT; + break; + } + + ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri, + &super->logs_root, &key); + if (ret < 0) + break; + + /* moving can modify and return errors, always update caller and item */ + mutex_lock(&server->alloc_mutex); + ret = alloc_move_empty(sb, &super->data_alloc, &drain.data_freed, + COMMIT_HOLD_ALLOC_BUDGET / 2); + mutex_unlock(&server->alloc_mutex); + if (ret == -EINPROGRESS) + ret = 0; + + *lt = drain; + err = scoutfs_btree_force(sb, &server->alloc, &server->wri, + &super->logs_root, &key, &drain, sizeof(drain)); + BUG_ON(err < 0); /* dirtying must guarantee success */ + + mutex_unlock(&server->logs_mutex); + + ret = server_apply_commit(sb, &hold, ret); + if (ret < 0) { + ret = 0; /* don't try to abort, ignoring ret */ + break; + } + } + + /* try to cleanly abort and write any partial dirty btree blocks, but ignore result */ + if (ret < 0) { + mutex_unlock(&server->logs_mutex); + server_apply_commit(sb, &hold, 0); + } +} + /* * Give the client roots to all the trees that they'll use to build * their transaction. @@ -1351,7 +1427,9 @@ static int server_get_log_trees(struct super_block *sb, goto update; } - ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 0); + ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 100); + if (ret == -EINPROGRESS) + ret = 0; if (ret < 0) { err_str = "emptying committed data_freed"; goto update; @@ -1429,6 +1507,10 @@ out: scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s", ret, rid, err_str); + /* try to drain excessive data_freed with additional commits, if needed, ignoring err */ + if (ret == 0) + try_drain_data_freed(sb, <); + return scoutfs_net_response(sb, conn, cmd, id, ret, <, sizeof(lt)); } diff --git a/tests/Makefile b/tests/Makefile index 60eea516..fcc43df1 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -10,7 +10,8 @@ BIN := src/createmany \ src/bulk_create_paths \ src/stage_tmpfile \ src/find_xattrs \ - src/create_xattr_loop + src/create_xattr_loop \ + src/fragmented_data_extents DEPS := $(wildcard src/*.d) diff --git a/tests/golden/large-fragmented-free b/tests/golden/large-fragmented-free new file mode 100644 index 00000000..58aea5f7 --- /dev/null +++ b/tests/golden/large-fragmented-free @@ -0,0 +1,3 @@ +== creating fragmented extents +== unlink file with moved extents to free extents per block +== cleanup diff --git a/tests/sequence b/tests/sequence index 37f30647..db6e9ba0 100644 --- a/tests/sequence +++ b/tests/sequence @@ -9,6 +9,7 @@ fallocate.sh setattr_more.sh offline-extent-waiting.sh move-blocks.sh +large-fragmented-free.sh enospc.sh srch-basic-functionality.sh simple-xattr-unit.sh diff --git a/tests/src/fragmented_data_extents.c b/tests/src/fragmented_data_extents.c new file mode 100644 index 00000000..d3fcc877 --- /dev/null +++ b/tests/src/fragmented_data_extents.c @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2021 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* + * This creates fragmented data extents. + * + * A file is created that has alternating free and allocated extents. + * This also results in the global allocator having the matching + * fragmented free extent pattern. While that file is being created, + * occasionally an allocated extent is moved to another file. This + * results in a file that has fragmented extents at a given stride that + * can be deleted to create free data extents with a given stride. + * + * We don't have hole punching so to do this quickly we use a goofy + * combination of fallocate, truncate, and our move_blocks ioctl. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ioctl.h" + +#define BLOCK_SIZE 4096 + +int main(int argc, char **argv) +{ + struct scoutfs_ioctl_move_blocks mb = {0,}; + unsigned long long freed_extents; + unsigned long long move_stride; + unsigned long long i; + int alloc_fd; + int trunc_fd; + off_t off; + int ret; + + if (argc != 5) { + printf("%s \n", argv[0]); + return 1; + } + + freed_extents = strtoull(argv[1], NULL, 0); + move_stride = strtoull(argv[2], NULL, 0); + + alloc_fd = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (alloc_fd == -1) { + fprintf(stderr, "error opening %s: %d (%s)\n", argv[3], errno, strerror(errno)); + exit(1); + } + + trunc_fd = open(argv[4], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (trunc_fd == -1) { + fprintf(stderr, "error opening %s: %d (%s)\n", argv[4], errno, strerror(errno)); + exit(1); + } + + for (i = 0, off = 0; i < freed_extents; i++, off += BLOCK_SIZE * 2) { + + ret = fallocate(alloc_fd, 0, off, BLOCK_SIZE * 2); + if (ret < 0) { + fprintf(stderr, "fallocate at off %llu error: %d (%s)\n", + (unsigned long long)off, errno, strerror(errno)); + exit(1); + } + + ret = ftruncate(alloc_fd, off + BLOCK_SIZE); + if (ret < 0) { + fprintf(stderr, "truncate to off %llu error: %d (%s)\n", + (unsigned long long)off + BLOCK_SIZE, errno, strerror(errno)); + exit(1); + } + + if ((i % move_stride) == 0) { + mb.from_fd = alloc_fd; + mb.from_off = off; + mb.len = BLOCK_SIZE; + mb.to_off = i * BLOCK_SIZE; + + ret = ioctl(trunc_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb); + if (ret < 0) { + fprintf(stderr, "move from off %llu error: %d (%s)\n", + (unsigned long long)off, + errno, strerror(errno)); + } + } + } + + if (alloc_fd > -1) + close(alloc_fd); + if (trunc_fd > -1) + close(trunc_fd); + + return 0; +} diff --git a/tests/tests/large-fragmented-free.sh b/tests/tests/large-fragmented-free.sh new file mode 100644 index 00000000..e27d1bbe --- /dev/null +++ b/tests/tests/large-fragmented-free.sh @@ -0,0 +1,22 @@ +# +# Make sure the server can handle a transaction with a data_freed whose +# blocks all hit different btree blocks in the main free list. It +# probably has to be merged in multiple commits. +# + +t_require_commands fragmented_data_extents + +EXTENTS_PER_BTREE_BLOCK=600 +EXTENTS_PER_LIST_BLOCK=8192 +FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK)) + +echo "== creating fragmented extents" +fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move" + +echo "== unlink file with moved extents to free extents per block" +rm -f "$T_D0/move" + +echo "== cleanup" +rm -f "$T_D0/alloc" + +t_pass