diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index 1a9aeefe..76bff395 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -84,6 +84,21 @@ static u64 smallest_order_length(u64 len) return 1ULL << (free_extent_order(len) * 3); } +/* + * An extent modification dirties three distinct leaves of an allocator + * btree as it adds and removes the blkno and size sorted items for the + * old and new lengths of the extent. Dirtying the paths to these + * leaves can grow the tree and grow/shrink neighbours at each level. + * We over-estimate the number of blocks allocated and freed (the paths + * share a root, growth doesn't free) to err on the simpler and safer + * side. The overhead is minimal given the relatively large list blocks + * and relatively short allocator trees. + */ +static u32 extent_mod_blocks(u32 height) +{ + return ((1 + height) * 2) * 3; +} + /* * Free extents don't have flags and are stored in two indexes sorted by * block location and by length order, largest first. The location key @@ -877,6 +892,14 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r * -ENOENT is returned if we run out of extents in the source tree * before moving the total. * + * If meta_reserved is non-zero then -EINPROGRESS can be returned if the + * current meta allocator's avail blocks or room for freed blocks would + * have fallen under the reserved amount. The could have been + * successfully dirtied in this case but the number of blocks moved is + * not returned. The caller is expected to deal with the partial + * progress by commiting the dirty trees and examining the resulting + * modified trees to see if they need to continue moving extents. + * * The caller can specify that extents in the source tree should first * be found based on their zone bitmaps. We'll first try to find * extents in the exclusive zones, then vacant zones, and then we'll @@ -891,7 +914,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *dst, struct scoutfs_alloc_root *src, u64 total, - __le64 *exclusive, __le64 *vacant, u64 zone_blocks) + __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved) { struct alloc_ext_args args = { .alloc = alloc, @@ -941,6 +964,14 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, if (ret < 0) break; + if (meta_reserved != 0 && + scoutfs_alloc_meta_low(sb, alloc, meta_reserved + + extent_mod_blocks(src->root.height) + + extent_mod_blocks(dst->root.height))) { + ret = -EINPROGRESS; + break; + } + /* searching set start/len, finish initializing alloced extent */ ext.map = found.map ? ext.start - found.start + found.map : 0; ext.flags = found.flags; @@ -1065,15 +1096,6 @@ out: * than completely exhausting the avail list or overflowing the freed * list. * - * An extent modification dirties three distinct leaves of an allocator - * btree as it adds and removes the blkno and size sorted items for the - * old and new lengths of the extent. Dirtying the paths to these - * leaves can grow the tree and grow/shrink neighbours at each level. - * We over-estimate the number of blocks allocated and freed (the paths - * share a root, growth doesn't free) to err on the simpler and safer - * side. The overhead is minimal given the relatively large list blocks - * and relatively short allocator trees. - * * The caller tells us how many extents they're about to modify and how * many other additional blocks they may cow manually. And finally, the * caller could be the first to dirty the avail and freed blocks in the @@ -1082,7 +1104,7 @@ out: static bool list_has_blocks(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_alloc_root *root, u32 extents, u32 addl_blocks) { - u32 tree_blocks = (((1 + root->root.height) * 2) * 3) * extents; + u32 tree_blocks = extent_mod_blocks(root->root.height) * extents; u32 most = 1 + tree_blocks + addl_blocks; if (le32_to_cpu(alloc->avail.first_nr) < most) { diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 353cbffc..b3a7e3c6 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *dst, struct scoutfs_alloc_root *src, u64 total, - __le64 *exclusive, __le64 *vacant, u64 zone_blocks); + __le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved); int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root, u64 start, u64 len); diff --git a/kmod/src/server.c b/kmod/src/server.c index 05d01478..2665c4ea 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -689,23 +689,18 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_ return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src, min(target - le64_to_cpu(dst->total_len), le64_to_cpu(src->total_len)), - exclusive, vacant, zone_blocks); -} - -static inline int alloc_move_refill(struct super_block *sb, struct scoutfs_alloc_root *dst, - struct scoutfs_alloc_root *src, u64 lo, u64 target) -{ - return alloc_move_refill_zoned(sb, dst, src, lo, target, NULL, NULL, 0); + exclusive, vacant, zone_blocks, 0); } static int alloc_move_empty(struct super_block *sb, struct scoutfs_alloc_root *dst, - struct scoutfs_alloc_root *src) + struct scoutfs_alloc_root *src, u64 meta_reserved) { DECLARE_SERVER_INFO(sb, server); return scoutfs_alloc_move(sb, &server->alloc, &server->wri, - dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0); + dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0, + meta_reserved); } /* @@ -1344,7 +1339,7 @@ static int server_get_log_trees(struct super_block *sb, goto unlock; } - ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed); + ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 0); if (ret < 0) { err_str = "emptying committed data_freed"; goto unlock; @@ -1544,9 +1539,11 @@ static int server_get_roots(struct super_block *sb, * read and we finalize the tree so that it will be merged. We reclaim * all the allocator items. * - * The caller holds the commit rwsem which means we do all this work in - * one server commit. We'll need to keep the total amount of blocks in - * trees in check. + * The caller holds the commit rwsem which means we have to do our work + * in one commit. The alocator btrees can be very large and very + * fragmented. We return -EINPROGRESS if we couldn't fully reclaim the + * allocators in one commit. The caller should apply the current + * commit and call again in a new commit. * * By the time we're evicting a client they've either synced their data * or have been forcefully removed. The free blocks in the allocator @@ -1606,9 +1603,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) } /* - * All of these can return errors after having modified the - * allocator trees. We have to try and update the roots in the - * log item. + * All of these can return errors, perhaps indicating successful + * partial progress, after having modified the allocator trees. + * We always have to update the roots in the log item. */ mutex_lock(&server->alloc_mutex); ret = (err_str = "splice meta_freed to other_freed", @@ -1618,18 +1615,21 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed, <.meta_avail)) ?: (err_str = "empty data_avail", - alloc_move_empty(sb, &super->data_alloc, <.data_avail)) ?: + alloc_move_empty(sb, &super->data_alloc, <.data_avail, 100)) ?: (err_str = "empty data_freed", - alloc_move_empty(sb, &super->data_alloc, <.data_freed)); + alloc_move_empty(sb, &super->data_alloc, <.data_freed, 100)); mutex_unlock(&server->alloc_mutex); - /* the transaction is no longer open */ - lt.commit_trans_seq = lt.get_trans_seq; + /* only finalize, allowing merging, once the allocators are fully freed */ + if (ret == 0) { + /* the transaction is no longer open */ + lt.commit_trans_seq = lt.get_trans_seq; - /* the mount is no longer writing to the zones */ - zero_data_alloc_zone_bits(<); - le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED); - lt.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + /* the mount is no longer writing to the zones */ + zero_data_alloc_zone_bits(<); + le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED); + lt.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + } err = scoutfs_btree_update(sb, &server->alloc, &server->wri, &super->logs_root, &key, <, sizeof(lt)); @@ -1638,7 +1638,7 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) out: mutex_unlock(&server->logs_mutex); - if (ret < 0) + if (ret < 0 && ret != -EINPROGRESS) scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s", ret, rid, err_str); @@ -3536,26 +3536,37 @@ struct farewell_request { * Reclaim all the resources for a mount which has gone away. It's sent * us a farewell promising to leave or we actively fenced it. * - * It's safe to call this multiple times for a given rid. Each - * individual action knows to recognize that it's already been performed - * and return success. + * This can be called multiple times across different servers for + * different reclaim attempts. The existence of the mounted_client item + * triggers reclaim and must be deleted last. Each step knows that it + * can be called multiple times and safely recognizes that its work + * might have already been done. + * + * Some steps (reclaiming large fragmented allocators) may need multiple + * calls to complete. They return -EINPROGRESS which tells us to apply + * the server commit and retry. */ static int reclaim_rid(struct super_block *sb, u64 rid) { COMMIT_HOLD(hold); int ret; + int err; - server_hold_commit(sb, &hold); + do { + server_hold_commit(sb, &hold); - /* delete mounted client last, recovery looks for it */ - ret = scoutfs_lock_server_farewell(sb, rid) ?: - reclaim_open_log_tree(sb, rid) ?: - cancel_srch_compact(sb, rid) ?: - cancel_log_merge(sb, rid) ?: - scoutfs_omap_remove_rid(sb, rid) ?: - delete_mounted_client(sb, rid); + err = scoutfs_lock_server_farewell(sb, rid) ?: + reclaim_open_log_tree(sb, rid) ?: + cancel_srch_compact(sb, rid) ?: + cancel_log_merge(sb, rid) ?: + scoutfs_omap_remove_rid(sb, rid) ?: + delete_mounted_client(sb, rid); - return server_apply_commit(sb, &hold, ret); + ret = server_apply_commit(sb, &hold, err == -EINPROGRESS ? 0 : err); + + } while (err == -EINPROGRESS && ret == 0); + + return ret; } /*