From daea8d5bc124bb94f4f3097928242d2fcec69e5f Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Wed, 25 Mar 2026 06:23:15 -0700 Subject: [PATCH 1/2] Reclaim orphaned log_trees entries from unmounted clients An unfinalized log_trees entry whose rid is not in mounted_clients is an orphan left behind by incomplete reclaim. Previously this permanently blocked log merges because the finalize loop treated it as an active client that would never commit. Call reclaim_open_log_tree for orphaned rids before starting a log merge. Once reclaimed, the existing merge and freeing paths include them normally. Also skip orphans in get_stable_trans_seq so their open transaction doesn't artificially lower the stable sequence. Signed-off-by: Auke Kok --- kmod/src/server.c | 105 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 9 deletions(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index 7f979df7..ea9a0746 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -256,6 +256,14 @@ static void server_down(struct server_info *server) cmpxchg(&server->status, was, SERVER_DOWN); } +static void init_mounted_client_key(struct scoutfs_key *key, u64 rid) +{ + *key = (struct scoutfs_key) { + .sk_zone = SCOUTFS_MOUNTED_CLIENT_ZONE, + .skmc_rid = cpu_to_le64(rid), + }; +} + /* * The per-holder allocation block use budget balances batching * efficiency and concurrency. The larger this gets, the fewer @@ -963,6 +971,28 @@ static int find_log_trees_item(struct super_block *sb, return ret; } +/* + * Return true if the given rid has a mounted_clients entry. + */ +static bool rid_is_mounted(struct super_block *sb, u64 rid) +{ + DECLARE_SERVER_INFO(sb, server); + struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + int ret; + + init_mounted_client_key(&key, rid); + + mutex_lock(&server->mounted_clients_mutex); + ret = scoutfs_btree_lookup(sb, &super->mounted_clients, &key, &iref); + if (ret == 0) + scoutfs_btree_put_iref(&iref); + mutex_unlock(&server->mounted_clients_mutex); + + return ret == 0; +} + /* * Find the log_trees item with the greatest nr for each rid. Fills the * caller's log_trees and sets the key before the returned log_trees for @@ -1221,6 +1251,60 @@ static int do_finalize_ours(struct super_block *sb, * happens to arrive at just the right time. That's fine, merging will * ignore and tear down the empty input. */ + +static int reclaim_open_log_tree(struct super_block *sb, u64 rid); + +/* + * Reclaim log trees for rids that have no mounted_clients entry. + * They block merges by appearing active. reclaim_open_log_tree + * may need multiple commits to drain allocators (-EINPROGRESS). + * + * The caller holds logs_mutex and a commit, both are dropped and + * re-acquired around each reclaim call. Returns >0 if any orphans + * were reclaimed so the caller can re-check state that may have + * changed while the lock was dropped. + */ +static int reclaim_orphan_log_trees(struct super_block *sb, u64 rid, + struct commit_hold *hold) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); + struct scoutfs_log_trees lt; + struct scoutfs_key key; + bool found = false; + u64 orphan_rid; + int ret; + int err; + + scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); + while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) { + + if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) || + le64_to_cpu(lt.rid) == rid || + rid_is_mounted(sb, le64_to_cpu(lt.rid))) + continue; + + orphan_rid = le64_to_cpu(lt.rid); + scoutfs_err(sb, "reclaiming orphan log trees for rid %016llx nr %llu", + orphan_rid, le64_to_cpu(lt.nr)); + found = true; + + do { + mutex_unlock(&server->logs_mutex); + err = reclaim_open_log_tree(sb, orphan_rid); + ret = server_apply_commit(sb, hold, + err == -EINPROGRESS ? 0 : err); + server_hold_commit(sb, hold); + mutex_lock(&server->logs_mutex); + } while (err == -EINPROGRESS && ret == 0); + + if (ret < 0) + break; + } + + return ret < 0 ? ret : found; +} + #define FINALIZE_POLL_MIN_DELAY_MS 5U #define FINALIZE_POLL_MAX_DELAY_MS 100U #define FINALIZE_POLL_DELAY_GROWTH_PCT 150U @@ -1261,6 +1345,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l break; } + ret = reclaim_orphan_log_trees(sb, rid, hold); + if (ret < 0) { + err_str = "reclaiming orphan log trees"; + break; + } + if (ret > 0) { + /* lock was dropped, re-check merge status */ + continue; + } + /* look for finalized and other active log btrees */ saw_finalized = false; others_active = false; @@ -1981,7 +2075,8 @@ static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret) scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) { if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) && - le64_to_cpu(lt.get_trans_seq) <= last_seq) { + le64_to_cpu(lt.get_trans_seq) <= last_seq && + rid_is_mounted(sb, le64_to_cpu(lt.rid))) { last_seq = le64_to_cpu(lt.get_trans_seq) - 1; } } @@ -3533,14 +3628,6 @@ out: return scoutfs_net_response(sb, conn, cmd, id, ret, &nst, sizeof(nst)); } -static void init_mounted_client_key(struct scoutfs_key *key, u64 rid) -{ - *key = (struct scoutfs_key) { - .sk_zone = SCOUTFS_MOUNTED_CLIENT_ZONE, - .skmc_rid = cpu_to_le64(rid), - }; -} - static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref) { return (iref->val_len != sizeof(struct scoutfs_mounted_client_btree_val)); From 8a730464abf3c20b9db12aaa1e7bb373952b9bfb Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Wed, 25 Mar 2026 06:23:40 -0700 Subject: [PATCH 2/2] Add orphan-log-trees test and reclaim_skip_finalize trigger Add a reclaim_skip_finalize trigger that prevents reclaim from setting FINALIZED on log_trees entries. The test arms this trigger, force-unmounts a client to create an orphan, and verifies the log merge succeeds without timeout and the orphan reclaim message appears in dmesg. Signed-off-by: Auke Kok --- kmod/src/server.c | 2 +- kmod/src/triggers.c | 1 + kmod/src/triggers.h | 1 + tests/funcs/filter.sh | 3 ++ tests/golden/orphan-log-trees | 3 ++ tests/sequence | 1 + tests/tests/orphan-log-trees.sh | 52 +++++++++++++++++++++++++++++++++ 7 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 tests/golden/orphan-log-trees create mode 100644 tests/tests/orphan-log-trees.sh diff --git a/kmod/src/server.c b/kmod/src/server.c index ea9a0746..e4791980 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -2023,7 +2023,7 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) mutex_unlock(&server->alloc_mutex); /* only finalize, allowing merging, once the allocators are fully freed */ - if (ret == 0) { + if (ret == 0 && !scoutfs_trigger(sb, RECLAIM_SKIP_FINALIZE)) { /* the transaction is no longer open */ lt.commit_trans_seq = lt.get_trans_seq; diff --git a/kmod/src/triggers.c b/kmod/src/triggers.c index 317f0911..4616d013 100644 --- a/kmod/src/triggers.c +++ b/kmod/src/triggers.c @@ -45,6 +45,7 @@ static char *names[] = { [SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate", [SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe", [SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge", + [SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize", }; bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t) diff --git a/kmod/src/triggers.h b/kmod/src/triggers.h index eeb33b49..64a798b7 100644 --- a/kmod/src/triggers.h +++ b/kmod/src/triggers.h @@ -8,6 +8,7 @@ enum scoutfs_trigger { SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE, SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE, SCOUTFS_TRIGGER_STATFS_LOCK_PURGE, + SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE, SCOUTFS_TRIGGER_NR, }; diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh index 3dc593c8..cb766300 100644 --- a/tests/funcs/filter.sh +++ b/tests/funcs/filter.sh @@ -123,6 +123,9 @@ t_filter_dmesg() re="$re|hrtimer: interrupt took .*" re="$re|clocksource: Long readout interval" + # orphan log trees reclaim is handled, not an error + re="$re|scoutfs .* reclaiming orphan log trees" + # fencing tests force unmounts and trigger timeouts re="$re|scoutfs .* forcing unmount" re="$re|scoutfs .* reconnect timed out" diff --git a/tests/golden/orphan-log-trees b/tests/golden/orphan-log-trees new file mode 100644 index 00000000..08003b10 --- /dev/null +++ b/tests/golden/orphan-log-trees @@ -0,0 +1,3 @@ +== create orphan log_trees entry via trigger +== verify orphan is reclaimed and merge completes +== verify orphan reclaim was logged diff --git a/tests/sequence b/tests/sequence index 1b6a74f3..e296553a 100644 --- a/tests/sequence +++ b/tests/sequence @@ -50,6 +50,7 @@ setup-error-teardown.sh resize-devices.sh change-devices.sh fence-and-reclaim.sh +orphan-log-trees.sh quorum-heartbeat-timeout.sh orphan-inodes.sh mount-unmount-race.sh diff --git a/tests/tests/orphan-log-trees.sh b/tests/tests/orphan-log-trees.sh new file mode 100644 index 00000000..14927eb6 --- /dev/null +++ b/tests/tests/orphan-log-trees.sh @@ -0,0 +1,52 @@ +# +# Test that orphaned log_trees entries from unmounted rids are +# finalized and merged. +# +# An orphan log_trees entry is one whose rid has no mounted_clients +# entry. This can happen from incomplete reclaim across server +# failovers. We simulate it with the reclaim_skip_finalize trigger +# which makes reclaim_open_log_tree skip the finalization step. +# + +t_require_commands touch scoutfs +t_require_mounts 2 + +TIMEOUT=90 + +echo "== create orphan log_trees entry via trigger" +sv=$(t_server_nr) +cl=$(t_first_client_nr) +rid=$(t_mount_rid $cl) + +touch "$T_D0/file" "$T_D1/file" +sync + +# arm the trigger so reclaim skips finalization +t_trigger_arm_silent reclaim_skip_finalize $sv + +# force unmount the client, server will fence and reclaim it +# but the trigger makes reclaim leave log_trees unfinalized +t_force_umount $cl + +# wait for fencing to run +verify_fenced() { + grep -q "running rid '$rid'" "$T_FENCED_LOG" 2>/dev/null +} +t_wait_until_timeout $TIMEOUT verify_fenced + +# give the server time to complete reclaim after fence +sleep 5 + +# remount the client so t_force_log_merge can sync all mounts. +# the client gets a new rid; the old rid's log_trees is the orphan. +t_mount $cl + +echo "== verify orphan is reclaimed and merge completes" +t_force_log_merge + +echo "== verify orphan reclaim was logged" +if ! dmesg | grep -q "reclaiming orphan log trees for rid $rid"; then + t_fail "expected orphan reclaim message for rid $rid in dmesg" +fi + +t_pass