diff --git a/kmod/src/server.c b/kmod/src/server.c index 7f979df7..e4791980 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -256,6 +256,14 @@ static void server_down(struct server_info *server) cmpxchg(&server->status, was, SERVER_DOWN); } +static void init_mounted_client_key(struct scoutfs_key *key, u64 rid) +{ + *key = (struct scoutfs_key) { + .sk_zone = SCOUTFS_MOUNTED_CLIENT_ZONE, + .skmc_rid = cpu_to_le64(rid), + }; +} + /* * The per-holder allocation block use budget balances batching * efficiency and concurrency. The larger this gets, the fewer @@ -963,6 +971,28 @@ static int find_log_trees_item(struct super_block *sb, return ret; } +/* + * Return true if the given rid has a mounted_clients entry. + */ +static bool rid_is_mounted(struct super_block *sb, u64 rid) +{ + DECLARE_SERVER_INFO(sb, server); + struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); + SCOUTFS_BTREE_ITEM_REF(iref); + struct scoutfs_key key; + int ret; + + init_mounted_client_key(&key, rid); + + mutex_lock(&server->mounted_clients_mutex); + ret = scoutfs_btree_lookup(sb, &super->mounted_clients, &key, &iref); + if (ret == 0) + scoutfs_btree_put_iref(&iref); + mutex_unlock(&server->mounted_clients_mutex); + + return ret == 0; +} + /* * Find the log_trees item with the greatest nr for each rid. Fills the * caller's log_trees and sets the key before the returned log_trees for @@ -1221,6 +1251,60 @@ static int do_finalize_ours(struct super_block *sb, * happens to arrive at just the right time. That's fine, merging will * ignore and tear down the empty input. */ + +static int reclaim_open_log_tree(struct super_block *sb, u64 rid); + +/* + * Reclaim log trees for rids that have no mounted_clients entry. + * They block merges by appearing active. reclaim_open_log_tree + * may need multiple commits to drain allocators (-EINPROGRESS). + * + * The caller holds logs_mutex and a commit, both are dropped and + * re-acquired around each reclaim call. Returns >0 if any orphans + * were reclaimed so the caller can re-check state that may have + * changed while the lock was dropped. + */ +static int reclaim_orphan_log_trees(struct super_block *sb, u64 rid, + struct commit_hold *hold) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); + struct scoutfs_log_trees lt; + struct scoutfs_key key; + bool found = false; + u64 orphan_rid; + int ret; + int err; + + scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); + while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) { + + if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) || + le64_to_cpu(lt.rid) == rid || + rid_is_mounted(sb, le64_to_cpu(lt.rid))) + continue; + + orphan_rid = le64_to_cpu(lt.rid); + scoutfs_err(sb, "reclaiming orphan log trees for rid %016llx nr %llu", + orphan_rid, le64_to_cpu(lt.nr)); + found = true; + + do { + mutex_unlock(&server->logs_mutex); + err = reclaim_open_log_tree(sb, orphan_rid); + ret = server_apply_commit(sb, hold, + err == -EINPROGRESS ? 0 : err); + server_hold_commit(sb, hold); + mutex_lock(&server->logs_mutex); + } while (err == -EINPROGRESS && ret == 0); + + if (ret < 0) + break; + } + + return ret < 0 ? ret : found; +} + #define FINALIZE_POLL_MIN_DELAY_MS 5U #define FINALIZE_POLL_MAX_DELAY_MS 100U #define FINALIZE_POLL_DELAY_GROWTH_PCT 150U @@ -1261,6 +1345,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l break; } + ret = reclaim_orphan_log_trees(sb, rid, hold); + if (ret < 0) { + err_str = "reclaiming orphan log trees"; + break; + } + if (ret > 0) { + /* lock was dropped, re-check merge status */ + continue; + } + /* look for finalized and other active log btrees */ saw_finalized = false; others_active = false; @@ -1929,7 +2023,7 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) mutex_unlock(&server->alloc_mutex); /* only finalize, allowing merging, once the allocators are fully freed */ - if (ret == 0) { + if (ret == 0 && !scoutfs_trigger(sb, RECLAIM_SKIP_FINALIZE)) { /* the transaction is no longer open */ lt.commit_trans_seq = lt.get_trans_seq; @@ -1981,7 +2075,8 @@ static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret) scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX); while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, <)) > 0) { if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) && - le64_to_cpu(lt.get_trans_seq) <= last_seq) { + le64_to_cpu(lt.get_trans_seq) <= last_seq && + rid_is_mounted(sb, le64_to_cpu(lt.rid))) { last_seq = le64_to_cpu(lt.get_trans_seq) - 1; } } @@ -3533,14 +3628,6 @@ out: return scoutfs_net_response(sb, conn, cmd, id, ret, &nst, sizeof(nst)); } -static void init_mounted_client_key(struct scoutfs_key *key, u64 rid) -{ - *key = (struct scoutfs_key) { - .sk_zone = SCOUTFS_MOUNTED_CLIENT_ZONE, - .skmc_rid = cpu_to_le64(rid), - }; -} - static bool invalid_mounted_client_item(struct scoutfs_btree_item_ref *iref) { return (iref->val_len != sizeof(struct scoutfs_mounted_client_btree_val)); diff --git a/kmod/src/triggers.c b/kmod/src/triggers.c index 317f0911..4616d013 100644 --- a/kmod/src/triggers.c +++ b/kmod/src/triggers.c @@ -45,6 +45,7 @@ static char *names[] = { [SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate", [SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe", [SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge", + [SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE] = "reclaim_skip_finalize", }; bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t) diff --git a/kmod/src/triggers.h b/kmod/src/triggers.h index eeb33b49..64a798b7 100644 --- a/kmod/src/triggers.h +++ b/kmod/src/triggers.h @@ -8,6 +8,7 @@ enum scoutfs_trigger { SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE, SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE, SCOUTFS_TRIGGER_STATFS_LOCK_PURGE, + SCOUTFS_TRIGGER_RECLAIM_SKIP_FINALIZE, SCOUTFS_TRIGGER_NR, }; diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh index 3dc593c8..cb766300 100644 --- a/tests/funcs/filter.sh +++ b/tests/funcs/filter.sh @@ -123,6 +123,9 @@ t_filter_dmesg() re="$re|hrtimer: interrupt took .*" re="$re|clocksource: Long readout interval" + # orphan log trees reclaim is handled, not an error + re="$re|scoutfs .* reclaiming orphan log trees" + # fencing tests force unmounts and trigger timeouts re="$re|scoutfs .* forcing unmount" re="$re|scoutfs .* reconnect timed out" diff --git a/tests/golden/orphan-log-trees b/tests/golden/orphan-log-trees new file mode 100644 index 00000000..08003b10 --- /dev/null +++ b/tests/golden/orphan-log-trees @@ -0,0 +1,3 @@ +== create orphan log_trees entry via trigger +== verify orphan is reclaimed and merge completes +== verify orphan reclaim was logged diff --git a/tests/sequence b/tests/sequence index 1b6a74f3..e296553a 100644 --- a/tests/sequence +++ b/tests/sequence @@ -50,6 +50,7 @@ setup-error-teardown.sh resize-devices.sh change-devices.sh fence-and-reclaim.sh +orphan-log-trees.sh quorum-heartbeat-timeout.sh orphan-inodes.sh mount-unmount-race.sh diff --git a/tests/tests/orphan-log-trees.sh b/tests/tests/orphan-log-trees.sh new file mode 100644 index 00000000..14927eb6 --- /dev/null +++ b/tests/tests/orphan-log-trees.sh @@ -0,0 +1,52 @@ +# +# Test that orphaned log_trees entries from unmounted rids are +# finalized and merged. +# +# An orphan log_trees entry is one whose rid has no mounted_clients +# entry. This can happen from incomplete reclaim across server +# failovers. We simulate it with the reclaim_skip_finalize trigger +# which makes reclaim_open_log_tree skip the finalization step. +# + +t_require_commands touch scoutfs +t_require_mounts 2 + +TIMEOUT=90 + +echo "== create orphan log_trees entry via trigger" +sv=$(t_server_nr) +cl=$(t_first_client_nr) +rid=$(t_mount_rid $cl) + +touch "$T_D0/file" "$T_D1/file" +sync + +# arm the trigger so reclaim skips finalization +t_trigger_arm_silent reclaim_skip_finalize $sv + +# force unmount the client, server will fence and reclaim it +# but the trigger makes reclaim leave log_trees unfinalized +t_force_umount $cl + +# wait for fencing to run +verify_fenced() { + grep -q "running rid '$rid'" "$T_FENCED_LOG" 2>/dev/null +} +t_wait_until_timeout $TIMEOUT verify_fenced + +# give the server time to complete reclaim after fence +sleep 5 + +# remount the client so t_force_log_merge can sync all mounts. +# the client gets a new rid; the old rid's log_trees is the orphan. +t_mount $cl + +echo "== verify orphan is reclaimed and merge completes" +t_force_log_merge + +echo "== verify orphan reclaim was logged" +if ! dmesg | grep -q "reclaiming orphan log trees for rid $rid"; then + t_fail "expected orphan reclaim message for rid $rid in dmesg" +fi + +t_pass