From 3f786596e0bf31be20e9abacd407c0e837bf67c1 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Wed, 28 May 2025 13:48:22 -0500 Subject: [PATCH 01/20] Don't overrun the block budget in server_log_merge_free_work(). This fixes a potential fence post failure like the following: error: 1 holders exceeded alloc budget av: bef 7407 now 7392, fr: bef 8185 now 7672 The code is only accounting for the freed btree blocks, not the dirtying of other items. So it's possible to be at exactly (COMMIT_HOLD_ALLOC_BUDGET / 2), dirty some log btree blocks, loop again, then consume another (COMMIT_HOLD_ALLOC_BUDGET / 2) and blow past the total budget. In this example, we went over by 13 blocks. By only consuming up to 1/8 of the budget on each loop, and committing when we have consumed 3/4 of the budget, we can avoid the fence post condition. Signed-off-by: Chris Kirby --- kmod/src/server.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kmod/src/server.c b/kmod/src/server.c index 9244f8d2..1fc18025 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -2531,7 +2531,7 @@ static void server_log_merge_free_work(struct work_struct *work) ret = scoutfs_btree_free_blocks(sb, &server->alloc, &server->wri, &fr.key, - &fr.root, COMMIT_HOLD_ALLOC_BUDGET / 2); + &fr.root, COMMIT_HOLD_ALLOC_BUDGET / 8); if (ret < 0) { err_str = "freeing log btree"; break; @@ -2550,7 +2550,7 @@ static void server_log_merge_free_work(struct work_struct *work) /* freed blocks are in allocator, we *have* to update fr */ BUG_ON(ret < 0); - if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) { + if (server_hold_alloc_used_since(sb, &hold) >= (COMMIT_HOLD_ALLOC_BUDGET * 3) / 4) { mutex_unlock(&server->logs_mutex); ret = server_apply_commit(sb, &hold, ret); commit = false; From 70bd93621397cdad1b1f08855a9f93fa28f8e766 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Thu, 12 Jun 2025 15:19:39 -0400 Subject: [PATCH 02/20] Ignore sparse error about stat.h on el8. On el8, sparse is at 0.6.4 in epel-release, but it fails with: ``` [SP src/util.c] src/util.c: note: in included file (through /usr/include/sys/stat.h): /usr/include/bits/statx.h:30:6: error: not a function /usr/include/bits/statx.h:30:6: error: bad constant expression type ``` This is due to us needing O_DIRECT from , so we set _GNU_SOURCE before including it, but this causes (through _USE_GNU in sys/stat.h) statx.h to be included, and that has __has_include, and sparse is too dumb to understand it. Just shut it up. Signed-off-by: Auke Kok --- utils/sparse.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utils/sparse.sh b/utils/sparse.sh index 0b93324f..adef573b 100755 --- a/utils/sparse.sh +++ b/utils/sparse.sh @@ -22,6 +22,11 @@ RE="$RE|warning: memset with byte count of 4194304" # some sparse versions don't know about some builtins RE="$RE|error: undefined identifier '__builtin_fpclassify'" +# on el8, sparse can't handle __has_include for some reason when _GNU_SOURCE +# is defined, and we need that for O_DIRECT. +RE="$RE|note: in included file .through /usr/include/sys/stat.h.:" +RE="$RE|/usr/include/bits/statx.h:30:6: error: " + # # don't filter out 'too many errors' here, it can signify that # sparse doesn't understand something and is throwing a *ton* From 0d262de4ac6abe11ea5e757e87a07fbfbc271665 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Thu, 5 Jun 2025 18:38:08 -0500 Subject: [PATCH 03/20] Fix dirtied block calculation in extent_mod_blocks() Free extents are stored in two btrees: one sorted by block number, one by size. So if you insert a new extent between two existing extents, you can be modifying two items in the by-block-number tree. And depending on the size of those items, that can result in three items over in the -by-size tree. So that's a 5x multiplier per level. If we're shrinking the tree and adding more freed blocks, we're conceptually dirtying two blocks at each level to merge. (current *2 in the code). But if they fall under the low water mark then one of them is freed, so we can have *3 per level in this case. Signed-off-by: Chris Kirby --- kmod/src/alloc.c | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/kmod/src/alloc.c b/kmod/src/alloc.c index 894b92ed..40c33c20 100644 --- a/kmod/src/alloc.c +++ b/kmod/src/alloc.c @@ -86,18 +86,47 @@ static u64 smallest_order_length(u64 len) } /* - * An extent modification dirties three distinct leaves of an allocator - * btree as it adds and removes the blkno and size sorted items for the - * old and new lengths of the extent. Dirtying the paths to these - * leaves can grow the tree and grow/shrink neighbours at each level. - * We over-estimate the number of blocks allocated and freed (the paths - * share a root, growth doesn't free) to err on the simpler and safer - * side. The overhead is minimal given the relatively large list blocks - * and relatively short allocator trees. + * Moving an extent between trees can dirty blocks in several ways. This + * function calculates worst case number of blocks across these scenarions. + * We treat the alloc and free counts independently, so the values below are + * max(allocated, freed), not the sum. + * + * We track extents with two separate btree items: by block number and by size. + * + * If we're removing an extent from the btree (allocating), we can dirty + * two blocks if the keys are in different leaves. If we wind up merging + * leaves because we fall below the low water mark, we can wind up freeing + * three leaves. + * + * That sequence is as follows, assuming the original keys are removed from + * blocks A and B: + * + * Allocate new dirty A' and B' + * Free old stable A and B + * B' has fallen below the low water mark, so copy B' into A' + * Free B' + * + * An extent insertion (freeing an extent) can dirty up to five distinct items + * in the btree as it adds and removes the blkno and size sorted items for the + * old and new lengths of the extent: + * + * In the by-blkno portion of the btree, we can dirty (allocate for COW) up + * to two blocks- either by merging adjacent extents, which can cause us to + * join leaf blocks; or by an insertion that causes a split. + * + * In the by-size portion, we never merge extents, so normally we just dirty + * a single item with a size insertion. But if we merged adjacent extents in + * the by-blkno portion of the tree, we might be working with three by-sizex + * items: removing the two old ones that were combined in the merge; and + * adding the new one for the larger, merged size. + * + * Finally, dirtying the paths to these leaves can grow the tree and grow/shrink + * neighbours at each level, so we multiply by the height of the tree after + * accounting for a possible new level. */ static u32 extent_mod_blocks(u32 height) { - return ((1 + height) * 2) * 3; + return ((1 + height) * 3) * 5; } /* From bb3e1f366565255331b4c21fb1f58446a5487e20 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Thu, 5 Jun 2025 18:58:57 -0500 Subject: [PATCH 04/20] Fix commit budget calculation with multiple holders The try_drain_data_freed() path was generating errors about overrunning its commit budget: scoutfs f.2b8928.r.02689f error: 1 holders exceeded alloc budget av: bef 8185 now 8036, fr: bef 8185 now 7602 The budget overrun check was using the current number of commit holders (in this case one) instead of the the maximum number of concurrent holders (in this case two). So even well behaved paths like try_drain_data_freed() can appear to exceed their commit budget if other holders dirty some blocks and apply their commits before the try_drain_data_freed() thread does its final budget reconciliation. Signed-off-by: Chris Kirby --- kmod/src/scoutfs_trace.h | 52 ++++++++++++++++++++++++---------------- kmod/src/server.c | 22 +++++++++-------- 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 930a275a..340c1f14 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -1966,15 +1966,17 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down, ); DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, - exceeded), + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(int, holding) __field(int, applying) __field(int, nr_holders) + __field(u32, budget) __field(__u32, avail_before) __field(__u32, freed_before) __field(int, committing) @@ -1985,35 +1987,45 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class, __entry->holding = !!holding; __entry->applying = !!applying; __entry->nr_holders = nr_holders; + __entry->budget = budget; __entry->avail_before = avail_before; __entry->freed_before = freed_before; __entry->committing = !!committing; __entry->exceeded = !!exceeded; ), - TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u", - SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders, - __entry->avail_before, __entry->freed_before, __entry->committing, - __entry->exceeded) + TP_printk(SCSBF" holding %u applying %u nr %u budget %u avail_before %u freed_before %u committing %u exceeded %u", + SCSB_TRACE_ARGS, __entry->holding, __entry->applying, + __entry->nr_holders, __entry->budget, + __entry->avail_before, __entry->freed_before, + __entry->committing, __entry->exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end, - TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders, - u32 avail_before, u32 freed_before, int committing, int exceeded), - TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded) + TP_PROTO(struct super_block *sb, int holding, int applying, + int nr_holders, u32 budget, + u32 avail_before, u32 freed_before, + int committing, int exceeded), + TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded) ); #define slt_symbolic(mode) \ diff --git a/kmod/src/server.c b/kmod/src/server.c index 1fc18025..22975f69 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -65,6 +65,7 @@ struct commit_users { struct list_head holding; struct list_head applying; unsigned int nr_holders; + u32 budget; u32 avail_before; u32 freed_before; bool committing; @@ -84,8 +85,9 @@ static void init_commit_users(struct commit_users *cusers) do { \ __typeof__(cusers) _cusers = (cusers); \ trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding), \ - !list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before, \ - _cusers->freed_before, _cusers->committing, _cusers->exceeded); \ + !list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->budget, \ + _cusers->avail_before, _cusers->freed_before, _cusers->committing, \ + _cusers->exceeded); \ } while (0) struct server_info { @@ -303,7 +305,6 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv u32 freed_used; u32 avail_now; u32 freed_now; - u32 budget; assert_spin_locked(&cusers->lock); @@ -318,15 +319,14 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv else freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now; - budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET; - if (avail_used <= budget && freed_used <= budget) + if (avail_used <= cusers->budget && freed_used <= cusers->budget) return; exceeded_once = true; cusers->exceeded = cusers->nr_holders; - scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u", - cusers->nr_holders, cusers->avail_before, avail_now, + scoutfs_err(sb, "holders exceeded alloc budget %u av: bef %u now %u, fr: bef %u now %u", + cusers->budget, cusers->avail_before, avail_now, cusers->freed_before, freed_now); list_for_each_entry(hold, &cusers->holding, entry) { @@ -349,7 +349,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server, { bool has_room; bool held; - u32 budget; + u32 new_budget; u32 av; u32 fr; @@ -367,8 +367,8 @@ static bool hold_commit(struct super_block *sb, struct server_info *server, } /* +2 for our additional hold and then for the final commit work the server does */ - budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET; - has_room = av >= budget && fr >= budget; + new_budget = max(cusers->budget, (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET); + has_room = av >= new_budget && fr >= new_budget; /* checking applying so holders drain once an apply caller starts waiting */ held = !cusers->committing && has_room && list_empty(&cusers->applying); @@ -388,6 +388,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server, list_add_tail(&hold->entry, &cusers->holding); cusers->nr_holders++; + cusers->budget = new_budget; } else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) { cusers->committing = true; @@ -516,6 +517,7 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int list_for_each_entry_safe(hold, tmp, &cusers->applying, entry) list_del_init(&hold->entry); cusers->committing = false; + cusers->budget = 0; spin_unlock(&cusers->lock); wake_up(&cusers->waitq); From 669e37c63612930651c92d284bec6261f470767f Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Wed, 18 Jun 2025 13:05:25 -0500 Subject: [PATCH 05/20] Remove hung task workaround from large-fragmented-free test Adjusting hung_task_timeout_secs is still needed for this test to pass with a debug kernel. But the logic belongs on the platform side. Signed-off-by: Chris Kirby --- tests/golden/large-fragmented-free | 1 - tests/tests/large-fragmented-free.sh | 24 ------------------------ 2 files changed, 25 deletions(-) diff --git a/tests/golden/large-fragmented-free b/tests/golden/large-fragmented-free index 7740849c..58aea5f7 100644 --- a/tests/golden/large-fragmented-free +++ b/tests/golden/large-fragmented-free @@ -1,4 +1,3 @@ -== setting longer hung task timeout == creating fragmented extents == unlink file with moved extents to free extents per block == cleanup diff --git a/tests/tests/large-fragmented-free.sh b/tests/tests/large-fragmented-free.sh index 33f3bef7..e27d1bbe 100644 --- a/tests/tests/large-fragmented-free.sh +++ b/tests/tests/large-fragmented-free.sh @@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600 EXTENTS_PER_LIST_BLOCK=8192 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK)) -# -# This test specifically creates a pathologically sparse file that will -# be as expensive as possible to free. This is usually fine on -# dedicated or reasonable hardware, but trying to run this in -# virtualized debug kernels can take a very long time. This test is -# about making sure that the server doesn't fail, not that the platform -# can handle the scale of work that our btree formats happen to require -# while execution is bogged down with use-after-free memory reference -# tracking. So we give the test a lot more breathing room before -# deciding that its hung. -# -echo "== setting longer hung task timeout" -if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then - secs=$(cat /proc/sys/kernel/hung_task_timeout_secs) - test "$secs" -gt 0 || \ - t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs" - restore_hung_task_timeout() - { - echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs - } - trap restore_hung_task_timeout EXIT - echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs -fi - echo "== creating fragmented extents" fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move" From 47af90d078a4ebf6b2168c44cce78ccb812d3fe1 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Wed, 18 Jun 2025 16:22:44 -0500 Subject: [PATCH 06/20] Fix race in offline-extent-waiting test Before comparing file contents, wait for the background dd to complete. Also fix a typo. Signed-off-by: Chris Kirby --- tests/golden/offline-extent-waiting | 2 +- tests/tests/offline-extent-waiting.sh | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/golden/offline-extent-waiting b/tests/golden/offline-extent-waiting index 9ea47d4f..7fcd8f19 100644 --- a/tests/golden/offline-extent-waiting +++ b/tests/golden/offline-extent-waiting @@ -49,7 +49,7 @@ offline wating should be empty: 0 == truncating does wait truncate should be waiting for first block: -trunate should no longer be waiting: +truncate should no longer be waiting: 0 == writing waits should be waiting for write diff --git a/tests/tests/offline-extent-waiting.sh b/tests/tests/offline-extent-waiting.sh index 31b04b59..dda3b7d4 100644 --- a/tests/tests/offline-extent-waiting.sh +++ b/tests/tests/offline-extent-waiting.sh @@ -157,7 +157,7 @@ echo "truncate should be waiting for first block:" expect_wait "$DIR/file" "change_size" $ino 0 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES sleep .1 -echo "trunate should no longer be waiting:" +echo "truncate should no longer be waiting:" scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l cat "$DIR/golden" > "$DIR/file" vers=$(scoutfs stat -s data_version "$DIR/file") @@ -168,10 +168,13 @@ scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES # overwrite, not truncate+write dd if="$DIR/other" of="$DIR/file" \ bs=$BS count=$BLOCKS conv=notrunc status=none & +pid="$!" sleep .1 echo "should be waiting for write" expect_wait "$DIR/file" "write" $ino 0 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES +# wait for the background dd to complete +wait "$pid" 2> /dev/null cmp "$DIR/file" "$DIR/other" echo "== cleanup" From 96eb9662a1f87504df8449a333bcaf42911cee9c Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Tue, 3 Jun 2025 11:13:23 -0700 Subject: [PATCH 07/20] Revert "Extend orphan-inodes timeout." This reverts commit 138c7c6b498bd625a688b9f56214108ee2e6c122. The timeout value here is still exceeded by CI test jobs, and thus causing the test to fail. Signed-off-by: Auke Kok --- tests/tests/orphan-inodes.sh | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/tests/orphan-inodes.sh b/tests/tests/orphan-inodes.sh index 414c747b..756f041e 100644 --- a/tests/tests/orphan-inodes.sh +++ b/tests/tests/orphan-inodes.sh @@ -70,15 +70,7 @@ done # wait for orphan scans to run t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000 # also have to wait for delayed log merge work from mount -C=120 -while (( C-- )); do - brk=1 - for ino in $inos; do - inode_exists $ino && brk=0 - done - test $brk -eq 1 && break - sleep 1 -done +sleep 15 for ino in $inos; do inode_exists $ino && echo "$ino still exists" done From f86a7b4d3cd1697253baa27dcd6822ddcb0d31f6 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Thu, 5 Jun 2025 14:33:42 -0700 Subject: [PATCH 08/20] Fully wait for orphan inode scan to complete. The issue with the previous attempt to fix the orphan-inodes test was that we would regularly exceed the 120s timeout value put in there. Instead, in this commit, we change the code to add a new counter to indicate orphan deletion progress. When orphan inodes are deleted, the increment of this counter indicates progress happened. Inversely, every time the counter doesn't increment, and the orphan scan attempts counter increments, we know that there was no more work to be done. For safety, we wait until 2 consecutive scan attempts were made without forward progress in the test case. Signed-off-by: Auke Kok --- kmod/src/counters.h | 1 + kmod/src/inode.c | 3 +++ tests/tests/orphan-inodes.sh | 28 ++++++++++++++++++++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 18e6daf1..c6d38775 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -90,6 +90,7 @@ EXPAND_COUNTER(forest_read_items) \ EXPAND_COUNTER(forest_roots_next_hint) \ EXPAND_COUNTER(forest_set_bloom_bits) \ + EXPAND_COUNTER(inode_deleted) \ EXPAND_COUNTER(item_cache_count_objects) \ EXPAND_COUNTER(item_cache_scan_objects) \ EXPAND_COUNTER(item_clear_dirty) \ diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 5e3f724f..61f06cd0 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -1854,6 +1854,9 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) goto out; ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock); + if (ret == 0) + scoutfs_inc_counter(sb, inode_deleted); + out: if (clear_trying) clear_bit(bit_nr, ldata->trying); diff --git a/tests/tests/orphan-inodes.sh b/tests/tests/orphan-inodes.sh index 756f041e..fd0d1d75 100644 --- a/tests/tests/orphan-inodes.sh +++ b/tests/tests/orphan-inodes.sh @@ -69,8 +69,32 @@ while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do done # wait for orphan scans to run t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000 -# also have to wait for delayed log merge work from mount -sleep 15 +# wait until we see two consecutive orphan scan attempts without +# any inode deletion forward progress in each mount +for nr in $(t_fs_nrs); do + C=0 + LOSA=$(t_counter orphan_scan_attempts $nr) + LDOP=$(t_counter inode_deleted $nr) + + while [ $C -lt 2 ]; do + sleep 1 + + OSA=$(t_counter orphan_scan_attempts $nr) + DOP=$(t_counter inode_deleted $nr) + + if [ $OSA != $LOSA ]; then + if [ $DOP == $LDOP ]; then + (( C++ )) + else + C=0 + fi + fi + + LOSA=$OSA + LDOP=$DOP + done +done + for ino in $inos; do inode_exists $ino && echo "$ino still exists" done From 0b7b9d4a5e2aae827e4dc63ad3b82495e176b092 Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Fri, 20 Jun 2025 15:08:44 -0700 Subject: [PATCH 09/20] Avoid trigger munching of block_remove_stale trigger. It's entirely likely that the trigger here is munched by a read on a dirty block from any unrelated or background read. Avoid that by putting the trigger at the end of the condition list. Now that the order is swapped, we have to avoid a null deref in block_is_dirty(bp) here, as well. Signed-off-by: Auke Kok --- kmod/src/block.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kmod/src/block.c b/kmod/src/block.c index 9eaba500..e44a98a7 100644 --- a/kmod/src/block.c +++ b/kmod/src/block.c @@ -712,8 +712,8 @@ retry: ret = 0; out: - if ((ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE)) && - !retried && !block_is_dirty(bp)) { + if (!retried && !IS_ERR_OR_NULL(bp) && !block_is_dirty(bp) && + (ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE))) { retried = true; scoutfs_inc_counter(sb, block_cache_remove_stale); block_remove(sb, bp); From 35bcad91a6e34a81773a82dd71c68aa7083354d3 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Mon, 7 Jul 2025 10:21:49 -0500 Subject: [PATCH 10/20] Close window where we can lose search items In finalize_and_start_log_merge(), we overwrite the server mount's log tree with its finalized form and then later write out its next open log tree. This leaves a window where the mount's srch_file is nulled out, causing us to lose any search items in that log tree. This shows up as intermittent failures in the srch-basic-functionality test. Eliminate this timing window by doing what unmount/reclaim does when it finalizes, by moving the resources from the item that we finalize into server trees/items as it finalizes. Then there is no window where those resources exist only in memory until we create another transaction. Signed-off-by: Chris Kirby --- kmod/src/format.h | 2 +- kmod/src/server.c | 125 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 100 insertions(+), 27 deletions(-) diff --git a/kmod/src/format.h b/kmod/src/format.h index 83e6015c..4eab7a7b 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -470,7 +470,7 @@ struct scoutfs_srch_compact { * @get_trans_seq, @commit_trans_seq: These pair of sequence numbers * determine if a transaction is currently open for the mount that owns * the log_trees struct. get_trans_seq is advanced by the server as the - * transaction is opened. The server sets comimt_trans_seq equal to + * transaction is opened. The server sets commit_trans_seq equal to * get_ as the transaction is committed. */ struct scoutfs_log_trees { diff --git a/kmod/src/server.c b/kmod/src/server.c index 22975f69..9dbd1509 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1040,6 +1040,101 @@ static int next_log_merge_item(struct super_block *sb, return next_log_merge_item_key(sb, root, zone, &key, val, val_len); } +static int do_finalize_ours(struct super_block *sb, + struct scoutfs_log_trees *lt, + struct commit_hold *hold) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb); + struct scoutfs_key key; + char *err_str = NULL; + u64 rid = le64_to_cpu(lt->rid); + bool more; + int ret; + int err; + + mutex_lock(&server->srch_mutex); + ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri, + &super->srch_root, <->srch_file, true); + mutex_unlock(&server->srch_mutex); + if (ret < 0) { + scoutfs_err(sb, "error rotating srch log for rid %016llx: %d", + rid, ret); + return ret; + } + + do { + more = false; + + /* + * All of these can return errors, perhaps indicating successful + * partial progress, after having modified the allocator trees. + * We always have to update the roots in the log item. + */ + mutex_lock(&server->alloc_mutex); + ret = (err_str = "splice meta_freed to other_freed", + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, server->other_freed, + <->meta_freed)) ?: + (err_str = "splice meta_avail", + scoutfs_alloc_splice_list(sb, &server->alloc, + &server->wri, server->other_freed, + <->meta_avail)) ?: + (err_str = "empty data_avail", + alloc_move_empty(sb, &super->data_alloc, + <->data_avail, + COMMIT_HOLD_ALLOC_BUDGET / 2)) ?: + (err_str = "empty data_freed", + alloc_move_empty(sb, &super->data_alloc, + <->data_freed, + COMMIT_HOLD_ALLOC_BUDGET / 2)); + mutex_unlock(&server->alloc_mutex); + + /* + * only finalize, allowing merging, once the allocators are + * fully freed + */ + if (ret == 0) { + /* the transaction is no longer open */ + le64_add_cpu(<->flags, SCOUTFS_LOG_TREES_FINALIZED); + lt->finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); + } + + scoutfs_key_init_log_trees(&key, rid, le64_to_cpu(lt->nr)); + + err = scoutfs_btree_update(sb, &server->alloc, &server->wri, + &super->logs_root, &key, lt, + sizeof(*lt)); + BUG_ON(err != 0); /* alloc, log, srch items out of sync */ + + if (ret == -EINPROGRESS) { + more = true; + mutex_unlock(&server->logs_mutex); + ret = server_apply_commit(sb, hold, 0); + if (ret < 0) + WARN_ON_ONCE(ret < 0); + server_hold_commit(sb, hold); + mutex_lock(&server->logs_mutex); + } else if (ret == 0) { + memset(<->item_root, 0, sizeof(lt->item_root)); + memset(<->bloom_ref, 0, sizeof(lt->bloom_ref)); + lt->inode_count_delta = 0; + lt->max_item_seq = 0; + lt->finalize_seq = 0; + le64_add_cpu(<->nr, 1); + lt->flags = 0; + } + } while (more); + + if (ret < 0) { + scoutfs_err(sb, + "error %d finalizing log trees for rid %016llx: %s", + ret, rid, err_str); + } + + return ret; +} + /* * Finalizing the log btrees for merging needs to be done carefully so * that items don't appear to go backwards in time. @@ -1091,7 +1186,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l struct scoutfs_log_merge_range rng; struct scoutfs_mount_options opts; struct scoutfs_log_trees each_lt; - struct scoutfs_log_trees fin; unsigned int delay_ms; unsigned long timeo; bool saw_finalized; @@ -1196,32 +1290,11 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l /* Finalize ours if it's visible to others */ if (ours_visible) { - fin = *lt; - memset(&fin.meta_avail, 0, sizeof(fin.meta_avail)); - memset(&fin.meta_freed, 0, sizeof(fin.meta_freed)); - memset(&fin.data_avail, 0, sizeof(fin.data_avail)); - memset(&fin.data_freed, 0, sizeof(fin.data_freed)); - memset(&fin.srch_file, 0, sizeof(fin.srch_file)); - le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED); - fin.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb)); - - scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid), - le64_to_cpu(fin.nr)); - ret = scoutfs_btree_update(sb, &server->alloc, &server->wri, - &super->logs_root, &key, &fin, - sizeof(fin)); + ret = do_finalize_ours(sb, lt, hold); if (ret < 0) { - err_str = "updating finalized log_trees"; + err_str = "finalizing ours"; break; } - - memset(<->item_root, 0, sizeof(lt->item_root)); - memset(<->bloom_ref, 0, sizeof(lt->bloom_ref)); - lt->inode_count_delta = 0; - lt->max_item_seq = 0; - lt->finalize_seq = 0; - le64_add_cpu(<->nr, 1); - lt->flags = 0; } /* wait a bit for mounts to arrive */ @@ -1680,8 +1753,8 @@ unlock: ret = server_apply_commit(sb, &hold, ret); if (ret < 0) - scoutfs_err(sb, "server error %d committing client logs for rid %016llx: %s", - ret, rid, err_str); + scoutfs_err(sb, "server error %d committing client logs for rid %016llx, nr %llu: %s", + ret, rid, le64_to_cpu(lt.nr), err_str); out: WARN_ON_ONCE(ret < 0); return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0); From a896984f59d2a8d54ec8cff9314dd31463653b5f Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Wed, 16 Jul 2025 14:09:07 -0500 Subject: [PATCH 11/20] Only start new quorum election after a receive failure It's possible for the quorum worker to be preempted for a long period, especially on debug kernels. Since we only check for how much time has passed, it's possible for a clean receive to inadvertently trigger an election. This can cause the quorum-heartbeat-timeout test to fail due to observed delays outside of the expected bounds. Instead, make sure we had a receive failure before comparing timestamps. Signed-off-by: Chris Kirby --- kmod/src/quorum.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index 7308b1dc..0e0fc7de 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -822,6 +822,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) /* followers and candidates start new election on timeout */ if (qst.role != LEADER && + msg.type == SCOUTFS_QUORUM_MSG_INVALID && ktime_after(ktime_get(), qst.timeout)) { /* .. but only if their server has stopped */ if (!scoutfs_server_is_down(sb)) { From d38e41cb57e7a89554a6da89ea1224c4f974ef04 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Fri, 29 Aug 2025 09:52:35 -0500 Subject: [PATCH 12/20] Add the inode number to scoutfs_xattr_set traces. Signed-off-by: Chris Kirby --- kmod/src/scoutfs_trace.h | 14 ++++++++------ kmod/src/xattr.c | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 340c1f14..8494503f 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -823,13 +823,14 @@ DEFINE_EVENT(scoutfs_lock_info_class, scoutfs_lock_destroy, ); TRACE_EVENT(scoutfs_xattr_set, - TP_PROTO(struct super_block *sb, size_t name_len, const void *value, - size_t size, int flags), + TP_PROTO(struct super_block *sb, __u64 ino, size_t name_len, + const void *value, size_t size, int flags), - TP_ARGS(sb, name_len, value, size, flags), + TP_ARGS(sb, ino, name_len, value, size, flags), TP_STRUCT__entry( SCSB_TRACE_FIELDS + __field(__u64, ino) __field(size_t, name_len) __field(const void *, value) __field(size_t, size) @@ -838,15 +839,16 @@ TRACE_EVENT(scoutfs_xattr_set, TP_fast_assign( SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; __entry->name_len = name_len; __entry->value = value; __entry->size = size; __entry->flags = flags; ), - TP_printk(SCSBF" name_len %zu value %p size %zu flags 0x%x", - SCSB_TRACE_ARGS, __entry->name_len, __entry->value, - __entry->size, __entry->flags) + TP_printk(SCSBF" ino %llu name_len %zu value %p size %zu flags 0x%x", + SCSB_TRACE_ARGS, __entry->ino, __entry->name_len, + __entry->value, __entry->size, __entry->flags) ); TRACE_EVENT(scoutfs_advance_dirty_super, diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c index 09db9927..7805b364 100644 --- a/kmod/src/xattr.c +++ b/kmod/src/xattr.c @@ -742,7 +742,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_ int ret; int err; - trace_scoutfs_xattr_set(sb, name_len, value, size, flags); + trace_scoutfs_xattr_set(sb, ino, name_len, value, size, flags); if (WARN_ON_ONCE(tgs->totl && tgs->indx) || WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock)) From 84a48ed8e2eade97479075075367edae39477e71 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Mon, 8 Sep 2025 16:30:24 -0500 Subject: [PATCH 13/20] Fix several cases in srch.c where the return value of EIO should have been -EIO. Signed-off-by: Chris Kirby --- kmod/src/srch.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kmod/src/srch.c b/kmod/src/srch.c index 99ea6448..39b0f595 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -62,7 +62,7 @@ * re-allocated and re-written. Search can restart by checking the * btree for the current set of files. Compaction reads log files which * are protected from other compactions by the persistent busy items - * created by the server. Compaction won't see it's blocks reused out + * created by the server. Compaction won't see its blocks reused out * from under it, but it can encounter stale cached blocks that need to * be invalidated. */ @@ -749,14 +749,14 @@ static int search_log_file(struct super_block *sb, for (i = 0; i < le32_to_cpu(srb->entry_nr); i++) { if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } ret = decode_entry(srb->entries + pos, &sre, &prev); if (ret <= 0) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } pos += ret; @@ -859,14 +859,14 @@ static int search_sorted_file(struct super_block *sb, if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } ret = decode_entry(srb->entries + pos, &sre, &prev); if (ret <= 0) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } pos += ret; @@ -1802,7 +1802,7 @@ static void swap_page_sre(void *A, void *B, int size) * typically, ~10x worst case). * * Because we read and sort all the input files we must perform the full - * compaction in one operation. The server must have given us a + * compaction in one operation. The server must have given us * sufficiently large avail/freed lists, otherwise we'll return ENOSPC. */ static int compact_logs(struct super_block *sb, @@ -1866,14 +1866,14 @@ static int compact_logs(struct super_block *sb, if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; break; } ret = decode_entry(srb->entries + pos, sre, &prev); if (ret <= 0) { /* can only be inconsistency :/ */ - ret = EIO; + ret = -EIO; goto out; } prev = *sre; From 01847d9fb63436916dad5f250294da779de566a8 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Tue, 9 Sep 2025 13:38:52 -0500 Subject: [PATCH 14/20] Add tracing for get_file_block() and scoutfs_ioc_search_xattrs(). Signed-off-by: Chris Kirby --- kmod/src/scoutfs_trace.h | 42 ++++++++++++++++++++++++++++++++++++++++ kmod/src/srch.c | 6 ++++++ 2 files changed, 48 insertions(+) diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 8494503f..56da1bff 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2465,6 +2465,27 @@ TRACE_EVENT(scoutfs_block_dirty_ref, __entry->block_blkno, __entry->block_seq) ); +TRACE_EVENT(scoutfs_get_file_block, + TP_PROTO(struct super_block *sb, u64 blkno, int flags), + + TP_ARGS(sb, blkno, flags), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(__u64, blkno) + __field(int, flags) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->blkno = blkno; + __entry->flags = flags; + ), + + TP_printk(SCSBF" blkno %llu flags 0x%x", + SCSB_TRACE_ARGS, __entry->blkno, __entry->flags) +); + TRACE_EVENT(scoutfs_block_stale, TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref, struct scoutfs_block_header *hdr, u32 magic, u32 crc), @@ -3062,6 +3083,27 @@ DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv, TP_ARGS(sb, sc) ); +TRACE_EVENT(scoutfs_ioc_search_xattrs, + TP_PROTO(struct super_block *sb, u64 ino, u64 last_ino), + + TP_ARGS(sb, ino, last_ino), + + TP_STRUCT__entry( + SCSB_TRACE_FIELDS + __field(u64, ino) + __field(u64, last_ino) + ), + + TP_fast_assign( + SCSB_TRACE_ASSIGN(sb); + __entry->ino = ino; + __entry->last_ino = last_ino; + ), + + TP_printk(SCSBF" ino %llu last_ino %llu", SCSB_TRACE_ARGS, + __entry->ino, __entry->last_ino) +); + #endif /* _TRACE_SCOUTFS_H */ /* This part must be outside protection */ diff --git a/kmod/src/srch.c b/kmod/src/srch.c index 39b0f595..455a523f 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -442,6 +442,10 @@ out: if (ret == 0 && (flags & GFB_INSERT) && blk >= le64_to_cpu(sfl->blocks)) sfl->blocks = cpu_to_le64(blk + 1); + if (bl) { + trace_scoutfs_get_file_block(sb, bl->blkno, flags); + } + *bl_ret = bl; return ret; } @@ -972,6 +976,8 @@ int scoutfs_srch_search_xattrs(struct super_block *sb, scoutfs_inc_counter(sb, srch_search_xattrs); + trace_scoutfs_ioc_search_xattrs(sb, ino, last_ino); + *done = false; srch_init_rb_root(sroot); From c19280c83c7dd040f00657cbf61e12ca3b936b91 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 10 Sep 2025 10:17:40 -0700 Subject: [PATCH 15/20] Add cond_resched to iput worker The iput worker can accumulate quite a bit of pending work to do. We've seen hung task warnings while it's doing its work (admitedly in debug kernels). There's no harm in throwing in a cond_resched so other tasks get a chance to do work. Signed-off-by: Zach Brown --- kmod/src/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 61f06cd0..e1ac0e3e 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -1965,6 +1965,8 @@ static void iput_worker(struct work_struct *work) while (count-- > 0) iput(inode); + cond_resched(); + /* can't touch inode after final iput */ spin_lock(&inf->iput_lock); From c3e6f3cd54b8666f9ac11f6e2036e9aab706479f Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Thu, 12 Jun 2025 16:52:03 -0400 Subject: [PATCH 16/20] Don't run format-version-forward-back on el8, either This test compiles an earlier commit from the tree that is starting to fail due to various changes on the OS level, most recently due to sparse issues with newer kernel headers. This problem will likely increase in the future as we add more supported releases. We opt to just only run this test on el7 for now. While we could have made this skip sparse checks that fail it on el8, it will suffice at this point if this just works on one of the supported OS versions during testing. Signed-off-by: Auke Kok --- tests/tests/format-version-forward-back.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests/format-version-forward-back.sh b/tests/tests/format-version-forward-back.sh index a5dde26b..04d6f907 100644 --- a/tests/tests/format-version-forward-back.sh +++ b/tests/tests/format-version-forward-back.sh @@ -11,8 +11,8 @@ # format version. # -# not supported on el9! -if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then +# not supported on el8 or higher +if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 7 ]; then t_skip_permitted "Unsupported OS version" fi From c72bf915ae25c9f785be318bfe4aab8468bfed8e Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Fri, 3 Oct 2025 14:01:39 -0500 Subject: [PATCH 17/20] Use ENOLINK as a special error code during forced unmount Tests such as quorum-heartbeat-timeout were failing with EIO messages in dmesg output due to expected errors during forced unmount. Use ENOLINK instead, and filter all errors from dmesg with this errno (67). Signed-off-by: Chris Kirby --- kmod/src/block.c | 4 ++-- kmod/src/format.h | 3 ++- kmod/src/net.c | 2 +- kmod/src/server.c | 2 +- kmod/src/trans.c | 4 ++-- tests/funcs/filter.sh | 3 +++ 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/kmod/src/block.c b/kmod/src/block.c index e44a98a7..0fa35dcc 100644 --- a/kmod/src/block.c +++ b/kmod/src/block.c @@ -488,7 +488,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp, int ret = 0; if (scoutfs_forcing_unmount(sb)) - return -EIO; + return -ENOLINK; sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9); @@ -1210,7 +1210,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_op BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE); if (scoutfs_forcing_unmount(sb)) - return -EIO; + return -ENOLINK; if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) || WARN_ON_ONCE(!op_is_write(opf) && !blk_crc)) diff --git a/kmod/src/format.h b/kmod/src/format.h index 4eab7a7b..07bb67aa 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -1091,7 +1091,8 @@ enum scoutfs_net_cmd { EXPAND_NET_ERRNO(ENOMEM) \ EXPAND_NET_ERRNO(EIO) \ EXPAND_NET_ERRNO(ENOSPC) \ - EXPAND_NET_ERRNO(EINVAL) + EXPAND_NET_ERRNO(EINVAL) \ + EXPAND_NET_ERRNO(ENOLINK) #undef EXPAND_NET_ERRNO #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which, diff --git a/kmod/src/net.c b/kmod/src/net.c index 613cf169..8636452e 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -332,7 +332,7 @@ static int submit_send(struct super_block *sb, return -EINVAL; if (scoutfs_forcing_unmount(sb)) - return -EIO; + return -ENOLINK; msend = kmalloc(offsetof(struct message_send, nh.data[data_len]), GFP_NOFS); diff --git a/kmod/src/server.c b/kmod/src/server.c index 9dbd1509..cd978d27 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -610,7 +610,7 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; if (scoutfs_forcing_unmount(sb)) { - ret = -EIO; + ret = -ENOLINK; goto out; } diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 10494c43..d131bfa1 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -196,7 +196,7 @@ static int retry_forever(struct super_block *sb, int (*func)(struct super_block } if (scoutfs_forcing_unmount(sb)) { - ret = -EIO; + ret = -ENOLINK; break; } @@ -252,7 +252,7 @@ void scoutfs_trans_write_func(struct work_struct *work) } if (scoutfs_forcing_unmount(sb)) { - ret = -EIO; + ret = -ENOLINK; goto out; } diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh index 294f9c72..b5a1a3cf 100644 --- a/tests/funcs/filter.sh +++ b/tests/funcs/filter.sh @@ -140,6 +140,9 @@ t_filter_dmesg() re="$re|scoutfs .* error.*server failed to bind to.*" re="$re|scoutfs .* critical transaction commit failure.*" + # ENOLINK (-67) indicates an expected forced unmount error + re="$re|scoutfs .* error -67 .*" + # change-devices causes loop device resizing re="$re|loop: module loaded" re="$re|loop[0-9].* detected capacity change from.*" From d277d7e95510adf4b2327d355455d9589879db82 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Mon, 6 Oct 2025 10:09:37 -0500 Subject: [PATCH 18/20] Fix race condition in orphan-inodes test Make sure that the orphan scanners can see deletions after forced unmounts by waiting for reclaim_open_log_tree() to run on each mount; and waiting for finalize_and_start_log_merge() to run and not find any finalized trees. Do this by adding two new counters: reclaimed_open_logs and log_merge_no_finalized and fixing the orphan-inodes test to check those before waiting for the orphan scanners to complete. Signed-off-by: Chris Kirby --- kmod/src/counters.h | 2 ++ kmod/src/server.c | 4 ++++ tests/tests/orphan-inodes.sh | 15 +++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/kmod/src/counters.h b/kmod/src/counters.h index c6d38775..c2bfa89f 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -146,6 +146,7 @@ EXPAND_COUNTER(lock_shrink_work) \ EXPAND_COUNTER(lock_unlock) \ EXPAND_COUNTER(lock_wait) \ + EXPAND_COUNTER(log_merge_no_finalized) \ EXPAND_COUNTER(log_merge_wait_timeout) \ EXPAND_COUNTER(net_dropped_response) \ EXPAND_COUNTER(net_send_bytes) \ @@ -182,6 +183,7 @@ EXPAND_COUNTER(quorum_send_vote) \ EXPAND_COUNTER(quorum_server_shutdown) \ EXPAND_COUNTER(quorum_term_follower) \ + EXPAND_COUNTER(reclaimed_open_logs) \ EXPAND_COUNTER(server_commit_hold) \ EXPAND_COUNTER(server_commit_queue) \ EXPAND_COUNTER(server_commit_worker) \ diff --git a/kmod/src/server.c b/kmod/src/server.c index cd978d27..4c9de90f 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -1256,6 +1256,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l /* done if we're not finalizing and there's no finalized */ if (!finalize_ours && !saw_finalized) { ret = 0; + scoutfs_inc_counter(sb, log_merge_no_finalized); break; } @@ -1889,6 +1890,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid) out: mutex_unlock(&server->logs_mutex); + if (ret == 0) + scoutfs_inc_counter(sb, reclaimed_open_logs); + if (ret < 0 && ret != -EINPROGRESS) scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s", ret, rid, err_str); diff --git a/tests/tests/orphan-inodes.sh b/tests/tests/orphan-inodes.sh index fd0d1d75..2bfddacc 100644 --- a/tests/tests/orphan-inodes.sh +++ b/tests/tests/orphan-inodes.sh @@ -67,6 +67,21 @@ t_mount_all while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do sleep .5 done + + +sv=$(t_server_nr) + +# wait for reclaim_open_log_tree() to complete for each mount +while [ $(t_counter reclaimed_open_logs $sv) -lt $T_NR_MOUNTS ]; do + sleep 1 +done + +# wait for finalize_and_start_log_merge() to find no active merges in flight +# and not find any finalized trees +while [ $(t_counter log_merge_no_finalized $sv) -lt 1 ]; do + sleep 1 +done + # wait for orphan scans to run t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000 # wait until we see two consecutive orphan scan attempts without From aa48a8ccfc6e8710af1603bc52094a3ca33d4bb3 Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Fri, 10 Oct 2025 08:58:14 -0500 Subject: [PATCH 19/20] Generate sorted srch-safe entry pairs During log compaction, the SRCH_COMPACT_LOGS_PAD_SAFE trigger was generating inode numbers that were not in sorted order. This resulted in later failures during srch-basic-functionality, because we were winding up with out of order first/last pairs and merging incorrectly. Instead, reuse the single entry in the block repeatedly, generating zero-padded pairs of this entry that are interpreted as create/delete and vanish during searching and merging. These aren't encoded in the normal way, but the extra zeroes are ignored during the decoding phase. Signed-off-by: Chris Kirby --- kmod/src/srch.c | 100 ++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 54 deletions(-) diff --git a/kmod/src/srch.c b/kmod/src/srch.c index 455a523f..a77c5eff 100644 --- a/kmod/src/srch.c +++ b/kmod/src/srch.c @@ -537,23 +537,35 @@ out: * the pairs cancel each other out by all readers (the second encoding * looks like deletion) so they aren't visible to the first/last bounds of * the block or file. + * + * We use the same entry repeatedly, so the diff between them will be empty. + * This lets us just emit the two-byte count word, leaving the other bytes + * as zero. + * + * Split the desired total len into two pieces, adding any remainder to the + * first four-bit value. */ -static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk, - struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre) +static void append_padded_entry(struct scoutfs_srch_file *sfl, + struct scoutfs_srch_block *srb, + int len) { - int ret; + int each; + int rem; + u16 lengths = 0; + u8 *buf = srb->entries + le32_to_cpu(srb->entry_bytes); - ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes), - sre, &srb->tail); - if (ret > 0) { - srb->tail = *sre; - le32_add_cpu(&srb->entry_nr, 1); - le32_add_cpu(&srb->entry_bytes, ret); - le64_add_cpu(&sfl->entries, 1); - ret = 0; - } + each = (len - 2) >> 1; + rem = (len - 2) & 1; - return ret; + lengths |= each + rem; + lengths |= each << 4; + + memset(buf, 0, len); + put_unaligned_le16(lengths, buf); + + le32_add_cpu(&srb->entry_nr, 1); + le32_add_cpu(&srb->entry_bytes, len); + le64_add_cpu(&sfl->entries, 1); } /* @@ -564,61 +576,41 @@ static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk, * This is called when there is a single existing entry in the block. * We have the entire block to work with. We encode pairs of matching * entries. This hides them from readers (both searches and merging) as - * they're interpreted as creation and deletion and are deleted. We use - * the existing hash value of the first entry in the block but then set - * the inode to an impossibly large number so it doesn't interfere with - * anything. + * they're interpreted as creation and deletion and are deleted. * - * To hit the specific offset we very carefully manage the amount of - * bytes of change between fields in the entry. We know that if we - * change all the byte of the ino and id we end up with a 20 byte - * (2+8+8,2) encoding of the pair of entries. To have the last entry - * start at the _SAFE_POS offset we know that the final 20 byte pair - * encoding needs to end at 2 bytes (second entry encoding) after the - * _SAFE_POS offset. + * For simplicity and to maintain sort ordering within the block, we reuse + * the existing entry. This lets us skip the encoding step, because we know + * the diff will be zero. We can zero-pad the resulting entries to hit the + * target offset exactly. * - * So as we encode pairs we watch the delta of our current offset from - * that desired final offset of 2 past _SAFE_POS. If we're a multiple - * of 20 away then we encode the full 20 byte pairs. If we're not, then - * we drop a byte to encode 19 bytes. That'll slowly change the offset - * to be a multiple of 20 again while encoding large entries. + * Because we can't predict the exact number of entry_bytes when we start, + * we adjust the byte count of subsequent entries until we wind up at a + * multiple of 20 bytes away from our goal and then use that length for + * the remaining entries. + * + * We could just use a single pair of unnaturally large entries to consume + * the needed space, adjusting for an odd number of entry_bytes if necessary. + * The use of 19 or 20 bytes for the entry pair matches what we would see with + * real (non-zero) entries that vary from the existing entry. */ -static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk, +static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, struct scoutfs_srch_block *srb) { - struct scoutfs_srch_entry sre; u32 target; s32 diff; - u64 hash; - u64 ino; - u64 id; - int ret; - - hash = le64_to_cpu(srb->tail.hash); - ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62); - id = le64_to_cpu(srb->tail.id); target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2; while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) { - ino ^= 1ULL << (7 * 8); + append_padded_entry(sfl, srb, 10); if (diff % 20 == 0) { - id ^= 1ULL << (7 * 8); + append_padded_entry(sfl, srb, 10); } else { - id ^= 1ULL << (6 * 8); + append_padded_entry(sfl, srb, 9); } - - sre.hash = cpu_to_le64(hash); - sre.ino = cpu_to_le64(ino); - sre.id = cpu_to_le64(id); - - ret = append_padded_entry(sfl, blk, srb, &sre); - if (ret == 0) - ret = append_padded_entry(sfl, blk, srb, &sre); - BUG_ON(ret != 0); - - diff = target - le32_to_cpu(srb->entry_bytes); } + + WARN_ON_ONCE(diff != 0); } /* @@ -1675,7 +1667,7 @@ static int kway_merge(struct super_block *sb, /* end sorted block on _SAFE offset for testing */ if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input && scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) { - pad_entries_at_safe(sfl, blk, srb); + pad_entries_at_safe(sfl, srb); scoutfs_block_put(sb, bl); bl = NULL; blk++; From b4d832375066187527c8b072c387d2803bc1053a Mon Sep 17 00:00:00 2001 From: Chris Kirby Date: Wed, 15 Oct 2025 17:37:05 -0500 Subject: [PATCH 20/20] Quorum message cleanup Make sure to log an error if the SCOUTFS_QUORUM_EVENT_END update_quorum_block() call fails in scoutfs_quorum_worker(). Correctly print if the reader or writer failed when logging errors in update_quorum_block(). Signed-off-by: Chris Kirby --- kmod/src/quorum.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index 0e0fc7de..1c42188e 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -520,10 +520,10 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool set_quorum_block_event(sb, &blk, event, term); ret = write_quorum_block(sb, blkno, &blk); if (ret < 0) - scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu", + scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu", ret, blkno, event, term); } else { - scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu", + scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu", ret, blkno, event, term); } @@ -983,7 +983,10 @@ static void scoutfs_quorum_worker(struct work_struct *work) } /* record that this slot no longer has an active quorum */ - update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true); + err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true); + if (err < 0 && ret == 0) + ret = err; + out: if (ret < 0) { scoutfs_err(sb, "quorum service saw error %d, shutting down. This mount is no longer participating in quorum. It should be remounted to restore service.",