Merge pull request #236 from versity/team/ci_green

Team/ci green
2026-04-28 08:56:55 +00:00 · 2025-10-22 11:05:08 -07:00
parent cad47ed1ed b4d8323750
commit 0ecaceba14
20 changed files with 380 additions and 188 deletions
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -86,18 +86,47 @@ static u64 smallest_order_length(u64 len)
 }

 /*
- * An extent modification dirties three distinct leaves of an allocator
- * btree as it adds and removes the blkno and size sorted items for the
- * old and new lengths of the extent.  Dirtying the paths to these
- * leaves can grow the tree and grow/shrink neighbours at each level.
- * We over-estimate the number of blocks allocated and freed (the paths
- * share a root, growth doesn't free) to err on the simpler and safer
- * side.  The overhead is minimal given the relatively large list blocks
- * and relatively short allocator trees.
+ * Moving an extent between trees can dirty blocks in several ways. This
+ * function calculates worst case number of blocks across these scenarions.
+ * We treat the alloc and free counts independently, so the values below are
+ * max(allocated, freed), not the sum.
+ *
+ * We track extents with two separate btree items: by block number and by size.
+ *
+ * If we're removing an extent from the btree (allocating), we can dirty
+ * two blocks if the keys are in different leaves. If we wind up merging
+ * leaves because we fall below the low water mark, we can wind up freeing
+ * three leaves.
+ *
+ * That sequence is as follows, assuming the original keys are removed from
+ * blocks A and B:
+ *
+ * Allocate new dirty A' and B'
+ * Free old stable A and B
+ * B' has fallen below the low water mark, so copy B' into A'
+ * Free B'
+ *
+ * An extent insertion (freeing an extent) can dirty up to five distinct items
+ * in the btree as it adds and removes the blkno and size sorted items for the
+ * old and new lengths of the extent:
+ *
+ * In the by-blkno portion of the btree, we can dirty (allocate for COW) up
+ * to two blocks- either by merging adjacent extents, which can cause us to
+ * join leaf blocks; or by an insertion that causes a split.
+ *
+ * In the by-size portion, we never merge extents, so normally we just dirty
+ * a single item with a size insertion. But if we merged adjacent extents in
+ * the by-blkno portion of the tree, we might be working with three by-sizex
+ * items: removing the two old ones that were combined in the merge; and
+ * adding the new one for the larger, merged size.
+ *
+ * Finally, dirtying the paths to these leaves can grow the tree and grow/shrink
+ * neighbours at each level, so we multiply by the height of the tree after
+ * accounting for a possible new level.
 */
 static u32 extent_mod_blocks(u32 height)
 {
-	return ((1 + height) * 2) * 3;
+	return ((1 + height) * 3) * 5;
 }

 /*
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -488,7 +488,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	int ret = 0;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

@@ -712,8 +712,8 @@ retry:

 	ret = 0;
 out:
-	if ((ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE)) &&
-	    !retried && !block_is_dirty(bp)) {
+	if (!retried && !IS_ERR_OR_NULL(bp) && !block_is_dirty(bp) &&
+	    (ret == -ESTALE || scoutfs_trigger(sb, BLOCK_REMOVE_STALE))) {
 		retried = true;
 		scoutfs_inc_counter(sb, block_cache_remove_stale);
 		block_remove(sb, bp);
@@ -1210,7 +1210,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, blk_op
 	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -90,6 +90,7 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(inode_deleted)				\
 	EXPAND_COUNTER(item_cache_count_objects)		\
 	EXPAND_COUNTER(item_cache_scan_objects)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
@@ -145,6 +146,7 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_no_finalized)			\
 	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
@@ -181,6 +183,7 @@
 	EXPAND_COUNTER(quorum_send_vote)			\
 	EXPAND_COUNTER(quorum_server_shutdown)			\
 	EXPAND_COUNTER(quorum_term_follower)			\
+	EXPAND_COUNTER(reclaimed_open_logs)			\
 	EXPAND_COUNTER(server_commit_hold)			\
 	EXPAND_COUNTER(server_commit_queue)			\
 	EXPAND_COUNTER(server_commit_worker)			\
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -470,7 +470,7 @@ struct scoutfs_srch_compact {
 * @get_trans_seq, @commit_trans_seq: These pair of sequence numbers
 * determine if a transaction is currently open for the mount that owns
 * the log_trees struct.  get_trans_seq is advanced by the server as the
- * transaction is opened.   The server sets comimt_trans_seq equal to
+ * transaction is opened.   The server sets commit_trans_seq equal to
 * get_ as the transaction is committed.
 */
 struct scoutfs_log_trees {
@@ -1091,7 +1091,8 @@ enum scoutfs_net_cmd {
 	EXPAND_NET_ERRNO(ENOMEM)	\
 	EXPAND_NET_ERRNO(EIO)		\
 	EXPAND_NET_ERRNO(ENOSPC)	\
-	EXPAND_NET_ERRNO(EINVAL)
+	EXPAND_NET_ERRNO(EINVAL)	\
+	EXPAND_NET_ERRNO(ENOLINK)

 #undef EXPAND_NET_ERRNO
 #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -1854,6 +1854,9 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 		goto out;

 	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
+	if (ret == 0)
+		scoutfs_inc_counter(sb, inode_deleted);
+
 out:
 	if (clear_trying)
 		clear_bit(bit_nr, ldata->trying);
@@ -1962,6 +1965,8 @@ static void iput_worker(struct work_struct *work)
 		while (count-- > 0)
 			iput(inode);

+		cond_resched();
+
 		/* can't touch inode after final iput */

 		spin_lock(&inf->iput_lock);
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -332,7 +332,7 @@ static int submit_send(struct super_block *sb,
 		return -EINVAL;

 	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
+		return -ENOLINK;

 	msend = kmalloc(offsetof(struct message_send,
 				 nh.data[data_len]), GFP_NOFS);
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -520,10 +520,10 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool
 		set_quorum_block_event(sb, &blk, event, term);
 		ret = write_quorum_block(sb, blkno, &blk);
 		if (ret < 0)
-			scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
+			scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
 				    ret, blkno, event, term);
 	} else {
-		scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
+		scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
 			    ret, blkno, event, term);
 	}

@@ -822,6 +822,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 		/* followers and candidates start new election on timeout */
 		if (qst.role != LEADER &&
+		    msg.type == SCOUTFS_QUORUM_MSG_INVALID &&
 		    ktime_after(ktime_get(), qst.timeout)) {
 			/* .. but only if their server has stopped */
 			if (!scoutfs_server_is_down(sb)) {
@@ -982,7 +983,10 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	}

 	/* record that this slot no longer has an active quorum */
-	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
+	err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
+	if (err < 0 && ret == 0)
+		ret = err;
+
 out:
 	if (ret < 0) {
 		scoutfs_err(sb, "quorum service saw error %d, shutting down.  This mount is no longer participating in quorum.  It should be remounted to restore service.",
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -823,13 +823,14 @@ DEFINE_EVENT(scoutfs_lock_info_class, scoutfs_lock_destroy,
 );

 TRACE_EVENT(scoutfs_xattr_set,
-	TP_PROTO(struct super_block *sb, size_t name_len, const void *value,
-		 size_t size, int flags),
+	TP_PROTO(struct super_block *sb, __u64 ino, size_t name_len,
+		 const void *value, size_t size, int flags),

-	TP_ARGS(sb, name_len, value, size, flags),
+	TP_ARGS(sb, ino, name_len, value, size, flags),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, ino)
 		__field(size_t, name_len)
 		__field(const void *, value)
 		__field(size_t, size)
@@ -838,15 +839,16 @@ TRACE_EVENT(scoutfs_xattr_set,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
 		__entry->name_len = name_len;
 		__entry->value = value;
 		__entry->size = size;
 		__entry->flags = flags;
 	),

-	TP_printk(SCSBF" name_len %zu value %p size %zu flags 0x%x",
-		  SCSB_TRACE_ARGS, __entry->name_len, __entry->value,
-		  __entry->size, __entry->flags)
+	TP_printk(SCSBF" ino %llu name_len %zu value %p size %zu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->ino,  __entry->name_len,
+		  __entry->value, __entry->size, __entry->flags)
 );

 TRACE_EVENT(scoutfs_advance_dirty_super,
@@ -1966,15 +1968,17 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
 );

 DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing,
-		exceeded),
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded),
        TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(int, holding)
 		__field(int, applying)
 		__field(int, nr_holders)
+		__field(u32, budget)
 		__field(__u32, avail_before)
 		__field(__u32, freed_before)
 		__field(int, committing)
@@ -1985,35 +1989,45 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
 		__entry->holding = !!holding;
 		__entry->applying = !!applying;
 		__entry->nr_holders = nr_holders;
+		__entry->budget = budget;
 		__entry->avail_before = avail_before;
 		__entry->freed_before = freed_before;
 		__entry->committing = !!committing;
 		__entry->exceeded = !!exceeded;
        ),
-	TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u",
-		  SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
-		  __entry->avail_before, __entry->freed_before, __entry->committing,
-		  __entry->exceeded)
+	TP_printk(SCSBF" holding %u applying %u nr %u budget %u avail_before %u freed_before %u committing %u exceeded %u",
+		  SCSB_TRACE_ARGS, __entry->holding, __entry->applying,
+		  __entry->nr_holders, __entry->budget,
+		  __entry->avail_before, __entry->freed_before,
+		  __entry->committing, __entry->exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
-        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+        TP_PROTO(struct super_block *sb, int holding, int applying,
+		 int nr_holders, u32 budget,
+		 u32 avail_before, u32 freed_before,
+		 int committing, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, budget, avail_before, freed_before, committing, exceeded)
 );

 #define slt_symbolic(mode)						\
@@ -2451,6 +2465,27 @@ TRACE_EVENT(scoutfs_block_dirty_ref,
 		  __entry->block_blkno, __entry->block_seq)
 );

+TRACE_EVENT(scoutfs_get_file_block,
+	TP_PROTO(struct super_block *sb, u64 blkno, int flags),
+
+	TP_ARGS(sb, blkno, flags),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, blkno)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->blkno = blkno;
+		__entry->flags = flags;
+	),
+
+	TP_printk(SCSBF" blkno %llu flags 0x%x",
+		  SCSB_TRACE_ARGS, __entry->blkno, __entry->flags)
+);
+
 TRACE_EVENT(scoutfs_block_stale,
 	TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref,
 		 struct scoutfs_block_header *hdr, u32 magic, u32 crc),
@@ -3048,6 +3083,27 @@ DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
 	TP_ARGS(sb, sc)
 );

+TRACE_EVENT(scoutfs_ioc_search_xattrs,
+	TP_PROTO(struct super_block *sb, u64 ino, u64 last_ino),
+
+	TP_ARGS(sb, ino, last_ino),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(u64, ino)
+		__field(u64, last_ino)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->ino = ino;
+		__entry->last_ino = last_ino;
+	),
+
+	TP_printk(SCSBF" ino %llu last_ino %llu", SCSB_TRACE_ARGS,
+		  __entry->ino, __entry->last_ino)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -65,6 +65,7 @@ struct commit_users {
 	struct list_head holding;
 	struct list_head applying;
 	unsigned int nr_holders;
+	u32 budget;
 	u32 avail_before;
 	u32 freed_before;
 	bool committing;
@@ -84,8 +85,9 @@ static void init_commit_users(struct commit_users *cusers)
 do {												\
 	__typeof__(cusers) _cusers = (cusers);							\
 	trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding),			\
-		!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before,	\
-		_cusers->freed_before, _cusers->committing, _cusers->exceeded);			\
+		!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->budget,		\
+		_cusers->avail_before, _cusers->freed_before, _cusers->committing,		\
+		_cusers->exceeded);								\
 } while (0)

 struct server_info {
@@ -303,7 +305,6 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 	u32 freed_used;
 	u32 avail_now;
 	u32 freed_now;
-	u32 budget;

 	assert_spin_locked(&cusers->lock);

@@ -318,15 +319,14 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 	else
 		freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now;

-	budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
-	if (avail_used <= budget && freed_used <= budget)
+	if (avail_used <= cusers->budget && freed_used <= cusers->budget)
 		return;

 	exceeded_once = true;
 	cusers->exceeded = cusers->nr_holders;

-	scoutfs_err(sb, "%u holders exceeded alloc budget av: bef %u now %u, fr: bef %u now %u",
-		    cusers->nr_holders, cusers->avail_before, avail_now,
+	scoutfs_err(sb, "holders exceeded alloc budget %u av: bef %u now %u, fr: bef %u now %u",
+		    cusers->budget, cusers->avail_before, avail_now,
 		    cusers->freed_before, freed_now);

 	list_for_each_entry(hold, &cusers->holding, entry) {
@@ -349,7 +349,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 {
 	bool has_room;
 	bool held;
-	u32 budget;
+	u32 new_budget;
 	u32 av;
 	u32 fr;

@@ -367,8 +367,8 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 	}

 	/* +2 for our additional hold and then for the final commit work the server does */
-	budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET;
-	has_room = av >= budget && fr >= budget;
+	new_budget = max(cusers->budget, (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET);
+	has_room = av >= new_budget && fr >= new_budget;
 	/* checking applying so holders drain once an apply caller starts waiting */
 	held = !cusers->committing && has_room && list_empty(&cusers->applying);

@@ -388,6 +388,7 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
 		list_add_tail(&hold->entry, &cusers->holding);

 		cusers->nr_holders++;
+		cusers->budget = new_budget;

 	} else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) {
 		cusers->committing = true;
@@ -516,6 +517,7 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
 	list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
 		list_del_init(&hold->entry);
 	cusers->committing = false;
+	cusers->budget = 0;
 	spin_unlock(&cusers->lock);

 	wake_up(&cusers->waitq);
@@ -608,7 +610,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

@@ -1038,6 +1040,101 @@ static int next_log_merge_item(struct super_block *sb,
 	return next_log_merge_item_key(sb, root, zone, &key, val, val_len);
 }

+static int do_finalize_ours(struct super_block *sb,
+			    struct scoutfs_log_trees *lt,
+			    struct commit_hold *hold)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
+	struct scoutfs_key key;
+	char *err_str = NULL;
+	u64 rid = le64_to_cpu(lt->rid);
+	bool more;
+	int ret;
+	int err;
+
+	mutex_lock(&server->srch_mutex);
+	ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
+				      &super->srch_root, &lt->srch_file, true);
+	mutex_unlock(&server->srch_mutex);
+	if (ret < 0) {
+		scoutfs_err(sb, "error rotating srch log for rid %016llx: %d",
+			    rid, ret);
+		return ret;
+        }
+
+	do {
+		more = false;
+
+		/*
+		 * All of these can return errors, perhaps indicating successful
+		 * partial progress, after having modified the allocator trees.
+		 * We always have to update the roots in the log item.
+		 */
+		mutex_lock(&server->alloc_mutex);
+		ret = (err_str = "splice meta_freed to other_freed",
+				scoutfs_alloc_splice_list(sb, &server->alloc,
+					&server->wri, server->other_freed,
+					&lt->meta_freed)) ?:
+			(err_str = "splice meta_avail",
+			 scoutfs_alloc_splice_list(sb, &server->alloc,
+					&server->wri, server->other_freed,
+					&lt->meta_avail)) ?:
+			(err_str = "empty data_avail",
+			 alloc_move_empty(sb, &super->data_alloc,
+					  &lt->data_avail,
+					  COMMIT_HOLD_ALLOC_BUDGET / 2)) ?:
+			(err_str = "empty data_freed",
+			 alloc_move_empty(sb, &super->data_alloc,
+					  &lt->data_freed,
+					  COMMIT_HOLD_ALLOC_BUDGET / 2));
+		mutex_unlock(&server->alloc_mutex);
+
+		/*
+		 * only finalize, allowing merging, once the allocators are
+		 * fully freed
+		 */
+		if (ret == 0) {
+			/* the transaction is no longer open */
+			le64_add_cpu(&lt->flags, SCOUTFS_LOG_TREES_FINALIZED);
+			lt->finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
+		}
+
+		scoutfs_key_init_log_trees(&key, rid, le64_to_cpu(lt->nr));
+
+		err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
+					   &super->logs_root, &key, lt,
+					   sizeof(*lt));
+		BUG_ON(err != 0); /* alloc, log, srch items out of sync */
+
+		if (ret == -EINPROGRESS) {
+			more = true;
+			mutex_unlock(&server->logs_mutex);
+			ret = server_apply_commit(sb, hold, 0);
+			if (ret < 0)
+				WARN_ON_ONCE(ret < 0);
+			server_hold_commit(sb, hold);
+			mutex_lock(&server->logs_mutex);
+		} else if (ret == 0) {
+			memset(&lt->item_root, 0, sizeof(lt->item_root));
+			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
+			lt->inode_count_delta = 0;
+			lt->max_item_seq = 0;
+			lt->finalize_seq = 0;
+			le64_add_cpu(&lt->nr, 1);
+			lt->flags = 0;
+		}
+	} while (more);
+
+	if (ret < 0) {
+		scoutfs_err(sb,
+			    "error %d finalizing log trees for rid %016llx: %s",
+			    ret, rid, err_str);
+	}
+
+	return ret;
+}
+
 /*
 * Finalizing the log btrees for merging needs to be done carefully so
 * that items don't appear to go backwards in time.
@@ -1089,7 +1186,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_log_merge_range rng;
 	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
-	struct scoutfs_log_trees fin;
 	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
@@ -1160,6 +1256,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
+			scoutfs_inc_counter(sb, log_merge_no_finalized);
 			break;
 		}

@@ -1194,32 +1291,11 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l

 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
-			fin = *lt;
-			memset(&fin.meta_avail, 0, sizeof(fin.meta_avail));
-			memset(&fin.meta_freed, 0, sizeof(fin.meta_freed));
-			memset(&fin.data_avail, 0, sizeof(fin.data_avail));
-			memset(&fin.data_freed, 0, sizeof(fin.data_freed));
-			memset(&fin.srch_file, 0, sizeof(fin.srch_file));
-			le64_add_cpu(&fin.flags, SCOUTFS_LOG_TREES_FINALIZED);
-			fin.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
-
-			scoutfs_key_init_log_trees(&key, le64_to_cpu(fin.rid),
-						   le64_to_cpu(fin.nr));
-			ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
-						   &super->logs_root, &key, &fin,
-						   sizeof(fin));
+			ret = do_finalize_ours(sb, lt, hold);
 			if (ret < 0) {
-				err_str = "updating finalized log_trees";
+				err_str = "finalizing ours";
 				break;
 			}
-
-			memset(&lt->item_root, 0, sizeof(lt->item_root));
-			memset(&lt->bloom_ref, 0, sizeof(lt->bloom_ref));
-			lt->inode_count_delta = 0;
-			lt->max_item_seq = 0;
-			lt->finalize_seq = 0;
-			le64_add_cpu(&lt->nr, 1);
-			lt->flags = 0;
 		}

 		/* wait a bit for mounts to arrive */
@@ -1678,8 +1754,8 @@ unlock:

 	ret = server_apply_commit(sb, &hold, ret);
 	if (ret < 0)
-		scoutfs_err(sb, "server error %d committing client logs for rid %016llx: %s",
-			    ret, rid, err_str);
+		scoutfs_err(sb, "server error %d committing client logs for rid %016llx, nr %llu: %s",
+			    ret, rid, le64_to_cpu(lt.nr), err_str);
 out:
 	WARN_ON_ONCE(ret < 0);
 	return scoutfs_net_response(sb, conn, cmd, id, ret, NULL, 0);
@@ -1814,6 +1890,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
 out:
 	mutex_unlock(&server->logs_mutex);

+	if (ret == 0)
+		scoutfs_inc_counter(sb, reclaimed_open_logs);
+
 	if (ret < 0 && ret != -EINPROGRESS)
 		scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s",
 			    ret, rid, err_str);
@@ -2531,7 +2610,7 @@ static void server_log_merge_free_work(struct work_struct *work)

 		ret = scoutfs_btree_free_blocks(sb, &server->alloc,
 						&server->wri, &fr.key,
-						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 2);
+						&fr.root, COMMIT_HOLD_ALLOC_BUDGET / 8);
 		if (ret < 0) {
 			err_str = "freeing log btree";
 			break;
@@ -2550,7 +2629,7 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
+		if (server_hold_alloc_used_since(sb, &hold) >= (COMMIT_HOLD_ALLOC_BUDGET * 3) / 4) {
 			mutex_unlock(&server->logs_mutex);
 			ret = server_apply_commit(sb, &hold, ret);
 			commit = false;
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -62,7 +62,7 @@
 * re-allocated and re-written.  Search can restart by checking the
 * btree for the current set of files.  Compaction reads log files which
 * are protected from other compactions by the persistent busy items
- * created by the server.  Compaction won't see it's blocks reused out
+ * created by the server.  Compaction won't see its blocks reused out
 * from under it, but it can encounter stale cached blocks that need to
 * be invalidated.
 */
@@ -442,6 +442,10 @@ out:
 	if (ret == 0 && (flags & GFB_INSERT) && blk >= le64_to_cpu(sfl->blocks))
 		sfl->blocks = cpu_to_le64(blk + 1);

+	if (bl) {
+		trace_scoutfs_get_file_block(sb, bl->blkno, flags);
+	}
+
 	*bl_ret = bl;
 	return ret;
 }
@@ -533,23 +537,35 @@ out:
 * the pairs cancel each other out by all readers (the second encoding
 * looks like deletion) so they aren't visible to the first/last bounds of
 * the block or file.
+ *
+ * We use the same entry repeatedly, so the diff between them will be empty.
+ * This lets us just emit the two-byte count word, leaving the other bytes
+ * as zero.
+ *
+ * Split the desired total len into two pieces, adding any remainder to the
+ * first four-bit value.
 */
-static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
-			       struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
+static void append_padded_entry(struct scoutfs_srch_file *sfl,
+				struct scoutfs_srch_block *srb,
+				int len)
 {
-	int ret;
+	int each;
+	int rem;
+	u16 lengths = 0;
+	u8 *buf = srb->entries + le32_to_cpu(srb->entry_bytes);

-	ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
-			   sre, &srb->tail);
-	if (ret > 0) {
-		srb->tail = *sre;
-		le32_add_cpu(&srb->entry_nr, 1);
-		le32_add_cpu(&srb->entry_bytes, ret);
-		le64_add_cpu(&sfl->entries, 1);
-		ret = 0;
-	}
+	each = (len - 2) >> 1;
+	rem = (len - 2) & 1;

-	return ret;
+	lengths |= each + rem;
+	lengths |= each << 4;
+
+	memset(buf, 0, len);
+	put_unaligned_le16(lengths, buf);
+
+	le32_add_cpu(&srb->entry_nr, 1);
+	le32_add_cpu(&srb->entry_bytes, len);
+	le64_add_cpu(&sfl->entries, 1);
 }

 /*
@@ -560,61 +576,41 @@ static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
 * This is called when there is a single existing entry in the block.
 * We have the entire block to work with.  We encode pairs of matching
 * entries.  This hides them from readers (both searches and merging) as
- * they're interpreted as creation and deletion and are deleted.  We use
- * the existing hash value of the first entry in the block but then set
- * the inode to an impossibly large number so it doesn't interfere with
- * anything.
+ * they're interpreted as creation and deletion and are deleted.
 *
- * To hit the specific offset we very carefully manage the amount of
- * bytes of change between fields in the entry.  We know that if we
- * change all the byte of the ino and id we end up with a 20 byte
- * (2+8+8,2) encoding of the pair of entries.  To have the last entry
- * start at the _SAFE_POS offset we know that the final 20 byte pair
- * encoding needs to end at 2 bytes (second entry encoding) after the
- * _SAFE_POS offset.
+ * For simplicity and to maintain sort ordering within the block, we reuse
+ * the existing entry. This lets us skip the encoding step, because we know
+ * the diff will be zero. We can zero-pad the resulting entries to hit the
+ * target offset exactly.
 *
- * So as we encode pairs we watch the delta of our current offset from
- * that desired final offset of 2 past _SAFE_POS.  If we're a multiple
- * of 20 away then we encode the full 20 byte pairs.  If we're not, then
- * we drop a byte to encode 19 bytes.  That'll slowly change the offset
- * to be a multiple of 20 again while encoding large entries.
+ * Because we can't predict the exact number of entry_bytes when we start,
+ * we adjust the byte count of subsequent entries until we wind up at a
+ * multiple of 20 bytes away from our goal and then use that length for
+ * the remaining entries.
+ *
+ * We could just use a single pair of unnaturally large entries to consume
+ * the needed space, adjusting for an odd number of entry_bytes if necessary.
+ * The use of 19 or 20 bytes for the entry pair matches what we would see with
+ * real (non-zero) entries that vary from the existing entry.
 */
-static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
+static void pad_entries_at_safe(struct scoutfs_srch_file *sfl,
 				struct scoutfs_srch_block *srb)
 {
-	struct scoutfs_srch_entry sre;
 	u32 target;
 	s32 diff;
-	u64 hash;
-	u64 ino;
-	u64 id;
-	int ret;
-
-	hash = le64_to_cpu(srb->tail.hash);
-	ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
-	id = le64_to_cpu(srb->tail.id);

 	target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;

 	while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
-		ino ^= 1ULL << (7 * 8);
+		append_padded_entry(sfl, srb, 10);
 		if (diff % 20 == 0) {
-			id ^= 1ULL << (7 * 8);
+			append_padded_entry(sfl, srb, 10);
 		} else {
-			id ^= 1ULL << (6 * 8);
+			append_padded_entry(sfl, srb, 9);
 		}
-
-		sre.hash = cpu_to_le64(hash);
-		sre.ino = cpu_to_le64(ino);
-		sre.id = cpu_to_le64(id);
-
-		ret = append_padded_entry(sfl, blk, srb, &sre);
-		if (ret == 0)
-			ret = append_padded_entry(sfl, blk, srb, &sre);
-		BUG_ON(ret != 0);
-
-		diff = target - le32_to_cpu(srb->entry_bytes);
 	}
+
+	WARN_ON_ONCE(diff != 0);
 }

 /*
@@ -749,14 +745,14 @@ static int search_log_file(struct super_block *sb,
 		for (i = 0; i < le32_to_cpu(srb->entry_nr); i++) {
 			if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}

 			ret = decode_entry(srb->entries + pos, &sre, &prev);
 			if (ret <= 0) {
 				/* can only be inconsistency :/ */
-				ret = EIO;
+				ret = -EIO;
 				break;
 			}
 			pos += ret;
@@ -859,14 +855,14 @@ static int search_sorted_file(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, &sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}
 		pos += ret;
@@ -972,6 +968,8 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,

 	scoutfs_inc_counter(sb, srch_search_xattrs);

+	trace_scoutfs_ioc_search_xattrs(sb, ino, last_ino);
+
 	*done = false;
 	srch_init_rb_root(sroot);

@@ -1669,7 +1667,7 @@ static int kway_merge(struct super_block *sb,
 			/* end sorted block on _SAFE offset for testing */
 			if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
 			    scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
-				pad_entries_at_safe(sfl, blk, srb);
+				pad_entries_at_safe(sfl, srb);
 				scoutfs_block_put(sb, bl);
 				bl = NULL;
 				blk++;
@@ -1802,7 +1800,7 @@ static void swap_page_sre(void *A, void *B, int size)
 * typically, ~10x worst case).
 *
 * Because we read and sort all the input files we must perform the full
- * compaction in one operation.  The server must have given us a
+ * compaction in one operation.  The server must have given us
 * sufficiently large avail/freed lists, otherwise we'll return ENOSPC.
 */
 static int compact_logs(struct super_block *sb,
@@ -1866,14 +1864,14 @@ static int compact_logs(struct super_block *sb,

 		if (pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			break;
 		}

 		ret = decode_entry(srb->entries + pos, sre, &prev);
 		if (ret <= 0) {
 			/* can only be inconsistency :/ */
-			ret = EIO;
+			ret = -EIO;
 			goto out;
 		}
 		prev = *sre;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -196,7 +196,7 @@ static int retry_forever(struct super_block *sb, int (*func)(struct super_block
 			}

 			if (scoutfs_forcing_unmount(sb)) {
-				ret = -EIO;
+				ret = -ENOLINK;
 				break;
 			}

@@ -252,7 +252,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	}

 	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
+		ret = -ENOLINK;
 		goto out;
 	}

--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -742,7 +742,7 @@ int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_
 	int ret;
 	int err;

-	trace_scoutfs_xattr_set(sb, name_len, value, size, flags);
+	trace_scoutfs_xattr_set(sb, ino, name_len, value, size, flags);

 	if (WARN_ON_ONCE(tgs->totl && tgs->indx) ||
 	    WARN_ON_ONCE((tgs->totl | tgs->indx) && !tag_lock))
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -140,6 +140,9 @@ t_filter_dmesg()
 	re="$re|scoutfs .* error.*server failed to bind to.*"
 	re="$re|scoutfs .* critical transaction commit failure.*"

+	# ENOLINK (-67) indicates an expected forced unmount error
+	re="$re|scoutfs .* error -67 .*"
+
 	# change-devices causes loop device resizing
 	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"
--- a/tests/golden/large-fragmented-free
+++ b/tests/golden/large-fragmented-free
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
--- a/tests/golden/offline-extent-waiting
+++ b/tests/golden/offline-extent-waiting
@@ -49,7 +49,7 @@ offline wating should be empty:
 0
 == truncating does wait
 truncate should be waiting for first block:
-trunate should no longer be waiting:
+truncate should no longer be waiting:
 0
 == writing waits
 should be waiting for write
--- a/tests/tests/format-version-forward-back.sh
+++ b/tests/tests/format-version-forward-back.sh
@@ -11,8 +11,8 @@
 # format version.
 #

-# not supported on el9!
-if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 8 ]; then
+# not supported on el8 or higher
+if [ $(source /etc/os-release ; echo ${VERSION_ID:0:1}) -gt 7 ]; then
 	t_skip_permitted "Unsupported OS version"
 fi

--- a/tests/tests/large-fragmented-free.sh
+++ b/tests/tests/large-fragmented-free.sh
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

--- a/tests/tests/offline-extent-waiting.sh
+++ b/tests/tests/offline-extent-waiting.sh
@@ -157,7 +157,7 @@ echo "truncate should be waiting for first block:"
 expect_wait "$DIR/file" "change_size" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
 sleep .1
-echo "trunate should no longer be waiting:"
+echo "truncate should no longer be waiting:"
 scoutfs data-waiting -B 0 -I 0 -p "$DIR" | wc -l
 cat "$DIR/golden" > "$DIR/file"
 vers=$(scoutfs stat -s data_version "$DIR/file")
@@ -168,10 +168,13 @@ scoutfs release "$DIR/file" -V "$vers" -o 0 -l $BYTES
 # overwrite, not truncate+write
 dd if="$DIR/other" of="$DIR/file" \
 	bs=$BS count=$BLOCKS conv=notrunc status=none &
+pid="$!"
 sleep .1
 echo "should be waiting for write"
 expect_wait "$DIR/file" "write" $ino 0
 scoutfs stage "$DIR/golden" "$DIR/file" -V "$vers" -o 0 -l $BYTES
+# wait for the background dd to complete
+wait "$pid" 2> /dev/null
 cmp "$DIR/file" "$DIR/other"

 echo "== cleanup"
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -67,18 +67,49 @@ t_mount_all
 while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
 	sleep .5
 done
-# wait for orphan scans to run
-t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
-# also have to wait for delayed log merge work from mount
-C=120
-while (( C-- )); do
-	brk=1
-	for ino in $inos; do
-		inode_exists $ino && brk=0
-	done
-	test $brk -eq 1 && break
+
+
+sv=$(t_server_nr)
+
+# wait for reclaim_open_log_tree() to complete for each mount
+while [ $(t_counter reclaimed_open_logs $sv) -lt $T_NR_MOUNTS ]; do
 	sleep 1
 done
+
+# wait for finalize_and_start_log_merge() to find no active merges in flight
+# and not find any finalized trees
+while [ $(t_counter log_merge_no_finalized $sv) -lt 1 ]; do
+	sleep 1
+done
+
+# wait for orphan scans to run
+t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000
+# wait until we see two consecutive orphan scan attempts without
+# any inode deletion forward progress in each mount
+for nr in $(t_fs_nrs); do
+	C=0
+	LOSA=$(t_counter orphan_scan_attempts $nr)
+	LDOP=$(t_counter inode_deleted $nr)
+
+	while [ $C -lt 2 ]; do
+		sleep 1
+
+		OSA=$(t_counter orphan_scan_attempts $nr)
+		DOP=$(t_counter inode_deleted $nr)
+
+		if [ $OSA != $LOSA ]; then
+			if [ $DOP == $LDOP ]; then
+				(( C++ ))
+			else
+				C=0
+			fi
+		fi
+
+		LOSA=$OSA
+		LDOP=$DOP
+	done
+done
+
 for ino in $inos; do
 	inode_exists $ino && echo "$ino still exists"
 done
--- a/utils/sparse.sh
+++ b/utils/sparse.sh
@@ -22,6 +22,11 @@ RE="$RE|warning: memset with byte count of 4194304"
 # some sparse versions don't know about some builtins
 RE="$RE|error: undefined identifier '__builtin_fpclassify'"

+# on el8, sparse can't handle __has_include for some reason when _GNU_SOURCE
+# is defined, and we need that for O_DIRECT.
+RE="$RE|note: in included file .through /usr/include/sys/stat.h.:"
+RE="$RE|/usr/include/bits/statx.h:30:6: error: "
+
 #
 # don't filter out 'too many errors' here, it can signify that
 # sparse doesn't understand something and is throwing a *ton*