Add force to prepare-empty-data-device

Signed-off-by: Zach Brown <zab@versity.com>
2026-01-08 21:03:12 +00:00 · 2023-11-02 18:05:51 -07:00
29 changed files with 392 additions and 1205 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,41 +1,6 @@
 Versity ScoutFS Release Notes
 =============================

---
-v1.20
-\
-*Apr 22, 2024*
-
-Minor changes to packaging to better support "weak" module linking of
-the kernel module, and to including git hashes in the built package.  No
-changes in runtime behaviour.
-
---
-v1.19
-\
-*Jan 30, 2024*
-
-Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
-for creating log merge operations.  The previous timeout, now the
-default, was too short for some systems and was resulting in consistent
-timeouts which created an excessive number of log trees waiting to be
-merged.
-
-Improved performance of many in-mount server operations when there are a
-large number of log trees waiting to be merged.
-
---
-v1.18
-\
-*Nov 7, 2023*
-
-Fixed a bug where background srch file compaction could stop making
-forward progress if a partial compaction operation was committed at a
-specific byte offset in a block.  This would cause srch file searches to
-be progressively more expensive over time.  Once this fix is running
-background compaction will resume, bringing the cost of searches back
-down.
-
 ---
 v1.17
 \
--- a/kmod/Makefile
+++ b/kmod/Makefile
@@ -12,22 +12,17 @@ else
 SP = @:
 endif

-SCOUTFS_GIT_DESCRIBE ?= \
+SCOUTFS_GIT_DESCRIBE := \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

-ESCAPED_GIT_DESCRIBE := \
-	$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
-
-RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
-
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
-		RPM_GITHASH=$(RPM_GITHASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

 # - We use the git describe from tags to set up the RPM versioning
 RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
+RPM_GITHASH := $(shell git rev-parse --short HEAD)
 TARFILE = scoutfs-kmod-$(RPM_VERSION).tar


@@ -46,8 +41,7 @@ modules_install:

 %.spec: %.spec.in .FORCE
 	sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
-	    -e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
+	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
 	mv $@+ $@


--- a/kmod/scoutfs-kmod.spec.in
+++ b/kmod/scoutfs-kmod.spec.in
@@ -1,7 +1,6 @@
 %define kmod_name scoutfs
 %define kmod_version @@VERSION@@
 %define kmod_git_hash @@GITHASH@@
-%define kmod_git_describe @@GITDESCRIBE@@
 %define pkg_date %(date +%%Y%%m%%d)

 # Disable the building of the debug package(s).
@@ -76,7 +75,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
 for flavor in %flavors_to_build; do
    rm -rf obj/$flavor
    cp -r source obj/$flavor
-    make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
+    make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
 done

 %install
@@ -98,21 +97,10 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
 /lib/modules

 %post
-echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
+weak-modules --add-kernel --no-initramfs
 depmod -a
 %endif

 %clean
 rm -rf %{buildroot}

-%preun
-# stash our modules for postun cleanup
-SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
-rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
-
-%postun
-if [ -x /sbin/weak-modules ]; then
-    cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
-fi
-
-rm /var/run/%{name}-modules-%{version}-%{release} || true
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -2029,253 +2029,187 @@ int scoutfs_btree_rebalance(struct super_block *sb,
 			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
 }

-struct merged_range {
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	struct rb_root root;
-	int size;
-};
-
-struct merged_item {
+struct merge_pos {
 	struct rb_node node;
-	struct scoutfs_key key;
+	struct scoutfs_btree_root *root;
+	struct scoutfs_block *bl;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *avl;
+	struct scoutfs_key *key;
 	u64 seq;
 	u8 flags;
 	unsigned int val_len;
-	u8 val[0];
+	u8 *val;
 };

-static inline struct merged_item *mitem_container(struct rb_node *node)
+static struct merge_pos *first_mpos(struct rb_root *root)
 {
-	return node ? container_of(node, struct merged_item, node) : NULL;
-}
-
-static inline struct merged_item *first_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_first(root));
-}
-
-static inline struct merged_item *last_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_last(root));
-}
-
-static inline struct merged_item *next_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
-}
-
-static inline struct merged_item *prev_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
-}
-
-static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
-				      struct rb_node **parent_ret, struct rb_node ***link_ret)
-{
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct merged_item *mitem;
-	int cmp;
-
-	while (*node) {
-		parent = *node;
-		mitem = container_of(*node, struct merged_item, node);
-
-		cmp = scoutfs_key_compare(key, &mitem->key);
-
-		if (cmp < 0) {
-			node = &(*node)->rb_left;
-		} else if (cmp > 0) {
-			node = &(*node)->rb_right;
-		} else {
-			*parent_ret = NULL;
-			*link_ret = NULL;
-			return mitem;
-		}
-	}
-
-	*parent_ret = parent;
-	*link_ret = node;
+	struct rb_node *node = rb_first(root);
+	if (node)
+		 return container_of(node, struct merge_pos, node);
 	return NULL;
 }

-static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
-			 struct rb_node *parent, struct rb_node **link)
+static struct merge_pos *next_mpos(struct merge_pos *mpos)
 {
-	rb_link_node(&mitem->node, parent, link);
-	rb_insert_color(&mitem->node, &rng->root);
-	rng->size += item_len_bytes(mitem->val_len);
+	struct rb_node *node;
+
+	if (mpos && (node = rb_next(&mpos->node)))
+		return container_of(node, struct merge_pos, node);
+	else
+		return NULL;
 }

-static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
-				struct merged_item *new)
+static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
 {
-	rb_replace_node(&victim->node, &new->node, &rng->root);
-	RB_CLEAR_NODE(&victim->node);
-	rng->size -= item_len_bytes(victim->val_len);
-	rng->size += item_len_bytes(new->val_len);
+	scoutfs_block_put(sb, mpos->bl);
+	kfree(mpos);
 }

-static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
+static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
 {
-	if (IS_ERR_OR_NULL(mitem))
-		return;
+	struct rb_node **node = &pos_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct merge_pos *mpos;
+	int cmp;

-	if (!RB_EMPTY_NODE(&mitem->node)) {
-		rng->size -= item_len_bytes(mitem->val_len);
-		rb_erase(&mitem->node, &rng->root);
+	parent = NULL;
+	while (*node) {
+		parent = *node;
+		mpos = container_of(*node, struct merge_pos, node);
+
+		/* sort merge items by key then newest to oldest */
+		cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
+		      -scoutfs_cmp(ins->seq, mpos->seq);
+
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
 	}

-	kfree(mitem);
-}
-
-static void trim_range_size(struct merged_range *rng, int merge_window)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && rng->size > merge_window) {
-
-		rng->end = mitem->key;
-		scoutfs_key_dec(&rng->end);
-
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
-}
-
-static void trim_range_end(struct merged_range *rng)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
+	rb_link_node(&ins->node, parent, node);
+	rb_insert_color(&ins->node, pos_root);
 }

 /*
- * Record and combine logged items from log roots for merging with the
- * writable destination root.  The caller is responsible for trimming
- * the range if it gets too large or if the key range shrinks.
+ * Find the next item in the merge_pos root in the caller's range and
+ * insert it into the rbtree sorted by key and version so that merging
+ * can find the next newest item at the front of the rbtree.  We free
+ * the mpos on error or if there are no more items in the range.
 */
-static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
-			   void *val, int val_len, void *arg)
+static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
+		      struct scoutfs_key *start, struct scoutfs_key *end)
 {
-	struct merged_range *rng = arg;
-	struct merged_item *mitem;
-	struct merged_item *found;
-	struct rb_node *parent;
-	struct rb_node **link;
-	int ret;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *next;
+	struct btree_walk_key_range kr;
+	struct scoutfs_key walk_key;
+	int ret = 0;

-	found = find_mitem(&rng->root, key, &parent, &link);
-	if (found) {
-		ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
-		if (ret < 0)
-			goto out;
-		if (ret > 0) {
-			if (ret == SCOUTFS_DELTA_COMBINED) {
-				scoutfs_inc_counter(sb, btree_merge_delta_combined);
-			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
-				scoutfs_inc_counter(sb, btree_merge_delta_null);
-				free_mitem(rng, found);
-			}
-			ret = 0;
-			goto out;
-		}
-
-		if (found->seq >= seq) {
-			ret = 0;
-			goto out;
-		}
+	/* always erase before freeing or inserting */
+	if (!RB_EMPTY_NODE(&mpos->node)) {
+		rb_erase(&mpos->node, pos_root);
+		RB_CLEAR_NODE(&mpos->node);
 	}

-	mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
-	if (!mitem) {
-		ret = -ENOMEM;
+	/*
+	 * advance to next item via the avl tree.   The caller's pos is
+	 * only ever incremented past the last key so we can use next to
+	 * iterate rather than using search to skip past multiple items.
+	 */
+	if (mpos->avl)
+		mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
+
+	/* find the next leaf with the key if we run out of items */
+	walk_key = *start;
+	while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
+		scoutfs_block_put(sb, mpos->bl);
+		mpos->bl = NULL;
+		ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
+				 0, &mpos->bl, &kr, NULL);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			free_mpos(sb, mpos);
+			goto out;
+		}
+		mpos->bt = mpos->bl->data;
+
+		mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
+					       start, NULL, NULL, &next, NULL) ?: next;
+		if (mpos->avl == NULL)
+			walk_key = kr.iter_next;
+	}
+
+	/* see if we're out of items within the range */
+	item = node_item(mpos->avl);
+	if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
+		free_mpos(sb, mpos);
+		ret = 0;
 		goto out;
 	}

-	mitem->key = *key;
-	mitem->seq = seq;
-	mitem->flags = flags;
-	mitem->val_len = val_len;
-	if (val_len)
-		memcpy(mitem->val, val, val_len);
-
-	if (found) {
-		replace_mitem(rng, found, mitem);
-		free_mitem(rng, found);
-	} else {
-		insert_mitem(rng, mitem, parent, link);
-	}
+	/* insert the next item within range at its version */
+	mpos->key = item_key(item);
+	mpos->seq = le64_to_cpu(item->seq);
+	mpos->flags = item->flags;
+	mpos->val_len = item_val_len(item);
+	mpos->val = item_val(mpos->bt, item);

+	insert_mpos(pos_root, mpos);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * Read a range of merged items.  The caller has set the key bounds of
- * the range.  We read a merge window's worth of items from blocks in
- * each input btree.
+ * The caller has reset all the merge positions for all the input log
+ * btree roots and wants the next logged item it should try and merge
+ * with the items in the fs_root.
 *
- * The caller can only use the smallest range that overlaps with all the
- * blocks that we read.  We start reading from the range's start key so
- * it will always be present and we don't need to adjust it.  The final
- * block we read from each input might not cover the range's end so it
- * needs to be adjusted.
- *
- * The end range can also shrink if we have to drop items because the
- * items exceeded the merge window size.
+ * We look ahead in the logged item stream to see if we should merge any
+ * older logged delta items into one result for the caller.  We also
+ * take this opportunity to skip and reset the mpos for any older
+ * versions of the first item.
 */
-static int read_merged_range(struct super_block *sb, struct merged_range *rng,
-			     struct list_head *inputs, int merge_window)
+static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
+			      struct scoutfs_key *end, struct merge_pos **mpos_ret)
 {
-	struct scoutfs_btree_root_head *rhead;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
+	struct merge_pos *mpos;
+	struct merge_pos *next;
 	struct scoutfs_key key;
 	int ret = 0;
-	int i;

-	list_for_each_entry(rhead, inputs, head) {
-		key = rng->start;
+	while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
+	       !scoutfs_key_compare(mpos->key, next->key)) {

-		for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
-			start = key;
-			end = rng->end;
-			ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
-						       merge_read_item, rng);
+		ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
+						    next->val, next->val_len);
+		if (ret < 0)
+			break;
+
+		/* reset advances to the next item */
+		key = *mpos->key;
+		scoutfs_key_inc(&key);
+
+		/* always skip next combined or older version */
+		ret = reset_mpos(sb, pos_root, next, &key, end);
+		if (ret < 0)
+			break;
+
+		if (ret == SCOUTFS_DELTA_COMBINED) {
+			scoutfs_inc_counter(sb, btree_merge_delta_combined);
+		} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
+			scoutfs_inc_counter(sb, btree_merge_delta_null);
+			/* if merging resulted in no info, skip current */
+			ret = reset_mpos(sb, pos_root, mpos, &key, end);
 			if (ret < 0)
-				goto out;
-
-			if (scoutfs_key_compare(&end, &rng->end) >= 0)
 				break;
-
-			key = end;
-			scoutfs_key_inc(&key);
 		}
-
-		if (scoutfs_key_compare(&end, &rng->end) < 0) {
-			rng->end = end;
-			trim_range_end(rng);
-		}
-
-		if (rng->size > merge_window)
-			trim_range_size(rng, merge_window);
 	}

-	trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
-	ret = 0;
-out:
+	*mpos_ret = mpos;
 	return ret;
 }

@@ -2292,13 +2226,6 @@ out:
 * to allocators running low or needing to join/split the parent.
 * *next_ret is set to the next key which hasn't been merged so that the
 * caller can retry with a new allocator and subtree.
- *
- * The number of input roots can be immense.  The merge_window specifies
- * the size of the set of merged items that we'll maintain as we iterate
- * over all the input roots.  Once we've merged items into the window
- * from all the input roots the merged input items are then merged to
- * the writable destination root.  It may take multiple passes of
- * windows of merged items to cover the input key range.
 */
 int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_alloc *alloc,
@@ -2308,16 +2235,18 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *inputs,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window)
+			bool subtree, int dirty_limit, int alloc_low)
 {
+	struct scoutfs_btree_root_head *rhead;
+	struct rb_root pos_root = RB_ROOT;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl = NULL;
 	struct btree_walk_key_range kr;
 	struct scoutfs_avl_node *par;
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-	struct merged_range rng;
+	struct scoutfs_key next;
+	struct merge_pos *mpos;
+	struct merge_pos *tmp;
 	int walk_val_len;
 	int walk_flags;
 	bool is_del;
@@ -2328,59 +2257,49 @@ int scoutfs_btree_merge(struct super_block *sb,
 	trace_scoutfs_btree_merge(sb, root, start, end);
 	scoutfs_inc_counter(sb, btree_merge);

+	list_for_each_entry(rhead, inputs, head) {
+		mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
+		if (!mpos) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		RB_CLEAR_NODE(&mpos->node);
+		mpos->root = &rhead->root;
+
+		ret = reset_mpos(sb, &pos_root, mpos, start, end);
+		if (ret < 0)
+			goto out;
+	}
+
 	walk_flags = BTW_DIRTY;
 	if (subtree)
 		walk_flags |= BTW_SUBTREE;
 	walk_val_len = 0;

-	rng.start = *start;
-	rng.end = *end;
-	rng.root = RB_ROOT;
-	rng.size = 0;
-
-	ret = read_merged_range(sb, &rng, inputs, merge_window);
-	if (ret < 0)
-		goto out;
-
-	for (;;) {
-		/* read next window as it empties (and it is possible to read an empty range) */
-		mitem = first_mitem(&rng.root);
-		if (!mitem) {
-			/* done if the read range hit the end */
-			if (scoutfs_key_compare(&rng.end, end) >= 0)
-				break;
-
-			/* read next batch of merged items */
-			rng.start = rng.end;
-			scoutfs_key_inc(&rng.start);
-			rng.end = *end;
-			ret = read_merged_range(sb, &rng, inputs, merge_window);
-			if (ret < 0)
-				break;
-			continue;
-		}
+	while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {

 		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
 			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
 			scoutfs_inc_counter(sb, btree_merge_alloc_low);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		scoutfs_block_put(sb, bl);
 		bl = NULL;
 		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-			         &mitem->key, walk_val_len, &bl, &kr, NULL);
+			         mpos->key, walk_val_len, &bl, &kr, NULL);
 		if (ret < 0) {
 			if (ret == -ERANGE)
-				*next_ret = mitem->key;
+				*next_ret = *mpos->key;
 			goto out;
 		}
 		bt = bl->data;
@@ -2392,21 +2311,22 @@ int scoutfs_btree_merge(struct super_block *sb,
 			continue;
 		}

-		while (mitem) {
+		while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
+
 			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
+			if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
 				break;

 			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, &mitem->key);
-			is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
+			item = leaf_item_hash_search(sb, bt, mpos->key);
+			is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);

 			/* see if we're merging delta items */
 			if (item && !is_del)
-				delta = scoutfs_forest_combine_deltas(&mitem->key,
+				delta = scoutfs_forest_combine_deltas(mpos->key,
 								      item_val(bt, item),
 								      item_val_len(item),
-								      mitem->val, mitem->val_len);
+								      mpos->val, mpos->val_len);
 			else
 				delta = 0;
 			if (delta < 0) {
@@ -2418,38 +2338,40 @@ int scoutfs_btree_merge(struct super_block *sb,
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 			}

-			trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
+			trace_scoutfs_btree_merge_items(sb, mpos->root,
+					mpos->key, mpos->val_len,
 					item ? root : NULL,
 					item ? item_key(item) : NULL,
 					item ? item_val_len(item) : 0, is_del);

 			/* rewalk and split if ins/update needs room */
-			if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
+			if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
 				walk_flags |= BTW_INSERT;
-				walk_val_len = mitem->val_len;
+				walk_val_len = mpos->val_len;
 				break;
 			}

 			/* insert missing non-deletion merge items */
 			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
+				scoutfs_avl_search(&bt->item_root,
+						   cmp_key_item, mpos->key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, &mitem->key, mitem->seq, mitem->flags,
-					    mitem->val, mitem->val_len, par, cmp);
+				create_item(bt, mpos->key, mpos->seq, mpos->flags,
+					    mpos->val, mpos->val_len, par, cmp);
 				scoutfs_inc_counter(sb, btree_merge_insert);
 			}

 			/* update existing items */
 			if (item && !is_del && !delta) {
-				item->seq = cpu_to_le64(mitem->seq);
-				item->flags = mitem->flags;
-				update_item_value(bt, item, mitem->val, mitem->val_len);
+				item->seq = cpu_to_le64(mpos->seq);
+				item->flags = mpos->flags;
+				update_item_value(bt, item, mpos->val, mpos->val_len);
 				scoutfs_inc_counter(sb, btree_merge_update);
 			}

 			/* update combined delta item seq */
 			if (delta == SCOUTFS_DELTA_COMBINED) {
-				item->seq = cpu_to_le64(mitem->seq);
+				item->seq = cpu_to_le64(mpos->seq);
 			}

 			/*
@@ -2481,18 +2403,21 @@ int scoutfs_btree_merge(struct super_block *sb,
 			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
 			walk_val_len = 0;

-			/* finished with this merged item */
-			tmp = mitem;
-			mitem = next_mitem(mitem);
-			free_mitem(&rng, tmp);
+			/* finished with this key, skip any older items */
+			next = *mpos->key;
+			scoutfs_key_inc(&next);
+			ret = reset_mpos(sb, &pos_root, mpos, &next, end);
+			if (ret < 0)
+				goto out;
 		}
 	}

 	ret = 0;
 out:
 	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
-		free_mitem(&rng, mitem);
+	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
+		free_mpos(sb, mpos);
+	}

 	return ret;
 }
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *input_list,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window);
+			bool subtree, int dirty_limit, int alloc_low);

 int scoutfs_btree_free_blocks(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -145,7 +145,6 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
-	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
 	EXPAND_COUNTER(net_send_error)				\
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -721,8 +721,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
 	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
 				  &next, &comp.root, &inputs,
 				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
-				  (2 * 1024 * 1024));
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
 	if (ret == -ERANGE) {
 		comp.remain = next;
 		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -33,7 +33,6 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
-	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
@@ -46,7 +45,6 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
-	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -115,10 +113,6 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

-#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
-#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
-#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
-
 #define MIN_ORPHAN_SCAN_DELAY_MS	100UL
 #define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
 #define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
@@ -132,27 +126,11 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
-	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
 }

-static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
-{
-	if (ret < 0) {
-		scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
-		return -EINVAL;
-	}
-	if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
-		scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
-			    val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
 {
 	if (ret < 0) {
@@ -218,14 +196,6 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

-		case Opt_log_merge_wait_timeout_ms:
-			ret = match_int(args, &nr);
-			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
-			if (ret < 0)
-				return ret;
-			opts->log_merge_wait_timeout_ms = nr;
-			break;
-
 		case Opt_metadev_path:
 			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
 			if (ret < 0)
@@ -452,43 +422,6 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

-static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
-						char *buf)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	struct scoutfs_mount_options opts;
-
-	scoutfs_options_read(sb, &opts);
-
-	return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
-}
-static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-						 const char *buf, size_t count)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	DECLARE_OPTIONS_INFO(sb, optinf);
-	char nullterm[30]; /* more than enough for octal -U64_MAX */
-	int val;
-	int len;
-	int ret;
-
-	len = min(count, sizeof(nullterm) - 1);
-	memcpy(nullterm, buf, len);
-	nullterm[len] = '\0';
-
-	ret = kstrtoint(nullterm, 0, &val);
-	ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
-	if (ret == 0) {
-		write_seqlock(&optinf->seqlock);
-		optinf->opts.log_merge_wait_timeout_ms = val;
-		write_sequnlock(&optinf->seqlock);
-		ret = count;
-	}
-
-	return ret;
-}
-SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
-
 static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -592,7 +525,6 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
-	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -8,7 +8,6 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
-	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -439,7 +439,6 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->journal_info = (unsigned long)journal_info;
 		__entry->holders = holders;
-		__entry->ret = ret;
 	),

 	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
@@ -1747,41 +1746,21 @@ TRACE_EVENT(scoutfs_btree_merge,
 		  sk_trace_args(end))
 );

-TRACE_EVENT(scoutfs_btree_merge_read_range,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
-		 int size),
-
-	TP_ARGS(sb, start, end, size),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		sk_trace_define(start)
-		sk_trace_define(end)
-		__field(int, size)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		sk_trace_assign(start, start);
-		sk_trace_assign(end, end);
-		__entry->size = size;
-	),
-
-	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
-);
-
 TRACE_EVENT(scoutfs_btree_merge_items,
 	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *m_root,
 		 struct scoutfs_key *m_key, int m_val_len,
 		 struct scoutfs_btree_root *f_root,
 		 struct scoutfs_key *f_key, int f_val_len,
 		 int is_del),

-	TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, m_root_blkno)
+		__field(__u64, m_root_seq)
+		__field(__u8, m_root_height)
 		sk_trace_define(m_key)
 		__field(int, m_val_len)
 		__field(__u64, f_root_blkno)
@@ -1794,6 +1773,10 @@ TRACE_EVENT(scoutfs_btree_merge_items,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->m_root_blkno = m_root ?
+					le64_to_cpu(m_root->ref.blkno) : 0;
+		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
+		__entry->m_root_height = m_root ? m_root->height : 0;
 		sk_trace_assign(m_key, m_key);
 		__entry->m_val_len = m_val_len;
 		__entry->f_root_blkno = f_root ?
@@ -1805,9 +1788,11 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		__entry->is_del = !!is_del;
 	),

-	TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
-		  __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
+	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
+		  __entry->m_root_height, sk_trace_args(m_key),
+		  __entry->m_val_len, __entry->f_root_blkno,
+		  __entry->f_root_seq, __entry->f_root_height,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );

@@ -2090,71 +2075,6 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_server_finalize_items,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
-		 u64 item_get_trans_seq),
-
-	TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(__u64, item_rid)
-		__field(__u64, item_nr)
-		__field(__u64, item_flags)
-		__field(__u64, item_get_trans_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->item_rid = item_rid;
-		__entry->item_nr = item_nr;
-		__entry->item_flags = item_flags;
-		__entry->item_get_trans_seq = item_get_trans_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
-		  __entry->item_flags, __entry->item_get_trans_seq)
-);
-
-TRACE_EVENT(scoutfs_server_finalize_decision,
-	TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
-		 bool ours_visible, bool finalize_ours, unsigned int delay_ms,
-		 u64 finalize_sent_seq),
-
-	TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
-		finalize_sent_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(bool, saw_finalized)
-		__field(bool, others_active)
-		__field(bool, ours_visible)
-		__field(bool, finalize_ours)
-		__field(unsigned int, delay_ms)
-		__field(__u64, finalize_sent_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->saw_finalized = saw_finalized;
-		__entry->others_active = others_active;
-		__entry->ours_visible = ours_visible;
-		__entry->finalize_ours = finalize_ours;
-		__entry->delay_ms = delay_ms;
-		__entry->finalize_sent_seq = finalize_sent_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
-		  __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
-		  __entry->finalize_sent_seq)
-);
-
 TRACE_EVENT(scoutfs_get_log_merge_status,
 	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
 		 u64 nr_requests, u64 nr_complete, u64 seq),
@@ -2879,81 +2799,6 @@ TRACE_EVENT(scoutfs_omap_should_delete,
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
 );

-#define SSCF_FMT "[bo %llu bs %llu es %llu]"
-#define SSCF_FIELDS(pref)					\
-	__field(__u64, pref##_blkno)				\
-	__field(__u64, pref##_blocks)				\
-	__field(__u64, pref##_entries)
-#define SSCF_ASSIGN(pref, sfl)					\
-	__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno);	\
-	__entry->pref##_blocks = le64_to_cpu((sfl)->blocks);	\
-	__entry->pref##_entries = le64_to_cpu((sfl)->entries);
-#define SSCF_ENTRY_ARGS(pref)					\
-	__entry->pref##_blkno,					\
-	__entry->pref##_blocks,					\
-	__entry->pref##_entries
-
-DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-
-	TP_ARGS(sb, sc),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, id)
-		__field(__u8, nr)
-		__field(__u8, flags)
-		SSCF_FIELDS(out)
-		__field(__u64, in0_blk)
-		__field(__u64, in0_pos)
-		SSCF_FIELDS(in0)
-		__field(__u64, in1_blk)
-		__field(__u64, in1_pos)
-		SSCF_FIELDS(in1)
-		__field(__u64, in2_blk)
-		__field(__u64, in2_pos)
-		SSCF_FIELDS(in2)
-		__field(__u64, in3_blk)
-		__field(__u64, in3_pos)
-		SSCF_FIELDS(in3)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->id = le64_to_cpu(sc->id);
-		__entry->nr = sc->nr;
-		__entry->flags = sc->flags;
-		SSCF_ASSIGN(out, &sc->out)
-		__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in0, &sc->in[0].sfl)
-		__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in1, &sc->in[1].sfl)
-		__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in2, &sc->in[2].sfl)
-		__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in3, &sc->in[3].sfl)
-	),
-
-	TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
-		  SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
-		  __entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
-		  __entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
-		  __entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
-		  __entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
-);
-DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-	TP_ARGS(sb, sc)
-);
-DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-	TP_ARGS(sb, sc)
-);
-
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -148,8 +148,6 @@ struct server_info {
 	struct scoutfs_quorum_config qconf;
 	/* a running server maintains a private dirty super */
 	struct scoutfs_super_block dirty_super;
-
-	u64 finalize_sent_seq;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -415,27 +413,6 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
 	wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
 }

-/*
- * Return the higher of the avail or freed used by the active commit
- * since this holder joined the commit.  This is *not* the amount used
- * by the holder, we don't track per-holder alloc use.
- */
-static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	u32 avail_used;
-	u32 freed_used;
-	u32 avail_now;
-	u32 freed_now;
-
-	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
-
-	avail_used = hold->avail - avail_now;
-	freed_used = hold->freed - freed_now;
-
-	return max(avail_used, freed_used);
-}
-
 /*
 * This is called while holding the commit and returns once the commit
 * is successfully written.  Many holders can all wait for all holders
@@ -961,24 +938,22 @@ static int find_log_trees_item(struct super_block *sb,
 }

 /*
- * Find the log_trees item with the greatest nr for each rid.  Fills the
- * caller's log_trees and sets the key before the returned log_trees for
- * the next iteration.  Returns 0 when done, > 0 for each item, and
- * -errno on fatal errors.
+ * Find the next log_trees item from the key.  Fills the caller's log_trees and sets
+ * the key past the returned log_trees for iteration.  Returns 0 when done, > 0 for each
+ * item, and -errno on fatal errors.
 */
-static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
-				struct scoutfs_key *key, struct scoutfs_log_trees *lt)
+static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
+		       struct scoutfs_key *key, struct scoutfs_log_trees *lt)
 {
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	int ret;

-	ret = scoutfs_btree_prev(sb, root, key, &iref);
+	ret = scoutfs_btree_next(sb, root, key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			memcpy(lt, iref.val, iref.val_len);
 			*key = *iref.key;
-			key->sklt_nr = 0;
-			scoutfs_key_dec(key);
+			scoutfs_key_inc(key);
 			ret = 1;
 		} else {
 			ret = -EIO;
@@ -1073,13 +1048,21 @@ static int next_log_merge_item(struct super_block *sb,
 * abandoned log btree finalized.  If it takes too long each client has
 * a change to make forward progress before being asked to commit again.
 *
+ * We're waiting on heavy state that is protected by mutexes and
+ * transaction machinery.  It's tricky to recreate that state for
+ * lightweight condition tests that don't change task state.  Instead of
+ * trying to get that right, particularly as we unwind after success or
+ * after timeouts, waiters use an unsatisfying poll.   Short enough to
+ * not add terrible latency, given how heavy and infrequent this already
+ * is, and long enough to not melt the cpu.  This could be tuned if it
+ * becomes a problem.
+ *
 * This can end up finalizing a new empty log btree if a new mount
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
-#define FINALIZE_POLL_MIN_DELAY_MS	5U
-#define FINALIZE_POLL_MAX_DELAY_MS	100U
-#define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
+#define FINALIZE_POLL_MS	(11)
+#define FINALIZE_TIMEOUT_MS	(MSEC_PER_SEC / 2)
 static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
 					u64 rid, struct commit_hold *hold)
 {
@@ -1087,10 +1070,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
-	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
 	struct scoutfs_log_trees fin;
-	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
 	bool others_active;
@@ -1098,14 +1079,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	bool ours_visible;
 	struct scoutfs_key key;
 	char *err_str = NULL;
-	ktime_t start;
 	int ret;
 	int err;

-	scoutfs_options_read(sb, &opts);
-	timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
-	delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
-	start = ktime_get_raw();
+	timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);

 	for (;;) {
 		/* nothing to do if there's already a merge in flight */
@@ -1122,13 +1099,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		saw_finalized = false;
 		others_active = false;
 		ours_visible = false;
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-		while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
-
-			trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
-							    le64_to_cpu(each_lt.nr),
-							    le64_to_cpu(each_lt.flags),
-							    le64_to_cpu(each_lt.get_trans_seq));
+		scoutfs_key_init_log_trees(&key, 0, 0);
+		while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
 				saw_finalized = true;
@@ -1153,10 +1125,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		finalize_ours = (lt->item_root.height > 2) ||
 				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);

-		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
-						       ours_visible, finalize_ours, delay_ms,
-						       server->finalize_sent_seq);
-
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
@@ -1164,13 +1132,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		}

 		/* send sync requests soon to give time to commit */
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+		scoutfs_key_init_log_trees(&key, 0, 0);
 		while (others_active &&
-		       (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		       (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
-			    (le64_to_cpu(each_lt.rid) == rid) ||
-			    (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
+			    (le64_to_cpu(each_lt.rid) == rid))
 				continue;

 			ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1190,8 +1157,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

-		server->finalize_sent_seq = scoutfs_server_seq(sb);
-
 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
 			fin = *lt;
@@ -1229,16 +1194,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			if (ret < 0)
 				err_str = "applying commit before waiting for finalized";

-			msleep(delay_ms);
-			delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
-				       FINALIZE_POLL_MAX_DELAY_MS);
+			msleep(FINALIZE_POLL_MS);

 			server_hold_commit(sb, hold);
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
 			if (time_after(jiffies, timeo)) {
-				scoutfs_inc_counter(sb, log_merge_wait_timeout);
 				ret = 0;
 				break;
 			}
@@ -1821,29 +1783,43 @@ out:
 * Give the caller the last seq before outstanding client commits.  All
 * seqs up to and including this are stable, new client transactions can
 * only have greater seqs.
- *
- * For each rid, only its greatest log trees nr can be an open commit.
- * We look at the last log_trees item for each client rid and record its
- * trans seq if it hasn't been committed.
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_log_trees lt;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_log_trees *lt;
 	struct scoutfs_key key;
 	u64 last_seq = 0;
 	int ret;

 	last_seq = scoutfs_server_seq(sb) - 1;
+	scoutfs_key_init_log_trees(&key, 0, 0);

 	mutex_lock(&server->logs_mutex);

-	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
-		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
-		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
-			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
+	for (;; scoutfs_key_inc(&key)) {
+		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				lt = iref.val;
+				if ((le64_to_cpu(lt->get_trans_seq) >
+				     le64_to_cpu(lt->commit_trans_seq)) &&
+				     le64_to_cpu(lt->get_trans_seq) <= last_seq) {
+					last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
+				}
+				key = *iref.key;
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
 		}
 	}

@@ -1990,7 +1966,9 @@ static int server_srch_get_compact(struct super_block *sb,
 	ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
 				       &super->srch_root, rid, sc);
 	mutex_unlock(&server->srch_mutex);
-	if (ret < 0 || (ret == 0 && sc->nr == 0))
+	if (ret == 0 && sc->nr == 0)
+		ret = -ENOENT;
+	if (ret < 0)
 		goto apply;

 	mutex_lock(&server->alloc_mutex);
@@ -2495,11 +2473,9 @@ static void server_log_merge_free_work(struct work_struct *work)

 	while (!server_is_stopping(server)) {

-		if (!commit) {
-			server_hold_commit(sb, &hold);
-			mutex_lock(&server->logs_mutex);
-			commit = true;
-		}
+		server_hold_commit(sb, &hold);
+		mutex_lock(&server->logs_mutex);
+		commit = true;

 		ret = next_log_merge_item(sb, &super->log_merge,
 					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2546,14 +2522,12 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
-			mutex_unlock(&server->logs_mutex);
-			ret = server_apply_commit(sb, &hold, ret);
-			commit = false;
-			if (ret < 0) {
-				err_str = "looping commit del/upd freeing item";
-				break;
-			}
+		mutex_unlock(&server->logs_mutex);
+		ret = server_apply_commit(sb, &hold, ret);
+		commit = false;
+		if (ret < 0) {
+			err_str = "looping commit del/upd freeing item";
+			break;
 		}
 	}

@@ -4326,7 +4300,6 @@ static void scoutfs_server_worker(struct work_struct *work)
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
-	server->finalize_sent_seq = 0;

 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -30,9 +30,6 @@
 #include "client.h"
 #include "counters.h"
 #include "scoutfs_trace.h"
-#include "triggers.h"
-#include "sysfs.h"
-#include "msg.h"

 /*
 * This srch subsystem gives us a way to find inodes that have a given
@@ -71,14 +68,10 @@ struct srch_info {
 	atomic_t shutdown;
 	struct workqueue_struct *workq;
 	struct delayed_work compact_dwork;
-	struct scoutfs_sysfs_attrs ssa;
-	atomic_t compact_delay_ms;
 };

 #define DECLARE_SRCH_INFO(sb, name) \
 	struct srch_info *name = SCOUTFS_SB(sb)->srch_info
-#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
-	DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_ARG(sre)						\
@@ -527,95 +520,6 @@ out:
 	return ret;
 }

-/*
- * Padded entries are encoded in pairs after an existing entry.  All of
- * the pairs cancel each other out by all readers (the second encoding
- * looks like deletion) so they aren't visible to the first/last bounds of
- * the block or file.
- */
-static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
-			       struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
-{
-	int ret;
-
-	ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
-			   sre, &srb->tail);
-	if (ret > 0) {
-		srb->tail = *sre;
-		le32_add_cpu(&srb->entry_nr, 1);
-		le32_add_cpu(&srb->entry_bytes, ret);
-		le64_add_cpu(&sfl->entries, 1);
-		ret = 0;
-	}
-
-	return ret;
-}
-
-/*
- * This is called by a testing trigger to create a very specific case of
- * encoded entry offsets.  We want the last entry in the block to start
- * precisely at the _SAFE_BYTES offset.
- *
- * This is called when there is a single existing entry in the block.
- * We have the entire block to work with.  We encode pairs of matching
- * entries.  This hides them from readers (both searches and merging) as
- * they're interpreted as creation and deletion and are deleted.  We use
- * the existing hash value of the first entry in the block but then set
- * the inode to an impossibly large number so it doesn't interfere with
- * anything.
- *
- * To hit the specific offset we very carefully manage the amount of
- * bytes of change between fields in the entry.  We know that if we
- * change all the byte of the ino and id we end up with a 20 byte
- * (2+8+8,2) encoding of the pair of entries.  To have the last entry
- * start at the _SAFE_POS offset we know that the final 20 byte pair
- * encoding needs to end at 2 bytes (second entry encoding) after the
- * _SAFE_POS offset.
- *
- * So as we encode pairs we watch the delta of our current offset from
- * that desired final offset of 2 past _SAFE_POS.  If we're a multiple
- * of 20 away then we encode the full 20 byte pairs.  If we're not, then
- * we drop a byte to encode 19 bytes.  That'll slowly change the offset
- * to be a multiple of 20 again while encoding large entries.
- */
-static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
-				struct scoutfs_srch_block *srb)
-{
-	struct scoutfs_srch_entry sre;
-	u32 target;
-	s32 diff;
-	u64 hash;
-	u64 ino;
-	u64 id;
-	int ret;
-
-	hash = le64_to_cpu(srb->tail.hash);
-	ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
-	id = le64_to_cpu(srb->tail.id);
-
-	target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
-
-	while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
-		ino ^= 1ULL << (7 * 8);
-		if (diff % 20 == 0) {
-			id ^= 1ULL << (7 * 8);
-		} else {
-			id ^= 1ULL << (6 * 8);
-		}
-
-		sre.hash = cpu_to_le64(hash);
-		sre.ino = cpu_to_le64(ino);
-		sre.id = cpu_to_le64(id);
-
-		ret = append_padded_entry(sfl, blk, srb, &sre);
-		if (ret == 0)
-			ret = append_padded_entry(sfl, blk, srb, &sre);
-		BUG_ON(ret != 0);
-
-		diff = target - le32_to_cpu(srb->entry_bytes);
-	}
-}
-
 /*
 * The caller is dropping an ino/id because the tracking rbtree is full.
 * This loses information so we can't return any entries at or after the
@@ -1083,9 +987,6 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 	struct scoutfs_key key;
 	int ret;

-	if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
-		force = true;
-
 	if (sfl->ref.blkno == 0 ||
 	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
 		return 0;
@@ -1561,7 +1462,7 @@ static int kway_merge(struct super_block *sb,
 		      struct scoutfs_block_writer *wri,
 		      struct scoutfs_srch_file *sfl,
 		      kway_get_t kway_get, kway_advance_t kway_adv,
-		      void **args, int nr, bool logs_input)
+		      void **args, int nr)
 {
 	DECLARE_SRCH_INFO(sb, srinf);
 	struct scoutfs_srch_block *srb = NULL;
@@ -1666,15 +1567,6 @@ static int kway_merge(struct super_block *sb,
 				blk++;
 			}

-			/* end sorted block on _SAFE offset for testing */
-			if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
-			    scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
-				pad_entries_at_safe(sfl, blk, srb);
-				scoutfs_block_put(sb, bl);
-				bl = NULL;
-				blk++;
-			}
-
 			scoutfs_inc_counter(sb, srch_compact_entry);

 		} else {
@@ -1717,8 +1609,6 @@ static int kway_merge(struct super_block *sb,
 			empty++;
 			ret = 0;
 		} else if (ret < 0) {
-			if (ret == -ENOANO) /* just testing trigger */
-				ret = 0;
 			goto out;
 		}

@@ -1926,7 +1816,7 @@ static int compact_logs(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
-			 args, nr_pages, true);
+			 args, nr_pages);
 	if (ret < 0)
 		goto out;

@@ -1984,18 +1874,12 @@ static int kway_get_reader(struct super_block *sb,
 	srb = rdr->bl->data;

 	if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
-	    rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
+	    rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
 	    rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
 		/* XXX inconsistency */
 		return -EIO;
 	}

-	if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
-	    scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
-		/* only used in testing */
-		return -ENOANO;
-	}
-
 	/* decode entry, possibly skipping start of the block */
 	while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
 		ret = decode_entry(srb->entries + rdr->pos,
@@ -2085,7 +1969,7 @@ static int compact_sorted(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
-			 kway_adv_reader, args, nr, false);
+			 kway_adv_reader, args, nr);

 	sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
 	for (i = 0; i < nr; i++) {
@@ -2214,15 +2098,8 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-static void queue_compact_work(struct srch_info *srinf, bool immediate)
-{
-	unsigned long delay;
-
-	if (!atomic_read(&srinf->shutdown)) {
-		delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
-		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
-	}
-}
+/* wait 10s between compact attempts on error, immediate after success */
+#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)

 /*
 * Get a compaction operation from the server, sort the entries from the
@@ -2250,6 +2127,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	struct super_block *sb = srinf->sb;
 	struct scoutfs_block_writer wri;
 	struct scoutfs_alloc alloc;
+	unsigned long delay;
 	int ret;
 	int err;

@@ -2262,8 +2140,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	scoutfs_block_writer_init(sb, &wri);

 	ret = scoutfs_client_srch_get_compact(sb, sc);
-	if (ret >= 0)
-		trace_scoutfs_srch_compact_client_recv(sb, sc);
 	if (ret < 0 || sc->nr == 0)
 		goto out;

@@ -2292,7 +2168,6 @@ commit:
 	sc->meta_freed = alloc.freed;
 	sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;

-	trace_scoutfs_srch_compact_client_send(sb, sc);
 	err = scoutfs_client_srch_commit_compact(sb, sc);
 	if (err < 0 && ret == 0)
 		ret = err;
@@ -2303,56 +2178,14 @@ out:
 		scoutfs_inc_counter(sb, srch_compact_error);

 	scoutfs_block_writer_forget_all(sb, &wri);
-	queue_compact_work(srinf, sc->nr > 0 && ret == 0);
+	if (!atomic_read(&srinf->shutdown)) {
+		delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
+		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
+	}

 	kfree(sc);
 }

-static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
-{
-	DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
-
-	return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
-}
-
-#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
-#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
-#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
-
-static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-				      const char *buf, size_t count)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	DECLARE_SRCH_INFO(sb, srinf);
-	char nullterm[30]; /* more than enough for octal -U64_MAX */
-	u64 val;
-	int len;
-	int ret;
-
-	len = min(count, sizeof(nullterm) - 1);
-	memcpy(nullterm, buf, len);
-	nullterm[len] = '\0';
-
-	ret = kstrtoll(nullterm, 0, &val);
-	if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
-		scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
-			    MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
-		return -EINVAL;
-	}
-
-	atomic_set(&srinf->compact_delay_ms, val);
-	cancel_delayed_work(&srinf->compact_dwork);
-	queue_compact_work(srinf, false);
-
-	return count;
-}
-SCOUTFS_ATTR_RW(compact_delay_ms);
-
-static struct attribute *srch_attrs[] = {
-	SCOUTFS_ATTR_PTR(compact_delay_ms),
-	NULL,
-};
-
 void scoutfs_srch_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -2369,8 +2202,6 @@ void scoutfs_srch_destroy(struct super_block *sb)
 		destroy_workqueue(srinf->workq);
 	}

-	scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
-
 	kfree(srinf);
 	sbi->srch_info = NULL;
 }
@@ -2388,15 +2219,8 @@ int scoutfs_srch_setup(struct super_block *sb)
 	srinf->sb = sb;
 	atomic_set(&srinf->shutdown, 0);
 	INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
-	scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
-	atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
-
 	sbi->srch_info = srinf;

-	ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
-	if (ret < 0)
-		goto out;
-
 	srinf->workq = alloc_workqueue("scoutfs_srch_compact",
 				       WQ_NON_REENTRANT | WQ_UNBOUND |
 				       WQ_HIGHPRI, 0);
@@ -2405,7 +2229,8 @@ int scoutfs_srch_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_compact_work(srinf, false);
+	queue_delayed_work(srinf->workq, &srinf->compact_dwork,
+			   msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));

 	ret = 0;
 out:
--- a/kmod/src/triggers.c
+++ b/kmod/src/triggers.c
@@ -39,9 +39,6 @@ struct scoutfs_triggers {

 static char *names[] = {
 	[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
-	[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
-	[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
-	[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
 	[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
 };

--- a/kmod/src/triggers.h
+++ b/kmod/src/triggers.h
@@ -3,9 +3,6 @@

 enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
-	SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
-	SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
-	SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
 	SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
 	SCOUTFS_TRIGGER_NR,
 };
--- a/tests/README.md
+++ b/tests/README.md
@@ -25,9 +25,8 @@ All options can be seen by running with -h.
 This script is built to test multi-node systems on one host by using
 different mounts of the same devices.  The script creates a fake block
 device in front of each fs block device for each mount that will be
-tested.  It will create predictable device mapper devices and mounts
-them on /mnt/test.N.  These static device names and mount paths limit
-the script to a single execution per host.
+tested.  Currently it will create free loop devices and will mount on
+/mnt/test.[0-9].

 All tests will be run by default.  Particular tests can be included or
 excluded by providing test name regular expressions with the -I and -E
@@ -105,8 +104,8 @@ used during the test.

 | Variable         | Description          | Origin          | Example           |
 | ---------------- | -------------------  | --------------- | ----------------- |
-| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9]        |
-| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9]        |
+| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
+| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
 | T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
 | T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
 | T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -6,61 +6,6 @@ t_filter_fs()
 	    -e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
 }

-#
-# We can hit a spurious kasan warning that was fixed upstream:
-#
-#  e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
-#
-# KASAN can get mad when the unwinder doesn't find ORC metadata and
-# wanders up without using frames and hits the KASAN stack red zones.
-# We can ignore these messages.
-#
-# They're bracketed by:
-# [ 2687.690127] ==================================================================
-# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
-# ...
-# [ 2687.706220] ==================================================================
-# [ 2687.707284] Disabling lock debugging due to kernel taint
-#
-# That final lock debugging message may not be included.
-#
-ignore_harmless_unwind_kasan_stack_oob()
-{
-awk '
-        BEGIN {
-                in_soob = 0
-                soob_nr = 0
-        }
-        ( !in_soob && $0 ~ /==================================================================/ ) {
-                in_soob = 1
-                soob_nr = NR
-                saved = $0
-        }
-        ( in_soob == 1 && NR == (soob_nr + 1) ) {
-                if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
-                        in_soob = 2
-                } else {
-                        in_soob = 0
-                        print saved
-                }
-		saved=""
-        }
-        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
-                in_soob = 3
-                soob_nr = NR
-        }
-        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
-                in_soob = 0
-        }
-        ( !in_soob ) { print $0 }
-        END {
-                if (saved) {
-                        print saved
-                }
-        }
-'
-}
-
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -141,12 +86,10 @@ t_filter_dmesg()
 	re="$re|scoutfs .* critical transaction commit failure.*"

 	# change-devices causes loop device resizing
-	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"

 	# ignore systemd-journal rotating
 	re="$re|systemd-journald.*"

-	egrep -v "($re)" | \
-		ignore_harmless_unwind_kasan_stack_oob
+	egrep -v "($re)" 
 }
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -265,15 +265,6 @@ t_trigger_get() {
 	cat "$(t_trigger_path "$nr")/$which"
 }

-t_trigger_set() {
-	local which="$1"
-	local nr="$2"
-	local val="$3"
-	local path=$(t_trigger_path "$nr")
-
-	echo "$val" > "$path/$which"
-}
-
 t_trigger_show() {
 	local which="$1"
 	local string="$2"
@@ -285,8 +276,9 @@ t_trigger_show() {
 t_trigger_arm_silent() {
 	local which="$1"
 	local nr="$2"
+	local path=$(t_trigger_path "$nr")

-	t_trigger_set "$which" "$nr" 1
+	echo 1 > "$path/$which"
 }

 t_trigger_arm() {
--- a/tests/golden/createmany-parallel-mounts
+++ b/tests/golden/createmany-parallel-mounts
@@ -1,4 +1,3 @@
 == measure initial createmany
 == measure initial createmany
 == measure two concurrent createmany runs
-== cleanup
--- a/tests/golden/large-fragmented-free
+++ b/tests/golden/large-fragmented-free
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
--- a/tests/golden/srch-safe-merge-pos
+++ b/tests/golden/srch-safe-merge-pos
@@ -1,37 +0,0 @@
-== initialize per-mount values
-== arm compaction triggers
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-== compact more often
-== create padded sorted inputs by forcing log rotation
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-== compaction of padded should stop at safe
-== verify no compaction errors
-== cleanup
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -326,10 +326,16 @@ unmount_all() {
 		cmd wait $p
 	done

-	# delete all temp devices
-	for dev in /dev/mapper/_scoutfs_test_*; do
-		if [ -b "$dev" ]; then
-			cmd dmsetup remove $dev
+	# delete all temp meta devices
+	for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
+		if [ -e "$dev" ]; then
+			cmd losetup -d "$dev"
+		fi
+	done
+	# delete all temp data devices
+	for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
+		if [ -e "$dev" ]; then
+			cmd losetup -d "$dev"
 		fi
 	done
 }
@@ -428,12 +434,6 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
 fenced_log "started fenced pid $fenced_pid in the background"

-# setup dm tables
-echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
-	$T_RESULTS/dmtable.meta
-echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
-	$T_RESULTS/dmtable.data
-
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -442,13 +442,10 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
 pids=""
 for i in $(seq 0 $((T_NR_MOUNTS - 1))); do

-	name="_scoutfs_test_meta_$i"
-	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
-	meta_dev="/dev/mapper/$name"
-
-	name="_scoutfs_test_data_$i"
-	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
-	data_dev="/dev/mapper/$name"
+	meta_dev=$(losetup --find --show $T_META_DEVICE)
+	test -b "$meta_dev" || die "failed to create temp device $meta_dev"
+	data_dev=$(losetup --find --show $T_DATA_DEVICE)
+	test -b "$data_dev" || die "failed to create temp device $data_dev"

 	dir="/mnt/test.$i"
 	test -d "$dir" || cmd mkdir -p "$dir"
--- a/tests/sequence
+++ b/tests/sequence
@@ -14,7 +14,6 @@ offline-extent-waiting.sh
 move-blocks.sh
 large-fragmented-free.sh
 enospc.sh
-srch-safe-merge-pos.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 totl-xattr-tag.sh
--- a/tests/src/bulk_create_paths.c
+++ b/tests/src/bulk_create_paths.c
@@ -1,7 +1,6 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <stdarg.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/stat.h>
@@ -36,10 +35,10 @@ struct opts {
 	unsigned int dry_run:1,
 		     ls_output:1,
 		     quiet:1,
-		     xattr_set:1,
-		     xattr_file:1,
-		     xattr_group:1;
-	char *xattr_name;
+		     user_xattr:1,
+		     same_srch_xattr:1,
+		     group_srch_xattr:1,
+		     unique_srch_xattr:1;
 };

 struct stats {
@@ -150,31 +149,12 @@ static void free_dir(struct dir *dir)
 	free(dir);
 }

-static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
-{
-	va_list ap;
-	int ret;
-
-	if (off >= sz)
-		return sz;
-
-	va_start(ap, fmt);
-	ret = vsnprintf(buf + off, sz - off, fmt, ap);
-	va_end(ap);
-
-	if (ret <= 0)
-		return sz;
-
-	return off + ret;
-}
-
 static void create_dir(struct dir *dir, struct opts *opts,
 		       struct stats *stats)
 {
 	struct str_list *s;
-	char name[256]; /* max len and null term */
+	char name[100];
 	char val = 'v';
-	size_t off;
 	int rc;
 	int i;

@@ -195,21 +175,29 @@ static void create_dir(struct dir *dir, struct opts *opts,
 		rc = mknod(s->str, S_IFREG | 0644, 0);
 		error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);

-		if (opts->xattr_set) {
-			off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
-			if (opts->xattr_file)
-				off = snprintf_off(name, sizeof(name), off,
-						   "-f-%lu", stats->files);
-			if (opts->xattr_group)
-				off = snprintf_off(name, sizeof(name), off,
-						   "-g-%lu", stats->files / 10000);
-
-			error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
-
+		rc = 0;
+		if (rc == 0 && opts->user_xattr) {
+			strcpy(name, "user.scoutfs_bcp");
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->same_srch_xattr) {
+			strcpy(name, "scoutfs.srch.scoutfs_bcp");
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->group_srch_xattr) {
+			snprintf(name, sizeof(name),
+				 "scoutfs.srch.scoutfs_bcp.group.%lu",
+				 stats->files / 10000);
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->unique_srch_xattr) {
+			snprintf(name, sizeof(name),
+				 "scoutfs.srch.scoutfs_bcp.unique.%lu",
+				 stats->files);
 			rc = setxattr(s->str, name, &val, 1, 0);
-			error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
 		}

+		error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);

 		stats->files++;
 		rate_banner(opts, stats);
@@ -377,10 +365,11 @@ static void usage(void)
 	       " -d DIR | create all files in DIR top level directory\n"
 	       " -n     | dry run, only parse, don't create any files\n"
 	       " -q     | quiet, don't regularly print rates\n"
-	       " -F     | append \"-f-NR\" file nr to xattr name, requires -X\n"
-	       " -G     | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
 	       " -L     | parse ls output; only reg, skip meta, paths at ./\n"
-	       " -X NAM | set named xattr in all files\n");
+	       " -X     | set the same user. xattr name in all files\n"
+	       " -S     | set the same .srch. xattr name in all files\n"
+	       " -G     | set a .srch. xattr name shared by groups of files\n"
+	       " -U     | set a unique .srch. xattr name in all files\n");
 }

 int main(int argc, char **argv)
@@ -397,7 +386,7 @@ int main(int argc, char **argv)

 	memset(&opts, 0, sizeof(opts));

-        while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
+        while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
                switch(c) {
                case 'd':
                        top_dir = strdup(optarg);
@@ -408,19 +397,20 @@ int main(int argc, char **argv)
                case 'q':
                        opts.quiet = 1;
                        break;
-                case 'F':
-                        opts.xattr_file = 1;
-                        break;
-                case 'G':
-                        opts.xattr_group = 1;
-                        break;
                case 'L':
                        opts.ls_output = 1;
                        break;
                case 'X':
-			opts.xattr_set = 1;
-			opts.xattr_name = strdup(optarg);
-			error_exit(!opts.xattr_name, "error allocating xattr name");
+                        opts.user_xattr = 1;
+                        break;
+                case 'S':
+                        opts.same_srch_xattr = 1;
+                        break;
+                case 'G':
+                        opts.group_srch_xattr = 1;
+                        break;
+                case 'U':
+                        opts.unique_srch_xattr = 1;
                        break;
                case '?':
                        printf("Unknown option '%c'\n", optopt);
@@ -429,11 +419,6 @@ int main(int argc, char **argv)
                }
        }

-	error_exit(opts.xattr_file && !opts.xattr_set,
-		   "must specify xattr -X when appending file nr with -F");
-	error_exit(opts.xattr_group && !opts.xattr_set,
-		   "must specify xattr -X when appending file nr with -G");
-
 	if (!opts.dry_run) {
 		error_exit(!top_dir,
 			   "must specify top level directory with -d");
--- a/tests/tests/createmany-parallel-mounts.sh
+++ b/tests/tests/createmany-parallel-mounts.sh
@@ -7,11 +7,9 @@ t_require_mounts 2

 COUNT=50000

-#
-# Prep dirs for test.  We have per-directory inode number allocators so
-# by putting each createmany in a per-mount dir they get their own inode
-# number region and cluster locks.
-#
+# Prep dirs for test. Each mount needs to make their own parent dir for
+# the createmany run, otherwise both dirs will end up in the same inode
+# group, causing updates to bounce that lock around.
 echo "== measure initial createmany"
 mkdir -p $T_D0/dir/0
 mkdir $T_D1/dir/1
@@ -19,20 +17,18 @@ mkdir $T_D1/dir/1
 echo "== measure initial createmany"
 START=$SECONDS
 createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
-sync
 SINGLE=$((SECONDS - START))
 echo single $SINGLE >> $T_TMP.full

 echo "== measure two concurrent createmany runs"
 START=$SECONDS
-(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
+createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
 pids="$!"
-(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
+createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
 pids="$pids $!"
 for p in $pids; do
        wait $p
 done
-sync
 BOTH=$((SECONDS - START))
 echo both $BOTH >> $T_TMP.full

@@ -45,10 +41,7 @@ echo both $BOTH >> $T_TMP.full
 # synchronized operation.
 FACTOR=200
 if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
-	t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
+	echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
 fi

-echo "== cleanup"
-find $T_D0/dir -delete
-
 t_pass
--- a/tests/tests/large-fragmented-free.sh
+++ b/tests/tests/large-fragmented-free.sh
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

--- a/tests/tests/srch-basic-functionality.sh
+++ b/tests/tests/srch-basic-functionality.sh
@@ -9,7 +9,6 @@ LOG=340000
 LIM=1000000

 SEQF="%.20g"
-SXA="scoutfs.srch.test-srch-basic-functionality"

 t_require_commands touch rm setfattr scoutfs find_xattrs

@@ -28,20 +27,20 @@ diff_srch_find()

 echo "== create new xattrs"
 touch "$T_D0/"{create,update}
-setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== update existing xattr"
-setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== remove an xattr"
-setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== remove xattr with files"
 rm -f "$T_D0/"{create,update}
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.test

 echo "== trigger small log merges by rotating single block with unmount"
 sv=$(t_server_nr)
@@ -57,7 +56,7 @@ while [ "$i" -lt "8" ]; do

 		eval path="\$T_D${nr}/single-block-$i"
 		touch "$path"
-		setfattr -n $SXA -v $i "$path"
+		setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
 		t_umount $nr
 		t_mount $nr

@@ -66,51 +65,51 @@ while [ "$i" -lt "8" ]; do
 done
 # wait for srch compaction worker delay
 sleep 10
-find "$T_D0" -type f -name 'single-block-*' -delete
+rm -rf "$T_D0/single-block-*"

 echo "== create entries in current log"
 DIR="$T_D0/dir"
 NR=$((LOG / 4))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete small fraction"
-seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
-diff_srch_find $SXA
+seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== create entries that exceed one log"
 NR=$((LOG * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete fractions in phases"
 for i in $(seq 1 3); do
-	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
-	diff_srch_find $SXA
+	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+	diff_srch_find scoutfs.srch.scoutfs_bcp
 done

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== create entries for exceed search entry limit"
 NR=$((LIM * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete half"
-seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
-diff_srch_find $SXA
+seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== entirely remove third batch"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 t_pass
--- a/tests/tests/srch-safe-merge-pos.sh
+++ b/tests/tests/srch-safe-merge-pos.sh
@@ -1,90 +0,0 @@
-#
-# There was a bug where srch file compaction could get stuck if a
-# partial compaction finished at the specific _SAFE_BYTES offset in a
-# block.  Resuming from that position would return an error and
-# compaction would stop making forward progress.
-#
-# We use triggers to pad the output of log compaction to end on the safe
-# offset and then cause compaction of those padded inputs to stop at the
-# safe offset.  Continuation will either succeed or return errors.  
-#
-
-# forcing rotation, so just a few
-NR=10
-SEQF="%.20g"
-COMPACT_NR=4
-
-echo "== initialize per-mount values"
-declare -a err
-declare -a compact_delay
-for nr in $(t_fs_nrs); do
-	err[$nr]=$(t_counter srch_compact_error $nr)
-	compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
-done
-restore_compact_delay()
-{
-	for nr in $(t_fs_nrs); do
-		echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
-	done
-}
-trap restore_compact_delay EXIT
-
-echo "== arm compaction triggers"
-for nr in $(t_fs_nrs); do
-	t_trigger_arm srch_compact_logs_pad_safe $nr
-	t_trigger_arm srch_merge_stop_safe $nr
-done
-
-echo "== compact more often"
-for nr in $(t_fs_nrs); do
-	echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
-done
-
-echo "== create padded sorted inputs by forcing log rotation"
-sv=$(t_server_nr)
-for i in $(seq 1 $COMPACT_NR); do
-	for j in $(seq 1 $COMPACT_NR); do
-		t_trigger_arm srch_force_log_rotate $sv
-
-		seq -f "f-$i-$j-$SEQF" 1 10 | \
-			bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
-			/dev/null
-		sync
-
-		test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
-			t_fail "srch_force_log_rotate didn't trigger"
-	done
-
-	padded=0
-	while test $padded == 0 && sleep .5; do
-		for nr in $(t_fs_nrs); do
-			if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
-				t_trigger_arm srch_compact_logs_pad_safe $nr
-				padded=1
-				break
-			fi
-			test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
-				t_fail "srch_compact_error counter increased on mount $nr"
-		done
-	done
-done
-
-echo "== compaction of padded should stop at safe"
-sleep 2
-for nr in $(t_fs_nrs); do
-	if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
-		break
-	fi
-done
-
-echo "== verify no compaction errors"
-sleep 2
-for nr in $(t_fs_nrs); do
-	test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
-		t_fail "srch_compact_error counter increased on mount $nr"
-done
-
-echo "== cleanup"
-find "$T_D0" -type f -name 'f-*' -delete
-
-t_pass
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -55,19 +55,6 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
-.B log_merge_wait_timeout_ms=<number>
-This option sets the amount of time, in milliseconds, that log merge
-creation can wait before timing out.  This setting is per-mount, only
-changes the behavior of that mount, and only affects the server when it
-is running in that mount.
-.sp
-This determines how long it may take for mounts to synchronize
-committing their log trees to create a log merge operation.  Setting it
-too high can create long latencies in the event that a mount takes a
-long time to commit their log.  Setting it too low can result in the
-creation of excessive numbers of log trees that are never merged.  The
-default is 500 and it can not be less than 100 nor greater than 60000.
-.TP
 .B metadev_path=<device>
 The metadev_path option specifies the path to the block device that
 contains the filesystem's metadata.
--- a/utils/src/prepare_empty_data_device.c
+++ b/utils/src/prepare_empty_data_device.c
@@ -38,6 +38,7 @@ struct prepare_empty_data_dev_args {
 	char *meta_device;
 	char *data_device;
 	bool check;
+	bool force;
 };

 static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
@@ -77,20 +78,22 @@ static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
 		goto out;
 	}

-	ret = meta_super_in_use(meta_fd, meta_super);
-	if (ret < 0) {
-		if (ret == -EBUSY)
-			fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to determine if the data device is empty.\n");
-		goto out;
-	}
+	if (!args->force) {
+		ret = meta_super_in_use(meta_fd, meta_super);
+		if (ret < 0) {
+			if (ret == -EBUSY)
+				fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to determine if the data device is empty.\n");
+			goto out;
+		}

-	in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
-		 le64_to_cpu(meta_super->data_alloc.total_len);
-	if (in_use) {
-		fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files.  They must be removed, truncated, or released before a new empty data device can be used.\n",
-		       SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
-		ret = -EINVAL;
-		goto out;
+		in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
+			 le64_to_cpu(meta_super->data_alloc.total_len);
+		if (in_use) {
+			fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files.  They must be removed, truncated, or released before a new empty data device can be used.\n",
+			       SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
+			ret = -EINVAL;
+			goto out;
+		}
 	}

 	if (args->data_device) {
@@ -193,6 +196,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 	case 'c':
 		args->check = true;
 		break;
+	case 'f':
+		args->force = true;
+		break;
 	case ARGP_KEY_ARG:
 		if (!args->meta_device)
 			args->meta_device = strdup_or_error(state, arg);
@@ -216,6 +222,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)

 static struct argp_option options[] = {
 	{ "check", 'c', NULL, 0, "Only check for errors and do not write", },
+	{ "force", 'f', NULL, 0, "Do not check that super is in use, nor if blocks are in use",},
 	{ NULL }
 };

@@ -230,6 +237,7 @@ static int prepare_empty_data_dev_cmd(int argc, char *argv[])
 {
 	struct prepare_empty_data_dev_args prepare_empty_data_dev_args = { 
 		.check = false,
+		.force = false,
 	};
 	int ret;