v1.20 Release

Finish the release notes for the 1.20 release. Signed-off-by: Zach Brown <zab@versity.com>
Merge pull request #165 from versity/greg/kmod-uninstall-cleanup
2026-01-10 05:37:25 +00:00 · 2024-04-22 13:20:42 -07:00 · 2024-04-11 14:32:06 -07:00 · 2024-04-11 13:20:50 -07:00 · 2024-03-29 13:48:33 -07:00 · 2024-02-05 15:44:10 -08:00
24 changed files with 981 additions and 445 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,6 +1,29 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.20
+\
+*Apr 22, 2024*
+
+Minor changes to packaging to better support "weak" module linking of
+the kernel module, and to including git hashes in the built package.  No
+changes in runtime behaviour.
+
+---
+v1.19
+\
+*Jan 30, 2024*
+
+Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
+for creating log merge operations.  The previous timeout, now the
+default, was too short for some systems and was resulting in consistent
+timeouts which created an excessive number of log trees waiting to be
+merged.
+
+Improved performance of many in-mount server operations when there are a
+large number of log trees waiting to be merged.
+
 ---
 v1.18
 \
--- a/kmod/Makefile
+++ b/kmod/Makefile
@@ -12,17 +12,22 @@ else
 SP = @:
 endif

-SCOUTFS_GIT_DESCRIBE := \
+SCOUTFS_GIT_DESCRIBE ?= \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

+ESCAPED_GIT_DESCRIBE := \
+	$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
+
+RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
+
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
+		RPM_GITHASH=$(RPM_GITHASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

 # - We use the git describe from tags to set up the RPM versioning
 RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
-RPM_GITHASH := $(shell git rev-parse --short HEAD)
 TARFILE = scoutfs-kmod-$(RPM_VERSION).tar


@@ -41,7 +46,8 @@ modules_install:

 %.spec: %.spec.in .FORCE
 	sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
+	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
+	    -e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
 	mv $@+ $@


--- a/kmod/scoutfs-kmod.spec.in
+++ b/kmod/scoutfs-kmod.spec.in
@@ -1,6 +1,7 @@
 %define kmod_name scoutfs
 %define kmod_version @@VERSION@@
 %define kmod_git_hash @@GITHASH@@
+%define kmod_git_describe @@GITDESCRIBE@@
 %define pkg_date %(date +%%Y%%m%%d)

 # Disable the building of the debug package(s).
@@ -75,7 +76,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
 for flavor in %flavors_to_build; do
    rm -rf obj/$flavor
    cp -r source obj/$flavor
-    make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
+    make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
 done

 %install
@@ -97,10 +98,21 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
 /lib/modules

 %post
-weak-modules --add-kernel --no-initramfs
+echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
 depmod -a
 %endif

 %clean
 rm -rf %{buildroot}

+%preun
+# stash our modules for postun cleanup
+SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
+rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
+
+%postun
+if [ -x /sbin/weak-modules ]; then
+    cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
+fi
+
+rm /var/run/%{name}-modules-%{version}-%{release} || true
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -2029,187 +2029,253 @@ int scoutfs_btree_rebalance(struct super_block *sb,
 			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
 }

-struct merge_pos {
+struct merged_range {
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct rb_root root;
+	int size;
+};
+
+struct merged_item {
 	struct rb_node node;
-	struct scoutfs_btree_root *root;
-	struct scoutfs_block *bl;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_avl_node *avl;
-	struct scoutfs_key *key;
+	struct scoutfs_key key;
 	u64 seq;
 	u8 flags;
 	unsigned int val_len;
-	u8 *val;
+	u8 val[0];
 };

-static struct merge_pos *first_mpos(struct rb_root *root)
+static inline struct merged_item *mitem_container(struct rb_node *node)
 {
-	struct rb_node *node = rb_first(root);
-	if (node)
-		 return container_of(node, struct merge_pos, node);
+	return node ? container_of(node, struct merged_item, node) : NULL;
+}
+
+static inline struct merged_item *first_mitem(struct rb_root *root)
+{
+	return mitem_container(rb_first(root));
+}
+
+static inline struct merged_item *last_mitem(struct rb_root *root)
+{
+	return mitem_container(rb_last(root));
+}
+
+static inline struct merged_item *next_mitem(struct merged_item *mitem)
+{
+	return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
+}
+
+static inline struct merged_item *prev_mitem(struct merged_item *mitem)
+{
+	return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
+}
+
+static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
+				      struct rb_node **parent_ret, struct rb_node ***link_ret)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct merged_item *mitem;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		mitem = container_of(*node, struct merged_item, node);
+
+		cmp = scoutfs_key_compare(key, &mitem->key);
+
+		if (cmp < 0) {
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			*parent_ret = NULL;
+			*link_ret = NULL;
+			return mitem;
+		}
+	}
+
+	*parent_ret = parent;
+	*link_ret = node;
 	return NULL;
 }

-static struct merge_pos *next_mpos(struct merge_pos *mpos)
+static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
+			 struct rb_node *parent, struct rb_node **link)
 {
-	struct rb_node *node;
-
-	if (mpos && (node = rb_next(&mpos->node)))
-		return container_of(node, struct merge_pos, node);
-	else
-		return NULL;
+	rb_link_node(&mitem->node, parent, link);
+	rb_insert_color(&mitem->node, &rng->root);
+	rng->size += item_len_bytes(mitem->val_len);
 }

-static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
+static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
+				struct merged_item *new)
 {
-	scoutfs_block_put(sb, mpos->bl);
-	kfree(mpos);
+	rb_replace_node(&victim->node, &new->node, &rng->root);
+	RB_CLEAR_NODE(&victim->node);
+	rng->size -= item_len_bytes(victim->val_len);
+	rng->size += item_len_bytes(new->val_len);
 }

-static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
+static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
 {
-	struct rb_node **node = &pos_root->rb_node;
-	struct rb_node *parent = NULL;
-	struct merge_pos *mpos;
-	int cmp;
+	if (IS_ERR_OR_NULL(mitem))
+		return;

-	parent = NULL;
-	while (*node) {
-		parent = *node;
-		mpos = container_of(*node, struct merge_pos, node);
-
-		/* sort merge items by key then newest to oldest */
-		cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
-		      -scoutfs_cmp(ins->seq, mpos->seq);
-
-		if (cmp < 0)
-			node = &(*node)->rb_left;
-		else
-			node = &(*node)->rb_right;
+	if (!RB_EMPTY_NODE(&mitem->node)) {
+		rng->size -= item_len_bytes(mitem->val_len);
+		rb_erase(&mitem->node, &rng->root);
 	}

-	rb_link_node(&ins->node, parent, node);
-	rb_insert_color(&ins->node, pos_root);
+	kfree(mitem);
+}
+
+static void trim_range_size(struct merged_range *rng, int merge_window)
+{
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+
+	mitem = last_mitem(&rng->root);
+	while (mitem && rng->size > merge_window) {
+
+		rng->end = mitem->key;
+		scoutfs_key_dec(&rng->end);
+
+		tmp = mitem;
+		mitem = prev_mitem(mitem);
+		free_mitem(rng, tmp);
+	}
+}
+
+static void trim_range_end(struct merged_range *rng)
+{
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+
+	mitem = last_mitem(&rng->root);
+	while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
+		tmp = mitem;
+		mitem = prev_mitem(mitem);
+		free_mitem(rng, tmp);
+	}
 }

 /*
- * Find the next item in the merge_pos root in the caller's range and
- * insert it into the rbtree sorted by key and version so that merging
- * can find the next newest item at the front of the rbtree.  We free
- * the mpos on error or if there are no more items in the range.
+ * Record and combine logged items from log roots for merging with the
+ * writable destination root.  The caller is responsible for trimming
+ * the range if it gets too large or if the key range shrinks.
 */
-static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
-		      struct scoutfs_key *start, struct scoutfs_key *end)
+static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
+			   void *val, int val_len, void *arg)
 {
-	struct scoutfs_btree_item *item;
-	struct scoutfs_avl_node *next;
-	struct btree_walk_key_range kr;
-	struct scoutfs_key walk_key;
-	int ret = 0;
+	struct merged_range *rng = arg;
+	struct merged_item *mitem;
+	struct merged_item *found;
+	struct rb_node *parent;
+	struct rb_node **link;
+	int ret;

-	/* always erase before freeing or inserting */
-	if (!RB_EMPTY_NODE(&mpos->node)) {
-		rb_erase(&mpos->node, pos_root);
-		RB_CLEAR_NODE(&mpos->node);
-	}
-
-	/*
-	 * advance to next item via the avl tree.   The caller's pos is
-	 * only ever incremented past the last key so we can use next to
-	 * iterate rather than using search to skip past multiple items.
-	 */
-	if (mpos->avl)
-		mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
-
-	/* find the next leaf with the key if we run out of items */
-	walk_key = *start;
-	while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
-		scoutfs_block_put(sb, mpos->bl);
-		mpos->bl = NULL;
-		ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
-				 0, &mpos->bl, &kr, NULL);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
-			free_mpos(sb, mpos);
+	found = find_mitem(&rng->root, key, &parent, &link);
+	if (found) {
+		ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (ret == SCOUTFS_DELTA_COMBINED) {
+				scoutfs_inc_counter(sb, btree_merge_delta_combined);
+			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
+				scoutfs_inc_counter(sb, btree_merge_delta_null);
+				free_mitem(rng, found);
+			}
+			ret = 0;
 			goto out;
 		}
-		mpos->bt = mpos->bl->data;

-		mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
-					       start, NULL, NULL, &next, NULL) ?: next;
-		if (mpos->avl == NULL)
-			walk_key = kr.iter_next;
+		if (found->seq >= seq) {
+			ret = 0;
+			goto out;
+		}
 	}

-	/* see if we're out of items within the range */
-	item = node_item(mpos->avl);
-	if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
-		free_mpos(sb, mpos);
-		ret = 0;
+	mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
+	if (!mitem) {
+		ret = -ENOMEM;
 		goto out;
 	}

-	/* insert the next item within range at its version */
-	mpos->key = item_key(item);
-	mpos->seq = le64_to_cpu(item->seq);
-	mpos->flags = item->flags;
-	mpos->val_len = item_val_len(item);
-	mpos->val = item_val(mpos->bt, item);
+	mitem->key = *key;
+	mitem->seq = seq;
+	mitem->flags = flags;
+	mitem->val_len = val_len;
+	if (val_len)
+		memcpy(mitem->val, val, val_len);
+
+	if (found) {
+		replace_mitem(rng, found, mitem);
+		free_mitem(rng, found);
+	} else {
+		insert_mitem(rng, mitem, parent, link);
+	}

-	insert_mpos(pos_root, mpos);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * The caller has reset all the merge positions for all the input log
- * btree roots and wants the next logged item it should try and merge
- * with the items in the fs_root.
+ * Read a range of merged items.  The caller has set the key bounds of
+ * the range.  We read a merge window's worth of items from blocks in
+ * each input btree.
 *
- * We look ahead in the logged item stream to see if we should merge any
- * older logged delta items into one result for the caller.  We also
- * take this opportunity to skip and reset the mpos for any older
- * versions of the first item.
+ * The caller can only use the smallest range that overlaps with all the
+ * blocks that we read.  We start reading from the range's start key so
+ * it will always be present and we don't need to adjust it.  The final
+ * block we read from each input might not cover the range's end so it
+ * needs to be adjusted.
+ *
+ * The end range can also shrink if we have to drop items because the
+ * items exceeded the merge window size.
 */
-static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
-			      struct scoutfs_key *end, struct merge_pos **mpos_ret)
+static int read_merged_range(struct super_block *sb, struct merged_range *rng,
+			     struct list_head *inputs, int merge_window)
 {
-	struct merge_pos *mpos;
-	struct merge_pos *next;
+	struct scoutfs_btree_root_head *rhead;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
 	struct scoutfs_key key;
 	int ret = 0;
+	int i;

-	while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
-	       !scoutfs_key_compare(mpos->key, next->key)) {
+	list_for_each_entry(rhead, inputs, head) {
+		key = rng->start;

-		ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
-						    next->val, next->val_len);
-		if (ret < 0)
-			break;
-
-		/* reset advances to the next item */
-		key = *mpos->key;
-		scoutfs_key_inc(&key);
-
-		/* always skip next combined or older version */
-		ret = reset_mpos(sb, pos_root, next, &key, end);
-		if (ret < 0)
-			break;
-
-		if (ret == SCOUTFS_DELTA_COMBINED) {
-			scoutfs_inc_counter(sb, btree_merge_delta_combined);
-		} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
-			scoutfs_inc_counter(sb, btree_merge_delta_null);
-			/* if merging resulted in no info, skip current */
-			ret = reset_mpos(sb, pos_root, mpos, &key, end);
+		for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
+			start = key;
+			end = rng->end;
+			ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
+						       merge_read_item, rng);
 			if (ret < 0)
+				goto out;
+
+			if (scoutfs_key_compare(&end, &rng->end) >= 0)
 				break;
+
+			key = end;
+			scoutfs_key_inc(&key);
 		}
+
+		if (scoutfs_key_compare(&end, &rng->end) < 0) {
+			rng->end = end;
+			trim_range_end(rng);
+		}
+
+		if (rng->size > merge_window)
+			trim_range_size(rng, merge_window);
 	}

-	*mpos_ret = mpos;
+	trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
+	ret = 0;
+out:
 	return ret;
 }

@@ -2226,6 +2292,13 @@ static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
 * to allocators running low or needing to join/split the parent.
 * *next_ret is set to the next key which hasn't been merged so that the
 * caller can retry with a new allocator and subtree.
+ *
+ * The number of input roots can be immense.  The merge_window specifies
+ * the size of the set of merged items that we'll maintain as we iterate
+ * over all the input roots.  Once we've merged items into the window
+ * from all the input roots the merged input items are then merged to
+ * the writable destination root.  It may take multiple passes of
+ * windows of merged items to cover the input key range.
 */
 int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_alloc *alloc,
@@ -2235,18 +2308,16 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *inputs,
-			bool subtree, int dirty_limit, int alloc_low)
+			bool subtree, int dirty_limit, int alloc_low, int merge_window)
 {
-	struct scoutfs_btree_root_head *rhead;
-	struct rb_root pos_root = RB_ROOT;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl = NULL;
 	struct btree_walk_key_range kr;
 	struct scoutfs_avl_node *par;
-	struct scoutfs_key next;
-	struct merge_pos *mpos;
-	struct merge_pos *tmp;
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+	struct merged_range rng;
 	int walk_val_len;
 	int walk_flags;
 	bool is_del;
@@ -2257,49 +2328,59 @@ int scoutfs_btree_merge(struct super_block *sb,
 	trace_scoutfs_btree_merge(sb, root, start, end);
 	scoutfs_inc_counter(sb, btree_merge);

-	list_for_each_entry(rhead, inputs, head) {
-		mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
-		if (!mpos) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		RB_CLEAR_NODE(&mpos->node);
-		mpos->root = &rhead->root;
-
-		ret = reset_mpos(sb, &pos_root, mpos, start, end);
-		if (ret < 0)
-			goto out;
-	}
-
 	walk_flags = BTW_DIRTY;
 	if (subtree)
 		walk_flags |= BTW_SUBTREE;
 	walk_val_len = 0;

-	while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
+	rng.start = *start;
+	rng.end = *end;
+	rng.root = RB_ROOT;
+	rng.size = 0;
+
+	ret = read_merged_range(sb, &rng, inputs, merge_window);
+	if (ret < 0)
+		goto out;
+
+	for (;;) {
+		/* read next window as it empties (and it is possible to read an empty range) */
+		mitem = first_mitem(&rng.root);
+		if (!mitem) {
+			/* done if the read range hit the end */
+			if (scoutfs_key_compare(&rng.end, end) >= 0)
+				break;
+
+			/* read next batch of merged items */
+			rng.start = rng.end;
+			scoutfs_key_inc(&rng.start);
+			rng.end = *end;
+			ret = read_merged_range(sb, &rng, inputs, merge_window);
+			if (ret < 0)
+				break;
+			continue;
+		}

 		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
 			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
 			ret = -ERANGE;
-			*next_ret = *mpos->key;
+			*next_ret = mitem->key;
 			goto out;
 		}

 		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
 			scoutfs_inc_counter(sb, btree_merge_alloc_low);
 			ret = -ERANGE;
-			*next_ret = *mpos->key;
+			*next_ret = mitem->key;
 			goto out;
 		}

 		scoutfs_block_put(sb, bl);
 		bl = NULL;
 		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-			         mpos->key, walk_val_len, &bl, &kr, NULL);
+			         &mitem->key, walk_val_len, &bl, &kr, NULL);
 		if (ret < 0) {
 			if (ret == -ERANGE)
-				*next_ret = *mpos->key;
+				*next_ret = mitem->key;
 			goto out;
 		}
 		bt = bl->data;
@@ -2311,22 +2392,21 @@ int scoutfs_btree_merge(struct super_block *sb,
 			continue;
 		}

-		while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
-
+		while (mitem) {
 			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
+			if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
 				break;

 			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, mpos->key);
-			is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);
+			item = leaf_item_hash_search(sb, bt, &mitem->key);
+			is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);

 			/* see if we're merging delta items */
 			if (item && !is_del)
-				delta = scoutfs_forest_combine_deltas(mpos->key,
+				delta = scoutfs_forest_combine_deltas(&mitem->key,
 								      item_val(bt, item),
 								      item_val_len(item),
-								      mpos->val, mpos->val_len);
+								      mitem->val, mitem->val_len);
 			else
 				delta = 0;
 			if (delta < 0) {
@@ -2338,40 +2418,38 @@ int scoutfs_btree_merge(struct super_block *sb,
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 			}

-			trace_scoutfs_btree_merge_items(sb, mpos->root,
-					mpos->key, mpos->val_len,
+			trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
 					item ? root : NULL,
 					item ? item_key(item) : NULL,
 					item ? item_val_len(item) : 0, is_del);

 			/* rewalk and split if ins/update needs room */
-			if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
+			if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
 				walk_flags |= BTW_INSERT;
-				walk_val_len = mpos->val_len;
+				walk_val_len = mitem->val_len;
 				break;
 			}

 			/* insert missing non-deletion merge items */
 			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root,
-						   cmp_key_item, mpos->key,
+				scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, mpos->key, mpos->seq, mpos->flags,
-					    mpos->val, mpos->val_len, par, cmp);
+				create_item(bt, &mitem->key, mitem->seq, mitem->flags,
+					    mitem->val, mitem->val_len, par, cmp);
 				scoutfs_inc_counter(sb, btree_merge_insert);
 			}

 			/* update existing items */
 			if (item && !is_del && !delta) {
-				item->seq = cpu_to_le64(mpos->seq);
-				item->flags = mpos->flags;
-				update_item_value(bt, item, mpos->val, mpos->val_len);
+				item->seq = cpu_to_le64(mitem->seq);
+				item->flags = mitem->flags;
+				update_item_value(bt, item, mitem->val, mitem->val_len);
 				scoutfs_inc_counter(sb, btree_merge_update);
 			}

 			/* update combined delta item seq */
 			if (delta == SCOUTFS_DELTA_COMBINED) {
-				item->seq = cpu_to_le64(mpos->seq);
+				item->seq = cpu_to_le64(mitem->seq);
 			}

 			/*
@@ -2403,21 +2481,18 @@ int scoutfs_btree_merge(struct super_block *sb,
 			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
 			walk_val_len = 0;

-			/* finished with this key, skip any older items */
-			next = *mpos->key;
-			scoutfs_key_inc(&next);
-			ret = reset_mpos(sb, &pos_root, mpos, &next, end);
-			if (ret < 0)
-				goto out;
+			/* finished with this merged item */
+			tmp = mitem;
+			mitem = next_mitem(mitem);
+			free_mitem(&rng, tmp);
 		}
 	}

 	ret = 0;
 out:
 	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
-		free_mpos(sb, mpos);
-	}
+	rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
+		free_mitem(&rng, mitem);

 	return ret;
 }
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *input_list,
-			bool subtree, int dirty_limit, int alloc_low);
+			bool subtree, int dirty_limit, int alloc_low, int merge_window);

 int scoutfs_btree_free_blocks(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -145,6 +145,7 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
 	EXPAND_COUNTER(net_send_error)				\
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -721,7 +721,8 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
 	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
 				  &next, &comp.root, &inputs,
 				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
+				  (2 * 1024 * 1024));
 	if (ret == -ERANGE) {
 		comp.remain = next;
 		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -33,6 +33,7 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
+	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
@@ -45,6 +46,7 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
+	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

+#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
+#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
+#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
+
 #define MIN_ORPHAN_SCAN_DELAY_MS	100UL
 #define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
 #define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
@@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
+	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
 }

+static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
+		return -EINVAL;
+	}
+	if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
+		scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
+			    val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
 {
 	if (ret < 0) {
@@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

+		case Opt_log_merge_wait_timeout_ms:
+			ret = match_int(args, &nr);
+			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->log_merge_wait_timeout_ms = nr;
+			break;
+
 		case Opt_metadev_path:
 			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
 			if (ret < 0)
@@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

+static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
+}
+static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	int val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoint(nullterm, 0, &val);
+	ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
+	if (ret == 0) {
+		write_seqlock(&optinf->seqlock);
+		optinf->opts.log_merge_wait_timeout_ms = val;
+		write_sequnlock(&optinf->seqlock);
+		ret = count;
+	}
+
+	return ret;
+}
+SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
+
 static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
+	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -8,6 +8,7 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
+	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -439,6 +439,7 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->journal_info = (unsigned long)journal_info;
 		__entry->holders = holders;
+		__entry->ret = ret;
 	),

 	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
@@ -1746,21 +1747,41 @@ TRACE_EVENT(scoutfs_btree_merge,
 		  sk_trace_args(end))
 );

+TRACE_EVENT(scoutfs_btree_merge_read_range,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
+		 int size),
+
+	TP_ARGS(sb, start, end, size),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(start)
+		sk_trace_define(end)
+		__field(int, size)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		__entry->size = size;
+	),
+
+	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
+);
+
 TRACE_EVENT(scoutfs_btree_merge_items,
 	TP_PROTO(struct super_block *sb,
-		 struct scoutfs_btree_root *m_root,
 		 struct scoutfs_key *m_key, int m_val_len,
 		 struct scoutfs_btree_root *f_root,
 		 struct scoutfs_key *f_key, int f_val_len,
 		 int is_del),

-	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+	TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, m_root_blkno)
-		__field(__u64, m_root_seq)
-		__field(__u8, m_root_height)
 		sk_trace_define(m_key)
 		__field(int, m_val_len)
 		__field(__u64, f_root_blkno)
@@ -1773,10 +1794,6 @@ TRACE_EVENT(scoutfs_btree_merge_items,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->m_root_blkno = m_root ?
-					le64_to_cpu(m_root->ref.blkno) : 0;
-		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
-		__entry->m_root_height = m_root ? m_root->height : 0;
 		sk_trace_assign(m_key, m_key);
 		__entry->m_val_len = m_val_len;
 		__entry->f_root_blkno = f_root ?
@@ -1788,11 +1805,9 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		__entry->is_del = !!is_del;
 	),

-	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
-		  __entry->m_root_height, sk_trace_args(m_key),
-		  __entry->m_val_len, __entry->f_root_blkno,
-		  __entry->f_root_seq, __entry->f_root_height,
+	TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
+		  __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );

@@ -2075,6 +2090,71 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

+TRACE_EVENT(scoutfs_server_finalize_items,
+	TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
+		 u64 item_get_trans_seq),
+
+	TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, c_rid)
+		__field(__u64, item_rid)
+		__field(__u64, item_nr)
+		__field(__u64, item_flags)
+		__field(__u64, item_get_trans_seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->c_rid = rid;
+		__entry->item_rid = item_rid;
+		__entry->item_nr = item_nr;
+		__entry->item_flags = item_flags;
+		__entry->item_get_trans_seq = item_get_trans_seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
+		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
+		  __entry->item_flags, __entry->item_get_trans_seq)
+);
+
+TRACE_EVENT(scoutfs_server_finalize_decision,
+	TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
+		 bool ours_visible, bool finalize_ours, unsigned int delay_ms,
+		 u64 finalize_sent_seq),
+
+	TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
+		finalize_sent_seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, c_rid)
+		__field(bool, saw_finalized)
+		__field(bool, others_active)
+		__field(bool, ours_visible)
+		__field(bool, finalize_ours)
+		__field(unsigned int, delay_ms)
+		__field(__u64, finalize_sent_seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->c_rid = rid;
+		__entry->saw_finalized = saw_finalized;
+		__entry->others_active = others_active;
+		__entry->ours_visible = ours_visible;
+		__entry->finalize_ours = finalize_ours;
+		__entry->delay_ms = delay_ms;
+		__entry->finalize_sent_seq = finalize_sent_seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
+		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
+		  __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
+		  __entry->finalize_sent_seq)
+);
+
 TRACE_EVENT(scoutfs_get_log_merge_status,
 	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
 		 u64 nr_requests, u64 nr_complete, u64 seq),
@@ -2799,6 +2879,81 @@ TRACE_EVENT(scoutfs_omap_should_delete,
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
 );

+#define SSCF_FMT "[bo %llu bs %llu es %llu]"
+#define SSCF_FIELDS(pref)					\
+	__field(__u64, pref##_blkno)				\
+	__field(__u64, pref##_blocks)				\
+	__field(__u64, pref##_entries)
+#define SSCF_ASSIGN(pref, sfl)					\
+	__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno);	\
+	__entry->pref##_blocks = le64_to_cpu((sfl)->blocks);	\
+	__entry->pref##_entries = le64_to_cpu((sfl)->entries);
+#define SSCF_ENTRY_ARGS(pref)					\
+	__entry->pref##_blkno,					\
+	__entry->pref##_blocks,					\
+	__entry->pref##_entries
+
+DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+
+	TP_ARGS(sb, sc),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, id)
+		__field(__u8, nr)
+		__field(__u8, flags)
+		SSCF_FIELDS(out)
+		__field(__u64, in0_blk)
+		__field(__u64, in0_pos)
+		SSCF_FIELDS(in0)
+		__field(__u64, in1_blk)
+		__field(__u64, in1_pos)
+		SSCF_FIELDS(in1)
+		__field(__u64, in2_blk)
+		__field(__u64, in2_pos)
+		SSCF_FIELDS(in2)
+		__field(__u64, in3_blk)
+		__field(__u64, in3_pos)
+		SSCF_FIELDS(in3)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->id = le64_to_cpu(sc->id);
+		__entry->nr = sc->nr;
+		__entry->flags = sc->flags;
+		SSCF_ASSIGN(out, &sc->out)
+		__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in0, &sc->in[0].sfl)
+		__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in1, &sc->in[1].sfl)
+		__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in2, &sc->in[2].sfl)
+		__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in3, &sc->in[3].sfl)
+	),
+
+	TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
+		  SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
+		  __entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
+		  __entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
+		  __entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
+		  __entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
+);
+DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+	TP_ARGS(sb, sc)
+);
+DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+	TP_ARGS(sb, sc)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -148,6 +148,8 @@ struct server_info {
 	struct scoutfs_quorum_config qconf;
 	/* a running server maintains a private dirty super */
 	struct scoutfs_super_block dirty_super;
+
+	u64 finalize_sent_seq;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -413,6 +415,27 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
 	wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
 }

+/*
+ * Return the higher of the avail or freed used by the active commit
+ * since this holder joined the commit.  This is *not* the amount used
+ * by the holder, we don't track per-holder alloc use.
+ */
+static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	u32 avail_used;
+	u32 freed_used;
+	u32 avail_now;
+	u32 freed_now;
+
+	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
+
+	avail_used = hold->avail - avail_now;
+	freed_used = hold->freed - freed_now;
+
+	return max(avail_used, freed_used);
+}
+
 /*
 * This is called while holding the commit and returns once the commit
 * is successfully written.  Many holders can all wait for all holders
@@ -938,22 +961,24 @@ static int find_log_trees_item(struct super_block *sb,
 }

 /*
- * Find the next log_trees item from the key.  Fills the caller's log_trees and sets
- * the key past the returned log_trees for iteration.  Returns 0 when done, > 0 for each
- * item, and -errno on fatal errors.
+ * Find the log_trees item with the greatest nr for each rid.  Fills the
+ * caller's log_trees and sets the key before the returned log_trees for
+ * the next iteration.  Returns 0 when done, > 0 for each item, and
+ * -errno on fatal errors.
 */
-static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *key, struct scoutfs_log_trees *lt)
+static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
+				struct scoutfs_key *key, struct scoutfs_log_trees *lt)
 {
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	int ret;

-	ret = scoutfs_btree_next(sb, root, key, &iref);
+	ret = scoutfs_btree_prev(sb, root, key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			memcpy(lt, iref.val, iref.val_len);
 			*key = *iref.key;
-			scoutfs_key_inc(key);
+			key->sklt_nr = 0;
+			scoutfs_key_dec(key);
 			ret = 1;
 		} else {
 			ret = -EIO;
@@ -1048,21 +1073,13 @@ static int next_log_merge_item(struct super_block *sb,
 * abandoned log btree finalized.  If it takes too long each client has
 * a change to make forward progress before being asked to commit again.
 *
- * We're waiting on heavy state that is protected by mutexes and
- * transaction machinery.  It's tricky to recreate that state for
- * lightweight condition tests that don't change task state.  Instead of
- * trying to get that right, particularly as we unwind after success or
- * after timeouts, waiters use an unsatisfying poll.   Short enough to
- * not add terrible latency, given how heavy and infrequent this already
- * is, and long enough to not melt the cpu.  This could be tuned if it
- * becomes a problem.
- *
 * This can end up finalizing a new empty log btree if a new mount
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
-#define FINALIZE_POLL_MS	(11)
-#define FINALIZE_TIMEOUT_MS	(MSEC_PER_SEC / 2)
+#define FINALIZE_POLL_MIN_DELAY_MS	5U
+#define FINALIZE_POLL_MAX_DELAY_MS	100U
+#define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
 static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
 					u64 rid, struct commit_hold *hold)
 {
@@ -1070,8 +1087,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
+	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
 	struct scoutfs_log_trees fin;
+	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
 	bool others_active;
@@ -1079,10 +1098,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	bool ours_visible;
 	struct scoutfs_key key;
 	char *err_str = NULL;
+	ktime_t start;
 	int ret;
 	int err;

-	timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);
+	scoutfs_options_read(sb, &opts);
+	timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
+	delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
+	start = ktime_get_raw();

 	for (;;) {
 		/* nothing to do if there's already a merge in flight */
@@ -1099,8 +1122,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		saw_finalized = false;
 		others_active = false;
 		ours_visible = false;
-		scoutfs_key_init_log_trees(&key, 0, 0);
-		while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+		while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+
+			trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
+							    le64_to_cpu(each_lt.nr),
+							    le64_to_cpu(each_lt.flags),
+							    le64_to_cpu(each_lt.get_trans_seq));

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
 				saw_finalized = true;
@@ -1125,6 +1153,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		finalize_ours = (lt->item_root.height > 2) ||
 				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);

+		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
+						       ours_visible, finalize_ours, delay_ms,
+						       server->finalize_sent_seq);
+
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
@@ -1132,12 +1164,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		}

 		/* send sync requests soon to give time to commit */
-		scoutfs_key_init_log_trees(&key, 0, 0);
+		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
 		while (others_active &&
-		       (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		       (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
-			    (le64_to_cpu(each_lt.rid) == rid))
+			    (le64_to_cpu(each_lt.rid) == rid) ||
+			    (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
 				continue;

 			ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1157,6 +1190,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

+		server->finalize_sent_seq = scoutfs_server_seq(sb);
+
 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
 			fin = *lt;
@@ -1194,13 +1229,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			if (ret < 0)
 				err_str = "applying commit before waiting for finalized";

-			msleep(FINALIZE_POLL_MS);
+			msleep(delay_ms);
+			delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
+				       FINALIZE_POLL_MAX_DELAY_MS);

 			server_hold_commit(sb, hold);
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
 			if (time_after(jiffies, timeo)) {
+				scoutfs_inc_counter(sb, log_merge_wait_timeout);
 				ret = 0;
 				break;
 			}
@@ -1783,43 +1821,29 @@ out:
 * Give the caller the last seq before outstanding client commits.  All
 * seqs up to and including this are stable, new client transactions can
 * only have greater seqs.
+ *
+ * For each rid, only its greatest log trees nr can be an open commit.
+ * We look at the last log_trees item for each client rid and record its
+ * trans seq if it hasn't been committed.
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_log_trees *lt;
+	struct scoutfs_log_trees lt;
 	struct scoutfs_key key;
 	u64 last_seq = 0;
 	int ret;

 	last_seq = scoutfs_server_seq(sb) - 1;
-	scoutfs_key_init_log_trees(&key, 0, 0);

 	mutex_lock(&server->logs_mutex);

-	for (;; scoutfs_key_inc(&key)) {
-		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
-		if (ret == 0) {
-			if (iref.val_len == sizeof(*lt)) {
-				lt = iref.val;
-				if ((le64_to_cpu(lt->get_trans_seq) >
-				     le64_to_cpu(lt->commit_trans_seq)) &&
-				     le64_to_cpu(lt->get_trans_seq) <= last_seq) {
-					last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
-				}
-				key = *iref.key;
-			} else {
-				ret = -EIO;
-			}
-			scoutfs_btree_put_iref(&iref);
-		}
-		if (ret < 0) {
-			if (ret == -ENOENT) {
-				ret = 0;
-				break;
-			}
+	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
+		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
+		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
+			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
 		}
 	}

@@ -2471,9 +2495,11 @@ static void server_log_merge_free_work(struct work_struct *work)

 	while (!server_is_stopping(server)) {

-		server_hold_commit(sb, &hold);
-		mutex_lock(&server->logs_mutex);
-		commit = true;
+		if (!commit) {
+			server_hold_commit(sb, &hold);
+			mutex_lock(&server->logs_mutex);
+			commit = true;
+		}

 		ret = next_log_merge_item(sb, &super->log_merge,
 					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2520,12 +2546,14 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		mutex_unlock(&server->logs_mutex);
-		ret = server_apply_commit(sb, &hold, ret);
-		commit = false;
-		if (ret < 0) {
-			err_str = "looping commit del/upd freeing item";
-			break;
+		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
+			mutex_unlock(&server->logs_mutex);
+			ret = server_apply_commit(sb, &hold, ret);
+			commit = false;
+			if (ret < 0) {
+				err_str = "looping commit del/upd freeing item";
+				break;
+			}
 		}
 	}

@@ -4298,6 +4326,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
+	server->finalize_sent_seq = 0;

 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -31,6 +31,8 @@
 #include "counters.h"
 #include "scoutfs_trace.h"
 #include "triggers.h"
+#include "sysfs.h"
+#include "msg.h"

 /*
 * This srch subsystem gives us a way to find inodes that have a given
@@ -69,10 +71,14 @@ struct srch_info {
 	atomic_t shutdown;
 	struct workqueue_struct *workq;
 	struct delayed_work compact_dwork;
+	struct scoutfs_sysfs_attrs ssa;
+	atomic_t compact_delay_ms;
 };

 #define DECLARE_SRCH_INFO(sb, name) \
 	struct srch_info *name = SCOUTFS_SB(sb)->srch_info
+#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
+	DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_ARG(sre)						\
@@ -2208,8 +2214,15 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-/* wait 10s between compact attempts on error, immediate after success */
-#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
+static void queue_compact_work(struct srch_info *srinf, bool immediate)
+{
+	unsigned long delay;
+
+	if (!atomic_read(&srinf->shutdown)) {
+		delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
+		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
+	}
+}

 /*
 * Get a compaction operation from the server, sort the entries from the
@@ -2237,7 +2250,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	struct super_block *sb = srinf->sb;
 	struct scoutfs_block_writer wri;
 	struct scoutfs_alloc alloc;
-	unsigned long delay;
 	int ret;
 	int err;

@@ -2250,6 +2262,8 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	scoutfs_block_writer_init(sb, &wri);

 	ret = scoutfs_client_srch_get_compact(sb, sc);
+	if (ret >= 0)
+		trace_scoutfs_srch_compact_client_recv(sb, sc);
 	if (ret < 0 || sc->nr == 0)
 		goto out;

@@ -2278,6 +2292,7 @@ commit:
 	sc->meta_freed = alloc.freed;
 	sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;

+	trace_scoutfs_srch_compact_client_send(sb, sc);
 	err = scoutfs_client_srch_commit_compact(sb, sc);
 	if (err < 0 && ret == 0)
 		ret = err;
@@ -2288,14 +2303,56 @@ out:
 		scoutfs_inc_counter(sb, srch_compact_error);

 	scoutfs_block_writer_forget_all(sb, &wri);
-	if (!atomic_read(&srinf->shutdown)) {
-		delay = (sc->nr > 0 && ret == 0) ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
-		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
-	}
+	queue_compact_work(srinf, sc->nr > 0 && ret == 0);

 	kfree(sc);
 }

+static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
+
+	return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
+}
+
+#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
+#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
+#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
+
+static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_SRCH_INFO(sb, srinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	u64 val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoll(nullterm, 0, &val);
+	if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
+		scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
+			    MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
+		return -EINVAL;
+	}
+
+	atomic_set(&srinf->compact_delay_ms, val);
+	cancel_delayed_work(&srinf->compact_dwork);
+	queue_compact_work(srinf, false);
+
+	return count;
+}
+SCOUTFS_ATTR_RW(compact_delay_ms);
+
+static struct attribute *srch_attrs[] = {
+	SCOUTFS_ATTR_PTR(compact_delay_ms),
+	NULL,
+};
+
 void scoutfs_srch_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -2312,6 +2369,8 @@ void scoutfs_srch_destroy(struct super_block *sb)
 		destroy_workqueue(srinf->workq);
 	}

+	scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
+
 	kfree(srinf);
 	sbi->srch_info = NULL;
 }
@@ -2329,8 +2388,15 @@ int scoutfs_srch_setup(struct super_block *sb)
 	srinf->sb = sb;
 	atomic_set(&srinf->shutdown, 0);
 	INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
+	scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
+	atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
+
 	sbi->srch_info = srinf;

+	ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
+	if (ret < 0)
+		goto out;
+
 	srinf->workq = alloc_workqueue("scoutfs_srch_compact",
 				       WQ_NON_REENTRANT | WQ_UNBOUND |
 				       WQ_HIGHPRI, 0);
@@ -2339,8 +2405,7 @@ int scoutfs_srch_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_delayed_work(srinf->workq, &srinf->compact_dwork,
-			   msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));
+	queue_compact_work(srinf, false);

 	ret = 0;
 out:
--- a/tests/README.md
+++ b/tests/README.md
@@ -25,8 +25,9 @@ All options can be seen by running with -h.
 This script is built to test multi-node systems on one host by using
 different mounts of the same devices.  The script creates a fake block
 device in front of each fs block device for each mount that will be
-tested.  Currently it will create free loop devices and will mount on
-/mnt/test.[0-9].
+tested.  It will create predictable device mapper devices and mounts
+them on /mnt/test.N.  These static device names and mount paths limit
+the script to a single execution per host.

 All tests will be run by default.  Particular tests can be included or
 excluded by providing test name regular expressions with the -I and -E
@@ -104,8 +105,8 @@ used during the test.

 | Variable         | Description          | Origin          | Example           |
 | ---------------- | -------------------  | --------------- | ----------------- |
-| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
-| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
+| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9]        |
+| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9]        |
 | T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
 | T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
 | T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -6,6 +6,61 @@ t_filter_fs()
 	    -e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
 }

+#
+# We can hit a spurious kasan warning that was fixed upstream:
+#
+#  e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
+#
+# KASAN can get mad when the unwinder doesn't find ORC metadata and
+# wanders up without using frames and hits the KASAN stack red zones.
+# We can ignore these messages.
+#
+# They're bracketed by:
+# [ 2687.690127] ==================================================================
+# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
+# ...
+# [ 2687.706220] ==================================================================
+# [ 2687.707284] Disabling lock debugging due to kernel taint
+#
+# That final lock debugging message may not be included.
+#
+ignore_harmless_unwind_kasan_stack_oob()
+{
+awk '
+        BEGIN {
+                in_soob = 0
+                soob_nr = 0
+        }
+        ( !in_soob && $0 ~ /==================================================================/ ) {
+                in_soob = 1
+                soob_nr = NR
+                saved = $0
+        }
+        ( in_soob == 1 && NR == (soob_nr + 1) ) {
+                if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
+                        in_soob = 2
+                } else {
+                        in_soob = 0
+                        print saved
+                }
+		saved=""
+        }
+        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
+                in_soob = 3
+                soob_nr = NR
+        }
+        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
+                in_soob = 0
+        }
+        ( !in_soob ) { print $0 }
+        END {
+                if (saved) {
+                        print saved
+                }
+        }
+'
+}
+
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -86,10 +141,12 @@ t_filter_dmesg()
 	re="$re|scoutfs .* critical transaction commit failure.*"

 	# change-devices causes loop device resizing
+	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"

 	# ignore systemd-journal rotating
 	re="$re|systemd-journald.*"

-	egrep -v "($re)" 
+	egrep -v "($re)" | \
+		ignore_harmless_unwind_kasan_stack_oob
 }
--- a/tests/golden/createmany-parallel-mounts
+++ b/tests/golden/createmany-parallel-mounts
@@ -1,3 +1,4 @@
 == measure initial createmany
 == measure initial createmany
 == measure two concurrent createmany runs
+== cleanup
--- a/tests/golden/large-fragmented-free
+++ b/tests/golden/large-fragmented-free
@@ -1,3 +1,4 @@
+== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
--- a/tests/golden/srch-safe-merge-pos
+++ b/tests/golden/srch-safe-merge-pos
@@ -1,4 +1,4 @@
-== snapshot errors
+== initialize per-mount values
 == arm compaction triggers
 trigger srch_compact_logs_pad_safe armed: 1
 trigger srch_merge_stop_safe armed: 1
@@ -10,72 +10,28 @@ trigger srch_compact_logs_pad_safe armed: 1
 trigger srch_merge_stop_safe armed: 1
 trigger srch_compact_logs_pad_safe armed: 1
 trigger srch_merge_stop_safe armed: 1
-== force lots of small rotated log files for compaction
+== compact more often
+== create padded sorted inputs by forcing log rotation
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
 trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-== wait for compaction
-== test and disarm compaction triggers
-== verify triggers and errors
+trigger srch_compact_logs_pad_safe armed: 1
+== compaction of padded should stop at safe
+== verify no compaction errors
 == cleanup
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -326,16 +326,10 @@ unmount_all() {
 		cmd wait $p
 	done

-	# delete all temp meta devices
-	for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
-		if [ -e "$dev" ]; then
-			cmd losetup -d "$dev"
-		fi
-	done
-	# delete all temp data devices
-	for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
-		if [ -e "$dev" ]; then
-			cmd losetup -d "$dev"
+	# delete all temp devices
+	for dev in /dev/mapper/_scoutfs_test_*; do
+		if [ -b "$dev" ]; then
+			cmd dmsetup remove $dev
 		fi
 	done
 }
@@ -434,6 +428,12 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
 fenced_log "started fenced pid $fenced_pid in the background"

+# setup dm tables
+echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
+	$T_RESULTS/dmtable.meta
+echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
+	$T_RESULTS/dmtable.data
+
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -442,10 +442,13 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
 pids=""
 for i in $(seq 0 $((T_NR_MOUNTS - 1))); do

-	meta_dev=$(losetup --find --show $T_META_DEVICE)
-	test -b "$meta_dev" || die "failed to create temp device $meta_dev"
-	data_dev=$(losetup --find --show $T_DATA_DEVICE)
-	test -b "$data_dev" || die "failed to create temp device $data_dev"
+	name="_scoutfs_test_meta_$i"
+	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
+	meta_dev="/dev/mapper/$name"
+
+	name="_scoutfs_test_data_$i"
+	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
+	data_dev="/dev/mapper/$name"

 	dir="/mnt/test.$i"
 	test -d "$dir" || cmd mkdir -p "$dir"
--- a/tests/src/bulk_create_paths.c
+++ b/tests/src/bulk_create_paths.c
@@ -1,6 +1,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdarg.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/stat.h>
@@ -35,10 +36,10 @@ struct opts {
 	unsigned int dry_run:1,
 		     ls_output:1,
 		     quiet:1,
-		     user_xattr:1,
-		     same_srch_xattr:1,
-		     group_srch_xattr:1,
-		     unique_srch_xattr:1;
+		     xattr_set:1,
+		     xattr_file:1,
+		     xattr_group:1;
+	char *xattr_name;
 };

 struct stats {
@@ -149,12 +150,31 @@ static void free_dir(struct dir *dir)
 	free(dir);
 }

+static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	if (off >= sz)
+		return sz;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(buf + off, sz - off, fmt, ap);
+	va_end(ap);
+
+	if (ret <= 0)
+		return sz;
+
+	return off + ret;
+}
+
 static void create_dir(struct dir *dir, struct opts *opts,
 		       struct stats *stats)
 {
 	struct str_list *s;
-	char name[100];
+	char name[256]; /* max len and null term */
 	char val = 'v';
+	size_t off;
 	int rc;
 	int i;

@@ -175,29 +195,21 @@ static void create_dir(struct dir *dir, struct opts *opts,
 		rc = mknod(s->str, S_IFREG | 0644, 0);
 		error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);

-		rc = 0;
-		if (rc == 0 && opts->user_xattr) {
-			strcpy(name, "user.scoutfs_bcp");
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->same_srch_xattr) {
-			strcpy(name, "scoutfs.srch.scoutfs_bcp");
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->group_srch_xattr) {
-			snprintf(name, sizeof(name),
-				 "scoutfs.srch.scoutfs_bcp.group.%lu",
-				 stats->files / 10000);
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->unique_srch_xattr) {
-			snprintf(name, sizeof(name),
-				 "scoutfs.srch.scoutfs_bcp.unique.%lu",
-				 stats->files);
+		if (opts->xattr_set) {
+			off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
+			if (opts->xattr_file)
+				off = snprintf_off(name, sizeof(name), off,
+						   "-f-%lu", stats->files);
+			if (opts->xattr_group)
+				off = snprintf_off(name, sizeof(name), off,
+						   "-g-%lu", stats->files / 10000);
+
+			error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
+
 			rc = setxattr(s->str, name, &val, 1, 0);
+			error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
 		}

-		error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);

 		stats->files++;
 		rate_banner(opts, stats);
@@ -365,11 +377,10 @@ static void usage(void)
 	       " -d DIR | create all files in DIR top level directory\n"
 	       " -n     | dry run, only parse, don't create any files\n"
 	       " -q     | quiet, don't regularly print rates\n"
+	       " -F     | append \"-f-NR\" file nr to xattr name, requires -X\n"
+	       " -G     | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
 	       " -L     | parse ls output; only reg, skip meta, paths at ./\n"
-	       " -X     | set the same user. xattr name in all files\n"
-	       " -S     | set the same .srch. xattr name in all files\n"
-	       " -G     | set a .srch. xattr name shared by groups of files\n"
-	       " -U     | set a unique .srch. xattr name in all files\n");
+	       " -X NAM | set named xattr in all files\n");
 }

 int main(int argc, char **argv)
@@ -386,7 +397,7 @@ int main(int argc, char **argv)

 	memset(&opts, 0, sizeof(opts));

-        while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
+        while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
                switch(c) {
                case 'd':
                        top_dir = strdup(optarg);
@@ -397,20 +408,19 @@ int main(int argc, char **argv)
                case 'q':
                        opts.quiet = 1;
                        break;
+                case 'F':
+                        opts.xattr_file = 1;
+                        break;
+                case 'G':
+                        opts.xattr_group = 1;
+                        break;
                case 'L':
                        opts.ls_output = 1;
                        break;
                case 'X':
-                        opts.user_xattr = 1;
-                        break;
-                case 'S':
-                        opts.same_srch_xattr = 1;
-                        break;
-                case 'G':
-                        opts.group_srch_xattr = 1;
-                        break;
-                case 'U':
-                        opts.unique_srch_xattr = 1;
+			opts.xattr_set = 1;
+			opts.xattr_name = strdup(optarg);
+			error_exit(!opts.xattr_name, "error allocating xattr name");
                        break;
                case '?':
                        printf("Unknown option '%c'\n", optopt);
@@ -419,6 +429,11 @@ int main(int argc, char **argv)
                }
        }

+	error_exit(opts.xattr_file && !opts.xattr_set,
+		   "must specify xattr -X when appending file nr with -F");
+	error_exit(opts.xattr_group && !opts.xattr_set,
+		   "must specify xattr -X when appending file nr with -G");
+
 	if (!opts.dry_run) {
 		error_exit(!top_dir,
 			   "must specify top level directory with -d");
--- a/tests/tests/createmany-parallel-mounts.sh
+++ b/tests/tests/createmany-parallel-mounts.sh
@@ -7,9 +7,11 @@ t_require_mounts 2

 COUNT=50000

-# Prep dirs for test. Each mount needs to make their own parent dir for
-# the createmany run, otherwise both dirs will end up in the same inode
-# group, causing updates to bounce that lock around.
+#
+# Prep dirs for test.  We have per-directory inode number allocators so
+# by putting each createmany in a per-mount dir they get their own inode
+# number region and cluster locks.
+#
 echo "== measure initial createmany"
 mkdir -p $T_D0/dir/0
 mkdir $T_D1/dir/1
@@ -17,18 +19,20 @@ mkdir $T_D1/dir/1
 echo "== measure initial createmany"
 START=$SECONDS
 createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
+sync
 SINGLE=$((SECONDS - START))
 echo single $SINGLE >> $T_TMP.full

 echo "== measure two concurrent createmany runs"
 START=$SECONDS
-createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
+(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
 pids="$!"
-createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
+(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
 pids="$pids $!"
 for p in $pids; do
        wait $p
 done
+sync
 BOTH=$((SECONDS - START))
 echo both $BOTH >> $T_TMP.full

@@ -41,7 +45,10 @@ echo both $BOTH >> $T_TMP.full
 # synchronized operation.
 FACTOR=200
 if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
-	echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
+	t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
 fi

+echo "== cleanup"
+find $T_D0/dir -delete
+
 t_pass
--- a/tests/tests/large-fragmented-free.sh
+++ b/tests/tests/large-fragmented-free.sh
@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

+#
+# This test specifically creates a pathologically sparse file that will
+# be as expensive as possible to free.  This is usually fine on
+# dedicated or reasonable hardware, but trying to run this in
+# virtualized debug kernels can take a very long time.  This test is
+# about making sure that the server doesn't fail, not that the platform
+# can handle the scale of work that our btree formats happen to require
+# while execution is bogged down with use-after-free memory reference
+# tracking.  So we give the test a lot more breathing room before
+# deciding that its hung.
+#
+echo "== setting longer hung task timeout"
+if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
+	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
+	test "$secs" -gt 0 || \
+		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
+	restore_hung_task_timeout()
+	{
+		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
+	}
+	trap restore_hung_task_timeout EXIT
+	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
+fi
+
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

--- a/tests/tests/srch-basic-functionality.sh
+++ b/tests/tests/srch-basic-functionality.sh
@@ -9,6 +9,7 @@ LOG=340000
 LIM=1000000

 SEQF="%.20g"
+SXA="scoutfs.srch.test-srch-basic-functionality"

 t_require_commands touch rm setfattr scoutfs find_xattrs

@@ -27,20 +28,20 @@ diff_srch_find()

 echo "== create new xattrs"
 touch "$T_D0/"{create,update}
-setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== update existing xattr"
-setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== remove an xattr"
-setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== remove xattr with files"
 rm -f "$T_D0/"{create,update}
-diff_srch_find scoutfs.srch.test
+diff_srch_find $SXA

 echo "== trigger small log merges by rotating single block with unmount"
 sv=$(t_server_nr)
@@ -56,7 +57,7 @@ while [ "$i" -lt "8" ]; do

 		eval path="\$T_D${nr}/single-block-$i"
 		touch "$path"
-		setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
+		setfattr -n $SXA -v $i "$path"
 		t_umount $nr
 		t_mount $nr

@@ -65,51 +66,51 @@ while [ "$i" -lt "8" ]; do
 done
 # wait for srch compaction worker delay
 sleep 10
-rm -rf "$T_D0/single-block-*"
+find "$T_D0" -type f -name 'single-block-*' -delete

 echo "== create entries in current log"
 DIR="$T_D0/dir"
 NR=$((LOG / 4))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete small fraction"
-seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
+diff_srch_find $SXA

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 echo "== create entries that exceed one log"
 NR=$((LOG * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete fractions in phases"
 for i in $(seq 1 3); do
-	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-	diff_srch_find scoutfs.srch.scoutfs_bcp
+	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
+	diff_srch_find $SXA
 done

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 echo "== create entries for exceed search entry limit"
 NR=$((LIM * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete half"
-seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
+diff_srch_find $SXA

 echo "== entirely remove third batch"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 t_pass
--- a/tests/tests/srch-safe-merge-pos.sh
+++ b/tests/tests/srch-safe-merge-pos.sh
@@ -4,10 +4,9 @@
 # block.  Resuming from that position would return an error and
 # compaction would stop making forward progress.
 #
-# We use triggers to make sure that we create the circumstance where a
-# sorted srch block ends at the _SAFE_BYTES offsset and that a merge
-# request stops with a partial block at that specific offset.  We then
-# watch error counters to make sure compaction doesn't get stuck.
+# We use triggers to pad the output of log compaction to end on the safe
+# offset and then cause compaction of those padded inputs to stop at the
+# safe offset.  Continuation will either succeed or return errors.  
 #

 # forcing rotation, so just a few
@@ -15,11 +14,20 @@ NR=10
 SEQF="%.20g"
 COMPACT_NR=4

-echo "== snapshot errors"
+echo "== initialize per-mount values"
 declare -a err
+declare -a compact_delay
 for nr in $(t_fs_nrs); do
 	err[$nr]=$(t_counter srch_compact_error $nr)
+	compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
 done
+restore_compact_delay()
+{
+	for nr in $(t_fs_nrs); do
+		echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
+	done
+}
+trap restore_compact_delay EXIT

 echo "== arm compaction triggers"
 for nr in $(t_fs_nrs); do
@@ -27,37 +35,50 @@ for nr in $(t_fs_nrs); do
 	t_trigger_arm srch_merge_stop_safe $nr
 done

-echo "== force lots of small rotated log files for compaction"
-sv=$(t_server_nr)
-iter=1
-while [ $iter -le $((COMPACT_NR * COMPACT_NR * COMPACT_NR)) ]; do
-	t_trigger_arm srch_force_log_rotate $sv
-
-	seq -f "f-$iter-$SEQF" 1 10 | src/bulk_create_paths -S -d "$T_D0" > /dev/null
-	sync
-
-	test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
-		t_fail "srch_force_log_rotate didn't trigger"
-
-	((iter++))
-done
-
-echo "== wait for compaction"
-sleep 15
-
-echo "== test and disarm compaction triggers"
-pad=0
-merge_stop=0
+echo "== compact more often"
 for nr in $(t_fs_nrs); do
-	test "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" && pad=1
-	t_trigger_set srch_compact_logs_pad_safe $nr 0
-	test "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" && merge_stop=1
-	t_trigger_set srch_merge_stop_safe $nr 0
+	echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
 done

-echo "== verify triggers and errors" 
-test $pad == 1 || t_fail "srch_compact_logs_pad_safe didn't trigger"
-test $merge_stop == 1 || t_fail "srch_merge_stop_safe didn't trigger"
+echo "== create padded sorted inputs by forcing log rotation"
+sv=$(t_server_nr)
+for i in $(seq 1 $COMPACT_NR); do
+	for j in $(seq 1 $COMPACT_NR); do
+		t_trigger_arm srch_force_log_rotate $sv
+
+		seq -f "f-$i-$j-$SEQF" 1 10 | \
+			bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
+			/dev/null
+		sync
+
+		test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
+			t_fail "srch_force_log_rotate didn't trigger"
+	done
+
+	padded=0
+	while test $padded == 0 && sleep .5; do
+		for nr in $(t_fs_nrs); do
+			if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
+				t_trigger_arm srch_compact_logs_pad_safe $nr
+				padded=1
+				break
+			fi
+			test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
+				t_fail "srch_compact_error counter increased on mount $nr"
+		done
+	done
+done
+
+echo "== compaction of padded should stop at safe"
+sleep 2
+for nr in $(t_fs_nrs); do
+	if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
+		break
+	fi
+done
+
+echo "== verify no compaction errors"
+sleep 2
 for nr in $(t_fs_nrs); do
 	test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
 		t_fail "srch_compact_error counter increased on mount $nr"
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
+.B log_merge_wait_timeout_ms=<number>
+This option sets the amount of time, in milliseconds, that log merge
+creation can wait before timing out.  This setting is per-mount, only
+changes the behavior of that mount, and only affects the server when it
+is running in that mount.
+.sp
+This determines how long it may take for mounts to synchronize
+committing their log trees to create a log merge operation.  Setting it
+too high can create long latencies in the event that a mount takes a
+long time to commit their log.  Setting it too low can result in the
+creation of excessive numbers of log trees that are never merged.  The
+default is 500 and it can not be less than 100 nor greater than 60000.
+.TP
 .B metadev_path=<device>
 The metadev_path option specifies the path to the block device that
 contains the filesystem's metadata.
Author	SHA1	Message	Date
Zach Brown	c3c4b08038	v1.20 Release Finish the release notes for the 1.20 release. Signed-off-by: Zach Brown <zab@versity.com>	2024-04-22 13:20:42 -07:00
Zach Brown	0519830229	Merge pull request #165 from versity/greg/kmod-uninstall-cleanup More cleanly drive weak-modules on install/uninstall	2024-04-11 14:32:06 -07:00
Greg Cymbalski	4d6e1a14ae	More safely install/uninstall with weak-modules This addresses some minor issues with how we handle driving the weak-modules infrastructure for handling running on kernels not explicitly built for. For one, we now drive weak-modules at install-time more explicitly (it was adding symlinks for all modules into the right place for the running kernel, whereas now it only handles that for scoutfs against all installed kernels). Also we no longer leave stale modules on the filesystem after an uninstall/upgrade, similar to what's done for vsm's kmods right now. RPM's pre/postinstall scriptlets are used to drive weak-modules to clean things up. Note that this (intentionally) does not (re)generate initrds of any kind. Finally, this was tested on both the native kernel version and on updates that would need the migrated modules. As a result, installs are a little quicker, the module still gets migrated successfully, and uninstalls correctly remove (only) the packaged module.	2024-04-11 13:20:50 -07:00
Greg Cymbalski	fc3e061ea8	Merge pull request #164 from versity/greg/preserve-git-describe Encode git info into spec to keep git info in final kmod	2024-03-29 13:48:33 -07:00
Greg Cymbalski	a4bc3fb27d	Capture git info at spec creation time, pass into make	2024-02-05 15:44:10 -08:00
Zach Brown	67990a7007	Merge pull request #162 from versity/zab/v1.19 v1.19 Release	2024-01-30 15:46:49 -08:00
Zach Brown	ba819be8f9	v1.19 Release Finish the release notes for the 1.19 release. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-30 12:11:23 -08:00
Zach Brown	1b103184ca	Merge pull request #161 from versity/zab/merge_timeout_option_fix Correctly set the log_merge_wait_timeout_ms option	2024-01-30 12:07:10 -08:00
Zach Brown	c3890abd7b	Correctly set the log_merge_wait_timeout_ms option The initial code for setting the timeout used the wrong parsed variable. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-30 12:01:35 -08:00
Zach Brown	5ab38bfa48	Merge pull request #160 from versity/zab/log_merging_speedups Zab/log merging speedups	2024-01-29 12:26:55 -08:00
Zach Brown	e9ad61b444	Delete multiple log trees items per server commit server_log_merge_free_work() is responsible for freeing all the input log trees for a log merge operation that has finished. It looks for the next item to free, frees the log btree it references, and then deletes the item. It was doing this with a full server commit for each item which can take an agonizingly long time. This changes it perform multiple deletions in a commit as long as there's plenty of alloc space. The moment the commit gets low it applies the commit and opens a new one. This sped up the deletion of a few hundred thousand log tree items from taking hours to seconds. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	91bbf90f71	Don't pin input btrees when merging The btree_merge code was pinning leaf blocks for all input btrees as it iterated over them. This doesn't work when there are a very large number of input btrees. It can run out of memory trying to hold a reference to a 64KiB leaf block for each input root. This reworks the btree merging code. It reads a window of blocks from all input trees to get a set of merged items. It can take multiple passes to complete the merge but by setting the merge window large enough this overhead is reduced. Merging now consumes a fixed amount of memory rather than using memory proportional to the number of input btrees. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	b5630f540d	Add tracing of the log merge finalizing decision Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	90a4c82363	Make log merge wait timeout tunable Add a mount option for the amount of time that log merge creation can wait before giving up. We add some counters so we can see how often the timeout is being hit and what the average successfull wait time is. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:25:56 -08:00
Zach Brown	f654fa0fda	Send syncs once when starting to merge The server sends sync requests to clients when it sees that they have open log trees that need to be committed for log merging to proceed. These are currently sent in the context of each client's get_log_trees request, resulting in sync requests queued for one client from all clients. Depending on message delivery and commit latencies, this can create a sync storm. The server's sends are reliable and the open commits are marked with the seq when they opened. It's easy for us to record having sent syncs to all open commits so that future attempts can be avoided. Later open commits will have higher seqs and will get a new round of syncs sent. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:25:20 -08:00
Zach Brown	50168a2d2a	Check each client's last log item for stable seq The server was checking all client log_trees items to search for the lowest commit seq that was still open. This can be expensive when there are a lot of finalized log_trees items that won't have open seqs. Only the last log_trees item for each client rid can be open, and the items are sorted by rid and nr, so we can easily only check the last item for each client rid. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:24:50 -08:00
Zach Brown	3c0616524a	Only search last log_trees per rid for finalizing During get_log_trees the server checks log_trees items to see if it should start a log merge operation. It did this by iterating over all log_trees items and there can be quite a lot of them. It doesn't need to see all of the items. It only needs to see the most recent log_trees item for each mount. That's enough to make the decisions that start the log merging process. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:23:59 -08:00
Zach Brown	8d3e6883c6	Merge pull request #159 from versity/auke/trans_hold Fix ret output for scoutfs_trans_hold trace pt.	2024-01-09 09:23:32 -08:00
Auke Kok	8747dae61c	Fix ret output for scoutfs_trans_hold trace pt. Signed-off-by: Auke Kok <auke.kok@versity.com>	2024-01-08 16:27:41 -08:00
Zach Brown	fffcf4a9bb	Merge pull request #158 from versity/zab/kasan_stack_oob_get_reg Ignore spurious KASAN unwind warning	2023-11-22 10:04:18 -08:00
Zach Brown	b552406427	Ignore spurious KASAN unwind warning KASAN could raise a spurious warning if the unwinder started in code without ORC metadata and tried to access in the KASAN stack frame redzones. This was fixed upstream but we can rarely see it in older kernels. We can ignore these messages. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-21 12:25:16 -08:00
Zach Brown	d812599e6b	Merge pull request #157 from versity/zab/dmsetup_test_devices Zab/dmsetup test devices	2023-11-21 10:13:02 -08:00
Zach Brown	03ab5cedb6	clean up createmany-parallel-mounts test This test is trying to make sure that concurrent work isn't much, much, slower than individual work. It does this by timing creating a bunch of files in a dir on a mount and then timing doing the same in two mounts concurrently. But it messed it up the concurrency pretty badly. It had the concurrent createmany tasks creating files with a full path. That means that every create is trying to read all the parent directories. The way inode number allocation works means that one of the mounts is likely to be getting a write lock that includes a shared parent. This created a ton of cluster lock contention between the two tasks. Then it didn't sync the creates between phases. It could be accidentally recording the time it took to write out the dirty single creates as time taken during the parallel creates. By syncing between phases and having the createmany tasks create files relative to their per-mount directories we actually perform concurrent work and test that we're not creating contention outside of the task load. This became a problem as we switched from loopback devices to device mapper devices. The loopback writers were using buffered writes so we were masking the io cost of constantly invalidating and refilling the item cache by turning the reads into memory copies out of the page cache. While we're in here we actually clean up the created files and then use t_fail to fail the test while the files still exist so they can be examined. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 15:12:57 -08:00
Zach Brown	2b94cd6468	Add loop module kernel message filter Now that we're not setting up per-mount loopback devices we can not have the loop module loaded until tests are running. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 13:39:38 -08:00
Zach Brown	5507ee5351	Use device-mapper for per-mount test devices We don't directly mount the underlying devices for each mount because the kernel notices multiple mounts and doesn't setup a new super block for each. Previously the script used loopback devices to create the local shared block construct 'cause it was easy. This introduced corruption of blocks that saw concurrent read and write IOs. The buffered kernel file IO paths that loopback eventually degrades into by default (via splice) could have buffered readers copying out of pages without the page lock while writers modified the page. This manifest as occasional crc failure of blocks that we knowingly issue concurrent reads and writes to from multiple mounts (the quorum and super blocks). This changes the script to use device-mapper linear passthrough devices. Their IOs don't hit a caching layer and don't provide an opportunity to corrupt blocks. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 13:39:38 -08:00
Zach Brown	1600a121d9	Merge pull request #156 from versity/zab/large_fragmented_free_hung_task Extend hung task timeout for large-fragmented-free	2023-11-15 09:49:13 -08:00
Zach Brown	6daf24ff37	Extend hung task timeout for large-fragmented-free Our large fragmented free test creates pathologically file extents which are as expensive as possible to free. We know that debugging kernels can take a long time to do this so we can extend the hung task timeout. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-14 15:01:37 -08:00
Zach Brown	cd5d9ff3e0	Merge pull request #154 from versity/zab/srch_test_fixes Zab/srch test fixes	2023-11-13 09:47:46 -08:00
Zach Brown	d94e49eb63	Fix quoted glob in srch-basic-functionality One of the phases of this test wanted to delete files but got the glob quoting wrong. This didn't matter for the original test but when we changed the test to use its own xattr name then those existing undeleted files got confused with other files in later phases of the test. This changes the test to delete the files with a more reliable find pattern instead of using shell glob expansion. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:16:36 -08:00
Zach Brown	1dbe408539	Add tracing of srch compact struct communication Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:16:33 -08:00
Zach Brown	bf21699ad7	bulk_create_paths test tool takes xattr name Previously the bulk_create_paths test tool used the same xattr name for each category of xattrs it was creating. This created a problem where two tests got their xattrs confused with each other. The first test created a bunch of srch xattrs, failed, and didn't clean up after itself. The second test saw these search xattrs as its own and got very confused when there were far more srch xattrs than it thought it had created. This lets each test specify the srch xattr names that are created by bulk_create_paths so that tests can work with their xattrs independent of each other. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:15:44 -08:00
Zach Brown	c7c67a173d	Specifically wait for compaction in srch test We just added a test to try and get srch compaction stuck by having an input file continue at a specific offset. To exercise the bug the test needs to perform 6 compactions. It needs to merge 4 sets of logs into 4 sorted files, it needs to make partial progress merging those 4 sorted files into another file, and then finall attempt to continue compacting from the partial progress offset. The first version of the test didn't necessarily ensure that these compactions happened. It created far too many log files then just waited for time to pass. If the host was slow then the mounts may not make it through the initial logs to try and compact the sorted files. The triggers wouldn't fire and the test would fail. These changes much more carefully orchestrate and watch the various steps of compaction to make sure that we trigger the bug. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:13:13 -08:00
Zach Brown	0d10189f58	Make srch compact request delay tunable Add a sysfs file for getting and setting the delay between srch compaction requests from the client. We'll use this in testing to ensure compaction runs promptly. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:13:07 -08:00
Zach Brown	6b88f3268e	Merge pull request #153 from versity/zab/v1.18 v1.18 Release	2023-11-08 10:57:56 -08:00