Add rpm spec file support for el8 builds

The rpmbuild support files no longer define the previously used kernel module macros. This carves out the differences between el7 and el8 with conditionals based on the distro we are building for. Signed-off-by: Ben McClelland <ben.mcclelland@versity.com>
Ignore last flag output by filefrag.
2026-06-09 21:22:36 +00:00 · 2023-08-25 15:40:05 -07:00 · 2023-08-01 16:35:48 -04:00 · 2023-08-01 16:35:48 -04:00 · 2023-08-01 16:35:48 -04:00 · 2023-08-01 16:35:48 -04:00
37 changed files with 526 additions and 1337 deletions
@@ -1,57 +1,6 @@
 Versity ScoutFS Release Notes
 =============================

---
-v1.20
-\
-*Apr 22, 2024*
-
-Minor changes to packaging to better support "weak" module linking of
-the kernel module, and to including git hashes in the built package.  No
-changes in runtime behaviour.
-
---
-v1.19
-\
-*Jan 30, 2024*
-
-Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
-for creating log merge operations.  The previous timeout, now the
-default, was too short for some systems and was resulting in consistent
-timeouts which created an excessive number of log trees waiting to be
-merged.
-
-Improved performance of many in-mount server operations when there are a
-large number of log trees waiting to be merged.
-
---
-v1.18
-\
-*Nov 7, 2023*
-
-Fixed a bug where background srch file compaction could stop making
-forward progress if a partial compaction operation was committed at a
-specific byte offset in a block.  This would cause srch file searches to
-be progressively more expensive over time.  Once this fix is running
-background compaction will resume, bringing the cost of searches back
-down.
-
---
-v1.17
-\
-*Oct 23, 2023*
-
-Add support for EL8 generation kernels.
-
---
-v1.16
-\
-*Oct 4, 2023*
-
-Fix an issue where the server could hang on startup if its persistent
-allocator structures were left in a specific degraded state by the
-previously active server.
-
 ---
 v1.15
 \
@@ -12,22 +12,17 @@ else
 SP = @:
 endif

-SCOUTFS_GIT_DESCRIBE ?= \
+SCOUTFS_GIT_DESCRIBE := \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

-ESCAPED_GIT_DESCRIBE := \
-	$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
-
-RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
-
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
-		RPM_GITHASH=$(RPM_GITHASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

 # - We use the git describe from tags to set up the RPM versioning
 RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
+RPM_GITHASH := $(shell git rev-parse --short HEAD)
 TARFILE = scoutfs-kmod-$(RPM_VERSION).tar


@@ -46,8 +41,7 @@ modules_install:

 %.spec: %.spec.in .FORCE
 	sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
-	    -e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
+	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
 	mv $@+ $@


@@ -1,7 +1,6 @@
 %define kmod_name scoutfs
 %define kmod_version @@VERSION@@
 %define kmod_git_hash @@GITHASH@@
-%define kmod_git_describe @@GITDESCRIBE@@
 %define pkg_date %(date +%%Y%%m%%d)

 # Disable the building of the debug package(s).
@@ -76,7 +75,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
 for flavor in %flavors_to_build; do
    rm -rf obj/$flavor
    cp -r source obj/$flavor
-    make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
+    make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
 done

 %install
@@ -98,21 +97,10 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
 /lib/modules

 %post
-echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
+weak-modules --add-kernel --no-initramfs
 depmod -a
 %endif

 %clean
 rm -rf %{buildroot}

-%preun
-# stash our modules for postun cleanup
-SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
-rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
-
-%postun
-if [ -x /sbin/weak-modules ]; then
-    cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
-fi
-
-rm /var/run/%{name}-modules-%{version}-%{release} || true
@@ -1078,7 +1078,7 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_

 	scoutfs_inc_counter(sb, block_cache_count_objects);

-	return shrinker_min_long(atomic_read(&binf->total_inserted));
+	return shrinker_min_t_long((u64)atomic_read(&binf->total_inserted));
 }

 /*
@@ -2029,253 +2029,187 @@ int scoutfs_btree_rebalance(struct super_block *sb,
 			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
 }

-struct merged_range {
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	struct rb_root root;
-	int size;
-};
-
-struct merged_item {
+struct merge_pos {
 	struct rb_node node;
-	struct scoutfs_key key;
+	struct scoutfs_btree_root *root;
+	struct scoutfs_block *bl;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *avl;
+	struct scoutfs_key *key;
 	u64 seq;
 	u8 flags;
 	unsigned int val_len;
-	u8 val[0];
+	u8 *val;
 };

-static inline struct merged_item *mitem_container(struct rb_node *node)
+static struct merge_pos *first_mpos(struct rb_root *root)
 {
-	return node ? container_of(node, struct merged_item, node) : NULL;
-}
-
-static inline struct merged_item *first_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_first(root));
-}
-
-static inline struct merged_item *last_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_last(root));
-}
-
-static inline struct merged_item *next_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
-}
-
-static inline struct merged_item *prev_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
-}
-
-static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
-				      struct rb_node **parent_ret, struct rb_node ***link_ret)
-{
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct merged_item *mitem;
-	int cmp;
-
-	while (*node) {
-		parent = *node;
-		mitem = container_of(*node, struct merged_item, node);
-
-		cmp = scoutfs_key_compare(key, &mitem->key);
-
-		if (cmp < 0) {
-			node = &(*node)->rb_left;
-		} else if (cmp > 0) {
-			node = &(*node)->rb_right;
-		} else {
-			*parent_ret = NULL;
-			*link_ret = NULL;
-			return mitem;
-		}
-	}
-
-	*parent_ret = parent;
-	*link_ret = node;
+	struct rb_node *node = rb_first(root);
+	if (node)
+		 return container_of(node, struct merge_pos, node);
 	return NULL;
 }

-static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
-			 struct rb_node *parent, struct rb_node **link)
+static struct merge_pos *next_mpos(struct merge_pos *mpos)
 {
-	rb_link_node(&mitem->node, parent, link);
-	rb_insert_color(&mitem->node, &rng->root);
-	rng->size += item_len_bytes(mitem->val_len);
+	struct rb_node *node;
+
+	if (mpos && (node = rb_next(&mpos->node)))
+		return container_of(node, struct merge_pos, node);
+	else
+		return NULL;
 }

-static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
-				struct merged_item *new)
+static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
 {
-	rb_replace_node(&victim->node, &new->node, &rng->root);
-	RB_CLEAR_NODE(&victim->node);
-	rng->size -= item_len_bytes(victim->val_len);
-	rng->size += item_len_bytes(new->val_len);
+	scoutfs_block_put(sb, mpos->bl);
+	kfree(mpos);
 }

-static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
+static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
 {
-	if (IS_ERR_OR_NULL(mitem))
-		return;
+	struct rb_node **node = &pos_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct merge_pos *mpos;
+	int cmp;

-	if (!RB_EMPTY_NODE(&mitem->node)) {
-		rng->size -= item_len_bytes(mitem->val_len);
-		rb_erase(&mitem->node, &rng->root);
+	parent = NULL;
+	while (*node) {
+		parent = *node;
+		mpos = container_of(*node, struct merge_pos, node);
+
+		/* sort merge items by key then newest to oldest */
+		cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
+		      -scoutfs_cmp(ins->seq, mpos->seq);
+
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
 	}

-	kfree(mitem);
-}
-
-static void trim_range_size(struct merged_range *rng, int merge_window)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && rng->size > merge_window) {
-
-		rng->end = mitem->key;
-		scoutfs_key_dec(&rng->end);
-
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
-}
-
-static void trim_range_end(struct merged_range *rng)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
+	rb_link_node(&ins->node, parent, node);
+	rb_insert_color(&ins->node, pos_root);
 }

 /*
- * Record and combine logged items from log roots for merging with the
- * writable destination root.  The caller is responsible for trimming
- * the range if it gets too large or if the key range shrinks.
+ * Find the next item in the merge_pos root in the caller's range and
+ * insert it into the rbtree sorted by key and version so that merging
+ * can find the next newest item at the front of the rbtree.  We free
+ * the mpos on error or if there are no more items in the range.
 */
-static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
-			   void *val, int val_len, void *arg)
+static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
+		      struct scoutfs_key *start, struct scoutfs_key *end)
 {
-	struct merged_range *rng = arg;
-	struct merged_item *mitem;
-	struct merged_item *found;
-	struct rb_node *parent;
-	struct rb_node **link;
-	int ret;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *next;
+	struct btree_walk_key_range kr;
+	struct scoutfs_key walk_key;
+	int ret = 0;

-	found = find_mitem(&rng->root, key, &parent, &link);
-	if (found) {
-		ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
-		if (ret < 0)
-			goto out;
-		if (ret > 0) {
-			if (ret == SCOUTFS_DELTA_COMBINED) {
-				scoutfs_inc_counter(sb, btree_merge_delta_combined);
-			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
-				scoutfs_inc_counter(sb, btree_merge_delta_null);
-				free_mitem(rng, found);
-			}
-			ret = 0;
-			goto out;
-		}
-
-		if (found->seq >= seq) {
-			ret = 0;
-			goto out;
-		}
+	/* always erase before freeing or inserting */
+	if (!RB_EMPTY_NODE(&mpos->node)) {
+		rb_erase(&mpos->node, pos_root);
+		RB_CLEAR_NODE(&mpos->node);
 	}

-	mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
-	if (!mitem) {
-		ret = -ENOMEM;
+	/*
+	 * advance to next item via the avl tree.   The caller's pos is
+	 * only ever incremented past the last key so we can use next to
+	 * iterate rather than using search to skip past multiple items.
+	 */
+	if (mpos->avl)
+		mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
+
+	/* find the next leaf with the key if we run out of items */
+	walk_key = *start;
+	while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
+		scoutfs_block_put(sb, mpos->bl);
+		mpos->bl = NULL;
+		ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
+				 0, &mpos->bl, &kr, NULL);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			free_mpos(sb, mpos);
+			goto out;
+		}
+		mpos->bt = mpos->bl->data;
+
+		mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
+					       start, NULL, NULL, &next, NULL) ?: next;
+		if (mpos->avl == NULL)
+			walk_key = kr.iter_next;
+	}
+
+	/* see if we're out of items within the range */
+	item = node_item(mpos->avl);
+	if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
+		free_mpos(sb, mpos);
+		ret = 0;
 		goto out;
 	}

-	mitem->key = *key;
-	mitem->seq = seq;
-	mitem->flags = flags;
-	mitem->val_len = val_len;
-	if (val_len)
-		memcpy(mitem->val, val, val_len);
-
-	if (found) {
-		replace_mitem(rng, found, mitem);
-		free_mitem(rng, found);
-	} else {
-		insert_mitem(rng, mitem, parent, link);
-	}
+	/* insert the next item within range at its version */
+	mpos->key = item_key(item);
+	mpos->seq = le64_to_cpu(item->seq);
+	mpos->flags = item->flags;
+	mpos->val_len = item_val_len(item);
+	mpos->val = item_val(mpos->bt, item);

+	insert_mpos(pos_root, mpos);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * Read a range of merged items.  The caller has set the key bounds of
- * the range.  We read a merge window's worth of items from blocks in
- * each input btree.
+ * The caller has reset all the merge positions for all the input log
+ * btree roots and wants the next logged item it should try and merge
+ * with the items in the fs_root.
 *
- * The caller can only use the smallest range that overlaps with all the
- * blocks that we read.  We start reading from the range's start key so
- * it will always be present and we don't need to adjust it.  The final
- * block we read from each input might not cover the range's end so it
- * needs to be adjusted.
- *
- * The end range can also shrink if we have to drop items because the
- * items exceeded the merge window size.
+ * We look ahead in the logged item stream to see if we should merge any
+ * older logged delta items into one result for the caller.  We also
+ * take this opportunity to skip and reset the mpos for any older
+ * versions of the first item.
 */
-static int read_merged_range(struct super_block *sb, struct merged_range *rng,
-			     struct list_head *inputs, int merge_window)
+static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
+			      struct scoutfs_key *end, struct merge_pos **mpos_ret)
 {
-	struct scoutfs_btree_root_head *rhead;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
+	struct merge_pos *mpos;
+	struct merge_pos *next;
 	struct scoutfs_key key;
 	int ret = 0;
-	int i;

-	list_for_each_entry(rhead, inputs, head) {
-		key = rng->start;
+	while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
+	       !scoutfs_key_compare(mpos->key, next->key)) {

-		for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
-			start = key;
-			end = rng->end;
-			ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
-						       merge_read_item, rng);
+		ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
+						    next->val, next->val_len);
+		if (ret < 0)
+			break;
+
+		/* reset advances to the next item */
+		key = *mpos->key;
+		scoutfs_key_inc(&key);
+
+		/* always skip next combined or older version */
+		ret = reset_mpos(sb, pos_root, next, &key, end);
+		if (ret < 0)
+			break;
+
+		if (ret == SCOUTFS_DELTA_COMBINED) {
+			scoutfs_inc_counter(sb, btree_merge_delta_combined);
+		} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
+			scoutfs_inc_counter(sb, btree_merge_delta_null);
+			/* if merging resulted in no info, skip current */
+			ret = reset_mpos(sb, pos_root, mpos, &key, end);
 			if (ret < 0)
-				goto out;
-
-			if (scoutfs_key_compare(&end, &rng->end) >= 0)
 				break;
-
-			key = end;
-			scoutfs_key_inc(&key);
 		}
-
-		if (scoutfs_key_compare(&end, &rng->end) < 0) {
-			rng->end = end;
-			trim_range_end(rng);
-		}
-
-		if (rng->size > merge_window)
-			trim_range_size(rng, merge_window);
 	}

-	trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
-	ret = 0;
-out:
+	*mpos_ret = mpos;
 	return ret;
 }

@@ -2292,13 +2226,6 @@ out:
 * to allocators running low or needing to join/split the parent.
 * *next_ret is set to the next key which hasn't been merged so that the
 * caller can retry with a new allocator and subtree.
- *
- * The number of input roots can be immense.  The merge_window specifies
- * the size of the set of merged items that we'll maintain as we iterate
- * over all the input roots.  Once we've merged items into the window
- * from all the input roots the merged input items are then merged to
- * the writable destination root.  It may take multiple passes of
- * windows of merged items to cover the input key range.
 */
 int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_alloc *alloc,
@@ -2308,16 +2235,18 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *inputs,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window)
+			bool subtree, int dirty_limit, int alloc_low)
 {
+	struct scoutfs_btree_root_head *rhead;
+	struct rb_root pos_root = RB_ROOT;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl = NULL;
 	struct btree_walk_key_range kr;
 	struct scoutfs_avl_node *par;
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-	struct merged_range rng;
+	struct scoutfs_key next;
+	struct merge_pos *mpos;
+	struct merge_pos *tmp;
 	int walk_val_len;
 	int walk_flags;
 	bool is_del;
@@ -2328,59 +2257,49 @@ int scoutfs_btree_merge(struct super_block *sb,
 	trace_scoutfs_btree_merge(sb, root, start, end);
 	scoutfs_inc_counter(sb, btree_merge);

+	list_for_each_entry(rhead, inputs, head) {
+		mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
+		if (!mpos) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		RB_CLEAR_NODE(&mpos->node);
+		mpos->root = &rhead->root;
+
+		ret = reset_mpos(sb, &pos_root, mpos, start, end);
+		if (ret < 0)
+			goto out;
+	}
+
 	walk_flags = BTW_DIRTY;
 	if (subtree)
 		walk_flags |= BTW_SUBTREE;
 	walk_val_len = 0;

-	rng.start = *start;
-	rng.end = *end;
-	rng.root = RB_ROOT;
-	rng.size = 0;
-
-	ret = read_merged_range(sb, &rng, inputs, merge_window);
-	if (ret < 0)
-		goto out;
-
-	for (;;) {
-		/* read next window as it empties (and it is possible to read an empty range) */
-		mitem = first_mitem(&rng.root);
-		if (!mitem) {
-			/* done if the read range hit the end */
-			if (scoutfs_key_compare(&rng.end, end) >= 0)
-				break;
-
-			/* read next batch of merged items */
-			rng.start = rng.end;
-			scoutfs_key_inc(&rng.start);
-			rng.end = *end;
-			ret = read_merged_range(sb, &rng, inputs, merge_window);
-			if (ret < 0)
-				break;
-			continue;
-		}
+	while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {

 		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
 			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
 			scoutfs_inc_counter(sb, btree_merge_alloc_low);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		scoutfs_block_put(sb, bl);
 		bl = NULL;
 		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-			         &mitem->key, walk_val_len, &bl, &kr, NULL);
+			         mpos->key, walk_val_len, &bl, &kr, NULL);
 		if (ret < 0) {
 			if (ret == -ERANGE)
-				*next_ret = mitem->key;
+				*next_ret = *mpos->key;
 			goto out;
 		}
 		bt = bl->data;
@@ -2392,21 +2311,22 @@ int scoutfs_btree_merge(struct super_block *sb,
 			continue;
 		}

-		while (mitem) {
+		while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
+
 			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
+			if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
 				break;

 			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, &mitem->key);
-			is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
+			item = leaf_item_hash_search(sb, bt, mpos->key);
+			is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);

 			/* see if we're merging delta items */
 			if (item && !is_del)
-				delta = scoutfs_forest_combine_deltas(&mitem->key,
+				delta = scoutfs_forest_combine_deltas(mpos->key,
 								      item_val(bt, item),
 								      item_val_len(item),
-								      mitem->val, mitem->val_len);
+								      mpos->val, mpos->val_len);
 			else
 				delta = 0;
 			if (delta < 0) {
@@ -2418,38 +2338,40 @@ int scoutfs_btree_merge(struct super_block *sb,
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 			}

-			trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
+			trace_scoutfs_btree_merge_items(sb, mpos->root,
+					mpos->key, mpos->val_len,
 					item ? root : NULL,
 					item ? item_key(item) : NULL,
 					item ? item_val_len(item) : 0, is_del);

 			/* rewalk and split if ins/update needs room */
-			if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
+			if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
 				walk_flags |= BTW_INSERT;
-				walk_val_len = mitem->val_len;
+				walk_val_len = mpos->val_len;
 				break;
 			}

 			/* insert missing non-deletion merge items */
 			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
+				scoutfs_avl_search(&bt->item_root,
+						   cmp_key_item, mpos->key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, &mitem->key, mitem->seq, mitem->flags,
-					    mitem->val, mitem->val_len, par, cmp);
+				create_item(bt, mpos->key, mpos->seq, mpos->flags,
+					    mpos->val, mpos->val_len, par, cmp);
 				scoutfs_inc_counter(sb, btree_merge_insert);
 			}

 			/* update existing items */
 			if (item && !is_del && !delta) {
-				item->seq = cpu_to_le64(mitem->seq);
-				item->flags = mitem->flags;
-				update_item_value(bt, item, mitem->val, mitem->val_len);
+				item->seq = cpu_to_le64(mpos->seq);
+				item->flags = mpos->flags;
+				update_item_value(bt, item, mpos->val, mpos->val_len);
 				scoutfs_inc_counter(sb, btree_merge_update);
 			}

 			/* update combined delta item seq */
 			if (delta == SCOUTFS_DELTA_COMBINED) {
-				item->seq = cpu_to_le64(mitem->seq);
+				item->seq = cpu_to_le64(mpos->seq);
 			}

 			/*
@@ -2481,18 +2403,21 @@ int scoutfs_btree_merge(struct super_block *sb,
 			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
 			walk_val_len = 0;

-			/* finished with this merged item */
-			tmp = mitem;
-			mitem = next_mitem(mitem);
-			free_mitem(&rng, tmp);
+			/* finished with this key, skip any older items */
+			next = *mpos->key;
+			scoutfs_key_inc(&next);
+			ret = reset_mpos(sb, &pos_root, mpos, &next, end);
+			if (ret < 0)
+				goto out;
 		}
 	}

 	ret = 0;
 out:
 	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
-		free_mitem(&rng, mitem);
+	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
+		free_mpos(sb, mpos);
+	}

 	return ret;
 }
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *input_list,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window);
+			bool subtree, int dirty_limit, int alloc_low);

 int scoutfs_btree_free_blocks(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
@@ -145,7 +145,6 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
-	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
 	EXPAND_COUNTER(net_send_error)				\
@@ -1058,15 +1058,16 @@ static int symlink_item_ops(struct super_block *sb, enum symlink_ops op, u64 ino
 	return ret;
 }

+#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
 /*
- * Fill a buffer with the null terminated symlink, and return it
- * so callers can free it once the vfs is done.
+ * Full a buffer with the null terminated symlink, point nd at it, and
+ * return it so put_link can free it once the vfs is done.
 *
 * We chose to pay the runtime cost of per-call allocation and copy
 * overhead instead of wiring up symlinks to the page cache, storing
 * each small link in a full page, and later having to reclaim them.
 */
-static void *scoutfs_get_link_target(struct dentry *dentry)
+static void *scoutfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
@@ -1125,20 +1126,10 @@ out:
 	if (ret < 0) {
 		kfree(path);
 		path = ERR_PTR(ret);
-	}
-
-	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
-	return path;
-}
-
-#ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
-static void *scoutfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char *path;
-
-	path = scoutfs_get_link_target(dentry);
-	if (!IS_ERR_OR_NULL(path))
+	} else {
 		nd_set_link(nd, path);
+	}
+	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
 	return path;
 }

@@ -1151,12 +1142,67 @@ static void scoutfs_put_link(struct dentry *dentry, struct nameidata *nd,
 #else
 static const char *scoutfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done)
 {
-	char *path;
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_lock *inode_lock = NULL;
+	char *path = NULL;
+	loff_t size;
+	int ret;

-	path = scoutfs_get_link_target(dentry);
-	if (!IS_ERR_OR_NULL(path))
+	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
+				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
+	if (ret)
+		return ERR_PTR(ret);
+
+	size = i_size_read(inode);
+
+	if (size == 0 || size > SCOUTFS_SYMLINK_MAX_SIZE) {
+		scoutfs_corruption(sb, SC_SYMLINK_INODE_SIZE,
+				   corrupt_symlink_inode_size,
+				   "ino %llu size %llu",
+				   scoutfs_ino(inode), (u64)size);
+		ret = -EIO;
+		goto out;
+	}
+
+	/* unlikely, but possible I suppose */
+	if (size > PATH_MAX) {
+		ret = -ENAMETOOLONG;
+		goto out;
+	}
+
+	path = kmalloc(size, GFP_NOFS);
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = symlink_item_ops(sb, SYM_LOOKUP, scoutfs_ino(inode), inode_lock,
+			       path, size);
+
+	if (ret == -ENOENT) {
+		scoutfs_corruption(sb, SC_SYMLINK_MISSING_ITEM,
+				   corrupt_symlink_missing_item,
+				   "ino %llu size %llu", scoutfs_ino(inode),
+				   size);
+		ret = -EIO;
+
+	} else if (ret == 0 && path[size - 1]) {
+		scoutfs_corruption(sb, SC_SYMLINK_NOT_NULL_TERM,
+				   corrupt_symlink_not_null_term,
+				   "ino %llu last %u",
+				   scoutfs_ino(inode), path[size - 1]);
+		ret = -EIO;
+	}
+
+	if (ret != -EIO)
 		set_delayed_call(done, kfree_link, path);

+out:
+	if (ret < 0) {
+		kfree(path);
+		path = ERR_PTR(ret);
+	}
+	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
 	return path;
 }
 #endif
@@ -721,8 +721,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
 	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
 				  &next, &comp.root, &inputs,
 				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
-				  (2 * 1024 * 1024));
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
 	if (ret == -ERANGE) {
 		comp.remain = next;
 		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
@@ -2541,7 +2541,7 @@ static unsigned long item_cache_count_objects(struct shrinker *shrink,

 	scoutfs_inc_counter(sb, item_cache_count_objects);

-	return shrinker_min_long(cinf->lru_pages);
+	return shrinker_min_t_long((u64)(cinf->lru_pages));
 }

 /*
@@ -1409,7 +1409,7 @@ static unsigned long lock_count_objects(struct shrinker *shrink,

 	scoutfs_inc_counter(sb, lock_count_objects);

-	return shrinker_min_long(linfo->lru_nr);
+	return shrinker_min_t_long((u64)(linfo->lru_nr));
 }

 /*
@@ -33,7 +33,6 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
-	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
@@ -46,7 +45,6 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
-	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -115,10 +113,6 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

-#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
-#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
-#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
-
 #define MIN_ORPHAN_SCAN_DELAY_MS	100UL
 #define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
 #define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
@@ -132,27 +126,11 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
-	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
 }

-static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
-{
-	if (ret < 0) {
-		scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
-		return -EINVAL;
-	}
-	if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
-		scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
-			    val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
 {
 	if (ret < 0) {
@@ -218,14 +196,6 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

-		case Opt_log_merge_wait_timeout_ms:
-			ret = match_int(args, &nr);
-			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
-			if (ret < 0)
-				return ret;
-			opts->log_merge_wait_timeout_ms = nr;
-			break;
-
 		case Opt_metadev_path:
 			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
 			if (ret < 0)
@@ -452,43 +422,6 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

-static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
-						char *buf)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	struct scoutfs_mount_options opts;
-
-	scoutfs_options_read(sb, &opts);
-
-	return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
-}
-static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-						 const char *buf, size_t count)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	DECLARE_OPTIONS_INFO(sb, optinf);
-	char nullterm[30]; /* more than enough for octal -U64_MAX */
-	int val;
-	int len;
-	int ret;
-
-	len = min(count, sizeof(nullterm) - 1);
-	memcpy(nullterm, buf, len);
-	nullterm[len] = '\0';
-
-	ret = kstrtoint(nullterm, 0, &val);
-	ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
-	if (ret == 0) {
-		write_seqlock(&optinf->seqlock);
-		optinf->opts.log_merge_wait_timeout_ms = val;
-		write_sequnlock(&optinf->seqlock);
-		ret = count;
-	}
-
-	return ret;
-}
-SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
-
 static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -592,7 +525,6 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
-	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
@@ -8,7 +8,6 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
-	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
@@ -439,7 +439,6 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->journal_info = (unsigned long)journal_info;
 		__entry->holders = holders;
-		__entry->ret = ret;
 	),

 	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
@@ -1747,41 +1746,21 @@ TRACE_EVENT(scoutfs_btree_merge,
 		  sk_trace_args(end))
 );

-TRACE_EVENT(scoutfs_btree_merge_read_range,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
-		 int size),
-
-	TP_ARGS(sb, start, end, size),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		sk_trace_define(start)
-		sk_trace_define(end)
-		__field(int, size)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		sk_trace_assign(start, start);
-		sk_trace_assign(end, end);
-		__entry->size = size;
-	),
-
-	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
-);
-
 TRACE_EVENT(scoutfs_btree_merge_items,
 	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *m_root,
 		 struct scoutfs_key *m_key, int m_val_len,
 		 struct scoutfs_btree_root *f_root,
 		 struct scoutfs_key *f_key, int f_val_len,
 		 int is_del),

-	TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, m_root_blkno)
+		__field(__u64, m_root_seq)
+		__field(__u8, m_root_height)
 		sk_trace_define(m_key)
 		__field(int, m_val_len)
 		__field(__u64, f_root_blkno)
@@ -1794,6 +1773,10 @@ TRACE_EVENT(scoutfs_btree_merge_items,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->m_root_blkno = m_root ?
+					le64_to_cpu(m_root->ref.blkno) : 0;
+		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
+		__entry->m_root_height = m_root ? m_root->height : 0;
 		sk_trace_assign(m_key, m_key);
 		__entry->m_val_len = m_val_len;
 		__entry->f_root_blkno = f_root ?
@@ -1805,9 +1788,11 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		__entry->is_del = !!is_del;
 	),

-	TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
-		  __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
+	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
+		  __entry->m_root_height, sk_trace_args(m_key),
+		  __entry->m_val_len, __entry->f_root_blkno,
+		  __entry->f_root_seq, __entry->f_root_height,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );

@@ -1911,9 +1896,8 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,

 DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing,
-		exceeded),
+		 u32 avail_before, u32 freed_before, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded),
        TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(int, holding)
@@ -1921,7 +1905,6 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
 		__field(int, nr_holders)
 		__field(__u32, avail_before)
 		__field(__u32, freed_before)
-		__field(int, committing)
 		__field(int, exceeded)
        ),
        TP_fast_assign(
@@ -1931,33 +1914,31 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
 		__entry->nr_holders = nr_holders;
 		__entry->avail_before = avail_before;
 		__entry->freed_before = freed_before;
-		__entry->committing = !!committing;
 		__entry->exceeded = !!exceeded;
        ),
-	TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u",
+	TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u exceeded %u",
 		  SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
-		  __entry->avail_before, __entry->freed_before, __entry->committing,
-		  __entry->exceeded)
+		  __entry->avail_before, __entry->freed_before, __entry->exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+		 u32 avail_before, u32 freed_before, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+		 u32 avail_before, u32 freed_before, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+		 u32 avail_before, u32 freed_before, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
 );
 DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
        TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
-		 u32 avail_before, u32 freed_before, int committing, int exceeded),
-        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
+		 u32 avail_before, u32 freed_before, int exceeded),
+        TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
 );

 #define slt_symbolic(mode)						\
@@ -2090,71 +2071,6 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_server_finalize_items,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
-		 u64 item_get_trans_seq),
-
-	TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(__u64, item_rid)
-		__field(__u64, item_nr)
-		__field(__u64, item_flags)
-		__field(__u64, item_get_trans_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->item_rid = item_rid;
-		__entry->item_nr = item_nr;
-		__entry->item_flags = item_flags;
-		__entry->item_get_trans_seq = item_get_trans_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
-		  __entry->item_flags, __entry->item_get_trans_seq)
-);
-
-TRACE_EVENT(scoutfs_server_finalize_decision,
-	TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
-		 bool ours_visible, bool finalize_ours, unsigned int delay_ms,
-		 u64 finalize_sent_seq),
-
-	TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
-		finalize_sent_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(bool, saw_finalized)
-		__field(bool, others_active)
-		__field(bool, ours_visible)
-		__field(bool, finalize_ours)
-		__field(unsigned int, delay_ms)
-		__field(__u64, finalize_sent_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->saw_finalized = saw_finalized;
-		__entry->others_active = others_active;
-		__entry->ours_visible = ours_visible;
-		__entry->finalize_ours = finalize_ours;
-		__entry->delay_ms = delay_ms;
-		__entry->finalize_sent_seq = finalize_sent_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
-		  __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
-		  __entry->finalize_sent_seq)
-);
-
 TRACE_EVENT(scoutfs_get_log_merge_status,
 	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
 		 u64 nr_requests, u64 nr_complete, u64 seq),
@@ -2879,81 +2795,6 @@ TRACE_EVENT(scoutfs_omap_should_delete,
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
 );

-#define SSCF_FMT "[bo %llu bs %llu es %llu]"
-#define SSCF_FIELDS(pref)					\
-	__field(__u64, pref##_blkno)				\
-	__field(__u64, pref##_blocks)				\
-	__field(__u64, pref##_entries)
-#define SSCF_ASSIGN(pref, sfl)					\
-	__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno);	\
-	__entry->pref##_blocks = le64_to_cpu((sfl)->blocks);	\
-	__entry->pref##_entries = le64_to_cpu((sfl)->entries);
-#define SSCF_ENTRY_ARGS(pref)					\
-	__entry->pref##_blkno,					\
-	__entry->pref##_blocks,					\
-	__entry->pref##_entries
-
-DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-
-	TP_ARGS(sb, sc),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, id)
-		__field(__u8, nr)
-		__field(__u8, flags)
-		SSCF_FIELDS(out)
-		__field(__u64, in0_blk)
-		__field(__u64, in0_pos)
-		SSCF_FIELDS(in0)
-		__field(__u64, in1_blk)
-		__field(__u64, in1_pos)
-		SSCF_FIELDS(in1)
-		__field(__u64, in2_blk)
-		__field(__u64, in2_pos)
-		SSCF_FIELDS(in2)
-		__field(__u64, in3_blk)
-		__field(__u64, in3_pos)
-		SSCF_FIELDS(in3)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->id = le64_to_cpu(sc->id);
-		__entry->nr = sc->nr;
-		__entry->flags = sc->flags;
-		SSCF_ASSIGN(out, &sc->out)
-		__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in0, &sc->in[0].sfl)
-		__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in1, &sc->in[1].sfl)
-		__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in2, &sc->in[2].sfl)
-		__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in3, &sc->in[3].sfl)
-	),
-
-	TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
-		  SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
-		  __entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
-		  __entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
-		  __entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
-		  __entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
-);
-DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-	TP_ARGS(sb, sc)
-);
-DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-	TP_ARGS(sb, sc)
-);
-
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
@@ -67,7 +67,6 @@ struct commit_users {
 	unsigned int nr_holders;
 	u32 avail_before;
 	u32 freed_before;
-	bool committing;
 	bool exceeded;
 };

@@ -85,13 +84,12 @@ do {												\
 	__typeof__(cusers) _cusers = (cusers);							\
 	trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding),			\
 		!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before,	\
-		_cusers->freed_before, _cusers->committing, _cusers->exceeded);			\
+		_cusers->freed_before, _cusers->exceeded);					\
 } while (0)

 struct server_info {
 	struct super_block *sb;
 	spinlock_t lock;
-	seqlock_t seqlock;
 	wait_queue_head_t waitq;

 	struct workqueue_struct *wq;
@@ -133,9 +131,11 @@ struct server_info {
 	struct mutex mounted_clients_mutex;

 	/* stable super stored from commits, given in locks and rpcs */
+	seqcount_t stable_seqcount;
 	struct scoutfs_super_block stable_super;

 	/* serializing and get and set volume options */
+	seqcount_t volopt_seqcount;
 	struct mutex volopt_mutex;
 	struct scoutfs_volume_options volopt;

@@ -148,8 +148,6 @@ struct server_info {
 	struct scoutfs_quorum_config qconf;
 	/* a running server maintains a private dirty super */
 	struct scoutfs_super_block dirty_super;
-
-	u64 finalize_sent_seq;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -183,7 +181,7 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
 	unsigned seq;

 	do {
-		seq = read_seqbegin(&server->seqlock);
+		seq = read_seqcount_begin(&server->volopt_seqcount);
 		if ((le64_to_cpu(server->volopt.set_bits) & bit)) {
 			is_set = true;
 			*val = le64_to_cpup(opt);
@@ -191,7 +189,7 @@ static bool get_volopt_val(struct server_info *server, int nr, u64 *val)
 			is_set = false;
 			*val = 0;
 		};
-	} while (read_seqretry(&server->seqlock, seq));
+	} while (read_seqcount_retry(&server->volopt_seqcount, seq));

 	return is_set;
 }
@@ -284,14 +282,6 @@ struct commit_hold {
 * per-holder allocation consumption tracking.   The best we can do is
 * flag all the current holders so that as they release we can see
 * everyone involved in crossing the limit.
- *
- * The consumption of space to record freed blocks is tricky.  The
- * freed_before value was the space available as the holder started.
- * But that happens before we actually dirty the first block in the
- * freed list.  If that block is too full then we just allocate a new
- * empty first block.  In that case the current remaining here can be a
- * lot more than the initial freed_before.  We account for that and
- * treat freed_before as the maximum capacity.
 */
 static void check_holder_budget(struct super_block *sb, struct server_info *server,
 				struct commit_users *cusers)
@@ -311,13 +301,8 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 		return;

 	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
-
 	avail_used = cusers->avail_before - avail_now;
-	if (freed_now < cusers->freed_before)
-		freed_used = cusers->freed_before - freed_now;
-	else
-		freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now;
-
+	freed_used = cusers->freed_before - freed_now;
 	budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
 	if (avail_used <= budget && freed_used <= budget)
 		return;
@@ -340,18 +325,31 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
 /*
 * We don't have per-holder consumption.   We allow commit holders as
 * long as the total budget of all the holders doesn't exceed the alloc
- * resources that were available.  If a hold is waiting for budget
- * availability in the allocators then we try and kick off a commit to
- * fill and use the next allocators after the current transaction.
+ * resources that were available
 */
+static bool commit_alloc_has_room(struct server_info *server, struct commit_users *cusers,
+				  unsigned int more_holders)
+{
+	u32 avail_before;
+	u32 freed_before;
+	u32 budget;
+
+	if (cusers->nr_holders > 0) {
+		avail_before = cusers->avail_before;
+		freed_before = cusers->freed_before;
+	} else {
+		scoutfs_alloc_meta_remaining(&server->alloc, &avail_before, &freed_before);
+	}
+
+	budget = (cusers->nr_holders + more_holders) * COMMIT_HOLD_ALLOC_BUDGET;
+
+	return avail_before >= budget && freed_before >= budget;
+}
+
 static bool hold_commit(struct super_block *sb, struct server_info *server,
 			struct commit_users *cusers, struct commit_hold *hold)
 {
-	bool has_room;
-	bool held;
-	u32 budget;
-	u32 av;
-	u32 fr;
+	bool held = false;

 	spin_lock(&cusers->lock);

@@ -359,39 +357,19 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,

 	check_holder_budget(sb, server, cusers);

-	if (cusers->nr_holders == 0) {
-		scoutfs_alloc_meta_remaining(&server->alloc, &av, &fr);
-	} else {
-		av = cusers->avail_before;
-		fr = cusers->freed_before;
-	}
-
 	/* +2 for our additional hold and then for the final commit work the server does */
-	budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET;
-	has_room = av >= budget && fr >= budget;
-	/* checking applying so holders drain once an apply caller starts waiting */
-	held = !cusers->committing && has_room && list_empty(&cusers->applying);
-
-	if (held) {
+	if (list_empty(&cusers->applying) && commit_alloc_has_room(server, cusers, 2)) {
+		scoutfs_alloc_meta_remaining(&server->alloc, &hold->avail, &hold->freed);
 		if (cusers->nr_holders == 0) {
-			cusers->avail_before = av;
-			cusers->freed_before = fr;
-			hold->avail = av;
-			hold->freed = fr;
+			cusers->avail_before = hold->avail;
+			cusers->freed_before = hold->freed;
 			cusers->exceeded = false;
-		} else {
-			scoutfs_alloc_meta_remaining(&server->alloc, &hold->avail, &hold->freed);
 		}
-
 		hold->exceeded = false;
 		hold->start = ktime_get();
 		list_add_tail(&hold->entry, &cusers->holding);
-
 		cusers->nr_holders++;
-
-	} else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) {
-		cusers->committing = true;
-		queue_work(server->wq, &server->commit_work);
+		held = true;
 	}

 	spin_unlock(&cusers->lock);
@@ -415,27 +393,6 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
 	wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
 }

-/*
- * Return the higher of the avail or freed used by the active commit
- * since this holder joined the commit.  This is *not* the amount used
- * by the holder, we don't track per-holder alloc use.
- */
-static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	u32 avail_used;
-	u32 freed_used;
-	u32 avail_now;
-	u32 freed_now;
-
-	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
-
-	avail_used = hold->avail - avail_now;
-	freed_used = hold->freed - freed_now;
-
-	return max(avail_used, freed_used);
-}
-
 /*
 * This is called while holding the commit and returns once the commit
 * is successfully written.  Many holders can all wait for all holders
@@ -446,6 +403,7 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,
 	DECLARE_SERVER_INFO(sb, server);
 	struct commit_users *cusers = &server->cusers;
 	struct timespec ts;
+	bool start_commit;

 	spin_lock(&cusers->lock);

@@ -466,15 +424,13 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,
 		list_del_init(&hold->entry);
 		hold->ret = err;
 	}
-
 	cusers->nr_holders--;
-	if (cusers->nr_holders == 0 && !cusers->committing && !list_empty(&cusers->applying)) {
-		cusers->committing = true;
-		queue_work(server->wq, &server->commit_work);
-	}
-
+	start_commit = cusers->nr_holders == 0 && !list_empty(&cusers->applying);
 	spin_unlock(&cusers->lock);

+	if (start_commit)
+		queue_work(server->wq, &server->commit_work);
+
 	wait_event(cusers->waitq, list_empty_careful(&hold->entry));
 	smp_rmb(); /* entry load before ret */
 	return hold->ret;
@@ -482,8 +438,8 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,

 /*
 * Start a commit from the commit work.  We should only have been queued
- * while there are no active holders and someone started the commit.
- * There may or may not be blocked apply callers waiting for the result.
+ * while a holder is waiting to apply after all active holders have
+ * finished.
 */
 static int commit_start(struct super_block *sb, struct commit_users *cusers)
 {
@@ -492,7 +448,7 @@ static int commit_start(struct super_block *sb, struct commit_users *cusers)
 	/* make sure holders held off once commit started */
 	spin_lock(&cusers->lock);
 	TRACE_COMMIT_USERS(sb, cusers, start);
-	if (WARN_ON_ONCE(!cusers->committing || cusers->nr_holders != 0))
+	if (WARN_ON_ONCE(list_empty(&cusers->applying) || cusers->nr_holders != 0))
 		ret = -EINVAL;
 	spin_unlock(&cusers->lock);

@@ -515,7 +471,6 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
 	smp_wmb(); /* ret stores before list updates */
 	list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
 		list_del_init(&hold->entry);
-	cusers->committing = false;
 	spin_unlock(&cusers->lock);

 	wake_up(&cusers->waitq);
@@ -528,7 +483,7 @@ static void get_stable(struct super_block *sb, struct scoutfs_super_block *super
 	unsigned int seq;

 	do {
-		seq = read_seqbegin(&server->seqlock);
+		seq = read_seqcount_begin(&server->stable_seqcount);
 		if (super)
 			*super = server->stable_super;
 		if (roots) {
@@ -536,7 +491,7 @@ static void get_stable(struct super_block *sb, struct scoutfs_super_block *super
 			roots->logs_root = server->stable_super.logs_root;
 			roots->srch_root = server->stable_super.srch_root;
 		}
-	} while (read_seqretry(&server->seqlock, seq));
+	} while (read_seqcount_retry(&server->stable_seqcount, seq));
 }

 u64 scoutfs_server_seq(struct super_block *sb)
@@ -570,9 +525,11 @@ void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq)

 static void set_stable_super(struct server_info *server, struct scoutfs_super_block *super)
 {
-	write_seqlock(&server->seqlock);
+	preempt_disable();
+	write_seqcount_begin(&server->stable_seqcount);
 	server->stable_super = *super;
-	write_sequnlock(&server->seqlock);
+	write_seqcount_end(&server->stable_seqcount);
+	preempt_enable();
 }

 /*
@@ -586,7 +543,7 @@ static void set_stable_super(struct server_info *server, struct scoutfs_super_bl
 * implement commits with a single pending work func.
 *
 * Processing paths hold the commit while they're making multiple
- * dependent changes.  When they're done and want it persistent they
+ * dependent changes.  When they're done and want it persistent they add
 * queue the commit work.  This work runs, performs the commit, and
 * wakes all the applying waiters with the result.  Readers can run
 * concurrently with these commits.
@@ -961,24 +918,22 @@ static int find_log_trees_item(struct super_block *sb,
 }

 /*
- * Find the log_trees item with the greatest nr for each rid.  Fills the
- * caller's log_trees and sets the key before the returned log_trees for
- * the next iteration.  Returns 0 when done, > 0 for each item, and
- * -errno on fatal errors.
+ * Find the next log_trees item from the key.  Fills the caller's log_trees and sets
+ * the key past the returned log_trees for iteration.  Returns 0 when done, > 0 for each
+ * item, and -errno on fatal errors.
 */
-static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
-				struct scoutfs_key *key, struct scoutfs_log_trees *lt)
+static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
+		       struct scoutfs_key *key, struct scoutfs_log_trees *lt)
 {
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	int ret;

-	ret = scoutfs_btree_prev(sb, root, key, &iref);
+	ret = scoutfs_btree_next(sb, root, key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			memcpy(lt, iref.val, iref.val_len);
 			*key = *iref.key;
-			key->sklt_nr = 0;
-			scoutfs_key_dec(key);
+			scoutfs_key_inc(key);
 			ret = 1;
 		} else {
 			ret = -EIO;
@@ -1073,13 +1028,21 @@ static int next_log_merge_item(struct super_block *sb,
 * abandoned log btree finalized.  If it takes too long each client has
 * a change to make forward progress before being asked to commit again.
 *
+ * We're waiting on heavy state that is protected by mutexes and
+ * transaction machinery.  It's tricky to recreate that state for
+ * lightweight condition tests that don't change task state.  Instead of
+ * trying to get that right, particularly as we unwind after success or
+ * after timeouts, waiters use an unsatisfying poll.   Short enough to
+ * not add terrible latency, given how heavy and infrequent this already
+ * is, and long enough to not melt the cpu.  This could be tuned if it
+ * becomes a problem.
+ *
 * This can end up finalizing a new empty log btree if a new mount
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
-#define FINALIZE_POLL_MIN_DELAY_MS	5U
-#define FINALIZE_POLL_MAX_DELAY_MS	100U
-#define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
+#define FINALIZE_POLL_MS	(11)
+#define FINALIZE_TIMEOUT_MS	(MSEC_PER_SEC / 2)
 static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
 					u64 rid, struct commit_hold *hold)
 {
@@ -1087,10 +1050,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
-	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
 	struct scoutfs_log_trees fin;
-	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
 	bool others_active;
@@ -1098,14 +1059,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	bool ours_visible;
 	struct scoutfs_key key;
 	char *err_str = NULL;
-	ktime_t start;
 	int ret;
 	int err;

-	scoutfs_options_read(sb, &opts);
-	timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
-	delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
-	start = ktime_get_raw();
+	timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);

 	for (;;) {
 		/* nothing to do if there's already a merge in flight */
@@ -1122,13 +1079,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		saw_finalized = false;
 		others_active = false;
 		ours_visible = false;
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-		while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
-
-			trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
-							    le64_to_cpu(each_lt.nr),
-							    le64_to_cpu(each_lt.flags),
-							    le64_to_cpu(each_lt.get_trans_seq));
+		scoutfs_key_init_log_trees(&key, 0, 0);
+		while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
 				saw_finalized = true;
@@ -1153,10 +1105,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		finalize_ours = (lt->item_root.height > 2) ||
 				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);

-		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
-						       ours_visible, finalize_ours, delay_ms,
-						       server->finalize_sent_seq);
-
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
@@ -1164,13 +1112,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		}

 		/* send sync requests soon to give time to commit */
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+		scoutfs_key_init_log_trees(&key, 0, 0);
 		while (others_active &&
-		       (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		       (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
-			    (le64_to_cpu(each_lt.rid) == rid) ||
-			    (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
+			    (le64_to_cpu(each_lt.rid) == rid))
 				continue;

 			ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1190,8 +1137,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

-		server->finalize_sent_seq = scoutfs_server_seq(sb);
-
 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
 			fin = *lt;
@@ -1229,16 +1174,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			if (ret < 0)
 				err_str = "applying commit before waiting for finalized";

-			msleep(delay_ms);
-			delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
-				       FINALIZE_POLL_MAX_DELAY_MS);
+			msleep(FINALIZE_POLL_MS);

 			server_hold_commit(sb, hold);
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
 			if (time_after(jiffies, timeo)) {
-				scoutfs_inc_counter(sb, log_merge_wait_timeout);
 				ret = 0;
 				break;
 			}
@@ -1821,29 +1763,43 @@ out:
 * Give the caller the last seq before outstanding client commits.  All
 * seqs up to and including this are stable, new client transactions can
 * only have greater seqs.
- *
- * For each rid, only its greatest log trees nr can be an open commit.
- * We look at the last log_trees item for each client rid and record its
- * trans seq if it hasn't been committed.
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_log_trees lt;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_log_trees *lt;
 	struct scoutfs_key key;
 	u64 last_seq = 0;
 	int ret;

 	last_seq = scoutfs_server_seq(sb) - 1;
+	scoutfs_key_init_log_trees(&key, 0, 0);

 	mutex_lock(&server->logs_mutex);

-	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
-		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
-		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
-			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
+	for (;; scoutfs_key_inc(&key)) {
+		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				lt = iref.val;
+				if ((le64_to_cpu(lt->get_trans_seq) >
+				     le64_to_cpu(lt->commit_trans_seq)) &&
+				     le64_to_cpu(lt->get_trans_seq) <= last_seq) {
+					last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
+				}
+				key = *iref.key;
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
 		}
 	}

@@ -1990,7 +1946,9 @@ static int server_srch_get_compact(struct super_block *sb,
 	ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
 				       &super->srch_root, rid, sc);
 	mutex_unlock(&server->srch_mutex);
-	if (ret < 0 || (ret == 0 && sc->nr == 0))
+	if (ret == 0 && sc->nr == 0)
+		ret = -ENOENT;
+	if (ret < 0)
 		goto apply;

 	mutex_lock(&server->alloc_mutex);
@@ -2495,11 +2453,9 @@ static void server_log_merge_free_work(struct work_struct *work)

 	while (!server_is_stopping(server)) {

-		if (!commit) {
-			server_hold_commit(sb, &hold);
-			mutex_lock(&server->logs_mutex);
-			commit = true;
-		}
+		server_hold_commit(sb, &hold);
+		mutex_lock(&server->logs_mutex);
+		commit = true;

 		ret = next_log_merge_item(sb, &super->log_merge,
 					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2546,14 +2502,12 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
-			mutex_unlock(&server->logs_mutex);
-			ret = server_apply_commit(sb, &hold, ret);
-			commit = false;
-			if (ret < 0) {
-				err_str = "looping commit del/upd freeing item";
-				break;
-			}
+		mutex_unlock(&server->logs_mutex);
+		ret = server_apply_commit(sb, &hold, ret);
+		commit = false;
+		if (ret < 0) {
+			err_str = "looping commit del/upd freeing item";
+			break;
 		}
 	}

@@ -3096,9 +3050,9 @@ static int server_get_volopt(struct super_block *sb, struct scoutfs_net_connecti
 	}

 	do {
-		seq = read_seqbegin(&server->seqlock);
+		seq = read_seqcount_begin(&server->volopt_seqcount);
 		volopt = server->volopt;
-	} while (read_seqretry(&server->seqlock, seq));
+	} while (read_seqcount_retry(&server->volopt_seqcount, seq));

 out:
 	return scoutfs_net_response(sb, conn, cmd, id, ret, &volopt, sizeof(volopt));
@@ -3167,12 +3121,12 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
 apply:
 	ret = server_apply_commit(sb, &hold, ret);

-	write_seqlock(&server->seqlock);
+	write_seqcount_begin(&server->volopt_seqcount);
 	if (ret == 0)
 		server->volopt = super->volopt;
 	else
 		super->volopt = server->volopt;
-	write_sequnlock(&server->seqlock);
+	write_seqcount_end(&server->volopt_seqcount);

 	mutex_unlock(&server->volopt_mutex);
 out:
@@ -3215,12 +3169,12 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec

 	ret = server_apply_commit(sb, &hold, ret);

-	write_seqlock(&server->seqlock);
+	write_seqcount_begin(&server->volopt_seqcount);
 	if (ret == 0)
 		server->volopt = super->volopt;
 	else
 		super->volopt = server->volopt;
-	write_sequnlock(&server->seqlock);
+	write_seqcount_end(&server->volopt_seqcount);

 	mutex_unlock(&server->volopt_mutex);
 out:
@@ -4326,7 +4280,6 @@ static void scoutfs_server_worker(struct work_struct *work)
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
-	server->finalize_sent_seq = 0;

 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
@@ -4360,9 +4313,9 @@ static void scoutfs_server_worker(struct work_struct *work)
 	}

 	/* update volume options early, possibly for use during startup */
-	write_seqlock(&server->seqlock);
+	write_seqcount_begin(&server->volopt_seqcount);
 	server->volopt = super->volopt;
-	write_sequnlock(&server->seqlock);
+	write_seqcount_end(&server->volopt_seqcount);

 	atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq));
 	set_stable_super(server, super);
@@ -4502,7 +4455,6 @@ int scoutfs_server_setup(struct super_block *sb)

 	server->sb = sb;
 	spin_lock_init(&server->lock);
-	seqlock_init(&server->seqlock);
 	init_waitqueue_head(&server->waitq);
 	INIT_WORK(&server->work, scoutfs_server_worker);
 	server->status = SERVER_DOWN;
@@ -4517,6 +4469,8 @@ int scoutfs_server_setup(struct super_block *sb)
 	INIT_WORK(&server->log_merge_free_work, server_log_merge_free_work);
 	mutex_init(&server->srch_mutex);
 	mutex_init(&server->mounted_clients_mutex);
+	seqcount_init(&server->stable_seqcount);
+	seqcount_init(&server->volopt_seqcount);
 	mutex_init(&server->volopt_mutex);
 	INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);
 	INIT_DELAYED_WORK(&server->reclaim_dwork, reclaim_worker);
@@ -30,9 +30,6 @@
 #include "client.h"
 #include "counters.h"
 #include "scoutfs_trace.h"
-#include "triggers.h"
-#include "sysfs.h"
-#include "msg.h"

 /*
 * This srch subsystem gives us a way to find inodes that have a given
@@ -71,14 +68,10 @@ struct srch_info {
 	atomic_t shutdown;
 	struct workqueue_struct *workq;
 	struct delayed_work compact_dwork;
-	struct scoutfs_sysfs_attrs ssa;
-	atomic_t compact_delay_ms;
 };

 #define DECLARE_SRCH_INFO(sb, name) \
 	struct srch_info *name = SCOUTFS_SB(sb)->srch_info
-#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
-	DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_ARG(sre)						\
@@ -527,95 +520,6 @@ out:
 	return ret;
 }

-/*
- * Padded entries are encoded in pairs after an existing entry.  All of
- * the pairs cancel each other out by all readers (the second encoding
- * looks like deletion) so they aren't visible to the first/last bounds of
- * the block or file.
- */
-static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
-			       struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
-{
-	int ret;
-
-	ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
-			   sre, &srb->tail);
-	if (ret > 0) {
-		srb->tail = *sre;
-		le32_add_cpu(&srb->entry_nr, 1);
-		le32_add_cpu(&srb->entry_bytes, ret);
-		le64_add_cpu(&sfl->entries, 1);
-		ret = 0;
-	}
-
-	return ret;
-}
-
-/*
- * This is called by a testing trigger to create a very specific case of
- * encoded entry offsets.  We want the last entry in the block to start
- * precisely at the _SAFE_BYTES offset.
- *
- * This is called when there is a single existing entry in the block.
- * We have the entire block to work with.  We encode pairs of matching
- * entries.  This hides them from readers (both searches and merging) as
- * they're interpreted as creation and deletion and are deleted.  We use
- * the existing hash value of the first entry in the block but then set
- * the inode to an impossibly large number so it doesn't interfere with
- * anything.
- *
- * To hit the specific offset we very carefully manage the amount of
- * bytes of change between fields in the entry.  We know that if we
- * change all the byte of the ino and id we end up with a 20 byte
- * (2+8+8,2) encoding of the pair of entries.  To have the last entry
- * start at the _SAFE_POS offset we know that the final 20 byte pair
- * encoding needs to end at 2 bytes (second entry encoding) after the
- * _SAFE_POS offset.
- *
- * So as we encode pairs we watch the delta of our current offset from
- * that desired final offset of 2 past _SAFE_POS.  If we're a multiple
- * of 20 away then we encode the full 20 byte pairs.  If we're not, then
- * we drop a byte to encode 19 bytes.  That'll slowly change the offset
- * to be a multiple of 20 again while encoding large entries.
- */
-static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
-				struct scoutfs_srch_block *srb)
-{
-	struct scoutfs_srch_entry sre;
-	u32 target;
-	s32 diff;
-	u64 hash;
-	u64 ino;
-	u64 id;
-	int ret;
-
-	hash = le64_to_cpu(srb->tail.hash);
-	ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
-	id = le64_to_cpu(srb->tail.id);
-
-	target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
-
-	while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
-		ino ^= 1ULL << (7 * 8);
-		if (diff % 20 == 0) {
-			id ^= 1ULL << (7 * 8);
-		} else {
-			id ^= 1ULL << (6 * 8);
-		}
-
-		sre.hash = cpu_to_le64(hash);
-		sre.ino = cpu_to_le64(ino);
-		sre.id = cpu_to_le64(id);
-
-		ret = append_padded_entry(sfl, blk, srb, &sre);
-		if (ret == 0)
-			ret = append_padded_entry(sfl, blk, srb, &sre);
-		BUG_ON(ret != 0);
-
-		diff = target - le32_to_cpu(srb->entry_bytes);
-	}
-}
-
 /*
 * The caller is dropping an ino/id because the tracking rbtree is full.
 * This loses information so we can't return any entries at or after the
@@ -1083,9 +987,6 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 	struct scoutfs_key key;
 	int ret;

-	if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
-		force = true;
-
 	if (sfl->ref.blkno == 0 ||
 	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
 		return 0;
@@ -1561,7 +1462,7 @@ static int kway_merge(struct super_block *sb,
 		      struct scoutfs_block_writer *wri,
 		      struct scoutfs_srch_file *sfl,
 		      kway_get_t kway_get, kway_advance_t kway_adv,
-		      void **args, int nr, bool logs_input)
+		      void **args, int nr)
 {
 	DECLARE_SRCH_INFO(sb, srinf);
 	struct scoutfs_srch_block *srb = NULL;
@@ -1666,15 +1567,6 @@ static int kway_merge(struct super_block *sb,
 				blk++;
 			}

-			/* end sorted block on _SAFE offset for testing */
-			if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
-			    scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
-				pad_entries_at_safe(sfl, blk, srb);
-				scoutfs_block_put(sb, bl);
-				bl = NULL;
-				blk++;
-			}
-
 			scoutfs_inc_counter(sb, srch_compact_entry);

 		} else {
@@ -1717,8 +1609,6 @@ static int kway_merge(struct super_block *sb,
 			empty++;
 			ret = 0;
 		} else if (ret < 0) {
-			if (ret == -ENOANO) /* just testing trigger */
-				ret = 0;
 			goto out;
 		}

@@ -1926,7 +1816,7 @@ static int compact_logs(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
-			 args, nr_pages, true);
+			 args, nr_pages);
 	if (ret < 0)
 		goto out;

@@ -1984,18 +1874,12 @@ static int kway_get_reader(struct super_block *sb,
 	srb = rdr->bl->data;

 	if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
-	    rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
+	    rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
 	    rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
 		/* XXX inconsistency */
 		return -EIO;
 	}

-	if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
-	    scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
-		/* only used in testing */
-		return -ENOANO;
-	}
-
 	/* decode entry, possibly skipping start of the block */
 	while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
 		ret = decode_entry(srb->entries + rdr->pos,
@@ -2085,7 +1969,7 @@ static int compact_sorted(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
-			 kway_adv_reader, args, nr, false);
+			 kway_adv_reader, args, nr);

 	sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
 	for (i = 0; i < nr; i++) {
@@ -2214,15 +2098,8 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-static void queue_compact_work(struct srch_info *srinf, bool immediate)
-{
-	unsigned long delay;
-
-	if (!atomic_read(&srinf->shutdown)) {
-		delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
-		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
-	}
-}
+/* wait 10s between compact attempts on error, immediate after success */
+#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)

 /*
 * Get a compaction operation from the server, sort the entries from the
@@ -2250,6 +2127,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	struct super_block *sb = srinf->sb;
 	struct scoutfs_block_writer wri;
 	struct scoutfs_alloc alloc;
+	unsigned long delay;
 	int ret;
 	int err;

@@ -2262,8 +2140,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	scoutfs_block_writer_init(sb, &wri);

 	ret = scoutfs_client_srch_get_compact(sb, sc);
-	if (ret >= 0)
-		trace_scoutfs_srch_compact_client_recv(sb, sc);
 	if (ret < 0 || sc->nr == 0)
 		goto out;

@@ -2292,7 +2168,6 @@ commit:
 	sc->meta_freed = alloc.freed;
 	sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;

-	trace_scoutfs_srch_compact_client_send(sb, sc);
 	err = scoutfs_client_srch_commit_compact(sb, sc);
 	if (err < 0 && ret == 0)
 		ret = err;
@@ -2303,56 +2178,14 @@ out:
 		scoutfs_inc_counter(sb, srch_compact_error);

 	scoutfs_block_writer_forget_all(sb, &wri);
-	queue_compact_work(srinf, sc->nr > 0 && ret == 0);
+	if (!atomic_read(&srinf->shutdown)) {
+		delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
+		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
+	}

 	kfree(sc);
 }

-static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
-{
-	DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
-
-	return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
-}
-
-#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
-#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
-#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
-
-static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-				      const char *buf, size_t count)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	DECLARE_SRCH_INFO(sb, srinf);
-	char nullterm[30]; /* more than enough for octal -U64_MAX */
-	u64 val;
-	int len;
-	int ret;
-
-	len = min(count, sizeof(nullterm) - 1);
-	memcpy(nullterm, buf, len);
-	nullterm[len] = '\0';
-
-	ret = kstrtoll(nullterm, 0, &val);
-	if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
-		scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
-			    MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
-		return -EINVAL;
-	}
-
-	atomic_set(&srinf->compact_delay_ms, val);
-	cancel_delayed_work(&srinf->compact_dwork);
-	queue_compact_work(srinf, false);
-
-	return count;
-}
-SCOUTFS_ATTR_RW(compact_delay_ms);
-
-static struct attribute *srch_attrs[] = {
-	SCOUTFS_ATTR_PTR(compact_delay_ms),
-	NULL,
-};
-
 void scoutfs_srch_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -2369,8 +2202,6 @@ void scoutfs_srch_destroy(struct super_block *sb)
 		destroy_workqueue(srinf->workq);
 	}

-	scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
-
 	kfree(srinf);
 	sbi->srch_info = NULL;
 }
@@ -2388,15 +2219,8 @@ int scoutfs_srch_setup(struct super_block *sb)
 	srinf->sb = sb;
 	atomic_set(&srinf->shutdown, 0);
 	INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
-	scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
-	atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
-
 	sbi->srch_info = srinf;

-	ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
-	if (ret < 0)
-		goto out;
-
 	srinf->workq = alloc_workqueue("scoutfs_srch_compact",
 				       WQ_NON_REENTRANT | WQ_UNBOUND |
 				       WQ_HIGHPRI, 0);
@@ -2405,7 +2229,8 @@ int scoutfs_srch_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_compact_work(srinf, false);
+	queue_delayed_work(srinf->workq, &srinf->compact_dwork,
+			   msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));

 	ret = 0;
 out:
@@ -39,9 +39,6 @@ struct scoutfs_triggers {

 static char *names[] = {
 	[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
-	[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
-	[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
-	[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
 	[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
 };

@@ -3,9 +3,6 @@

 enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
-	SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
-	SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
-	SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
 	SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
 	SCOUTFS_TRIGGER_NR,
 };
@@ -23,9 +23,9 @@ static inline void down_write_two(struct rw_semaphore *a,
 * ~0UL values. Hence, we cap count to ~0L, which is arbitarily high
 * enough to avoid it.
 */
-static inline long shrinker_min_long(long count)
+static inline unsigned long shrinker_min_t_long(unsigned long count)
 {
-	return min(count, LONG_MAX);
+	return min_t(u64, count, LONG_MAX);
 }

 #endif
@@ -25,9 +25,8 @@ All options can be seen by running with -h.
 This script is built to test multi-node systems on one host by using
 different mounts of the same devices.  The script creates a fake block
 device in front of each fs block device for each mount that will be
-tested.  It will create predictable device mapper devices and mounts
-them on /mnt/test.N.  These static device names and mount paths limit
-the script to a single execution per host.
+tested.  Currently it will create free loop devices and will mount on
+/mnt/test.[0-9].

 All tests will be run by default.  Particular tests can be included or
 excluded by providing test name regular expressions with the -I and -E
@@ -105,8 +104,8 @@ used during the test.

 | Variable         | Description          | Origin          | Example           |
 | ---------------- | -------------------  | --------------- | ----------------- |
-| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9]        |
-| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9]        |
+| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
+| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
 | T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
 | T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
 | T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
@@ -35,7 +35,7 @@ t_fail()
 t_quiet()
 {
 	echo "# $*" >> "$T_TMPDIR/quiet.log"
-	"$@" >> "$T_TMPDIR/quiet.log" 2>&1 || \
+	"$@" > "$T_TMPDIR/quiet.log" 2>&1 || \
 		t_fail "quiet command failed"
 }

@@ -6,61 +6,6 @@ t_filter_fs()
 	    -e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
 }

-#
-# We can hit a spurious kasan warning that was fixed upstream:
-#
-#  e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
-#
-# KASAN can get mad when the unwinder doesn't find ORC metadata and
-# wanders up without using frames and hits the KASAN stack red zones.
-# We can ignore these messages.
-#
-# They're bracketed by:
-# [ 2687.690127] ==================================================================
-# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
-# ...
-# [ 2687.706220] ==================================================================
-# [ 2687.707284] Disabling lock debugging due to kernel taint
-#
-# That final lock debugging message may not be included.
-#
-ignore_harmless_unwind_kasan_stack_oob()
-{
-awk '
-        BEGIN {
-                in_soob = 0
-                soob_nr = 0
-        }
-        ( !in_soob && $0 ~ /==================================================================/ ) {
-                in_soob = 1
-                soob_nr = NR
-                saved = $0
-        }
-        ( in_soob == 1 && NR == (soob_nr + 1) ) {
-                if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
-                        in_soob = 2
-                } else {
-                        in_soob = 0
-                        print saved
-                }
-		saved=""
-        }
-        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
-                in_soob = 3
-                soob_nr = NR
-        }
-        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
-                in_soob = 0
-        }
-        ( !in_soob ) { print $0 }
-        END {
-                if (saved) {
-                        print saved
-                }
-        }
-'
-}
-
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -141,12 +86,7 @@ t_filter_dmesg()
 	re="$re|scoutfs .* critical transaction commit failure.*"

 	# change-devices causes loop device resizing
-	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"

-	# ignore systemd-journal rotating
-	re="$re|systemd-journald.*"
-
-	egrep -v "($re)" | \
-		ignore_harmless_unwind_kasan_stack_oob
+	egrep -v "($re)" 
 }
@@ -265,15 +265,6 @@ t_trigger_get() {
 	cat "$(t_trigger_path "$nr")/$which"
 }

-t_trigger_set() {
-	local which="$1"
-	local nr="$2"
-	local val="$3"
-	local path=$(t_trigger_path "$nr")
-
-	echo "$val" > "$path/$which"
-}
-
 t_trigger_show() {
 	local which="$1"
 	local string="$2"
@@ -285,8 +276,9 @@ t_trigger_show() {
 t_trigger_arm_silent() {
 	local which="$1"
 	local nr="$2"
+	local path=$(t_trigger_path "$nr")

-	t_trigger_set "$which" "$nr" 1
+	echo 1 > "$path/$which"
 }

 t_trigger_arm() {
@@ -1,4 +1,3 @@
 == measure initial createmany
 == measure initial createmany
 == measure two concurrent createmany runs
-== cleanup
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
@@ -1,37 +0,0 @@
-== initialize per-mount values
-== arm compaction triggers
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-== compact more often
-== create padded sorted inputs by forcing log rotation
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-== compaction of padded should stop at safe
-== verify no compaction errors
-== cleanup
@@ -326,10 +326,16 @@ unmount_all() {
 		cmd wait $p
 	done

-	# delete all temp devices
-	for dev in /dev/mapper/_scoutfs_test_*; do
-		if [ -b "$dev" ]; then
-			cmd dmsetup remove $dev
+	# delete all temp meta devices
+	for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
+		if [ -e "$dev" ]; then
+			cmd losetup -d "$dev"
+		fi
+	done
+	# delete all temp data devices
+	for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
+		if [ -e "$dev" ]; then
+			cmd losetup -d "$dev"
 		fi
 	done
 }
@@ -428,12 +434,6 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
 fenced_log "started fenced pid $fenced_pid in the background"

-# setup dm tables
-echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
-	$T_RESULTS/dmtable.meta
-echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
-	$T_RESULTS/dmtable.data
-
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -442,13 +442,10 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
 pids=""
 for i in $(seq 0 $((T_NR_MOUNTS - 1))); do

-	name="_scoutfs_test_meta_$i"
-	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
-	meta_dev="/dev/mapper/$name"
-
-	name="_scoutfs_test_data_$i"
-	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
-	data_dev="/dev/mapper/$name"
+	meta_dev=$(losetup --find --show $T_META_DEVICE)
+	test -b "$meta_dev" || die "failed to create temp device $meta_dev"
+	data_dev=$(losetup --find --show $T_DATA_DEVICE)
+	test -b "$data_dev" || die "failed to create temp device $data_dev"

 	dir="/mnt/test.$i"
 	test -d "$dir" || cmd mkdir -p "$dir"
@@ -14,7 +14,6 @@ offline-extent-waiting.sh
 move-blocks.sh
 large-fragmented-free.sh
 enospc.sh
-srch-safe-merge-pos.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 totl-xattr-tag.sh
@@ -1,7 +1,6 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <stdarg.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/stat.h>
@@ -36,10 +35,10 @@ struct opts {
 	unsigned int dry_run:1,
 		     ls_output:1,
 		     quiet:1,
-		     xattr_set:1,
-		     xattr_file:1,
-		     xattr_group:1;
-	char *xattr_name;
+		     user_xattr:1,
+		     same_srch_xattr:1,
+		     group_srch_xattr:1,
+		     unique_srch_xattr:1;
 };

 struct stats {
@@ -150,31 +149,12 @@ static void free_dir(struct dir *dir)
 	free(dir);
 }

-static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
-{
-	va_list ap;
-	int ret;
-
-	if (off >= sz)
-		return sz;
-
-	va_start(ap, fmt);
-	ret = vsnprintf(buf + off, sz - off, fmt, ap);
-	va_end(ap);
-
-	if (ret <= 0)
-		return sz;
-
-	return off + ret;
-}
-
 static void create_dir(struct dir *dir, struct opts *opts,
 		       struct stats *stats)
 {
 	struct str_list *s;
-	char name[256]; /* max len and null term */
+	char name[100];
 	char val = 'v';
-	size_t off;
 	int rc;
 	int i;

@@ -195,21 +175,29 @@ static void create_dir(struct dir *dir, struct opts *opts,
 		rc = mknod(s->str, S_IFREG | 0644, 0);
 		error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);

-		if (opts->xattr_set) {
-			off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
-			if (opts->xattr_file)
-				off = snprintf_off(name, sizeof(name), off,
-						   "-f-%lu", stats->files);
-			if (opts->xattr_group)
-				off = snprintf_off(name, sizeof(name), off,
-						   "-g-%lu", stats->files / 10000);
-
-			error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
-
+		rc = 0;
+		if (rc == 0 && opts->user_xattr) {
+			strcpy(name, "user.scoutfs_bcp");
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->same_srch_xattr) {
+			strcpy(name, "scoutfs.srch.scoutfs_bcp");
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->group_srch_xattr) {
+			snprintf(name, sizeof(name),
+				 "scoutfs.srch.scoutfs_bcp.group.%lu",
+				 stats->files / 10000);
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->unique_srch_xattr) {
+			snprintf(name, sizeof(name),
+				 "scoutfs.srch.scoutfs_bcp.unique.%lu",
+				 stats->files);
 			rc = setxattr(s->str, name, &val, 1, 0);
-			error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
 		}

+		error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);

 		stats->files++;
 		rate_banner(opts, stats);
@@ -377,10 +365,11 @@ static void usage(void)
 	       " -d DIR | create all files in DIR top level directory\n"
 	       " -n     | dry run, only parse, don't create any files\n"
 	       " -q     | quiet, don't regularly print rates\n"
-	       " -F     | append \"-f-NR\" file nr to xattr name, requires -X\n"
-	       " -G     | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
 	       " -L     | parse ls output; only reg, skip meta, paths at ./\n"
-	       " -X NAM | set named xattr in all files\n");
+	       " -X     | set the same user. xattr name in all files\n"
+	       " -S     | set the same .srch. xattr name in all files\n"
+	       " -G     | set a .srch. xattr name shared by groups of files\n"
+	       " -U     | set a unique .srch. xattr name in all files\n");
 }

 int main(int argc, char **argv)
@@ -397,7 +386,7 @@ int main(int argc, char **argv)

 	memset(&opts, 0, sizeof(opts));

-        while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
+        while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
                switch(c) {
                case 'd':
                        top_dir = strdup(optarg);
@@ -408,19 +397,20 @@ int main(int argc, char **argv)
                case 'q':
                        opts.quiet = 1;
                        break;
-                case 'F':
-                        opts.xattr_file = 1;
-                        break;
-                case 'G':
-                        opts.xattr_group = 1;
-                        break;
                case 'L':
                        opts.ls_output = 1;
                        break;
                case 'X':
-			opts.xattr_set = 1;
-			opts.xattr_name = strdup(optarg);
-			error_exit(!opts.xattr_name, "error allocating xattr name");
+                        opts.user_xattr = 1;
+                        break;
+                case 'S':
+                        opts.same_srch_xattr = 1;
+                        break;
+                case 'G':
+                        opts.group_srch_xattr = 1;
+                        break;
+                case 'U':
+                        opts.unique_srch_xattr = 1;
                        break;
                case '?':
                        printf("Unknown option '%c'\n", optopt);
@@ -429,11 +419,6 @@ int main(int argc, char **argv)
                }
        }

-	error_exit(opts.xattr_file && !opts.xattr_set,
-		   "must specify xattr -X when appending file nr with -F");
-	error_exit(opts.xattr_group && !opts.xattr_set,
-		   "must specify xattr -X when appending file nr with -G");
-
 	if (!opts.dry_run) {
 		error_exit(!top_dir,
 			   "must specify top level directory with -d");
@@ -11,13 +11,8 @@ FILE="$T_D0/file"
 # final block as we truncated past it.
 #
 echo "== truncate writes zeroed partial end of file block"
-yes | dd of="$FILE" bs=8K count=1 status=none iflag=fullblock
+yes | dd of="$FILE" bs=8K count=1 status=none
 sync
-
-# not passing iflag=fullblock causes the file occasionally to just be
-# 4K, so just to be safe we should at least check size once
-test `stat --printf="%s\n" "$FILE"` -eq 8192 || t_fail "test file incorrect start size"
-
 truncate -s 6K "$FILE"
 truncate -s 12K "$FILE"
 echo 3 > /proc/sys/vm/drop_caches
@@ -7,11 +7,9 @@ t_require_mounts 2

 COUNT=50000

-#
-# Prep dirs for test.  We have per-directory inode number allocators so
-# by putting each createmany in a per-mount dir they get their own inode
-# number region and cluster locks.
-#
+# Prep dirs for test. Each mount needs to make their own parent dir for
+# the createmany run, otherwise both dirs will end up in the same inode
+# group, causing updates to bounce that lock around.
 echo "== measure initial createmany"
 mkdir -p $T_D0/dir/0
 mkdir $T_D1/dir/1
@@ -19,20 +17,18 @@ mkdir $T_D1/dir/1
 echo "== measure initial createmany"
 START=$SECONDS
 createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
-sync
 SINGLE=$((SECONDS - START))
 echo single $SINGLE >> $T_TMP.full

 echo "== measure two concurrent createmany runs"
 START=$SECONDS
-(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
+createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
 pids="$!"
-(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
+createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
 pids="$pids $!"
 for p in $pids; do
        wait $p
 done
-sync
 BOTH=$((SECONDS - START))
 echo both $BOTH >> $T_TMP.full

@@ -45,10 +41,7 @@ echo both $BOTH >> $T_TMP.full
 # synchronized operation.
 FACTOR=200
 if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
-	t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
+	echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
 fi

-echo "== cleanup"
-find $T_D0/dir -delete
-
 t_pass
@@ -7,11 +7,14 @@ t_require_mounts 2

 #
 # Make sure that all mounts can read the results of a write from each
-# mount.
+# mount.  And make sure that the greatest of all the written seqs is
+# visible after the writes were commited by remote reads.
 #
 check_read_write()
 {
 	local expected
+	local greatest=0
+	local seq
 	local path
 	local saw
 	local w
@@ -22,6 +25,11 @@ check_read_write()
 		eval path="\$T_D${w}/written"
 		echo "$expected" > "$path"

+		seq=$(scoutfs stat -s meta_seq $path)
+		if [ "$seq" -gt "$greatest" ]; then
+			greatest=$seq
+		fi
+
 		for r in $(t_fs_nrs); do
 			eval path="\$T_D${r}/written"
 			saw=$(cat "$path")
@@ -30,6 +38,11 @@ check_read_write()
 			fi
 		done
 	done
+
+	seq=$(scoutfs statfs -s committed_seq -p $T_D0)
+	if [ "$seq" -lt "$greatest" ]; then
+		echo "committed_seq $seq less than greatest $greatest"
+	fi
 }

 # verify that fenced ran our testing fence script
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

@@ -2,8 +2,6 @@
 # Some basic tests of online resizing metadata and data devices.
 #

-t_require_commands bc
-
 statfs_total() {
 	local single="total_$1_blocks"
 	local mnt="$2"
@@ -9,7 +9,6 @@ LOG=340000
 LIM=1000000

 SEQF="%.20g"
-SXA="scoutfs.srch.test-srch-basic-functionality"

 t_require_commands touch rm setfattr scoutfs find_xattrs

@@ -28,20 +27,20 @@ diff_srch_find()

 echo "== create new xattrs"
 touch "$T_D0/"{create,update}
-setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== update existing xattr"
-setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== remove an xattr"
-setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== remove xattr with files"
 rm -f "$T_D0/"{create,update}
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.test

 echo "== trigger small log merges by rotating single block with unmount"
 sv=$(t_server_nr)
@@ -57,7 +56,7 @@ while [ "$i" -lt "8" ]; do

 		eval path="\$T_D${nr}/single-block-$i"
 		touch "$path"
-		setfattr -n $SXA -v $i "$path"
+		setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
 		t_umount $nr
 		t_mount $nr

@@ -66,51 +65,51 @@ while [ "$i" -lt "8" ]; do
 done
 # wait for srch compaction worker delay
 sleep 10
-find "$T_D0" -type f -name 'single-block-*' -delete
+rm -rf "$T_D0/single-block-*"

 echo "== create entries in current log"
 DIR="$T_D0/dir"
 NR=$((LOG / 4))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete small fraction"
-seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
-diff_srch_find $SXA
+seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== create entries that exceed one log"
 NR=$((LOG * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete fractions in phases"
 for i in $(seq 1 3); do
-	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
-	diff_srch_find $SXA
+	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+	diff_srch_find scoutfs.srch.scoutfs_bcp
 done

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== create entries for exceed search entry limit"
 NR=$((LIM * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete half"
-seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
-diff_srch_find $SXA
+seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== entirely remove third batch"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 t_pass
@@ -1,90 +0,0 @@
-#
-# There was a bug where srch file compaction could get stuck if a
-# partial compaction finished at the specific _SAFE_BYTES offset in a
-# block.  Resuming from that position would return an error and
-# compaction would stop making forward progress.
-#
-# We use triggers to pad the output of log compaction to end on the safe
-# offset and then cause compaction of those padded inputs to stop at the
-# safe offset.  Continuation will either succeed or return errors.  
-#
-
-# forcing rotation, so just a few
-NR=10
-SEQF="%.20g"
-COMPACT_NR=4
-
-echo "== initialize per-mount values"
-declare -a err
-declare -a compact_delay
-for nr in $(t_fs_nrs); do
-	err[$nr]=$(t_counter srch_compact_error $nr)
-	compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
-done
-restore_compact_delay()
-{
-	for nr in $(t_fs_nrs); do
-		echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
-	done
-}
-trap restore_compact_delay EXIT
-
-echo "== arm compaction triggers"
-for nr in $(t_fs_nrs); do
-	t_trigger_arm srch_compact_logs_pad_safe $nr
-	t_trigger_arm srch_merge_stop_safe $nr
-done
-
-echo "== compact more often"
-for nr in $(t_fs_nrs); do
-	echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
-done
-
-echo "== create padded sorted inputs by forcing log rotation"
-sv=$(t_server_nr)
-for i in $(seq 1 $COMPACT_NR); do
-	for j in $(seq 1 $COMPACT_NR); do
-		t_trigger_arm srch_force_log_rotate $sv
-
-		seq -f "f-$i-$j-$SEQF" 1 10 | \
-			bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
-			/dev/null
-		sync
-
-		test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
-			t_fail "srch_force_log_rotate didn't trigger"
-	done
-
-	padded=0
-	while test $padded == 0 && sleep .5; do
-		for nr in $(t_fs_nrs); do
-			if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
-				t_trigger_arm srch_compact_logs_pad_safe $nr
-				padded=1
-				break
-			fi
-			test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
-				t_fail "srch_compact_error counter increased on mount $nr"
-		done
-	done
-done
-
-echo "== compaction of padded should stop at safe"
-sleep 2
-for nr in $(t_fs_nrs); do
-	if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
-		break
-	fi
-done
-
-echo "== verify no compaction errors"
-sleep 2
-for nr in $(t_fs_nrs); do
-	test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
-		t_fail "srch_compact_error counter increased on mount $nr"
-done
-
-echo "== cleanup"
-find "$T_D0" -type f -name 'f-*' -delete
-
-t_pass
@@ -55,19 +55,6 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
-.B log_merge_wait_timeout_ms=<number>
-This option sets the amount of time, in milliseconds, that log merge
-creation can wait before timing out.  This setting is per-mount, only
-changes the behavior of that mount, and only affects the server when it
-is running in that mount.
-.sp
-This determines how long it may take for mounts to synchronize
-committing their log trees to create a log merge operation.  Setting it
-too high can create long latencies in the event that a mount takes a
-long time to commit their log.  Setting it too low can result in the
-creation of excessive numbers of log trees that are never merged.  The
-default is 500 and it can not be less than 100 nor greater than 60000.
-.TP
 .B metadev_path=<device>
 The metadev_path option specifies the path to the block device that
 contains the filesystem's metadata.
Author	SHA1	Message	Date
Ben McClelland	bcf559818b	Add rpm spec file support for el8 builds The rpmbuild support files no longer define the previously used kernel module macros. This carves out the differences between el7 and el8 with conditionals based on the distro we are building for. Signed-off-by: Ben McClelland <ben.mcclelland@versity.com>	2023-08-25 15:40:05 -07:00
Auke Kok	36ee4d946b	Ignore `last` flag output by filefrag. New versions of filefrag will output the presence of the `last` flag as well, but we don't care. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	dc57b34b8d	Don't use static struct initializer. In rhel7 this is a nested struct with ktime_t. However, in rhel8 ktime_t is a simple s64, and not a union, and thus we can't do this as easily. Just memset it. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	779b96df81	Allow the kernel to return -ESTALE from orphan-inode test In newer kernels, we always get -ESTALE because the inode has been marked immediately as deleting. Since this is expected behavior we should not fail the test here on this error value. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	1ee0331b8b	Skip userns based testing for RHEL8. In RHEL7, this was skipped automatically. In RHEL8, we don't support the needed passing through of the actual user namespace into our ACL set/get handlers. Once we get around v5.11 or so, the handlers are automatically passed the namespace. Until then, skip this test. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	42acf01dce	Use `.prefix` for POSIX acl instead of `.name`. New kernels expect to do a partial match when a .prefix is used here, and provide a .name member in case matching should look at the whole string. This is what we want. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	ca4d463c75	Don't cache ACL's in newer kernels. The caller takes care of caching for us. Us doing caching messes with memory management of cached ACLs and breaks. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	e7dadd09ae	New versions of getfattr will quote empty attr values. Instead of messing with quotes and using grep for the correct xattr name, directly query the value of the xattr being tested only, and compare that to the input. Side effect is that this is significantly simpler and faster. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	954843d2ab	Account for coreutils using statx() call instead of stat() `stat` internally switched to using the new `statx` syscall, and this affects the output of perror() subsequently. This is the same error as before (and expected). Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	5968265aad	Account for e2fsprogs output format changes. The filefrag program in e2fsprogs-v1.42.10-10-g29758d2f now includes an extra flag, and changes how the `unknown` flag is output. We essentially adjust for this "new" golden value on the fly if we encounter it. We don't expect future changes to the output. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	f30c095e8e	Account for quoting style changes in coreutils. In older versions of coreutils, quoted strings are occasionally output using utf-8 open/close single quotes. New versions of coreutils will exclusively use the ASCII single quote character "'" when the output is not a TTY - as is the case with all test scripts. We can avoid most of these problems by always setting LC_ALL=C in testing, however. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	1578ded917	Ignore loop device resizing messages. These occasionally trigger during tests. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	2c21f88f24	Support .read/write_iter callbacks in lieu of .aio_read/write The aio_read and aio_write callbacks are no longer used by newer kernels which now uses iter based readers and writers. We can avoid implementing plain .read and .write as an iter will be generated when needed for us automatically. We add a new data_wait_check_iter() function accordingly. With these methods removed from the kernel, the el8 kernel no longer uses the extended ops wrapper struct and is much closer now to upstream. As a result, a lot of methods are moving around from inode_dir_operations to and from inode_file_operations etc, and perhaps things will look a bit more structured as a result. As a result, we need a slightly different data_wait_check() that accounts for the iter and offset properly. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	57356b57aa	Implement .readahead for address_space_operations (aops). .readpages is obsolete in el8 kernels. We implement the .readahead method instead which is passed a struct readahead_control. We use the readahead_page(rac) accessor to retrieve page by page from the struct. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	b81e3bf421	implement generic_file_buffered_write() This function is removed in el8 therefore we need to implement it ourselves now. Copy it. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	5d1742a954	(un)register_hotcpu_notifier is obsolete v4.9-12228-g530e9b76ae8f Drops all (un)register_(hot)cpu_notifier() API functions. From here on we need to use the new cpuhp_* API. We avoid this entirely for now, at the cost of leaking pages until the filesystem is unmounted. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	22bd4c4493	Timespec64 changes for yr2038. Provide a fallback `current_time(inode)` implementation for older kernels. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	371bff49af	Adjust scoutfs_quorum_loop trace point. Convert the timeout struct unto a u64 nsecs value before passing it to the trace point event, as to not overflow the 64bit limitation on args. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	3d43fdfeaa	Initialize msg.msg_iter from iovec. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	6563f70a90	Handle net arg being added to sock_create_kern() Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	a14da52cbb	kernel_getsockname and kernel_getpeername dropped addrlen arg. v4.16-rc1-1-g9b2c45d479d0 This interface now returns (sizeof (addr)) on success, instead of 0. Therefore, we have to change the error condition detection. The compat for older kernels handles the addrlen check internally. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	f367e485a6	xattr functions are now passed flags through struct xattr_handler Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	8a7bc0cdfa	Remove the use of backing_dev_info pt from address_space. Instead, use the new inline inode_to_bdi from <backing-dev.h> to fill in the task's backing_dev_info. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	e81d16f8db	Do not use MS_* flags anymore in kernel space. MS_* flags from <linux/mount.h> should not be used in the kernel anymore from 4.x onwards. Instead, we need to use the SB_* versions Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Zach Brown	bad0455e28	Use count/scan objects shrinking interface Move to the more recent interfaces for counting and scanning cached objects to shrink. Signed-off-by: Zach Brown <zab@versity.com> Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	0a30c0b926	Use page->lru instead of page->list With v3.14-rc1-10-g34bf6ef94a83, page->list is removed Instead, use the union member ->lru. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Zach Brown	84a4000c85	Use more modern bio interfaces Move towards modern bio intefaces, while unfortunately carrying along a bunch of compat functions that let us still work with the old incompatible interfaces. Signed-off-by: Zach Brown <zab@versity.com> Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Zach Brown	859f63e49b	Use memalloc_nofs_save memalloc_nofs_save() was introduced as preferential to trying to use GFP flags to indicate that a task should not recurse during reclaim. We use it instead of the _noio_ we were using before. Signed-off-by: Zach Brown <zab@versity.com>	2023-08-01 16:35:48 -04:00
Zach Brown	588bdb7969	Use percpu_counter_add_batch __percpu_counter_add_batch was renamed to make it clear that the __ doesn't mean it's less safe, as it means in other calls in the API, but just that it takes an additional parameter. Signed-off-by: Zach Brown <zab@versity.com> Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:48 -04:00
Auke Kok	b894f6b04c	Use __posix_acl_create/_chmod and add backwards compatibility There are new interfaces available but the old one has been retained for us to use. In case of older kernels, we will need to fall back to the previous name of these functions. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:35:46 -04:00
Auke Kok	e26573ae8e	Fix argument test for __posix_acl_valid. The argument is fixed to be user_namespace, instead of user_ns. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:34:50 -04:00
Auke Kok	3f6b98496f	Use setattr_preapre() as inode_change_ok() was removed in v4.8-rc1 Instead, we can call setattr_prepare() directly. We provide a fallback for older kernels. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:34:49 -04:00
Auke Kok	b8a378ede7	Use the new inode->i_version manipulation methods. Provide fallback in degraded mode for kernels pre-v4.15-rc3 by directly manipulating the member as needed. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:33:28 -04:00
Auke Kok	4b08e79988	inode->i_mutex has been replaced with inode->i_rwsem. Since v4.6-rc3-27-g9902af79c01a, inode->i_mutex has been replaced with ->i_rwsem. However, long since whenever, inode_lock() and related functions already worked as intended and provided fully exclusive locking to the inode. To avoid a name clash on pre-rhel8 kernels, we have to rename a stack variable in `src/file.c`. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:33:28 -04:00
Auke Kok	2ac28c4969	New inode->i_version API requires <iversion.h> Since v4.15-rc3-4-gae5e165d855d, <linux/iversion.h> contains a new inode->i_version API and it is not included by default. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:33:28 -04:00
Auke Kok	3608d1aae1	use $(MAKE) to allow passing jobserver flags. With this, we can `make -jX` to speed up compiles a bit from the kmod folder. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:33:28 -04:00
Auke Kok	f13757f0af	module_init/_exit should have a semicolon at eol. In the past this was not needed but since el7 onwards these macros should require the semicolon. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:33:28 -04:00
Auke Kok	34e6efd39c	Adjust for new augmented rbtree compute callback function signature The new variant of the code that recomputes the augmented value is designed to handle non-scalar types and to facilitate that, it has new semantics for the _compute callback. It is now passed a boolean flag `exit` that indicates that if the value isn't changed, it should exit and halt propagation. The callback function now shall return whether that propagation should stop or not, and not the computed new value. The callback can now directly update the new computed value in the node. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 16:30:16 -04:00
Auke Kok	b452ca3d23	Add include <blkdev.h>. Fixes: Error: implicit declaration of function ‘blkdev_put’ Previously this was an `extern` in <fs.h> and included implicitly, hence the need to hard include it now. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00
Auke Kok	090c795b7e	preempt_mask.h is removed entirely. v4.1-rc4-22-g92cf211874e9 merges this into preempt.h, and on rhel7 kernels we don't need this include anymore either. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00
Auke Kok	d9394cb084	page_cache_release() is removed. put_page() instead. Even in 3.x, this already was equivalent. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00
Auke Kok	67ae352618	flush_work_sync is equivalent to flush_work. v3.15-rc1-6-g1a56f2aa4752 removes flush_work_sync entirely, but ever since v3.6-rc1-25-g606a5020b9bd which made all workqueues non-reentrant, it has been equivalent to flush_work. This is safe because in all cases only one server->work can be in flight at a time. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00
Auke Kok	38bb5a8254	d_materialise_unique replaced with d_splice_alias. Note argument order reversal. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00
Auke Kok	2510688a36	READ_ONCE() replaces ACCESS_ONCE() v3.18-rc3-2-g230fa253df63 forces us to remove ACCESS_ONCE() with READ_ONCE(), but it is probably the better interface and works with non-scalar types. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00
Auke Kok	15a5dca8c6	PAGE_CACHE_SIZE was removed, replace with PAGE_SIZE. PAGE_CACHE_SIZE was previously defined to be equivalent to PAGE_SIZE. This symbol was removed in v4.6-rc1-32-g1fa64f198b9f. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00
Auke Kok	c3996cb021	Include kernel.h and fs.h at the top of kernelcompat.h Because we `-include src/kernelcompat.h` from the command line, this header gets included before any of the kernel includes in most .c and .h files. We should at least make sure we pull in <fs> and <kernel> since they're required. Signed-off-by: Auke Kok <auke.kok@versity.com>	2023-08-01 13:40:59 -04:00