Add force to prepare-empty-data-device

Signed-off-by: Zach Brown <zab@versity.com>
2026-06-09 21:22:36 +00:00 · 2023-11-02 18:05:51 -07:00
50 changed files with 397 additions and 5449 deletions
@@ -1,32 +1,6 @@
 Versity ScoutFS Release Notes
 =============================

---
-v1.19
-\
-*Jan 30, 2024*
-
-Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
-for creating log merge operations.  The previous timeout, now the
-default, was too short for some systems and was resulting in consistent
-timeouts which created an excessive number of log trees waiting to be
-merged.
-
-Improved performance of many in-mount server operations when there are a
-large number of log trees waiting to be merged.
-
---
-v1.18
-\
-*Nov 7, 2023*
-
-Fixed a bug where background srch file compaction could stop making
-forward progress if a partial compaction operation was committed at a
-specific byte offset in a block.  This would cause srch file searches to
-be progressively more expensive over time.  Once this fix is running
-background compaction will resume, bringing the cost of searches back
-down.
-
 ---
 v1.17
 \
@@ -12,22 +12,17 @@ else
 SP = @:
 endif

-SCOUTFS_GIT_DESCRIBE ?= \
+SCOUTFS_GIT_DESCRIBE := \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

-ESCAPED_GIT_DESCRIBE := \
-	$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
-
-RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
-
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
-		RPM_GITHASH=$(RPM_GITHASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

 # - We use the git describe from tags to set up the RPM versioning
 RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
+RPM_GITHASH := $(shell git rev-parse --short HEAD)
 TARFILE = scoutfs-kmod-$(RPM_VERSION).tar


@@ -46,8 +41,7 @@ modules_install:

 %.spec: %.spec.in .FORCE
 	sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
-	    -e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
+	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
 	mv $@+ $@


@@ -1,7 +1,6 @@
 %define kmod_name scoutfs
 %define kmod_version @@VERSION@@
 %define kmod_git_hash @@GITHASH@@
-%define kmod_git_describe @@GITDESCRIBE@@
 %define pkg_date %(date +%%Y%%m%%d)

 # Disable the building of the debug package(s).
@@ -76,7 +75,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
 for flavor in %flavors_to_build; do
    rm -rf obj/$flavor
    cp -r source obj/$flavor
-    make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
+    make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
 done

 %install
@@ -98,21 +97,10 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
 /lib/modules

 %post
-echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
+weak-modules --add-kernel --no-initramfs
 depmod -a
 %endif

 %clean
 rm -rf %{buildroot}

-%preun
-# stash our modules for postun cleanup
-SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
-rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
-
-%postun
-if [ -x /sbin/weak-modules ]; then
-    cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
-fi
-
-rm /var/run/%{name}-modules-%{version}-%{release} || true
@@ -2029,253 +2029,187 @@ int scoutfs_btree_rebalance(struct super_block *sb,
 			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
 }

-struct merged_range {
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	struct rb_root root;
-	int size;
-};
-
-struct merged_item {
+struct merge_pos {
 	struct rb_node node;
-	struct scoutfs_key key;
+	struct scoutfs_btree_root *root;
+	struct scoutfs_block *bl;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *avl;
+	struct scoutfs_key *key;
 	u64 seq;
 	u8 flags;
 	unsigned int val_len;
-	u8 val[0];
+	u8 *val;
 };

-static inline struct merged_item *mitem_container(struct rb_node *node)
+static struct merge_pos *first_mpos(struct rb_root *root)
 {
-	return node ? container_of(node, struct merged_item, node) : NULL;
-}
-
-static inline struct merged_item *first_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_first(root));
-}
-
-static inline struct merged_item *last_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_last(root));
-}
-
-static inline struct merged_item *next_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
-}
-
-static inline struct merged_item *prev_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
-}
-
-static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
-				      struct rb_node **parent_ret, struct rb_node ***link_ret)
-{
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct merged_item *mitem;
-	int cmp;
-
-	while (*node) {
-		parent = *node;
-		mitem = container_of(*node, struct merged_item, node);
-
-		cmp = scoutfs_key_compare(key, &mitem->key);
-
-		if (cmp < 0) {
-			node = &(*node)->rb_left;
-		} else if (cmp > 0) {
-			node = &(*node)->rb_right;
-		} else {
-			*parent_ret = NULL;
-			*link_ret = NULL;
-			return mitem;
-		}
-	}
-
-	*parent_ret = parent;
-	*link_ret = node;
+	struct rb_node *node = rb_first(root);
+	if (node)
+		 return container_of(node, struct merge_pos, node);
 	return NULL;
 }

-static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
-			 struct rb_node *parent, struct rb_node **link)
+static struct merge_pos *next_mpos(struct merge_pos *mpos)
 {
-	rb_link_node(&mitem->node, parent, link);
-	rb_insert_color(&mitem->node, &rng->root);
-	rng->size += item_len_bytes(mitem->val_len);
+	struct rb_node *node;
+
+	if (mpos && (node = rb_next(&mpos->node)))
+		return container_of(node, struct merge_pos, node);
+	else
+		return NULL;
 }

-static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
-				struct merged_item *new)
+static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
 {
-	rb_replace_node(&victim->node, &new->node, &rng->root);
-	RB_CLEAR_NODE(&victim->node);
-	rng->size -= item_len_bytes(victim->val_len);
-	rng->size += item_len_bytes(new->val_len);
+	scoutfs_block_put(sb, mpos->bl);
+	kfree(mpos);
 }

-static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
+static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
 {
-	if (IS_ERR_OR_NULL(mitem))
-		return;
+	struct rb_node **node = &pos_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct merge_pos *mpos;
+	int cmp;

-	if (!RB_EMPTY_NODE(&mitem->node)) {
-		rng->size -= item_len_bytes(mitem->val_len);
-		rb_erase(&mitem->node, &rng->root);
+	parent = NULL;
+	while (*node) {
+		parent = *node;
+		mpos = container_of(*node, struct merge_pos, node);
+
+		/* sort merge items by key then newest to oldest */
+		cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
+		      -scoutfs_cmp(ins->seq, mpos->seq);
+
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
 	}

-	kfree(mitem);
-}
-
-static void trim_range_size(struct merged_range *rng, int merge_window)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && rng->size > merge_window) {
-
-		rng->end = mitem->key;
-		scoutfs_key_dec(&rng->end);
-
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
-}
-
-static void trim_range_end(struct merged_range *rng)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
+	rb_link_node(&ins->node, parent, node);
+	rb_insert_color(&ins->node, pos_root);
 }

 /*
- * Record and combine logged items from log roots for merging with the
- * writable destination root.  The caller is responsible for trimming
- * the range if it gets too large or if the key range shrinks.
+ * Find the next item in the merge_pos root in the caller's range and
+ * insert it into the rbtree sorted by key and version so that merging
+ * can find the next newest item at the front of the rbtree.  We free
+ * the mpos on error or if there are no more items in the range.
 */
-static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
-			   void *val, int val_len, void *arg)
+static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
+		      struct scoutfs_key *start, struct scoutfs_key *end)
 {
-	struct merged_range *rng = arg;
-	struct merged_item *mitem;
-	struct merged_item *found;
-	struct rb_node *parent;
-	struct rb_node **link;
-	int ret;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *next;
+	struct btree_walk_key_range kr;
+	struct scoutfs_key walk_key;
+	int ret = 0;

-	found = find_mitem(&rng->root, key, &parent, &link);
-	if (found) {
-		ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
-		if (ret < 0)
-			goto out;
-		if (ret > 0) {
-			if (ret == SCOUTFS_DELTA_COMBINED) {
-				scoutfs_inc_counter(sb, btree_merge_delta_combined);
-			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
-				scoutfs_inc_counter(sb, btree_merge_delta_null);
-				free_mitem(rng, found);
-			}
-			ret = 0;
-			goto out;
-		}
-
-		if (found->seq >= seq) {
-			ret = 0;
-			goto out;
-		}
+	/* always erase before freeing or inserting */
+	if (!RB_EMPTY_NODE(&mpos->node)) {
+		rb_erase(&mpos->node, pos_root);
+		RB_CLEAR_NODE(&mpos->node);
 	}

-	mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
-	if (!mitem) {
-		ret = -ENOMEM;
+	/*
+	 * advance to next item via the avl tree.   The caller's pos is
+	 * only ever incremented past the last key so we can use next to
+	 * iterate rather than using search to skip past multiple items.
+	 */
+	if (mpos->avl)
+		mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
+
+	/* find the next leaf with the key if we run out of items */
+	walk_key = *start;
+	while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
+		scoutfs_block_put(sb, mpos->bl);
+		mpos->bl = NULL;
+		ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
+				 0, &mpos->bl, &kr, NULL);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			free_mpos(sb, mpos);
+			goto out;
+		}
+		mpos->bt = mpos->bl->data;
+
+		mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
+					       start, NULL, NULL, &next, NULL) ?: next;
+		if (mpos->avl == NULL)
+			walk_key = kr.iter_next;
+	}
+
+	/* see if we're out of items within the range */
+	item = node_item(mpos->avl);
+	if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
+		free_mpos(sb, mpos);
+		ret = 0;
 		goto out;
 	}

-	mitem->key = *key;
-	mitem->seq = seq;
-	mitem->flags = flags;
-	mitem->val_len = val_len;
-	if (val_len)
-		memcpy(mitem->val, val, val_len);
-
-	if (found) {
-		replace_mitem(rng, found, mitem);
-		free_mitem(rng, found);
-	} else {
-		insert_mitem(rng, mitem, parent, link);
-	}
+	/* insert the next item within range at its version */
+	mpos->key = item_key(item);
+	mpos->seq = le64_to_cpu(item->seq);
+	mpos->flags = item->flags;
+	mpos->val_len = item_val_len(item);
+	mpos->val = item_val(mpos->bt, item);

+	insert_mpos(pos_root, mpos);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * Read a range of merged items.  The caller has set the key bounds of
- * the range.  We read a merge window's worth of items from blocks in
- * each input btree.
+ * The caller has reset all the merge positions for all the input log
+ * btree roots and wants the next logged item it should try and merge
+ * with the items in the fs_root.
 *
- * The caller can only use the smallest range that overlaps with all the
- * blocks that we read.  We start reading from the range's start key so
- * it will always be present and we don't need to adjust it.  The final
- * block we read from each input might not cover the range's end so it
- * needs to be adjusted.
- *
- * The end range can also shrink if we have to drop items because the
- * items exceeded the merge window size.
+ * We look ahead in the logged item stream to see if we should merge any
+ * older logged delta items into one result for the caller.  We also
+ * take this opportunity to skip and reset the mpos for any older
+ * versions of the first item.
 */
-static int read_merged_range(struct super_block *sb, struct merged_range *rng,
-			     struct list_head *inputs, int merge_window)
+static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
+			      struct scoutfs_key *end, struct merge_pos **mpos_ret)
 {
-	struct scoutfs_btree_root_head *rhead;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
+	struct merge_pos *mpos;
+	struct merge_pos *next;
 	struct scoutfs_key key;
 	int ret = 0;
-	int i;

-	list_for_each_entry(rhead, inputs, head) {
-		key = rng->start;
+	while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
+	       !scoutfs_key_compare(mpos->key, next->key)) {

-		for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
-			start = key;
-			end = rng->end;
-			ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
-						       merge_read_item, rng);
+		ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
+						    next->val, next->val_len);
+		if (ret < 0)
+			break;
+
+		/* reset advances to the next item */
+		key = *mpos->key;
+		scoutfs_key_inc(&key);
+
+		/* always skip next combined or older version */
+		ret = reset_mpos(sb, pos_root, next, &key, end);
+		if (ret < 0)
+			break;
+
+		if (ret == SCOUTFS_DELTA_COMBINED) {
+			scoutfs_inc_counter(sb, btree_merge_delta_combined);
+		} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
+			scoutfs_inc_counter(sb, btree_merge_delta_null);
+			/* if merging resulted in no info, skip current */
+			ret = reset_mpos(sb, pos_root, mpos, &key, end);
 			if (ret < 0)
-				goto out;
-
-			if (scoutfs_key_compare(&end, &rng->end) >= 0)
 				break;
-
-			key = end;
-			scoutfs_key_inc(&key);
 		}
-
-		if (scoutfs_key_compare(&end, &rng->end) < 0) {
-			rng->end = end;
-			trim_range_end(rng);
-		}
-
-		if (rng->size > merge_window)
-			trim_range_size(rng, merge_window);
 	}

-	trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
-	ret = 0;
-out:
+	*mpos_ret = mpos;
 	return ret;
 }

@@ -2292,13 +2226,6 @@ out:
 * to allocators running low or needing to join/split the parent.
 * *next_ret is set to the next key which hasn't been merged so that the
 * caller can retry with a new allocator and subtree.
- *
- * The number of input roots can be immense.  The merge_window specifies
- * the size of the set of merged items that we'll maintain as we iterate
- * over all the input roots.  Once we've merged items into the window
- * from all the input roots the merged input items are then merged to
- * the writable destination root.  It may take multiple passes of
- * windows of merged items to cover the input key range.
 */
 int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_alloc *alloc,
@@ -2308,16 +2235,18 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *inputs,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window)
+			bool subtree, int dirty_limit, int alloc_low)
 {
+	struct scoutfs_btree_root_head *rhead;
+	struct rb_root pos_root = RB_ROOT;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl = NULL;
 	struct btree_walk_key_range kr;
 	struct scoutfs_avl_node *par;
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-	struct merged_range rng;
+	struct scoutfs_key next;
+	struct merge_pos *mpos;
+	struct merge_pos *tmp;
 	int walk_val_len;
 	int walk_flags;
 	bool is_del;
@@ -2328,59 +2257,49 @@ int scoutfs_btree_merge(struct super_block *sb,
 	trace_scoutfs_btree_merge(sb, root, start, end);
 	scoutfs_inc_counter(sb, btree_merge);

+	list_for_each_entry(rhead, inputs, head) {
+		mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
+		if (!mpos) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		RB_CLEAR_NODE(&mpos->node);
+		mpos->root = &rhead->root;
+
+		ret = reset_mpos(sb, &pos_root, mpos, start, end);
+		if (ret < 0)
+			goto out;
+	}
+
 	walk_flags = BTW_DIRTY;
 	if (subtree)
 		walk_flags |= BTW_SUBTREE;
 	walk_val_len = 0;

-	rng.start = *start;
-	rng.end = *end;
-	rng.root = RB_ROOT;
-	rng.size = 0;
-
-	ret = read_merged_range(sb, &rng, inputs, merge_window);
-	if (ret < 0)
-		goto out;
-
-	for (;;) {
-		/* read next window as it empties (and it is possible to read an empty range) */
-		mitem = first_mitem(&rng.root);
-		if (!mitem) {
-			/* done if the read range hit the end */
-			if (scoutfs_key_compare(&rng.end, end) >= 0)
-				break;
-
-			/* read next batch of merged items */
-			rng.start = rng.end;
-			scoutfs_key_inc(&rng.start);
-			rng.end = *end;
-			ret = read_merged_range(sb, &rng, inputs, merge_window);
-			if (ret < 0)
-				break;
-			continue;
-		}
+	while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {

 		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
 			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
 			scoutfs_inc_counter(sb, btree_merge_alloc_low);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		scoutfs_block_put(sb, bl);
 		bl = NULL;
 		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-			         &mitem->key, walk_val_len, &bl, &kr, NULL);
+			         mpos->key, walk_val_len, &bl, &kr, NULL);
 		if (ret < 0) {
 			if (ret == -ERANGE)
-				*next_ret = mitem->key;
+				*next_ret = *mpos->key;
 			goto out;
 		}
 		bt = bl->data;
@@ -2392,21 +2311,22 @@ int scoutfs_btree_merge(struct super_block *sb,
 			continue;
 		}

-		while (mitem) {
+		while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
+
 			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
+			if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
 				break;

 			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, &mitem->key);
-			is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
+			item = leaf_item_hash_search(sb, bt, mpos->key);
+			is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);

 			/* see if we're merging delta items */
 			if (item && !is_del)
-				delta = scoutfs_forest_combine_deltas(&mitem->key,
+				delta = scoutfs_forest_combine_deltas(mpos->key,
 								      item_val(bt, item),
 								      item_val_len(item),
-								      mitem->val, mitem->val_len);
+								      mpos->val, mpos->val_len);
 			else
 				delta = 0;
 			if (delta < 0) {
@@ -2418,38 +2338,40 @@ int scoutfs_btree_merge(struct super_block *sb,
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 			}

-			trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
+			trace_scoutfs_btree_merge_items(sb, mpos->root,
+					mpos->key, mpos->val_len,
 					item ? root : NULL,
 					item ? item_key(item) : NULL,
 					item ? item_val_len(item) : 0, is_del);

 			/* rewalk and split if ins/update needs room */
-			if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
+			if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
 				walk_flags |= BTW_INSERT;
-				walk_val_len = mitem->val_len;
+				walk_val_len = mpos->val_len;
 				break;
 			}

 			/* insert missing non-deletion merge items */
 			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
+				scoutfs_avl_search(&bt->item_root,
+						   cmp_key_item, mpos->key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, &mitem->key, mitem->seq, mitem->flags,
-					    mitem->val, mitem->val_len, par, cmp);
+				create_item(bt, mpos->key, mpos->seq, mpos->flags,
+					    mpos->val, mpos->val_len, par, cmp);
 				scoutfs_inc_counter(sb, btree_merge_insert);
 			}

 			/* update existing items */
 			if (item && !is_del && !delta) {
-				item->seq = cpu_to_le64(mitem->seq);
-				item->flags = mitem->flags;
-				update_item_value(bt, item, mitem->val, mitem->val_len);
+				item->seq = cpu_to_le64(mpos->seq);
+				item->flags = mpos->flags;
+				update_item_value(bt, item, mpos->val, mpos->val_len);
 				scoutfs_inc_counter(sb, btree_merge_update);
 			}

 			/* update combined delta item seq */
 			if (delta == SCOUTFS_DELTA_COMBINED) {
-				item->seq = cpu_to_le64(mitem->seq);
+				item->seq = cpu_to_le64(mpos->seq);
 			}

 			/*
@@ -2481,18 +2403,21 @@ int scoutfs_btree_merge(struct super_block *sb,
 			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
 			walk_val_len = 0;

-			/* finished with this merged item */
-			tmp = mitem;
-			mitem = next_mitem(mitem);
-			free_mitem(&rng, tmp);
+			/* finished with this key, skip any older items */
+			next = *mpos->key;
+			scoutfs_key_inc(&next);
+			ret = reset_mpos(sb, &pos_root, mpos, &next, end);
+			if (ret < 0)
+				goto out;
 		}
 	}

 	ret = 0;
 out:
 	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
-		free_mitem(&rng, mitem);
+	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
+		free_mpos(sb, mpos);
+	}

 	return ret;
 }
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *input_list,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window);
+			bool subtree, int dirty_limit, int alloc_low);

 int scoutfs_btree_free_blocks(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
@@ -145,7 +145,6 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
-	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
 	EXPAND_COUNTER(net_send_error)				\
@@ -721,8 +721,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
 	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
 				  &next, &comp.root, &inputs,
 				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
-				  (2 * 1024 * 1024));
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
 	if (ret == -ERANGE) {
 		comp.remain = next;
 		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
@@ -33,7 +33,6 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
-	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
@@ -46,7 +45,6 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
-	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -115,10 +113,6 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

-#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
-#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
-#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
-
 #define MIN_ORPHAN_SCAN_DELAY_MS	100UL
 #define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
 #define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
@@ -132,27 +126,11 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
-	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
 }

-static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
-{
-	if (ret < 0) {
-		scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
-		return -EINVAL;
-	}
-	if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
-		scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
-			    val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
 {
 	if (ret < 0) {
@@ -218,14 +196,6 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

-		case Opt_log_merge_wait_timeout_ms:
-			ret = match_int(args, &nr);
-			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
-			if (ret < 0)
-				return ret;
-			opts->log_merge_wait_timeout_ms = nr;
-			break;
-
 		case Opt_metadev_path:
 			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
 			if (ret < 0)
@@ -452,43 +422,6 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

-static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
-						char *buf)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	struct scoutfs_mount_options opts;
-
-	scoutfs_options_read(sb, &opts);
-
-	return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
-}
-static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-						 const char *buf, size_t count)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	DECLARE_OPTIONS_INFO(sb, optinf);
-	char nullterm[30]; /* more than enough for octal -U64_MAX */
-	int val;
-	int len;
-	int ret;
-
-	len = min(count, sizeof(nullterm) - 1);
-	memcpy(nullterm, buf, len);
-	nullterm[len] = '\0';
-
-	ret = kstrtoint(nullterm, 0, &val);
-	ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
-	if (ret == 0) {
-		write_seqlock(&optinf->seqlock);
-		optinf->opts.log_merge_wait_timeout_ms = val;
-		write_sequnlock(&optinf->seqlock);
-		ret = count;
-	}
-
-	return ret;
-}
-SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
-
 static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -592,7 +525,6 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
-	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
@@ -8,7 +8,6 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
-	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
@@ -439,7 +439,6 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->journal_info = (unsigned long)journal_info;
 		__entry->holders = holders;
-		__entry->ret = ret;
 	),

 	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
@@ -1747,41 +1746,21 @@ TRACE_EVENT(scoutfs_btree_merge,
 		  sk_trace_args(end))
 );

-TRACE_EVENT(scoutfs_btree_merge_read_range,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
-		 int size),
-
-	TP_ARGS(sb, start, end, size),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		sk_trace_define(start)
-		sk_trace_define(end)
-		__field(int, size)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		sk_trace_assign(start, start);
-		sk_trace_assign(end, end);
-		__entry->size = size;
-	),
-
-	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
-);
-
 TRACE_EVENT(scoutfs_btree_merge_items,
 	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *m_root,
 		 struct scoutfs_key *m_key, int m_val_len,
 		 struct scoutfs_btree_root *f_root,
 		 struct scoutfs_key *f_key, int f_val_len,
 		 int is_del),

-	TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, m_root_blkno)
+		__field(__u64, m_root_seq)
+		__field(__u8, m_root_height)
 		sk_trace_define(m_key)
 		__field(int, m_val_len)
 		__field(__u64, f_root_blkno)
@@ -1794,6 +1773,10 @@ TRACE_EVENT(scoutfs_btree_merge_items,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->m_root_blkno = m_root ?
+					le64_to_cpu(m_root->ref.blkno) : 0;
+		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
+		__entry->m_root_height = m_root ? m_root->height : 0;
 		sk_trace_assign(m_key, m_key);
 		__entry->m_val_len = m_val_len;
 		__entry->f_root_blkno = f_root ?
@@ -1805,9 +1788,11 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		__entry->is_del = !!is_del;
 	),

-	TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
-		  __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
+	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
+		  __entry->m_root_height, sk_trace_args(m_key),
+		  __entry->m_val_len, __entry->f_root_blkno,
+		  __entry->f_root_seq, __entry->f_root_height,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );

@@ -2090,71 +2075,6 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_server_finalize_items,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
-		 u64 item_get_trans_seq),
-
-	TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(__u64, item_rid)
-		__field(__u64, item_nr)
-		__field(__u64, item_flags)
-		__field(__u64, item_get_trans_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->item_rid = item_rid;
-		__entry->item_nr = item_nr;
-		__entry->item_flags = item_flags;
-		__entry->item_get_trans_seq = item_get_trans_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
-		  __entry->item_flags, __entry->item_get_trans_seq)
-);
-
-TRACE_EVENT(scoutfs_server_finalize_decision,
-	TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
-		 bool ours_visible, bool finalize_ours, unsigned int delay_ms,
-		 u64 finalize_sent_seq),
-
-	TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
-		finalize_sent_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(bool, saw_finalized)
-		__field(bool, others_active)
-		__field(bool, ours_visible)
-		__field(bool, finalize_ours)
-		__field(unsigned int, delay_ms)
-		__field(__u64, finalize_sent_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->saw_finalized = saw_finalized;
-		__entry->others_active = others_active;
-		__entry->ours_visible = ours_visible;
-		__entry->finalize_ours = finalize_ours;
-		__entry->delay_ms = delay_ms;
-		__entry->finalize_sent_seq = finalize_sent_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
-		  __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
-		  __entry->finalize_sent_seq)
-);
-
 TRACE_EVENT(scoutfs_get_log_merge_status,
 	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
 		 u64 nr_requests, u64 nr_complete, u64 seq),
@@ -2879,81 +2799,6 @@ TRACE_EVENT(scoutfs_omap_should_delete,
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
 );

-#define SSCF_FMT "[bo %llu bs %llu es %llu]"
-#define SSCF_FIELDS(pref)					\
-	__field(__u64, pref##_blkno)				\
-	__field(__u64, pref##_blocks)				\
-	__field(__u64, pref##_entries)
-#define SSCF_ASSIGN(pref, sfl)					\
-	__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno);	\
-	__entry->pref##_blocks = le64_to_cpu((sfl)->blocks);	\
-	__entry->pref##_entries = le64_to_cpu((sfl)->entries);
-#define SSCF_ENTRY_ARGS(pref)					\
-	__entry->pref##_blkno,					\
-	__entry->pref##_blocks,					\
-	__entry->pref##_entries
-
-DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-
-	TP_ARGS(sb, sc),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, id)
-		__field(__u8, nr)
-		__field(__u8, flags)
-		SSCF_FIELDS(out)
-		__field(__u64, in0_blk)
-		__field(__u64, in0_pos)
-		SSCF_FIELDS(in0)
-		__field(__u64, in1_blk)
-		__field(__u64, in1_pos)
-		SSCF_FIELDS(in1)
-		__field(__u64, in2_blk)
-		__field(__u64, in2_pos)
-		SSCF_FIELDS(in2)
-		__field(__u64, in3_blk)
-		__field(__u64, in3_pos)
-		SSCF_FIELDS(in3)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->id = le64_to_cpu(sc->id);
-		__entry->nr = sc->nr;
-		__entry->flags = sc->flags;
-		SSCF_ASSIGN(out, &sc->out)
-		__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in0, &sc->in[0].sfl)
-		__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in1, &sc->in[1].sfl)
-		__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in2, &sc->in[2].sfl)
-		__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
-		__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
-		SSCF_ASSIGN(in3, &sc->in[3].sfl)
-	),
-
-	TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
-		  SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
-		  __entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
-		  __entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
-		  __entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
-		  __entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
-);
-DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-	TP_ARGS(sb, sc)
-);
-DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
-	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
-	TP_ARGS(sb, sc)
-);
-
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
@@ -148,8 +148,6 @@ struct server_info {
 	struct scoutfs_quorum_config qconf;
 	/* a running server maintains a private dirty super */
 	struct scoutfs_super_block dirty_super;
-
-	u64 finalize_sent_seq;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -415,27 +413,6 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
 	wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
 }

-/*
- * Return the higher of the avail or freed used by the active commit
- * since this holder joined the commit.  This is *not* the amount used
- * by the holder, we don't track per-holder alloc use.
- */
-static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	u32 avail_used;
-	u32 freed_used;
-	u32 avail_now;
-	u32 freed_now;
-
-	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
-
-	avail_used = hold->avail - avail_now;
-	freed_used = hold->freed - freed_now;
-
-	return max(avail_used, freed_used);
-}
-
 /*
 * This is called while holding the commit and returns once the commit
 * is successfully written.  Many holders can all wait for all holders
@@ -961,24 +938,22 @@ static int find_log_trees_item(struct super_block *sb,
 }

 /*
- * Find the log_trees item with the greatest nr for each rid.  Fills the
- * caller's log_trees and sets the key before the returned log_trees for
- * the next iteration.  Returns 0 when done, > 0 for each item, and
- * -errno on fatal errors.
+ * Find the next log_trees item from the key.  Fills the caller's log_trees and sets
+ * the key past the returned log_trees for iteration.  Returns 0 when done, > 0 for each
+ * item, and -errno on fatal errors.
 */
-static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
-				struct scoutfs_key *key, struct scoutfs_log_trees *lt)
+static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
+		       struct scoutfs_key *key, struct scoutfs_log_trees *lt)
 {
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	int ret;

-	ret = scoutfs_btree_prev(sb, root, key, &iref);
+	ret = scoutfs_btree_next(sb, root, key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			memcpy(lt, iref.val, iref.val_len);
 			*key = *iref.key;
-			key->sklt_nr = 0;
-			scoutfs_key_dec(key);
+			scoutfs_key_inc(key);
 			ret = 1;
 		} else {
 			ret = -EIO;
@@ -1073,13 +1048,21 @@ static int next_log_merge_item(struct super_block *sb,
 * abandoned log btree finalized.  If it takes too long each client has
 * a change to make forward progress before being asked to commit again.
 *
+ * We're waiting on heavy state that is protected by mutexes and
+ * transaction machinery.  It's tricky to recreate that state for
+ * lightweight condition tests that don't change task state.  Instead of
+ * trying to get that right, particularly as we unwind after success or
+ * after timeouts, waiters use an unsatisfying poll.   Short enough to
+ * not add terrible latency, given how heavy and infrequent this already
+ * is, and long enough to not melt the cpu.  This could be tuned if it
+ * becomes a problem.
+ *
 * This can end up finalizing a new empty log btree if a new mount
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
-#define FINALIZE_POLL_MIN_DELAY_MS	5U
-#define FINALIZE_POLL_MAX_DELAY_MS	100U
-#define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
+#define FINALIZE_POLL_MS	(11)
+#define FINALIZE_TIMEOUT_MS	(MSEC_PER_SEC / 2)
 static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
 					u64 rid, struct commit_hold *hold)
 {
@@ -1087,10 +1070,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
-	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
 	struct scoutfs_log_trees fin;
-	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
 	bool others_active;
@@ -1098,14 +1079,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	bool ours_visible;
 	struct scoutfs_key key;
 	char *err_str = NULL;
-	ktime_t start;
 	int ret;
 	int err;

-	scoutfs_options_read(sb, &opts);
-	timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
-	delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
-	start = ktime_get_raw();
+	timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);

 	for (;;) {
 		/* nothing to do if there's already a merge in flight */
@@ -1122,13 +1099,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		saw_finalized = false;
 		others_active = false;
 		ours_visible = false;
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-		while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
-
-			trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
-							    le64_to_cpu(each_lt.nr),
-							    le64_to_cpu(each_lt.flags),
-							    le64_to_cpu(each_lt.get_trans_seq));
+		scoutfs_key_init_log_trees(&key, 0, 0);
+		while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
 				saw_finalized = true;
@@ -1153,10 +1125,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		finalize_ours = (lt->item_root.height > 2) ||
 				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);

-		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
-						       ours_visible, finalize_ours, delay_ms,
-						       server->finalize_sent_seq);
-
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
@@ -1164,13 +1132,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		}

 		/* send sync requests soon to give time to commit */
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+		scoutfs_key_init_log_trees(&key, 0, 0);
 		while (others_active &&
-		       (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		       (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
-			    (le64_to_cpu(each_lt.rid) == rid) ||
-			    (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
+			    (le64_to_cpu(each_lt.rid) == rid))
 				continue;

 			ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1190,8 +1157,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

-		server->finalize_sent_seq = scoutfs_server_seq(sb);
-
 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
 			fin = *lt;
@@ -1229,16 +1194,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			if (ret < 0)
 				err_str = "applying commit before waiting for finalized";

-			msleep(delay_ms);
-			delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
-				       FINALIZE_POLL_MAX_DELAY_MS);
+			msleep(FINALIZE_POLL_MS);

 			server_hold_commit(sb, hold);
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
 			if (time_after(jiffies, timeo)) {
-				scoutfs_inc_counter(sb, log_merge_wait_timeout);
 				ret = 0;
 				break;
 			}
@@ -1821,29 +1783,43 @@ out:
 * Give the caller the last seq before outstanding client commits.  All
 * seqs up to and including this are stable, new client transactions can
 * only have greater seqs.
- *
- * For each rid, only its greatest log trees nr can be an open commit.
- * We look at the last log_trees item for each client rid and record its
- * trans seq if it hasn't been committed.
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_log_trees lt;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_log_trees *lt;
 	struct scoutfs_key key;
 	u64 last_seq = 0;
 	int ret;

 	last_seq = scoutfs_server_seq(sb) - 1;
+	scoutfs_key_init_log_trees(&key, 0, 0);

 	mutex_lock(&server->logs_mutex);

-	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
-		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
-		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
-			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
+	for (;; scoutfs_key_inc(&key)) {
+		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				lt = iref.val;
+				if ((le64_to_cpu(lt->get_trans_seq) >
+				     le64_to_cpu(lt->commit_trans_seq)) &&
+				     le64_to_cpu(lt->get_trans_seq) <= last_seq) {
+					last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
+				}
+				key = *iref.key;
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
 		}
 	}

@@ -1990,7 +1966,9 @@ static int server_srch_get_compact(struct super_block *sb,
 	ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
 				       &super->srch_root, rid, sc);
 	mutex_unlock(&server->srch_mutex);
-	if (ret < 0 || (ret == 0 && sc->nr == 0))
+	if (ret == 0 && sc->nr == 0)
+		ret = -ENOENT;
+	if (ret < 0)
 		goto apply;

 	mutex_lock(&server->alloc_mutex);
@@ -2495,11 +2473,9 @@ static void server_log_merge_free_work(struct work_struct *work)

 	while (!server_is_stopping(server)) {

-		if (!commit) {
-			server_hold_commit(sb, &hold);
-			mutex_lock(&server->logs_mutex);
-			commit = true;
-		}
+		server_hold_commit(sb, &hold);
+		mutex_lock(&server->logs_mutex);
+		commit = true;

 		ret = next_log_merge_item(sb, &super->log_merge,
 					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2546,14 +2522,12 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
-			mutex_unlock(&server->logs_mutex);
-			ret = server_apply_commit(sb, &hold, ret);
-			commit = false;
-			if (ret < 0) {
-				err_str = "looping commit del/upd freeing item";
-				break;
-			}
+		mutex_unlock(&server->logs_mutex);
+		ret = server_apply_commit(sb, &hold, ret);
+		commit = false;
+		if (ret < 0) {
+			err_str = "looping commit del/upd freeing item";
+			break;
 		}
 	}

@@ -4326,7 +4300,6 @@ static void scoutfs_server_worker(struct work_struct *work)
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
-	server->finalize_sent_seq = 0;

 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
@@ -30,9 +30,6 @@
 #include "client.h"
 #include "counters.h"
 #include "scoutfs_trace.h"
-#include "triggers.h"
-#include "sysfs.h"
-#include "msg.h"

 /*
 * This srch subsystem gives us a way to find inodes that have a given
@@ -71,14 +68,10 @@ struct srch_info {
 	atomic_t shutdown;
 	struct workqueue_struct *workq;
 	struct delayed_work compact_dwork;
-	struct scoutfs_sysfs_attrs ssa;
-	atomic_t compact_delay_ms;
 };

 #define DECLARE_SRCH_INFO(sb, name) \
 	struct srch_info *name = SCOUTFS_SB(sb)->srch_info
-#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
-	DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_ARG(sre)						\
@@ -527,95 +520,6 @@ out:
 	return ret;
 }

-/*
- * Padded entries are encoded in pairs after an existing entry.  All of
- * the pairs cancel each other out by all readers (the second encoding
- * looks like deletion) so they aren't visible to the first/last bounds of
- * the block or file.
- */
-static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
-			       struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
-{
-	int ret;
-
-	ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
-			   sre, &srb->tail);
-	if (ret > 0) {
-		srb->tail = *sre;
-		le32_add_cpu(&srb->entry_nr, 1);
-		le32_add_cpu(&srb->entry_bytes, ret);
-		le64_add_cpu(&sfl->entries, 1);
-		ret = 0;
-	}
-
-	return ret;
-}
-
-/*
- * This is called by a testing trigger to create a very specific case of
- * encoded entry offsets.  We want the last entry in the block to start
- * precisely at the _SAFE_BYTES offset.
- *
- * This is called when there is a single existing entry in the block.
- * We have the entire block to work with.  We encode pairs of matching
- * entries.  This hides them from readers (both searches and merging) as
- * they're interpreted as creation and deletion and are deleted.  We use
- * the existing hash value of the first entry in the block but then set
- * the inode to an impossibly large number so it doesn't interfere with
- * anything.
- *
- * To hit the specific offset we very carefully manage the amount of
- * bytes of change between fields in the entry.  We know that if we
- * change all the byte of the ino and id we end up with a 20 byte
- * (2+8+8,2) encoding of the pair of entries.  To have the last entry
- * start at the _SAFE_POS offset we know that the final 20 byte pair
- * encoding needs to end at 2 bytes (second entry encoding) after the
- * _SAFE_POS offset.
- *
- * So as we encode pairs we watch the delta of our current offset from
- * that desired final offset of 2 past _SAFE_POS.  If we're a multiple
- * of 20 away then we encode the full 20 byte pairs.  If we're not, then
- * we drop a byte to encode 19 bytes.  That'll slowly change the offset
- * to be a multiple of 20 again while encoding large entries.
- */
-static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
-				struct scoutfs_srch_block *srb)
-{
-	struct scoutfs_srch_entry sre;
-	u32 target;
-	s32 diff;
-	u64 hash;
-	u64 ino;
-	u64 id;
-	int ret;
-
-	hash = le64_to_cpu(srb->tail.hash);
-	ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
-	id = le64_to_cpu(srb->tail.id);
-
-	target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
-
-	while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
-		ino ^= 1ULL << (7 * 8);
-		if (diff % 20 == 0) {
-			id ^= 1ULL << (7 * 8);
-		} else {
-			id ^= 1ULL << (6 * 8);
-		}
-
-		sre.hash = cpu_to_le64(hash);
-		sre.ino = cpu_to_le64(ino);
-		sre.id = cpu_to_le64(id);
-
-		ret = append_padded_entry(sfl, blk, srb, &sre);
-		if (ret == 0)
-			ret = append_padded_entry(sfl, blk, srb, &sre);
-		BUG_ON(ret != 0);
-
-		diff = target - le32_to_cpu(srb->entry_bytes);
-	}
-}
-
 /*
 * The caller is dropping an ino/id because the tracking rbtree is full.
 * This loses information so we can't return any entries at or after the
@@ -1083,9 +987,6 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 	struct scoutfs_key key;
 	int ret;

-	if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
-		force = true;
-
 	if (sfl->ref.blkno == 0 ||
 	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
 		return 0;
@@ -1561,7 +1462,7 @@ static int kway_merge(struct super_block *sb,
 		      struct scoutfs_block_writer *wri,
 		      struct scoutfs_srch_file *sfl,
 		      kway_get_t kway_get, kway_advance_t kway_adv,
-		      void **args, int nr, bool logs_input)
+		      void **args, int nr)
 {
 	DECLARE_SRCH_INFO(sb, srinf);
 	struct scoutfs_srch_block *srb = NULL;
@@ -1666,15 +1567,6 @@ static int kway_merge(struct super_block *sb,
 				blk++;
 			}

-			/* end sorted block on _SAFE offset for testing */
-			if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
-			    scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
-				pad_entries_at_safe(sfl, blk, srb);
-				scoutfs_block_put(sb, bl);
-				bl = NULL;
-				blk++;
-			}
-
 			scoutfs_inc_counter(sb, srch_compact_entry);

 		} else {
@@ -1717,8 +1609,6 @@ static int kway_merge(struct super_block *sb,
 			empty++;
 			ret = 0;
 		} else if (ret < 0) {
-			if (ret == -ENOANO) /* just testing trigger */
-				ret = 0;
 			goto out;
 		}

@@ -1926,7 +1816,7 @@ static int compact_logs(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
-			 args, nr_pages, true);
+			 args, nr_pages);
 	if (ret < 0)
 		goto out;

@@ -1984,18 +1874,12 @@ static int kway_get_reader(struct super_block *sb,
 	srb = rdr->bl->data;

 	if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
-	    rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
+	    rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
 	    rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
 		/* XXX inconsistency */
 		return -EIO;
 	}

-	if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
-	    scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
-		/* only used in testing */
-		return -ENOANO;
-	}
-
 	/* decode entry, possibly skipping start of the block */
 	while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
 		ret = decode_entry(srb->entries + rdr->pos,
@@ -2085,7 +1969,7 @@ static int compact_sorted(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
-			 kway_adv_reader, args, nr, false);
+			 kway_adv_reader, args, nr);

 	sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
 	for (i = 0; i < nr; i++) {
@@ -2214,15 +2098,8 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-static void queue_compact_work(struct srch_info *srinf, bool immediate)
-{
-	unsigned long delay;
-
-	if (!atomic_read(&srinf->shutdown)) {
-		delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
-		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
-	}
-}
+/* wait 10s between compact attempts on error, immediate after success */
+#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)

 /*
 * Get a compaction operation from the server, sort the entries from the
@@ -2250,6 +2127,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	struct super_block *sb = srinf->sb;
 	struct scoutfs_block_writer wri;
 	struct scoutfs_alloc alloc;
+	unsigned long delay;
 	int ret;
 	int err;

@@ -2262,8 +2140,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	scoutfs_block_writer_init(sb, &wri);

 	ret = scoutfs_client_srch_get_compact(sb, sc);
-	if (ret >= 0)
-		trace_scoutfs_srch_compact_client_recv(sb, sc);
 	if (ret < 0 || sc->nr == 0)
 		goto out;

@@ -2292,7 +2168,6 @@ commit:
 	sc->meta_freed = alloc.freed;
 	sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;

-	trace_scoutfs_srch_compact_client_send(sb, sc);
 	err = scoutfs_client_srch_commit_compact(sb, sc);
 	if (err < 0 && ret == 0)
 		ret = err;
@@ -2303,56 +2178,14 @@ out:
 		scoutfs_inc_counter(sb, srch_compact_error);

 	scoutfs_block_writer_forget_all(sb, &wri);
-	queue_compact_work(srinf, sc->nr > 0 && ret == 0);
+	if (!atomic_read(&srinf->shutdown)) {
+		delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
+		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
+	}

 	kfree(sc);
 }

-static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
-{
-	DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
-
-	return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
-}
-
-#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
-#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
-#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
-
-static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-				      const char *buf, size_t count)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	DECLARE_SRCH_INFO(sb, srinf);
-	char nullterm[30]; /* more than enough for octal -U64_MAX */
-	u64 val;
-	int len;
-	int ret;
-
-	len = min(count, sizeof(nullterm) - 1);
-	memcpy(nullterm, buf, len);
-	nullterm[len] = '\0';
-
-	ret = kstrtoll(nullterm, 0, &val);
-	if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
-		scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
-			    MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
-		return -EINVAL;
-	}
-
-	atomic_set(&srinf->compact_delay_ms, val);
-	cancel_delayed_work(&srinf->compact_dwork);
-	queue_compact_work(srinf, false);
-
-	return count;
-}
-SCOUTFS_ATTR_RW(compact_delay_ms);
-
-static struct attribute *srch_attrs[] = {
-	SCOUTFS_ATTR_PTR(compact_delay_ms),
-	NULL,
-};
-
 void scoutfs_srch_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -2369,8 +2202,6 @@ void scoutfs_srch_destroy(struct super_block *sb)
 		destroy_workqueue(srinf->workq);
 	}

-	scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
-
 	kfree(srinf);
 	sbi->srch_info = NULL;
 }
@@ -2388,15 +2219,8 @@ int scoutfs_srch_setup(struct super_block *sb)
 	srinf->sb = sb;
 	atomic_set(&srinf->shutdown, 0);
 	INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
-	scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
-	atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
-
 	sbi->srch_info = srinf;

-	ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
-	if (ret < 0)
-		goto out;
-
 	srinf->workq = alloc_workqueue("scoutfs_srch_compact",
 				       WQ_NON_REENTRANT | WQ_UNBOUND |
 				       WQ_HIGHPRI, 0);
@@ -2405,7 +2229,8 @@ int scoutfs_srch_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_compact_work(srinf, false);
+	queue_delayed_work(srinf->workq, &srinf->compact_dwork,
+			   msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));

 	ret = 0;
 out:
@@ -39,9 +39,6 @@ struct scoutfs_triggers {

 static char *names[] = {
 	[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
-	[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
-	[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
-	[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
 	[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
 };

@@ -3,9 +3,6 @@

 enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
-	SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
-	SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
-	SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
 	SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
 	SCOUTFS_TRIGGER_NR,
 };
@@ -12,8 +12,7 @@ BIN := src/createmany			\
 	src/find_xattrs			\
 	src/create_xattr_loop		\
 	src/fragmented_data_extents	\
-	src/o_tmpfile_umask		\
-	src/parallel_restore
+	src/o_tmpfile_umask

 DEPS := $(wildcard src/*.d)

@@ -23,11 +22,8 @@ ifneq ($(DEPS),)
 -include $(DEPS)
 endif

-src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
-
 $(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $($(@)_cflags)
-
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@

 .PHONY: clean
 clean:
@@ -25,9 +25,8 @@ All options can be seen by running with -h.
 This script is built to test multi-node systems on one host by using
 different mounts of the same devices.  The script creates a fake block
 device in front of each fs block device for each mount that will be
-tested.  It will create predictable device mapper devices and mounts
-them on /mnt/test.N.  These static device names and mount paths limit
-the script to a single execution per host.
+tested.  Currently it will create free loop devices and will mount on
+/mnt/test.[0-9].

 All tests will be run by default.  Particular tests can be included or
 excluded by providing test name regular expressions with the -I and -E
@@ -105,8 +104,8 @@ used during the test.

 | Variable         | Description          | Origin          | Example           |
 | ---------------- | -------------------  | --------------- | ----------------- |
-| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9]        |
-| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9]        |
+| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
+| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
 | T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
 | T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
 | T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
@@ -6,61 +6,6 @@ t_filter_fs()
 	    -e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
 }

-#
-# We can hit a spurious kasan warning that was fixed upstream:
-#
-#  e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
-#
-# KASAN can get mad when the unwinder doesn't find ORC metadata and
-# wanders up without using frames and hits the KASAN stack red zones.
-# We can ignore these messages.
-#
-# They're bracketed by:
-# [ 2687.690127] ==================================================================
-# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
-# ...
-# [ 2687.706220] ==================================================================
-# [ 2687.707284] Disabling lock debugging due to kernel taint
-#
-# That final lock debugging message may not be included.
-#
-ignore_harmless_unwind_kasan_stack_oob()
-{
-awk '
-        BEGIN {
-                in_soob = 0
-                soob_nr = 0
-        }
-        ( !in_soob && $0 ~ /==================================================================/ ) {
-                in_soob = 1
-                soob_nr = NR
-                saved = $0
-        }
-        ( in_soob == 1 && NR == (soob_nr + 1) ) {
-                if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
-                        in_soob = 2
-                } else {
-                        in_soob = 0
-                        print saved
-                }
-		saved=""
-        }
-        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
-                in_soob = 3
-                soob_nr = NR
-        }
-        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
-                in_soob = 0
-        }
-        ( !in_soob ) { print $0 }
-        END {
-                if (saved) {
-                        print saved
-                }
-        }
-'
-}
-
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -141,12 +86,10 @@ t_filter_dmesg()
 	re="$re|scoutfs .* critical transaction commit failure.*"

 	# change-devices causes loop device resizing
-	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"

 	# ignore systemd-journal rotating
 	re="$re|systemd-journald.*"

-	egrep -v "($re)" | \
-		ignore_harmless_unwind_kasan_stack_oob
+	egrep -v "($re)" 
 }
@@ -265,15 +265,6 @@ t_trigger_get() {
 	cat "$(t_trigger_path "$nr")/$which"
 }

-t_trigger_set() {
-	local which="$1"
-	local nr="$2"
-	local val="$3"
-	local path=$(t_trigger_path "$nr")
-
-	echo "$val" > "$path/$which"
-}
-
 t_trigger_show() {
 	local which="$1"
 	local string="$2"
@@ -285,8 +276,9 @@ t_trigger_show() {
 t_trigger_arm_silent() {
 	local which="$1"
 	local nr="$2"
+	local path=$(t_trigger_path "$nr")

-	t_trigger_set "$which" "$nr" 1
+	echo 1 > "$path/$which"
 }

 t_trigger_arm() {
@@ -1,4 +1,3 @@
 == measure initial createmany
 == measure initial createmany
 == measure two concurrent createmany runs
-== cleanup
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
@@ -1,37 +0,0 @@
-== initialize per-mount values
-== arm compaction triggers
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_merge_stop_safe armed: 1
-== compact more often
-== create padded sorted inputs by forcing log rotation
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_force_log_rotate armed: 1
-trigger srch_compact_logs_pad_safe armed: 1
-== compaction of padded should stop at safe
-== verify no compaction errors
-== cleanup
@@ -326,10 +326,16 @@ unmount_all() {
 		cmd wait $p
 	done

-	# delete all temp devices
-	for dev in /dev/mapper/_scoutfs_test_*; do
-		if [ -b "$dev" ]; then
-			cmd dmsetup remove $dev
+	# delete all temp meta devices
+	for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
+		if [ -e "$dev" ]; then
+			cmd losetup -d "$dev"
+		fi
+	done
+	# delete all temp data devices
+	for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
+		if [ -e "$dev" ]; then
+			cmd losetup -d "$dev"
 		fi
 	done
 }
@@ -428,12 +434,6 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
 fenced_log "started fenced pid $fenced_pid in the background"

-# setup dm tables
-echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
-	$T_RESULTS/dmtable.meta
-echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
-	$T_RESULTS/dmtable.data
-
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -442,13 +442,10 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
 pids=""
 for i in $(seq 0 $((T_NR_MOUNTS - 1))); do

-	name="_scoutfs_test_meta_$i"
-	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
-	meta_dev="/dev/mapper/$name"
-
-	name="_scoutfs_test_data_$i"
-	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
-	data_dev="/dev/mapper/$name"
+	meta_dev=$(losetup --find --show $T_META_DEVICE)
+	test -b "$meta_dev" || die "failed to create temp device $meta_dev"
+	data_dev=$(losetup --find --show $T_DATA_DEVICE)
+	test -b "$data_dev" || die "failed to create temp device $data_dev"

 	dir="/mnt/test.$i"
 	test -d "$dir" || cmd mkdir -p "$dir"
@@ -14,7 +14,6 @@ offline-extent-waiting.sh
 move-blocks.sh
 large-fragmented-free.sh
 enospc.sh
-srch-safe-merge-pos.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 totl-xattr-tag.sh
@@ -1,7 +1,6 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <stdarg.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/stat.h>
@@ -36,10 +35,10 @@ struct opts {
 	unsigned int dry_run:1,
 		     ls_output:1,
 		     quiet:1,
-		     xattr_set:1,
-		     xattr_file:1,
-		     xattr_group:1;
-	char *xattr_name;
+		     user_xattr:1,
+		     same_srch_xattr:1,
+		     group_srch_xattr:1,
+		     unique_srch_xattr:1;
 };

 struct stats {
@@ -150,31 +149,12 @@ static void free_dir(struct dir *dir)
 	free(dir);
 }

-static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
-{
-	va_list ap;
-	int ret;
-
-	if (off >= sz)
-		return sz;
-
-	va_start(ap, fmt);
-	ret = vsnprintf(buf + off, sz - off, fmt, ap);
-	va_end(ap);
-
-	if (ret <= 0)
-		return sz;
-
-	return off + ret;
-}
-
 static void create_dir(struct dir *dir, struct opts *opts,
 		       struct stats *stats)
 {
 	struct str_list *s;
-	char name[256]; /* max len and null term */
+	char name[100];
 	char val = 'v';
-	size_t off;
 	int rc;
 	int i;

@@ -195,21 +175,29 @@ static void create_dir(struct dir *dir, struct opts *opts,
 		rc = mknod(s->str, S_IFREG | 0644, 0);
 		error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);

-		if (opts->xattr_set) {
-			off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
-			if (opts->xattr_file)
-				off = snprintf_off(name, sizeof(name), off,
-						   "-f-%lu", stats->files);
-			if (opts->xattr_group)
-				off = snprintf_off(name, sizeof(name), off,
-						   "-g-%lu", stats->files / 10000);
-
-			error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
-
+		rc = 0;
+		if (rc == 0 && opts->user_xattr) {
+			strcpy(name, "user.scoutfs_bcp");
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->same_srch_xattr) {
+			strcpy(name, "scoutfs.srch.scoutfs_bcp");
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->group_srch_xattr) {
+			snprintf(name, sizeof(name),
+				 "scoutfs.srch.scoutfs_bcp.group.%lu",
+				 stats->files / 10000);
+			rc = setxattr(s->str, name, &val, 1, 0);
+		}
+		if (rc == 0 && opts->unique_srch_xattr) {
+			snprintf(name, sizeof(name),
+				 "scoutfs.srch.scoutfs_bcp.unique.%lu",
+				 stats->files);
 			rc = setxattr(s->str, name, &val, 1, 0);
-			error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
 		}

+		error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);

 		stats->files++;
 		rate_banner(opts, stats);
@@ -377,10 +365,11 @@ static void usage(void)
 	       " -d DIR | create all files in DIR top level directory\n"
 	       " -n     | dry run, only parse, don't create any files\n"
 	       " -q     | quiet, don't regularly print rates\n"
-	       " -F     | append \"-f-NR\" file nr to xattr name, requires -X\n"
-	       " -G     | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
 	       " -L     | parse ls output; only reg, skip meta, paths at ./\n"
-	       " -X NAM | set named xattr in all files\n");
+	       " -X     | set the same user. xattr name in all files\n"
+	       " -S     | set the same .srch. xattr name in all files\n"
+	       " -G     | set a .srch. xattr name shared by groups of files\n"
+	       " -U     | set a unique .srch. xattr name in all files\n");
 }

 int main(int argc, char **argv)
@@ -397,7 +386,7 @@ int main(int argc, char **argv)

 	memset(&opts, 0, sizeof(opts));

-        while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
+        while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
                switch(c) {
                case 'd':
                        top_dir = strdup(optarg);
@@ -408,19 +397,20 @@ int main(int argc, char **argv)
                case 'q':
                        opts.quiet = 1;
                        break;
-                case 'F':
-                        opts.xattr_file = 1;
-                        break;
-                case 'G':
-                        opts.xattr_group = 1;
-                        break;
                case 'L':
                        opts.ls_output = 1;
                        break;
                case 'X':
-			opts.xattr_set = 1;
-			opts.xattr_name = strdup(optarg);
-			error_exit(!opts.xattr_name, "error allocating xattr name");
+                        opts.user_xattr = 1;
+                        break;
+                case 'S':
+                        opts.same_srch_xattr = 1;
+                        break;
+                case 'G':
+                        opts.group_srch_xattr = 1;
+                        break;
+                case 'U':
+                        opts.unique_srch_xattr = 1;
                        break;
                case '?':
                        printf("Unknown option '%c'\n", optopt);
@@ -429,11 +419,6 @@ int main(int argc, char **argv)
                }
        }

-	error_exit(opts.xattr_file && !opts.xattr_set,
-		   "must specify xattr -X when appending file nr with -F");
-	error_exit(opts.xattr_group && !opts.xattr_set,
-		   "must specify xattr -X when appending file nr with -G");
-
 	if (!opts.dry_run) {
 		error_exit(!top_dir,
 			   "must specify top level directory with -d");
@@ -1,782 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/xattr.h>
-#include <ctype.h>
-#include <string.h>
-#include <errno.h>
-#include <limits.h>
-#include <time.h>
-#include <sys/prctl.h>
-#include <signal.h>
-#include <sys/socket.h>
-
-#include "../../utils/src/sparse.h"
-#include "../../utils/src/util.h"
-#include "../../utils/src/list.h"
-#include "../../utils/src/parse.h"
-#include "../../kmod/src/format.h"
-#include "../../utils/src/parallel_restore.h"
-
-/*
- * XXX:
- *  - add a nice description of what's going on
- *  - mention allocator contention
- *  - test child process dying handling
- *  - root dir entry name length is wrong
- */
-
-#define ERRF " errno %d (%s)"
-#define ERRA errno, strerror(errno)
-
-#define error_exit(cond, fmt, args...)			\
-do {							\
-	if (cond) {					\
-		printf("error: "fmt"\n", ##args);	\
-		exit(1);				\
-	}						\
-} while (0)
-
-#define dprintf(fmt, args...)		\
-do {					\
-	if (0)				\
-		printf(fmt, ##args);	\
-} while (0)
-
-#define REG_MODE (S_IFREG | 0644)
-#define DIR_MODE (S_IFDIR | 0755)
-
-struct opts {
-	unsigned long long buf_size;
-
-	unsigned long long write_batch;
-	unsigned long long low_dirs;
-	unsigned long long high_dirs;
-	unsigned long long low_files;
-	unsigned long long high_files;
-	char *meta_path;
-	unsigned long long total_files;
-	bool read_only;
-	unsigned long long seed;
-	unsigned long long nr_writers;
-};
-
-static void usage(void)
-{
-	printf("usage:\n"
-	       " -b NR       | threads write blocks in batches files (100000)\n"
-	       " -d LOW:HIGH | range of subdirs per directory (5:10)\n"
-	       " -f LOW:HIGH | range of files per directory (10:20)\n"
-	       " -m PATH     | path to metadata device\n"
-	       " -n NR       | total number of files to create (100)\n"
-	       " -r          | read-only, all work except writing, measure cpu cost\n"
-	       " -s NR       | randomization seed (random)\n"
-	       " -w NR       | number of writing processes to fork (online cpus)\n"
-	       );
-}
-
-static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
-			 void *buf, size_t buf_size, int dev_fd)
-{
-	size_t total = 0;
-	size_t count;
-	off_t off;
-	int ret;
-
-	do {
-		ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
-		error_exit(ret, "write buf %d", ret);
-
-		if (count > 0) {
-			if (!opts->read_only)
-				ret = pwrite(dev_fd, buf, count, off);
-			else
-				ret = count;
-			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
-			total += ret;
-		}
-	} while (count > 0);
-
-	return total;
-}
-
-struct gen_inode {
-	struct scoutfs_parallel_restore_inode inode;
-	struct scoutfs_parallel_restore_xattr **xattrs;
-	u64 nr_xattrs;
-	struct scoutfs_parallel_restore_entry **entries;
-	u64 nr_files;
-	u64 nr_entries;
-};
-
-static void free_gino(struct gen_inode *gino)
-{
-	u64 i;
-
-	if (gino) {
-		if (gino->entries) {
-			for (i = 0; i < gino->nr_entries; i++)
-				free(gino->entries[i]);
-			free(gino->entries);
-		}
-		if (gino->xattrs) {
-			for (i = 0; i < gino->nr_xattrs; i++)
-				free(gino->xattrs[i]);
-			free(gino->xattrs);
-		}
-		free(gino);
-	}
-}
-
-static struct scoutfs_parallel_restore_xattr *
-generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
-		int value_len)
-{
-	struct scoutfs_parallel_restore_xattr *xattr;
-
-	xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
-	error_exit(!xattr, "error allocating generated xattr");
-
-	*xattr = (struct scoutfs_parallel_restore_xattr) {
-		.ino = ino,
-		.pos = pos,
-		.name_len = name_len,
-		.value_len = value_len,
-	};
-
-	xattr->name = (void *)(xattr + 1);
-	xattr->value = (void *)(xattr->name + name_len);
-
-	memcpy(xattr->name, name, name_len);
-	if (value_len)
-		memcpy(xattr->value, value, value_len);
-
-	return xattr;
-}
-
-static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
-{
-	struct gen_inode *gino;
-	struct timespec now;
-
-	clock_gettime(CLOCK_REALTIME, &now);
-
-	gino = calloc(1, sizeof(struct gen_inode));
-	error_exit(!gino, "failure allocating generated inode");
-
-	gino->inode = (struct scoutfs_parallel_restore_inode) {
-		.ino = ino,
-		.mode = mode,
-		.atime = now,
-		.ctime = now,
-		.mtime = now,
-		.crtime = now,
-	};
-
-	/*
-	 * hacky creation of a bunch of xattrs for now.
-	 */
-	if ((mode & S_IFMT) == S_IFREG) {
-		#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
-		struct name_val {
-			char *name;
-			int len;
-			char *value;
-			int value_len;
-		} nv[] = {
-			NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
-			NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
-			NV("scoutfs.hide.sam_reqcopies", ""),
-			NV("scoutfs.hide.sam_copy_2", ""),
-			NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
-			NV("scoutfs.hide.sam_copy_1", ""),
-			NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
-			NV("scoutfs.hide.srch.sam_release_cand", ""),
-			NV("scoutfs.hide.sam_restime", ""),
-			NV("scoutfs.hide.sam_uuid", ""),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
-			NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
-			NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
-			NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
-			NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
-			NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
-		};
-		unsigned int nr = array_size(nv);
-		int i;
-
-		gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
-
-		for (i = 0; i < nr; i++)
-			gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
-							 nv[i].value, nv[i].value_len);
-
-		gino->nr_xattrs = nr;
-		gino->inode.nr_xattrs = nr;
-	}
-
-	return gino;
-}
-
-static struct scoutfs_parallel_restore_entry *
-generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
-{
-	struct scoutfs_parallel_restore_entry *entry;
-	char buf[PATH_MAX];
-	int bytes;
-
-	bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
-
-	entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
-	error_exit(!entry, "error allocating generated entry");
-
-	*entry = (struct scoutfs_parallel_restore_entry) {
-		.dir_ino = dir_ino,
-		.pos = pos,
-		.ino = ino,
-		.mode = mode,
-		.name = (void *)(entry + 1),
-		.name_len = bytes,
-	};
-
-	memcpy(entry->name, buf, bytes);
-
-	return entry;
-}
-
-static u64 random64(void)
-{
-	return ((u64)lrand48() << 32) | lrand48();
-}
-
-static u64 random_range(u64 low, u64 high)
-{
-	return low + (random64() % (high - low + 1));
-}
-
-static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
-				      bool no_dirs)
-{
-	struct scoutfs_parallel_restore_entry *entry;
-	struct gen_inode *gino;
-	u64 nr_entries;
-	u64 nr_files;
-	u64 nr_dirs;
-	u64 ino;
-	char *prefix;
-	mode_t mode;
-	u64 i;
-
-	nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
-	nr_files = random_range(opts->low_files, opts->high_files);
-
-	if (1 + nr_dirs + nr_files > ino_len) {
-		nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
-		nr_files = (ino_len - 1) - nr_dirs;
-	}
-
-	nr_entries = nr_dirs + nr_files;
-
-	gino = generate_inode(opts, dir_ino, DIR_MODE);
-	error_exit(!gino, "error allocating generated inode");
-
-	gino->inode.nr_subdirs = nr_dirs;
-	gino->nr_files = nr_files;
-
-	if (nr_entries) {
-		gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
-		error_exit(!gino->entries, "error allocating generated inode entries");
-
-		gino->nr_entries = nr_entries;
-	}
-
-	mode = DIR_MODE;
-	prefix = "dir";
-	for (i = 0; i < nr_entries; i++) {
-		if (i == nr_dirs) {
-			mode = REG_MODE;
-			prefix = "file";
-		}
-
-		ino = ino_start + i;
-		entry = generate_entry(opts, prefix, ino, gino->inode.ino,
-				       SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
-
-		gino->entries[i] = entry;
-		gino->inode.total_entry_name_bytes += entry->name_len;
-	}
-
-	return gino;
-}
-
-/*
- * Restore a generated inode.  If it's a directory then we also restore
- * all its entries.  The caller is going to descend into subdir entries and generate
- * those dir inodes.  We have to generate and restore all non-dir inodes referenced
- * by this inode's entries.
- */
-static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
-			  struct gen_inode *gino)
-{
-	struct gen_inode *nondir;
-	int ret;
-	u64 i;
-
-	ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
-	error_exit(ret, "thread add root inode %d", ret);
-
-	for (i = 0; i < gino->nr_entries; i++) {
-		ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
-		error_exit(ret, "thread add entry %d", ret);
-
-		/* caller only needs subdir entries, generate and free others */
-		if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
-
-			nondir = generate_inode(opts, gino->entries[i]->ino,
-						gino->entries[i]->mode);
-			restore_inode(opts, wri, nondir);
-			free_gino(nondir);
-
-			free(gino->entries[i]);
-			if (i != gino->nr_entries - 1)
-				gino->entries[i] = gino->entries[gino->nr_entries - 1];
-			gino->nr_entries--;
-			gino->nr_files--;
-			i--;
-		}
-	}
-
-	for (i = 0; i < gino->nr_xattrs; i++) {
-		ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
-		error_exit(ret, "thread add xattr %d", ret);
-	}
-}
-
-struct writer_args {
-	struct list_head head;
-
-	int dev_fd;
-	int pair_fd;
-
-	struct scoutfs_parallel_restore_slice slice;
-	u64 writer_nr;
-	u64 dir_height;
-	u64 ino_start;
-	u64 ino_len;
-};
-
-struct write_result {
-	struct scoutfs_parallel_restore_progress prog;
-	struct scoutfs_parallel_restore_slice slice;
-	__le64 files_created;
-	__le64 bytes_written;
-};
-
-static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
-				  void *buf, size_t buf_size, int dev_fd,
-				  struct write_result *res, bool get_slice, int pair_fd)
-{
-	size_t total;
-	int ret;
-
-	total = write_bufs(opts, wri, buf, buf_size, dev_fd);
-	le64_add_cpu(&res->bytes_written, total);
-
-	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
-	error_exit(ret, "get prog %d", ret);
-
-	if (get_slice) {
-		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
-		error_exit(ret, "thread get slice %d", ret);
-	}
-
-	ret = write(pair_fd, res, sizeof(struct write_result));
-	error_exit(ret != sizeof(struct write_result), "result send error");
-
-	memset(res, 0, sizeof(struct write_result));
-}
-
-/*
- * Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
- * number of writers.
- */
-static u64 topdir_entry_bytes(u64 nr_writers)
-{
-	u64 bytes = (3 + 1) * nr_writers;
-	u64 limit;
-	u64 done;
-	u64 wid;
-	u64 nr;
-
-	for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
-		nr = min(limit - done, nr_writers - done);
-		bytes += nr * wid;
-	}
-
-	return bytes;
-}
-
-struct dir_pos {
-	struct gen_inode *gino;
-	u64 pos;
-};
-
-static void writer_proc(struct opts *opts, struct writer_args *args)
-{
-	struct scoutfs_parallel_restore_writer *wri = NULL;
-	struct scoutfs_parallel_restore_entry *entry;
-	struct dir_pos *dirs = NULL;
-	struct write_result res;
-	struct gen_inode *gino;
-	void *buf = NULL;
-	u64 level;
-	u64 ino;
-	int ret;
-
-	memset(&res, 0, sizeof(res));
-
-	dirs = calloc(args->dir_height, sizeof(struct dir_pos));
-	error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
-
-	errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
-	error_exit(errno, "error allocating block buf "ERRF, ERRA);
-
-	ret = scoutfs_parallel_restore_create_writer(&wri);
-	error_exit(ret, "create writer %d", ret);
-
-	ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
-	error_exit(ret, "add slice %d", ret);
-
-	/* writer 0 creates the root dir */
-	if (args->writer_nr == 0) {
-		gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
-		gino->inode.nr_subdirs = opts->nr_writers;
-		gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
-
-		ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
-		error_exit(ret, "thread add root inode %d", ret);
-		free_gino(gino);
-	}
-
-	/* create root entry for our top level dir */
-	ino = args->ino_start++;
-	args->ino_len--;
-
-	entry = generate_entry(opts, "top", args->writer_nr,
-			       SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
-			       ino, DIR_MODE);
-
-	ret = scoutfs_parallel_restore_add_entry(wri, entry);
-	error_exit(ret, "thread top entry %d", ret);
-	free(entry);
-
-	level = args->dir_height - 1;
-
-	while (args->ino_len > 0 && level < args->dir_height) {
-		gino = dirs[level].gino;
-
-		/* generate and restore if we follow entries */
-		if (!gino) {
-			gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
-			args->ino_start += gino->nr_entries;
-			args->ino_len -= gino->nr_entries;
-			le64_add_cpu(&res.files_created, gino->nr_files);
-
-			restore_inode(opts, wri, gino);
-			dirs[level].gino = gino;
-		}
-
-		if (dirs[level].pos == gino->nr_entries) {
-			/* ascend if we're done with this dir */
-			dirs[level].gino = NULL;
-			dirs[level].pos = 0;
-			free_gino(gino);
-			level++;
-
-		} else {
-			/* otherwise descend into subdir entry */
-			ino = gino->entries[dirs[level].pos]->ino;
-			dirs[level].pos++;
-			level--;
-		}
-
-		/* do a partial write at batch intervals when there's still more to do */
-		if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
-			write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
-					    &res, false, args->pair_fd);
-	}
-
-	write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
-			    &res, true, args->pair_fd);
-
-	scoutfs_parallel_restore_destroy_writer(&wri);
-
-	free(dirs);
-	free(buf);
-}
-
-static void fork_writer(struct opts *opts, struct writer_args *args)
-{
-	pid_t parent = getpid();
-	pid_t pid;
-	int ret;
-
-	pid = fork();
-	error_exit(pid == -1, "fork error");
-
-	if (pid != 0)
-		return;
-
-	ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
-	error_exit(ret < 0, "failed to set parent death sig");
-
-	printf("pid %u getpid() %u parent %u getppid() %u\n",
-		pid, getpid(), parent, getppid());
-	error_exit(getppid() != parent, "child parent already changed");
-
-	writer_proc(opts, args);
-	exit(0);
-}
-
-static int do_restore(struct opts *opts)
-{
-	struct scoutfs_parallel_restore_writer *wri = NULL;
-	struct scoutfs_parallel_restore_slice *slices = NULL;
-	struct scoutfs_super_block *super = NULL;
-	struct write_result res;
-	struct writer_args *args;
-	struct timespec begin;
-	struct timespec end;
-	LIST_HEAD(writers);
-	u64 next_ino;
-	u64 ino_per;
-	u64 avg_dirs;
-	u64 avg_files;
-	u64 dir_height;
-	u64 tot_files;
-	u64 tot_bytes;
-	int pair[2] = {-1, -1};
-	float secs;
-	void *buf = NULL;
-	int dev_fd = -1;
-	int ret;
-	int i;
-
-	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
-	error_exit(ret, "socketpair error "ERRF, ERRA);
-
-	dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
-	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
-
-	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
-		posix_memalign((void **)&buf, 4096, opts->buf_size);
-	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
-
-	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
-		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
-	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
-
-	ret = scoutfs_parallel_restore_create_writer(&wri);
-	error_exit(ret, "create writer %d", ret);
-
-	ret = scoutfs_parallel_restore_import_super(wri, super);
-	error_exit(ret, "import super %d", ret);
-
-	slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
-	error_exit(!slices, "alloc slices");
-
-	scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
-
-	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
-	error_exit(ret, "add slices[0] %d", ret);
-
-	next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
-	ino_per = opts->total_files / opts->nr_writers;
-	avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
-	avg_files = (opts->low_files + opts->high_files) / 2;
-
-	dir_height = 1;
-	tot_files = avg_files * opts->nr_writers;
-
-	while (tot_files < opts->total_files) {
-		dir_height++;
-		tot_files *= avg_dirs;
-	}
-
-	dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
-
-	clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
-
-	/* start each writing process */
-	for (i = 0; i < opts->nr_writers; i++) {
-		args = calloc(1, sizeof(struct writer_args));
-		error_exit(!args, "alloc writer args");
-
-		args->dev_fd = dev_fd;
-		args->pair_fd = pair[1];
-		args->slice = slices[1 + i];
-		args->writer_nr = i;
-		args->dir_height = dir_height;
-		args->ino_start = next_ino;
-		args->ino_len = ino_per;
-
-		list_add_tail(&args->head, &writers);
-		next_ino += ino_per;
-
-		fork_writer(opts, args);
-	}
-
-	/* read results and watch for writers to finish */
-	tot_files = 0;
-	tot_bytes = 0;
-	i = 0;
-	while (i < opts->nr_writers) {
-		ret = read(pair[0], &res, sizeof(struct write_result));
-		error_exit(ret != sizeof(struct write_result), "result read error");
-
-		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
-		error_exit(ret, "add thr prog %d", ret);
-
-		if (res.slice.meta_len != 0) {
-			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
-			error_exit(ret, "add thr slice %d", ret);
-			i++;
-		}
-
-		tot_files += le64_to_cpu(res.files_created);
-		tot_bytes += le64_to_cpu(res.bytes_written);
-	}
-
-	tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
-
-	ret = scoutfs_parallel_restore_export_super(wri, super);
-	error_exit(ret, "update super %d", ret);
-
-	if (!opts->read_only) {
-		ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
-			     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
-		error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
-	}
-
-	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
-
-	scoutfs_parallel_restore_destroy_writer(&wri);
-
-	secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
-	       ((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
-	printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
-		tot_files, tot_bytes, secs,
-		(float)tot_bytes / tot_files, (float)tot_files / secs);
-
-	if (dev_fd >= 0)
-		close(dev_fd);
-	if (pair[0] >= 0)
-		close(pair[0]);
-	if (pair[1] >= 0)
-		close(pair[1]);
-	free(super);
-	free(slices);
-	free(buf);
-
-	return 0;
-}
-
-static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
-{
-	char *sep;
-	int ret = 0;
-
-	sep = index(str, ':');
-	if (sep) {
-		*sep = '\0';
-		ret = parse_u64(sep + 1, high_ret);
-	}
-
-	if (ret == 0)
-		ret = parse_u64(str, low_ret);
-
-	if (sep)
-		*sep = ':';
-
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	struct opts opts = {
-		.buf_size = (32 * 1024 * 1024),
-
-		.write_batch = 1000000,
-		.low_dirs = 5,
-		.high_dirs = 10,
-		.low_files = 10,
-		.high_files = 20,
-		.total_files = 100,
-	};
-	int ret;
-	int c;
-
-	opts.seed = random64();
-	opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
-
-        while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
-                switch(c) {
-                case 'b':
-			ret = parse_u64(optarg, &opts.write_batch);
-			error_exit(ret, "error parsing -b '%s'\n", optarg);
-			error_exit(opts.write_batch == 0, "-b can't be 0");
-                        break;
-                case 'd':
-			ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
-			error_exit(ret, "error parsing -d '%s'\n", optarg);
-                        break;
-                case 'f':
-			ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
-			error_exit(ret, "error parsing -f '%s'\n", optarg);
-                        break;
-                case 'm':
-                        opts.meta_path = strdup(optarg);
-                        break;
-                case 'n':
-			ret = parse_u64(optarg, &opts.total_files);
-			error_exit(ret, "error parsing -n '%s'\n", optarg);
-                        break;
-                case 'r':
-			opts.read_only = true;
-			break;
-                case 's':
-			ret = parse_u64(optarg, &opts.seed);
-			error_exit(ret, "error parsing -s '%s'\n", optarg);
-                        break;
-                case 'w':
-			ret = parse_u64(optarg, &opts.nr_writers);
-			error_exit(ret, "error parsing -w '%s'\n", optarg);
-                        break;
-                case '?':
-                        printf("Unknown option '%c'\n", optopt);
-                        usage();
-			exit(1);
-                }
-        }
-
-	error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
-		   opts.low_dirs, opts.high_dirs);
-	error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
-		   opts.low_files, opts.high_files);
-	error_exit(!opts.meta_path, "must specify metadata device path with -m");
-
-	printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
-		opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
-		opts.total_files, opts.seed, opts.nr_writers);
-
-	ret = do_restore(&opts);
-
-	free(opts.meta_path);
-
-	return ret == 0 ? 0 : 1;
-}
@@ -7,11 +7,9 @@ t_require_mounts 2

 COUNT=50000

-#
-# Prep dirs for test.  We have per-directory inode number allocators so
-# by putting each createmany in a per-mount dir they get their own inode
-# number region and cluster locks.
-#
+# Prep dirs for test. Each mount needs to make their own parent dir for
+# the createmany run, otherwise both dirs will end up in the same inode
+# group, causing updates to bounce that lock around.
 echo "== measure initial createmany"
 mkdir -p $T_D0/dir/0
 mkdir $T_D1/dir/1
@@ -19,20 +17,18 @@ mkdir $T_D1/dir/1
 echo "== measure initial createmany"
 START=$SECONDS
 createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
-sync
 SINGLE=$((SECONDS - START))
 echo single $SINGLE >> $T_TMP.full

 echo "== measure two concurrent createmany runs"
 START=$SECONDS
-(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
+createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
 pids="$!"
-(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
+createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
 pids="$pids $!"
 for p in $pids; do
        wait $p
 done
-sync
 BOTH=$((SECONDS - START))
 echo both $BOTH >> $T_TMP.full

@@ -45,10 +41,7 @@ echo both $BOTH >> $T_TMP.full
 # synchronized operation.
 FACTOR=200
 if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
-	t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
+	echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
 fi

-echo "== cleanup"
-find $T_D0/dir -delete
-
 t_pass
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

@@ -9,7 +9,6 @@ LOG=340000
 LIM=1000000

 SEQF="%.20g"
-SXA="scoutfs.srch.test-srch-basic-functionality"

 t_require_commands touch rm setfattr scoutfs find_xattrs

@@ -28,20 +27,20 @@ diff_srch_find()

 echo "== create new xattrs"
 touch "$T_D0/"{create,update}
-setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== update existing xattr"
-setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== remove an xattr"
-setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
-diff_srch_find $SXA
+setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
+diff_srch_find scoutfs.srch.test

 echo "== remove xattr with files"
 rm -f "$T_D0/"{create,update}
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.test

 echo "== trigger small log merges by rotating single block with unmount"
 sv=$(t_server_nr)
@@ -57,7 +56,7 @@ while [ "$i" -lt "8" ]; do

 		eval path="\$T_D${nr}/single-block-$i"
 		touch "$path"
-		setfattr -n $SXA -v $i "$path"
+		setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
 		t_umount $nr
 		t_mount $nr

@@ -66,51 +65,51 @@ while [ "$i" -lt "8" ]; do
 done
 # wait for srch compaction worker delay
 sleep 10
-find "$T_D0" -type f -name 'single-block-*' -delete
+rm -rf "$T_D0/single-block-*"

 echo "== create entries in current log"
 DIR="$T_D0/dir"
 NR=$((LOG / 4))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete small fraction"
-seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
-diff_srch_find $SXA
+seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== create entries that exceed one log"
 NR=$((LOG * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete fractions in phases"
 for i in $(seq 1 3); do
-	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
-	diff_srch_find $SXA
+	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+	diff_srch_find scoutfs.srch.scoutfs_bcp
 done

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== create entries for exceed search entry limit"
 NR=$((LIM * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
-diff_srch_find $SXA
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== delete half"
-seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
-diff_srch_find $SXA
+seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
+diff_srch_find scoutfs.srch.scoutfs_bcp

 echo "== entirely remove third batch"
 rm -rf "$DIR"
-diff_srch_find $SXA
+diff_srch_find scoutfs.srch.scoutfs_bcp

 t_pass
@@ -1,90 +0,0 @@
-#
-# There was a bug where srch file compaction could get stuck if a
-# partial compaction finished at the specific _SAFE_BYTES offset in a
-# block.  Resuming from that position would return an error and
-# compaction would stop making forward progress.
-#
-# We use triggers to pad the output of log compaction to end on the safe
-# offset and then cause compaction of those padded inputs to stop at the
-# safe offset.  Continuation will either succeed or return errors.  
-#
-
-# forcing rotation, so just a few
-NR=10
-SEQF="%.20g"
-COMPACT_NR=4
-
-echo "== initialize per-mount values"
-declare -a err
-declare -a compact_delay
-for nr in $(t_fs_nrs); do
-	err[$nr]=$(t_counter srch_compact_error $nr)
-	compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
-done
-restore_compact_delay()
-{
-	for nr in $(t_fs_nrs); do
-		echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
-	done
-}
-trap restore_compact_delay EXIT
-
-echo "== arm compaction triggers"
-for nr in $(t_fs_nrs); do
-	t_trigger_arm srch_compact_logs_pad_safe $nr
-	t_trigger_arm srch_merge_stop_safe $nr
-done
-
-echo "== compact more often"
-for nr in $(t_fs_nrs); do
-	echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
-done
-
-echo "== create padded sorted inputs by forcing log rotation"
-sv=$(t_server_nr)
-for i in $(seq 1 $COMPACT_NR); do
-	for j in $(seq 1 $COMPACT_NR); do
-		t_trigger_arm srch_force_log_rotate $sv
-
-		seq -f "f-$i-$j-$SEQF" 1 10 | \
-			bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
-			/dev/null
-		sync
-
-		test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
-			t_fail "srch_force_log_rotate didn't trigger"
-	done
-
-	padded=0
-	while test $padded == 0 && sleep .5; do
-		for nr in $(t_fs_nrs); do
-			if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
-				t_trigger_arm srch_compact_logs_pad_safe $nr
-				padded=1
-				break
-			fi
-			test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
-				t_fail "srch_compact_error counter increased on mount $nr"
-		done
-	done
-done
-
-echo "== compaction of padded should stop at safe"
-sleep 2
-for nr in $(t_fs_nrs); do
-	if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
-		break
-	fi
-done
-
-echo "== verify no compaction errors"
-sleep 2
-for nr in $(t_fs_nrs); do
-	test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
-		t_fail "srch_compact_error counter increased on mount $nr"
-done
-
-echo "== cleanup"
-find "$T_D0" -type f -name 'f-*' -delete
-
-t_pass
@@ -18,9 +18,7 @@ BIN := src/scoutfs
 OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
 DEPS := $(wildcard */*.d)

-AR := src/scoutfs_parallel_restore.a
-
-all: $(BIN) $(AR)
+all: $(BIN)

 ifneq ($(DEPS),)
 -include $(DEPS)
@@ -38,10 +36,6 @@ $(BIN): $(OBJ)
 	$(QU)  [BIN $@]
 	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid

-$(AR): $(OBJ)
-	$(QU)  [AR $@]
-	$(VE)ar rcs $@ $^
-
 %.o %.d: %.c Makefile sparse.sh
 	$(QU)  [CC $<]
 	$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
@@ -55,19 +55,6 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
-.B log_merge_wait_timeout_ms=<number>
-This option sets the amount of time, in milliseconds, that log merge
-creation can wait before timing out.  This setting is per-mount, only
-changes the behavior of that mount, and only affects the server when it
-is running in that mount.
-.sp
-This determines how long it may take for mounts to synchronize
-committing their log trees to create a log merge operation.  Setting it
-too high can create long latencies in the event that a mount takes a
-long time to commit their log.  Setting it too low can result in the
-creation of excessive numbers of log trees that are never merged.  The
-default is 500 and it can not be less than 100 nor greater than 60000.
-.TP
 .B metadev_path=<device>
 The metadev_path option specifies the path to the block device that
 contains the filesystem's metadata.
@@ -54,8 +54,6 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
-install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
-install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
 install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
 install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
 install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -72,7 +70,6 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
 %files -n scoutfs-devel
 %defattr(644,root,root,755)
 %{_includedir}/scoutfs
-%{_libdir}/scoutfs

 %clean
 rm -rf %{buildroot}
@@ -1,20 +0,0 @@
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "hash.h"
-#include "bloom.h"
-
-void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs)
-{
-	u64 hash;
-	int i;
-
-	hash = scoutfs_hash64(key, sizeof(struct scoutfs_key));
-
-	for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
-		nrs[i] = (u32)hash % SCOUTFS_FOREST_BLOOM_BITS;
-		hash >>= SCOUTFS_FOREST_BLOOM_FUNC_BITS;
-	}
-}
@@ -1,6 +0,0 @@
-#ifndef _BLOOM_H_
-#define _BLOOM_H_
-
-void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs);
-
-#endif
@@ -8,7 +8,7 @@
 #include "leaf_item_hash.h"
 #include "btree.h"

-void btree_init_block(struct scoutfs_btree_block *bt, int level)
+static void init_block(struct scoutfs_btree_block *bt, int level)
 {
 	int free;

@@ -33,7 +33,7 @@ void btree_init_root_single(struct scoutfs_btree_root *root,

 	memset(bt, 0, SCOUTFS_BLOCK_LG_SIZE);

-	btree_init_block(bt, 0);
+	init_block(bt, 0);
 }

 static void *alloc_val(struct scoutfs_btree_block *bt, int len)
@@ -1,7 +1,6 @@
 #ifndef _BTREE_H_
 #define _BTREE_H_

-void btree_init_block(struct scoutfs_btree_block *bt, int level);
 void btree_init_root_single(struct scoutfs_btree_root *root,
 			    struct scoutfs_btree_block *bt,
 			    u64 seq, u64 blkno);
@@ -1,24 +0,0 @@
-#ifndef _LK_RBTREE_WRAPPER_H_
-#define _LK_RBTREE_WRAPPER_H_
-
-/*
- * We're using this lame hack to build and use the kernel's rbtree in
- * userspace.  We drop the kernel's rbtree*[ch] implementation in and
- * use them with this wrapper.  We only have to remove the kernel
- * includes from the imported files.
- */
-
-#include <stdbool.h>
-#include "util.h"
-
-#define rcu_assign_pointer(a, b)	do { a = b; } while (0)
-#define READ_ONCE(a)			({ a; })
-#define WRITE_ONCE(a, b)		do { a = b; } while (0)
-#define unlikely(a)			({ a; })
-#define EXPORT_SYMBOL(a)		/* nop */
-
-#include "rbtree_types.h"
-#include "rbtree.h"
-#include "rbtree_augmented.h"
-
-#endif
@@ -1,24 +0,0 @@
-#include <unistd.h>
-#include <sys/stat.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "mode_types.h"
-
-unsigned int mode_to_type(mode_t mode)
-{
-#define S_SHIFT 12
-	static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
-		[S_IFIFO >> S_SHIFT]	= SCOUTFS_DT_FIFO,
-		[S_IFCHR >> S_SHIFT]	= SCOUTFS_DT_CHR,
-		[S_IFDIR >> S_SHIFT]	= SCOUTFS_DT_DIR,
-		[S_IFBLK >> S_SHIFT]	= SCOUTFS_DT_BLK,
-		[S_IFREG >> S_SHIFT]	= SCOUTFS_DT_REG,
-		[S_IFLNK >> S_SHIFT]	= SCOUTFS_DT_LNK,
-		[S_IFSOCK >> S_SHIFT]	= SCOUTFS_DT_SOCK,
-	};
-
-	return mode_types[(mode & S_IFMT) >> S_SHIFT];
-#undef S_SHIFT
-}
@@ -1,6 +0,0 @@
-#ifndef _MODE_TYPES_H_
-#define _MODE_TYPES_H_
-
-unsigned int mode_to_type(mode_t mode);
-
-#endif
@@ -1,46 +0,0 @@
-#ifndef _SCOUTFS_NAME_HASH_H_
-#define _SCOUTFS_NAME_HASH_H_
-
-#include "hash.h"
-
-/*
- * Test a bit number as though an array of bytes is a large len-bit
- * big-endian value.  nr 0 is the LSB of the final byte, nr (len - 1) is
- * the MSB of the first byte.
- */
-static int test_be_bytes_bit(int nr, const char *bytes, int len)
-{
-	return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
-}
-
-/*
- * Generate a 32bit "fingerprint" of the name by extracting 32 evenly
- * distributed bits from the name.  The intent is to have the sort order
- * of the fingerprints reflect the memcmp() sort order of the names
- * while mapping large names down to small fs keys.
- *
- * Names that are smaller than 32bits are biased towards the high bits
- * of the fingerprint so that most significant bits of the fingerprints
- * consistently reflect the initial characters of the names.
- */
-static inline u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
-{
-	int name_bits = name_len * 8;
-	int skip = max(name_bits / 32, 1);
-	u32 fp = 0;
-	int f;
-	int n;
-
-	for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
-		fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
-
-	return fp;
-}
-
-static inline u64 dirent_name_hash(const char *name, unsigned int name_len)
-{
-       return scoutfs_hash32(name, name_len) |
-              ((u64)dirent_name_fingerprint(name, name_len) << 32);
-}
-
-#endif
@@ -1,103 +0,0 @@
-#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
-#define _SCOUTFS_PARALLEL_RESTORE_H_
-
-#include <errno.h>
-
-struct scoutfs_parallel_restore_progress {
-	struct scoutfs_btree_root fs_items;
-	struct scoutfs_btree_root root_items;
-	struct scoutfs_srch_file sfl;
-	struct scoutfs_block_ref bloom_ref;
-	__le64 inode_count;
-	__le64 max_ino;
-};
-
-struct scoutfs_parallel_restore_slice {
-	__le64 fsid;
-	__le64 meta_start;
-	__le64 meta_len;
-};
-
-struct scoutfs_parallel_restore_entry {
-	u64 dir_ino;
-	u64 pos;
-	u64 ino;
-	mode_t mode;
-	char *name;
-	unsigned int name_len;
-};
-
-struct scoutfs_parallel_restore_xattr {
-	u64 ino;
-	u64 pos;
-	char *name;
-	unsigned int name_len;
-	void *value;
-	unsigned int value_len;
-};
-
-struct scoutfs_parallel_restore_inode {
-	/* all inodes */
-	u64 ino;
-	u64 nr_xattrs;
-	u32 uid;
-	u32 gid;
-	u32 mode;
-	u32 rdev;
-	struct timespec atime;
-	struct timespec ctime;
-	struct timespec mtime;
-	struct timespec crtime;
-
-	/* regular files */
-	u64 data_version;
-	u64 size;
-	bool offline;
-
-	/* only used for directories */
-	u64 nr_subdirs;
-	u64 total_entry_name_bytes;
-
-	/* only used for symlnks */
-	char *target;
-	unsigned int target_len; /* not including null terminator */
-};
-
-typedef __typeof__(EINVAL) spr_err_t;
-
-struct scoutfs_parallel_restore_writer;
-
-spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
-void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
-
-spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
-					       struct scoutfs_parallel_restore_slice *slices,
-					       int nr);
-spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
-					    struct scoutfs_parallel_restore_slice *slice);
-spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
-					    struct scoutfs_parallel_restore_slice *slice);
-
-spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
-					     struct scoutfs_parallel_restore_inode *inode);
-spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
-					     struct scoutfs_parallel_restore_entry *entry);
-spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
-					     struct scoutfs_parallel_restore_xattr *xattr);
-
-spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_parallel_restore_progress *prog);
-spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_parallel_restore_progress *prog);
-
-spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
-					     void *buf, size_t len, off_t *off_ret,
-					     size_t *count_ret);
-
-spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_super_block *super);
-spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
-						struct scoutfs_super_block *super);
-
-
-#endif
@@ -38,6 +38,7 @@ struct prepare_empty_data_dev_args {
 	char *meta_device;
 	char *data_device;
 	bool check;
+	bool force;
 };

 static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
@@ -77,20 +78,22 @@ static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
 		goto out;
 	}

-	ret = meta_super_in_use(meta_fd, meta_super);
-	if (ret < 0) {
-		if (ret == -EBUSY)
-			fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to determine if the data device is empty.\n");
-		goto out;
-	}
+	if (!args->force) {
+		ret = meta_super_in_use(meta_fd, meta_super);
+		if (ret < 0) {
+			if (ret == -EBUSY)
+				fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to determine if the data device is empty.\n");
+			goto out;
+		}

-	in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
-		 le64_to_cpu(meta_super->data_alloc.total_len);
-	if (in_use) {
-		fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files.  They must be removed, truncated, or released before a new empty data device can be used.\n",
-		       SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
-		ret = -EINVAL;
-		goto out;
+		in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
+			 le64_to_cpu(meta_super->data_alloc.total_len);
+		if (in_use) {
+			fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files.  They must be removed, truncated, or released before a new empty data device can be used.\n",
+			       SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
+			ret = -EINVAL;
+			goto out;
+		}
 	}

 	if (args->data_device) {
@@ -193,6 +196,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 	case 'c':
 		args->check = true;
 		break;
+	case 'f':
+		args->force = true;
+		break;
 	case ARGP_KEY_ARG:
 		if (!args->meta_device)
 			args->meta_device = strdup_or_error(state, arg);
@@ -216,6 +222,7 @@ static int parse_opt(int key, char *arg, struct argp_state *state)

 static struct argp_option options[] = {
 	{ "check", 'c', NULL, 0, "Only check for errors and do not write", },
+	{ "force", 'f', NULL, 0, "Do not check that super is in use, nor if blocks are in use",},
 	{ NULL }
 };

@@ -230,6 +237,7 @@ static int prepare_empty_data_dev_cmd(int argc, char *argv[])
 {
 	struct prepare_empty_data_dev_args prepare_empty_data_dev_args = { 
 		.check = false,
+		.force = false,
 	};
 	int ret;

@@ -1,629 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  (C) 2012  Michel Lespinasse <walken@google.com>
-
-
-  linux/lib/rbtree.c
-*/
-
-#include "lk_rbtree_wrapper.h"
-
-/*
- * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
- *
- *  1) A node is either red or black
- *  2) The root is black
- *  3) All leaves (NULL) are black
- *  4) Both children of every red node are black
- *  5) Every simple path from root to leaves contains the same number
- *     of black nodes.
- *
- *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
- *  consecutive red nodes in a path and every red node is therefore followed by
- *  a black. So if B is the number of black nodes on every simple path (as per
- *  5), then the longest possible path due to 4 is 2B.
- *
- *  We shall indicate color with case, where black nodes are uppercase and red
- *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
- *  parentheses and have some accompanying text comment.
- */
-
-/*
- * Notes on lockless lookups:
- *
- * All stores to the tree structure (rb_left and rb_right) must be done using
- * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
- * tree structure as seen in program order.
- *
- * These two requirements will allow lockless iteration of the tree -- not
- * correct iteration mind you, tree rotations are not atomic so a lookup might
- * miss entire subtrees.
- *
- * But they do guarantee that any such traversal will only see valid elements
- * and that it will indeed complete -- does not get stuck in a loop.
- *
- * It also guarantees that if the lookup returns an element it is the 'correct'
- * one. But not returning an element does _NOT_ mean it's not present.
- *
- * NOTE:
- *
- * Stores to __rb_parent_color are not important for simple lookups so those
- * are left undone as of now. Nor did I check for loops involving parent
- * pointers.
- */
-
-static inline void rb_set_black(struct rb_node *rb)
-{
-	rb->__rb_parent_color |= RB_BLACK;
-}
-
-static inline struct rb_node *rb_red_parent(struct rb_node *red)
-{
-	return (struct rb_node *)red->__rb_parent_color;
-}
-
-/*
- * Helper function for rotations:
- * - old's parent and color get assigned to new
- * - old gets assigned new as a parent and 'color' as a color.
- */
-static inline void
-__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
-			struct rb_root *root, int color)
-{
-	struct rb_node *parent = rb_parent(old);
-	new->__rb_parent_color = old->__rb_parent_color;
-	rb_set_parent_color(old, new, color);
-	__rb_change_child(old, new, parent, root);
-}
-
-static __always_inline void
-__rb_insert(struct rb_node *node, struct rb_root *root,
-	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
-
-	while (true) {
-		/*
-		 * Loop invariant: node is red.
-		 */
-		if (unlikely(!parent)) {
-			/*
-			 * The inserted node is root. Either this is the
-			 * first node, or we recursed at Case 1 below and
-			 * are no longer violating 4).
-			 */
-			rb_set_parent_color(node, NULL, RB_BLACK);
-			break;
-		}
-
-		/*
-		 * If there is a black parent, we are done.
-		 * Otherwise, take some corrective action as,
-		 * per 4), we don't want a red root or two
-		 * consecutive red nodes.
-		 */
-		if(rb_is_black(parent))
-			break;
-
-		gparent = rb_red_parent(parent);
-
-		tmp = gparent->rb_right;
-		if (parent != tmp) {	/* parent == gparent->rb_left */
-			if (tmp && rb_is_red(tmp)) {
-				/*
-				 * Case 1 - node's uncle is red (color flips).
-				 *
-				 *       G            g
-				 *      / \          / \
-				 *     p   u  -->   P   U
-				 *    /            /
-				 *   n            n
-				 *
-				 * However, since g's parent might be red, and
-				 * 4) does not allow this, we need to recurse
-				 * at g.
-				 */
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-				rb_set_parent_color(parent, gparent, RB_BLACK);
-				node = gparent;
-				parent = rb_parent(node);
-				rb_set_parent_color(node, parent, RB_RED);
-				continue;
-			}
-
-			tmp = parent->rb_right;
-			if (node == tmp) {
-				/*
-				 * Case 2 - node's uncle is black and node is
-				 * the parent's right child (left rotate at parent).
-				 *
-				 *      G             G
-				 *     / \           / \
-				 *    p   U  -->    n   U
-				 *     \           /
-				 *      n         p
-				 *
-				 * This still leaves us in violation of 4), the
-				 * continuation into Case 3 will fix that.
-				 */
-				tmp = node->rb_left;
-				WRITE_ONCE(parent->rb_right, tmp);
-				WRITE_ONCE(node->rb_left, parent);
-				if (tmp)
-					rb_set_parent_color(tmp, parent,
-							    RB_BLACK);
-				rb_set_parent_color(parent, node, RB_RED);
-				augment_rotate(parent, node);
-				parent = node;
-				tmp = node->rb_right;
-			}
-
-			/*
-			 * Case 3 - node's uncle is black and node is
-			 * the parent's left child (right rotate at gparent).
-			 *
-			 *        G           P
-			 *       / \         / \
-			 *      p   U  -->  n   g
-			 *     /                 \
-			 *    n                   U
-			 */
-			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
-			WRITE_ONCE(parent->rb_right, gparent);
-			if (tmp)
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
-			augment_rotate(gparent, parent);
-			break;
-		} else {
-			tmp = gparent->rb_left;
-			if (tmp && rb_is_red(tmp)) {
-				/* Case 1 - color flips */
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-				rb_set_parent_color(parent, gparent, RB_BLACK);
-				node = gparent;
-				parent = rb_parent(node);
-				rb_set_parent_color(node, parent, RB_RED);
-				continue;
-			}
-
-			tmp = parent->rb_left;
-			if (node == tmp) {
-				/* Case 2 - right rotate at parent */
-				tmp = node->rb_right;
-				WRITE_ONCE(parent->rb_left, tmp);
-				WRITE_ONCE(node->rb_right, parent);
-				if (tmp)
-					rb_set_parent_color(tmp, parent,
-							    RB_BLACK);
-				rb_set_parent_color(parent, node, RB_RED);
-				augment_rotate(parent, node);
-				parent = node;
-				tmp = node->rb_left;
-			}
-
-			/* Case 3 - left rotate at gparent */
-			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
-			WRITE_ONCE(parent->rb_left, gparent);
-			if (tmp)
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
-			augment_rotate(gparent, parent);
-			break;
-		}
-	}
-}
-
-/*
- * Inline version for rb_erase() use - we want to be able to inline
- * and eliminate the dummy_rotate callback there
- */
-static __always_inline void
-____rb_erase_color(struct rb_node *parent, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
-
-	while (true) {
-		/*
-		 * Loop invariants:
-		 * - node is black (or NULL on first iteration)
-		 * - node is not the root (parent is not NULL)
-		 * - All leaf paths going through parent and node have a
-		 *   black node count that is 1 lower than other leaf paths.
-		 */
-		sibling = parent->rb_right;
-		if (node != sibling) {	/* node == parent->rb_left */
-			if (rb_is_red(sibling)) {
-				/*
-				 * Case 1 - left rotate at parent
-				 *
-				 *     P               S
-				 *    / \             / \
-				 *   N   s    -->    p   Sr
-				 *      / \         / \
-				 *     Sl  Sr      N   Sl
-				 */
-				tmp1 = sibling->rb_left;
-				WRITE_ONCE(parent->rb_right, tmp1);
-				WRITE_ONCE(sibling->rb_left, parent);
-				rb_set_parent_color(tmp1, parent, RB_BLACK);
-				__rb_rotate_set_parents(parent, sibling, root,
-							RB_RED);
-				augment_rotate(parent, sibling);
-				sibling = tmp1;
-			}
-			tmp1 = sibling->rb_right;
-			if (!tmp1 || rb_is_black(tmp1)) {
-				tmp2 = sibling->rb_left;
-				if (!tmp2 || rb_is_black(tmp2)) {
-					/*
-					 * Case 2 - sibling color flip
-					 * (p could be either color here)
-					 *
-					 *    (p)           (p)
-					 *    / \           / \
-					 *   N   S    -->  N   s
-					 *      / \           / \
-					 *     Sl  Sr        Sl  Sr
-					 *
-					 * This leaves us violating 5) which
-					 * can be fixed by flipping p to black
-					 * if it was red, or by recursing at p.
-					 * p is red when coming from Case 1.
-					 */
-					rb_set_parent_color(sibling, parent,
-							    RB_RED);
-					if (rb_is_red(parent))
-						rb_set_black(parent);
-					else {
-						node = parent;
-						parent = rb_parent(node);
-						if (parent)
-							continue;
-					}
-					break;
-				}
-				/*
-				 * Case 3 - right rotate at sibling
-				 * (p could be either color here)
-				 *
-				 *   (p)           (p)
-				 *   / \           / \
-				 *  N   S    -->  N   sl
-				 *     / \             \
-				 *    sl  Sr            S
-				 *                       \
-				 *                        Sr
-				 *
-				 * Note: p might be red, and then both
-				 * p and sl are red after rotation(which
-				 * breaks property 4). This is fixed in
-				 * Case 4 (in __rb_rotate_set_parents()
-				 *         which set sl the color of p
-				 *         and set p RB_BLACK)
-				 *
-				 *   (p)            (sl)
-				 *   / \            /  \
-				 *  N   sl   -->   P    S
-				 *       \        /      \
-				 *        S      N        Sr
-				 *         \
-				 *          Sr
-				 */
-				tmp1 = tmp2->rb_right;
-				WRITE_ONCE(sibling->rb_left, tmp1);
-				WRITE_ONCE(tmp2->rb_right, sibling);
-				WRITE_ONCE(parent->rb_right, tmp2);
-				if (tmp1)
-					rb_set_parent_color(tmp1, sibling,
-							    RB_BLACK);
-				augment_rotate(sibling, tmp2);
-				tmp1 = sibling;
-				sibling = tmp2;
-			}
-			/*
-			 * Case 4 - left rotate at parent + color flips
-			 * (p and sl could be either color here.
-			 *  After rotation, p becomes black, s acquires
-			 *  p's color, and sl keeps its color)
-			 *
-			 *      (p)             (s)
-			 *      / \             / \
-			 *     N   S     -->   P   Sr
-			 *        / \         / \
-			 *      (sl) sr      N  (sl)
-			 */
-			tmp2 = sibling->rb_left;
-			WRITE_ONCE(parent->rb_right, tmp2);
-			WRITE_ONCE(sibling->rb_left, parent);
-			rb_set_parent_color(tmp1, sibling, RB_BLACK);
-			if (tmp2)
-				rb_set_parent(tmp2, parent);
-			__rb_rotate_set_parents(parent, sibling, root,
-						RB_BLACK);
-			augment_rotate(parent, sibling);
-			break;
-		} else {
-			sibling = parent->rb_left;
-			if (rb_is_red(sibling)) {
-				/* Case 1 - right rotate at parent */
-				tmp1 = sibling->rb_right;
-				WRITE_ONCE(parent->rb_left, tmp1);
-				WRITE_ONCE(sibling->rb_right, parent);
-				rb_set_parent_color(tmp1, parent, RB_BLACK);
-				__rb_rotate_set_parents(parent, sibling, root,
-							RB_RED);
-				augment_rotate(parent, sibling);
-				sibling = tmp1;
-			}
-			tmp1 = sibling->rb_left;
-			if (!tmp1 || rb_is_black(tmp1)) {
-				tmp2 = sibling->rb_right;
-				if (!tmp2 || rb_is_black(tmp2)) {
-					/* Case 2 - sibling color flip */
-					rb_set_parent_color(sibling, parent,
-							    RB_RED);
-					if (rb_is_red(parent))
-						rb_set_black(parent);
-					else {
-						node = parent;
-						parent = rb_parent(node);
-						if (parent)
-							continue;
-					}
-					break;
-				}
-				/* Case 3 - left rotate at sibling */
-				tmp1 = tmp2->rb_left;
-				WRITE_ONCE(sibling->rb_right, tmp1);
-				WRITE_ONCE(tmp2->rb_left, sibling);
-				WRITE_ONCE(parent->rb_left, tmp2);
-				if (tmp1)
-					rb_set_parent_color(tmp1, sibling,
-							    RB_BLACK);
-				augment_rotate(sibling, tmp2);
-				tmp1 = sibling;
-				sibling = tmp2;
-			}
-			/* Case 4 - right rotate at parent + color flips */
-			tmp2 = sibling->rb_right;
-			WRITE_ONCE(parent->rb_left, tmp2);
-			WRITE_ONCE(sibling->rb_right, parent);
-			rb_set_parent_color(tmp1, sibling, RB_BLACK);
-			if (tmp2)
-				rb_set_parent(tmp2, parent);
-			__rb_rotate_set_parents(parent, sibling, root,
-						RB_BLACK);
-			augment_rotate(parent, sibling);
-			break;
-		}
-	}
-}
-
-/* Non-inline version for rb_erase_augmented() use */
-void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	____rb_erase_color(parent, root, augment_rotate);
-}
-EXPORT_SYMBOL(__rb_erase_color);
-
-/*
- * Non-augmented rbtree manipulation functions.
- *
- * We use dummy augmented callbacks here, and have the compiler optimize them
- * out of the rb_insert_color() and rb_erase() function definitions.
- */
-
-static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
-static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
-static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
-
-static const struct rb_augment_callbacks dummy_callbacks = {
-	.propagate = dummy_propagate,
-	.copy = dummy_copy,
-	.rotate = dummy_rotate
-};
-
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
-{
-	__rb_insert(node, root, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_insert_color);
-
-void rb_erase(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
-	if (rebalance)
-		____rb_erase_color(rebalance, root, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_erase);
-
-/*
- * Augmented rbtree manipulation functions.
- *
- * This instantiates the same __always_inline functions as in the non-augmented
- * case, but this time with user-defined callbacks.
- */
-
-void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	__rb_insert(node, root, augment_rotate);
-}
-EXPORT_SYMBOL(__rb_insert_augmented);
-
-/*
- * This function returns the first node (in sort order) of the tree.
- */
-struct rb_node *rb_first(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_left)
-		n = n->rb_left;
-	return n;
-}
-EXPORT_SYMBOL(rb_first);
-
-struct rb_node *rb_last(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_right)
-		n = n->rb_right;
-	return n;
-}
-EXPORT_SYMBOL(rb_last);
-
-struct rb_node *rb_next(const struct rb_node *node)
-{
-	struct rb_node *parent;
-
-	if (RB_EMPTY_NODE(node))
-		return NULL;
-
-	/*
-	 * If we have a right-hand child, go down and then left as far
-	 * as we can.
-	 */
-	if (node->rb_right) {
-		node = node->rb_right;
-		while (node->rb_left)
-			node = node->rb_left;
-		return (struct rb_node *)node;
-	}
-
-	/*
-	 * No right-hand children. Everything down and left is smaller than us,
-	 * so any 'next' node must be in the general direction of our parent.
-	 * Go up the tree; any time the ancestor is a right-hand child of its
-	 * parent, keep going up. First time it's a left-hand child of its
-	 * parent, said parent is our 'next' node.
-	 */
-	while ((parent = rb_parent(node)) && node == parent->rb_right)
-		node = parent;
-
-	return parent;
-}
-EXPORT_SYMBOL(rb_next);
-
-struct rb_node *rb_prev(const struct rb_node *node)
-{
-	struct rb_node *parent;
-
-	if (RB_EMPTY_NODE(node))
-		return NULL;
-
-	/*
-	 * If we have a left-hand child, go down and then right as far
-	 * as we can.
-	 */
-	if (node->rb_left) {
-		node = node->rb_left;
-		while (node->rb_right)
-			node = node->rb_right;
-		return (struct rb_node *)node;
-	}
-
-	/*
-	 * No left-hand children. Go up till we find an ancestor which
-	 * is a right-hand child of its parent.
-	 */
-	while ((parent = rb_parent(node)) && node == parent->rb_left)
-		node = parent;
-
-	return parent;
-}
-EXPORT_SYMBOL(rb_prev);
-
-void rb_replace_node(struct rb_node *victim, struct rb_node *new,
-		     struct rb_root *root)
-{
-	struct rb_node *parent = rb_parent(victim);
-
-	/* Copy the pointers/colour from the victim to the replacement */
-	*new = *victim;
-
-	/* Set the surrounding nodes to point to the replacement */
-	if (victim->rb_left)
-		rb_set_parent(victim->rb_left, new);
-	if (victim->rb_right)
-		rb_set_parent(victim->rb_right, new);
-	__rb_change_child(victim, new, parent, root);
-}
-EXPORT_SYMBOL(rb_replace_node);
-
-void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
-			 struct rb_root *root)
-{
-	struct rb_node *parent = rb_parent(victim);
-
-	/* Copy the pointers/colour from the victim to the replacement */
-	*new = *victim;
-
-	/* Set the surrounding nodes to point to the replacement */
-	if (victim->rb_left)
-		rb_set_parent(victim->rb_left, new);
-	if (victim->rb_right)
-		rb_set_parent(victim->rb_right, new);
-
-	/* Set the parent's pointer to the new node last after an RCU barrier
-	 * so that the pointers onwards are seen to be set correctly when doing
-	 * an RCU walk over the tree.
-	 */
-	__rb_change_child_rcu(victim, new, parent, root);
-}
-EXPORT_SYMBOL(rb_replace_node_rcu);
-
-static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
-{
-	for (;;) {
-		if (node->rb_left)
-			node = node->rb_left;
-		else if (node->rb_right)
-			node = node->rb_right;
-		else
-			return (struct rb_node *)node;
-	}
-}
-
-struct rb_node *rb_next_postorder(const struct rb_node *node)
-{
-	const struct rb_node *parent;
-	if (!node)
-		return NULL;
-	parent = rb_parent(node);
-
-	/* If we're sitting on node, we've already seen our children */
-	if (parent && node == parent->rb_left && parent->rb_right) {
-		/* If we are the parent's left node, go to the parent's right
-		 * node then all the way down to the left */
-		return rb_left_deepest_node(parent->rb_right);
-	} else
-		/* Otherwise we are the parent's right node, and the parent
-		 * should be next */
-		return (struct rb_node *)parent;
-}
-EXPORT_SYMBOL(rb_next_postorder);
-
-struct rb_node *rb_first_postorder(const struct rb_root *root)
-{
-	if (!root->rb_node)
-		return NULL;
-
-	return rb_left_deepest_node(root->rb_node);
-}
-EXPORT_SYMBOL(rb_first_postorder);
@@ -1,328 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  
-
-  linux/include/linux/rbtree.h
-
-  To use rbtrees you'll have to implement your own insert and search cores.
-  This will avoid us to use callbacks and to drop drammatically performances.
-  I know it's not the cleaner way,  but in C (not in C++) to get
-  performances and genericity...
-
-  See Documentation/core-api/rbtree.rst for documentation and samples.
-*/
-
-#ifndef	_LINUX_RBTREE_H
-#define	_LINUX_RBTREE_H
-
-#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
-
-#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
-
-#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
-
-/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
-#define RB_EMPTY_NODE(node)  \
-	((node)->__rb_parent_color == (unsigned long)(node))
-#define RB_CLEAR_NODE(node)  \
-	((node)->__rb_parent_color = (unsigned long)(node))
-
-
-extern void rb_insert_color(struct rb_node *, struct rb_root *);
-extern void rb_erase(struct rb_node *, struct rb_root *);
-
-
-/* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_next(const struct rb_node *);
-extern struct rb_node *rb_prev(const struct rb_node *);
-extern struct rb_node *rb_first(const struct rb_root *);
-extern struct rb_node *rb_last(const struct rb_root *);
-
-/* Postorder iteration - always visit the parent after its children */
-extern struct rb_node *rb_first_postorder(const struct rb_root *);
-extern struct rb_node *rb_next_postorder(const struct rb_node *);
-
-/* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
-			    struct rb_root *root);
-extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
-				struct rb_root *root);
-
-static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
-				struct rb_node **rb_link)
-{
-	node->__rb_parent_color = (unsigned long)parent;
-	node->rb_left = node->rb_right = NULL;
-
-	*rb_link = node;
-}
-
-static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
-				    struct rb_node **rb_link)
-{
-	node->__rb_parent_color = (unsigned long)parent;
-	node->rb_left = node->rb_right = NULL;
-
-	rcu_assign_pointer(*rb_link, node);
-}
-
-#define rb_entry_safe(ptr, type, member) \
-	({ typeof(ptr) ____ptr = (ptr); \
-	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
-	})
-
-/**
- * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
- * given type allowing the backing memory of @pos to be invalidated
- *
- * @pos:	the 'type *' to use as a loop cursor.
- * @n:		another 'type *' to use as temporary storage
- * @root:	'rb_root *' of the rbtree.
- * @field:	the name of the rb_node field within 'type'.
- *
- * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
- * list_for_each_entry_safe() and allows the iteration to continue independent
- * of changes to @pos by the body of the loop.
- *
- * Note, however, that it cannot handle other modifications that re-order the
- * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
- * rb_erase() may rebalance the tree, causing us to miss some nodes.
- */
-#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
-	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
-	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
-			typeof(*pos), field); 1; }); \
-	     pos = n)
-
-/* Same as rb_first(), but O(1) */
-#define rb_first_cached(root) (root)->rb_leftmost
-
-static inline void rb_insert_color_cached(struct rb_node *node,
-					  struct rb_root_cached *root,
-					  bool leftmost)
-{
-	if (leftmost)
-		root->rb_leftmost = node;
-	rb_insert_color(node, &root->rb_root);
-}
-
-
-static inline struct rb_node *
-rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
-{
-	struct rb_node *leftmost = NULL;
-
-	if (root->rb_leftmost == node)
-		leftmost = root->rb_leftmost = rb_next(node);
-
-	rb_erase(node, &root->rb_root);
-
-	return leftmost;
-}
-
-static inline void rb_replace_node_cached(struct rb_node *victim,
-					  struct rb_node *new,
-					  struct rb_root_cached *root)
-{
-	if (root->rb_leftmost == victim)
-		root->rb_leftmost = new;
-	rb_replace_node(victim, new, &root->rb_root);
-}
-
-/*
- * The below helper functions use 2 operators with 3 different
- * calling conventions. The operators are related like:
- *
- *	comp(a->key,b) < 0  := less(a,b)
- *	comp(a->key,b) > 0  := less(b,a)
- *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
- *
- * If these operators define a partial order on the elements we make no
- * guarantee on which of the elements matching the key is found. See
- * rb_find().
- *
- * The reason for this is to allow the find() interface without requiring an
- * on-stack dummy object, which might not be feasible due to object size.
- */
-
-/**
- * rb_add_cached() - insert @node into the leftmost cached tree @tree
- * @node: node to insert
- * @tree: leftmost cached tree to insert @node into
- * @less: operator defining the (partial) node order
- *
- * Returns @node when it is the new leftmost, or NULL.
- */
-static __always_inline struct rb_node *
-rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
-	      bool (*less)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	bool leftmost = true;
-
-	while (*link) {
-		parent = *link;
-		if (less(node, parent)) {
-			link = &parent->rb_left;
-		} else {
-			link = &parent->rb_right;
-			leftmost = false;
-		}
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color_cached(node, tree, leftmost);
-
-	return leftmost ? node : NULL;
-}
-
-/**
- * rb_add() - insert @node into @tree
- * @node: node to insert
- * @tree: tree to insert @node into
- * @less: operator defining the (partial) node order
- */
-static __always_inline void
-rb_add(struct rb_node *node, struct rb_root *tree,
-       bool (*less)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*link) {
-		parent = *link;
-		if (less(node, parent))
-			link = &parent->rb_left;
-		else
-			link = &parent->rb_right;
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color(node, tree);
-}
-
-/**
- * rb_find_add() - find equivalent @node in @tree, or add @node
- * @node: node to look-for / insert
- * @tree: tree to search / modify
- * @cmp: operator defining the node order
- *
- * Returns the rb_node matching @node, or NULL when no match is found and @node
- * is inserted.
- */
-static __always_inline struct rb_node *
-rb_find_add(struct rb_node *node, struct rb_root *tree,
-	    int (*cmp)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_node;
-	struct rb_node *parent = NULL;
-	int c;
-
-	while (*link) {
-		parent = *link;
-		c = cmp(node, parent);
-
-		if (c < 0)
-			link = &parent->rb_left;
-		else if (c > 0)
-			link = &parent->rb_right;
-		else
-			return parent;
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color(node, tree);
-	return NULL;
-}
-
-/**
- * rb_find() - find @key in tree @tree
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining the node order
- *
- * Returns the rb_node matching @key or NULL.
- */
-static __always_inline struct rb_node *
-rb_find(const void *key, const struct rb_root *tree,
-	int (*cmp)(const void *key, const struct rb_node *))
-{
-	struct rb_node *node = tree->rb_node;
-
-	while (node) {
-		int c = cmp(key, node);
-
-		if (c < 0)
-			node = node->rb_left;
-		else if (c > 0)
-			node = node->rb_right;
-		else
-			return node;
-	}
-
-	return NULL;
-}
-
-/**
- * rb_find_first() - find the first @key in @tree
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining node order
- *
- * Returns the leftmost node matching @key, or NULL.
- */
-static __always_inline struct rb_node *
-rb_find_first(const void *key, const struct rb_root *tree,
-	      int (*cmp)(const void *key, const struct rb_node *))
-{
-	struct rb_node *node = tree->rb_node;
-	struct rb_node *match = NULL;
-
-	while (node) {
-		int c = cmp(key, node);
-
-		if (c <= 0) {
-			if (!c)
-				match = node;
-			node = node->rb_left;
-		} else if (c > 0) {
-			node = node->rb_right;
-		}
-	}
-
-	return match;
-}
-
-/**
- * rb_next_match() - find the next @key in @tree
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining node order
- *
- * Returns the next node matching @key, or NULL.
- */
-static __always_inline struct rb_node *
-rb_next_match(const void *key, struct rb_node *node,
-	      int (*cmp)(const void *key, const struct rb_node *))
-{
-	node = rb_next(node);
-	if (node && cmp(key, node))
-		node = NULL;
-	return node;
-}
-
-/**
- * rb_for_each() - iterates a subtree matching @key
- * @node: iterator
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining node order
- */
-#define rb_for_each(node, key, tree, cmp) \
-	for ((node) = rb_find_first((key), (tree), (cmp)); \
-	     (node); (node) = rb_next_match((key), (node), (cmp)))
-
-#endif	/* _LINUX_RBTREE_H */
@@ -1,313 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  (C) 2012  Michel Lespinasse <walken@google.com>
-
-
-  linux/include/linux/rbtree_augmented.h
-*/
-
-#ifndef _LINUX_RBTREE_AUGMENTED_H
-#define _LINUX_RBTREE_AUGMENTED_H
-
-/*
- * Please note - only struct rb_augment_callbacks and the prototypes for
- * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
- * The rest are implementation details you are not expected to depend on.
- *
- * See Documentation/core-api/rbtree.rst for documentation and samples.
- */
-
-struct rb_augment_callbacks {
-	void (*propagate)(struct rb_node *node, struct rb_node *stop);
-	void (*copy)(struct rb_node *old, struct rb_node *new);
-	void (*rotate)(struct rb_node *old, struct rb_node *new);
-};
-
-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
-
-/*
- * Fixup the rbtree and update the augmented information when rebalancing.
- *
- * On insertion, the user must update the augmented information on the path
- * leading to the inserted node, then call rb_link_node() as usual and
- * rb_insert_augmented() instead of the usual rb_insert_color() call.
- * If rb_insert_augmented() rebalances the rbtree, it will callback into
- * a user provided function to update the augmented information on the
- * affected subtrees.
- */
-static inline void
-rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-		    const struct rb_augment_callbacks *augment)
-{
-	__rb_insert_augmented(node, root, augment->rotate);
-}
-
-static inline void
-rb_insert_augmented_cached(struct rb_node *node,
-			   struct rb_root_cached *root, bool newleft,
-			   const struct rb_augment_callbacks *augment)
-{
-	if (newleft)
-		root->rb_leftmost = node;
-	rb_insert_augmented(node, &root->rb_root, augment);
-}
-
-/*
- * Template for declaring augmented rbtree callbacks (generic case)
- *
- * RBSTATIC:    'static' or empty
- * RBNAME:      name of the rb_augment_callbacks structure
- * RBSTRUCT:    struct type of the tree nodes
- * RBFIELD:     name of struct rb_node field within RBSTRUCT
- * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
- * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
- */
-
-#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,				\
-			     RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)	\
-static inline void							\
-RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
-{									\
-	while (rb != stop) {						\
-		RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);	\
-		if (RBCOMPUTE(node, true))				\
-			break;						\
-		rb = rb_parent(&node->RBFIELD);				\
-	}								\
-}									\
-static inline void							\
-RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
-{									\
-	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
-	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
-	new->RBAUGMENTED = old->RBAUGMENTED;				\
-}									\
-static void								\
-RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
-{									\
-	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
-	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
-	new->RBAUGMENTED = old->RBAUGMENTED;				\
-	RBCOMPUTE(old, false);						\
-}									\
-RBSTATIC const struct rb_augment_callbacks RBNAME = {			\
-	.propagate = RBNAME ## _propagate,				\
-	.copy = RBNAME ## _copy,					\
-	.rotate = RBNAME ## _rotate					\
-};
-
-/*
- * Template for declaring augmented rbtree callbacks,
- * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
- *
- * RBSTATIC:    'static' or empty
- * RBNAME:      name of the rb_augment_callbacks structure
- * RBSTRUCT:    struct type of the tree nodes
- * RBFIELD:     name of struct rb_node field within RBSTRUCT
- * RBTYPE:      type of the RBAUGMENTED field
- * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
- * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
- */
-
-#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,	      \
-				 RBTYPE, RBAUGMENTED, RBCOMPUTE)	      \
-static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)	      \
-{									      \
-	RBSTRUCT *child;						      \
-	RBTYPE max = RBCOMPUTE(node);					      \
-	if (node->RBFIELD.rb_left) {					      \
-		child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
-		if (child->RBAUGMENTED > max)				      \
-			max = child->RBAUGMENTED;			      \
-	}								      \
-	if (node->RBFIELD.rb_right) {					      \
-		child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
-		if (child->RBAUGMENTED > max)				      \
-			max = child->RBAUGMENTED;			      \
-	}								      \
-	if (exit && node->RBAUGMENTED == max)				      \
-		return true;						      \
-	node->RBAUGMENTED = max;					      \
-	return false;							      \
-}									      \
-RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,					      \
-		     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
-
-
-#define	RB_RED		0
-#define	RB_BLACK	1
-
-#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
-
-#define __rb_color(pc)     ((pc) & 1)
-#define __rb_is_black(pc)  __rb_color(pc)
-#define __rb_is_red(pc)    (!__rb_color(pc))
-#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
-#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
-#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
-
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
-	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
-}
-
-static inline void rb_set_parent_color(struct rb_node *rb,
-				       struct rb_node *p, int color)
-{
-	rb->__rb_parent_color = (unsigned long)p | color;
-}
-
-static inline void
-__rb_change_child(struct rb_node *old, struct rb_node *new,
-		  struct rb_node *parent, struct rb_root *root)
-{
-	if (parent) {
-		if (parent->rb_left == old)
-			WRITE_ONCE(parent->rb_left, new);
-		else
-			WRITE_ONCE(parent->rb_right, new);
-	} else
-		WRITE_ONCE(root->rb_node, new);
-}
-
-static inline void
-__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
-		      struct rb_node *parent, struct rb_root *root)
-{
-	if (parent) {
-		if (parent->rb_left == old)
-			rcu_assign_pointer(parent->rb_left, new);
-		else
-			rcu_assign_pointer(parent->rb_right, new);
-	} else
-		rcu_assign_pointer(root->rb_node, new);
-}
-
-extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
-
-static __always_inline struct rb_node *
-__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		     const struct rb_augment_callbacks *augment)
-{
-	struct rb_node *child = node->rb_right;
-	struct rb_node *tmp = node->rb_left;
-	struct rb_node *parent, *rebalance;
-	unsigned long pc;
-
-	if (!tmp) {
-		/*
-		 * Case 1: node to erase has no more than 1 child (easy!)
-		 *
-		 * Note that if there is one child it must be red due to 5)
-		 * and node must be black due to 4). We adjust colors locally
-		 * so as to bypass __rb_erase_color() later on.
-		 */
-		pc = node->__rb_parent_color;
-		parent = __rb_parent(pc);
-		__rb_change_child(node, child, parent, root);
-		if (child) {
-			child->__rb_parent_color = pc;
-			rebalance = NULL;
-		} else
-			rebalance = __rb_is_black(pc) ? parent : NULL;
-		tmp = parent;
-	} else if (!child) {
-		/* Still case 1, but this time the child is node->rb_left */
-		tmp->__rb_parent_color = pc = node->__rb_parent_color;
-		parent = __rb_parent(pc);
-		__rb_change_child(node, tmp, parent, root);
-		rebalance = NULL;
-		tmp = parent;
-	} else {
-		struct rb_node *successor = child, *child2;
-
-		tmp = child->rb_left;
-		if (!tmp) {
-			/*
-			 * Case 2: node's successor is its right child
-			 *
-			 *    (n)          (s)
-			 *    / \          / \
-			 *  (x) (s)  ->  (x) (c)
-			 *        \
-			 *        (c)
-			 */
-			parent = successor;
-			child2 = successor->rb_right;
-
-			augment->copy(node, successor);
-		} else {
-			/*
-			 * Case 3: node's successor is leftmost under
-			 * node's right child subtree
-			 *
-			 *    (n)          (s)
-			 *    / \          / \
-			 *  (x) (y)  ->  (x) (y)
-			 *      /            /
-			 *    (p)          (p)
-			 *    /            /
-			 *  (s)          (c)
-			 *    \
-			 *    (c)
-			 */
-			do {
-				parent = successor;
-				successor = tmp;
-				tmp = tmp->rb_left;
-			} while (tmp);
-			child2 = successor->rb_right;
-			WRITE_ONCE(parent->rb_left, child2);
-			WRITE_ONCE(successor->rb_right, child);
-			rb_set_parent(child, successor);
-
-			augment->copy(node, successor);
-			augment->propagate(parent, successor);
-		}
-
-		tmp = node->rb_left;
-		WRITE_ONCE(successor->rb_left, tmp);
-		rb_set_parent(tmp, successor);
-
-		pc = node->__rb_parent_color;
-		tmp = __rb_parent(pc);
-		__rb_change_child(node, successor, tmp, root);
-
-		if (child2) {
-			rb_set_parent_color(child2, parent, RB_BLACK);
-			rebalance = NULL;
-		} else {
-			rebalance = rb_is_black(successor) ? parent : NULL;
-		}
-		successor->__rb_parent_color = pc;
-		tmp = successor;
-	}
-
-	augment->propagate(tmp, NULL);
-	return rebalance;
-}
-
-static __always_inline void
-rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		   const struct rb_augment_callbacks *augment)
-{
-	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
-	if (rebalance)
-		__rb_erase_color(rebalance, root, augment->rotate);
-}
-
-static __always_inline void
-rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
-			  const struct rb_augment_callbacks *augment)
-{
-	if (root->rb_leftmost == node)
-		root->rb_leftmost = rb_next(node);
-	rb_erase_augmented(node, &root->rb_root, augment);
-}
-
-#endif	/* _LINUX_RBTREE_AUGMENTED_H */
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _LINUX_RBTREE_TYPES_H
-#define _LINUX_RBTREE_TYPES_H
-
-struct rb_node {
-	unsigned long  __rb_parent_color;
-	struct rb_node *rb_right;
-	struct rb_node *rb_left;
-} __attribute__((aligned(sizeof(long))));
-/* The alignment might seem pointless, but allegedly CRIS needs it */
-
-struct rb_root {
-	struct rb_node *rb_node;
-};
-
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
-	struct rb_root rb_root;
-	struct rb_node *rb_leftmost;
-};
-
-#define RB_ROOT (struct rb_root) { NULL, }
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
-
-#endif
@@ -44,37 +44,3 @@ int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,

 	return tot;
 }
-
-static int encode_u64(__le64 *buf, u64 val)
-{
-	int bytes;
-
-	val = (val << 1) ^ ((s64)val >> 63); /* shift sign extend */
-	bytes = (fls64(val) + 7) >> 3;
-
-	put_unaligned_le64(val, buf);
-	return bytes;
-}
-
-int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev)
-{
-	u64 diffs[] = {
-		le64_to_cpu(sre->hash) - le64_to_cpu(prev->hash),
-		le64_to_cpu(sre->ino) - le64_to_cpu(prev->ino),
-		le64_to_cpu(sre->id) - le64_to_cpu(prev->id),
-	};
-	u16 lengths = 0;
-	int bytes;
-	int tot = 2;
-	int i;
-
-	for (i = 0; i < array_size(diffs); i++) {
-		bytes = encode_u64(buf + tot, diffs[i]);
-		lengths |= bytes << (i << 2);
-		tot += bytes;
-	}
-
-	put_unaligned_le16(lengths, buf);
-
-	return tot;
-}
@@ -3,6 +3,5 @@

 int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
 		      struct scoutfs_srch_entry *prev);
-int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev);

 #endif
@@ -69,8 +69,6 @@ do {				\
 #define container_of(ptr, type, memb) \
 	((type *)((void *)(ptr) - offsetof(type, memb)))

-#define NSEC_PER_SEC 1000000000
-
 #define BITS_PER_LONG (sizeof(long) * 8)
 #define U8_MAX ((u8)~0ULL)
 #define U16_MAX ((u16)~0ULL)
@@ -83,7 +81,6 @@ do {				\
 							\
 	(_x == 0 ? 0 : 64 - __builtin_clzll(_x));	\
 })
-#define fls64(x) flsll(x)

 #define ilog2(x)					\
 ({							\
@@ -101,16 +98,6 @@ emit_get_unaligned_le(16)
 emit_get_unaligned_le(32)
 emit_get_unaligned_le(64)

-#define emit_put_unaligned_le(nr)				\
-static inline void put_unaligned_le##nr(u##nr val, void *buf)	\
-{								\
-	__le##nr x = cpu_to_le##nr(val);			\
-	memcpy(buf, &x, sizeof(x));				\
-}
-emit_put_unaligned_le(16)
-emit_put_unaligned_le(32)
-emit_put_unaligned_le(64)
-
 /*
 * return -1,0,+1 based on the memcmp comparison of the minimum of their
 * two lengths.  If their min shared bytes are equal but the lengths