Add test for parallel restore

Signed-off-by: Zach Brown <zab@versity.com>
Add parallel restore
2026-01-10 21:50:20 +00:00 · 2024-04-15 11:06:24 -07:00 · 2024-04-15 11:06:24 -07:00 · 2024-04-15 11:06:24 -07:00 · 2024-04-15 11:06:24 -07:00 · 2024-04-15 11:06:24 -07:00
33 changed files with 4813 additions and 266 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,6 +1,20 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.19
+\
+*Jan 30, 2024*
+
+Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
+for creating log merge operations.  The previous timeout, now the
+default, was too short for some systems and was resulting in consistent
+timeouts which created an excessive number of log trees waiting to be
+merged.
+
+Improved performance of many in-mount server operations when there are a
+large number of log trees waiting to be merged.
+
 ---
 v1.18
 \
--- a/kmod/Makefile
+++ b/kmod/Makefile
@@ -12,17 +12,22 @@ else
 SP = @:
 endif

-SCOUTFS_GIT_DESCRIBE := \
+SCOUTFS_GIT_DESCRIBE ?= \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

+ESCAPED_GIT_DESCRIBE := \
+	$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
+
+RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
+
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
+		RPM_GITHASH=$(RPM_GITHASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

 # - We use the git describe from tags to set up the RPM versioning
 RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
-RPM_GITHASH := $(shell git rev-parse --short HEAD)
 TARFILE = scoutfs-kmod-$(RPM_VERSION).tar


@@ -41,7 +46,8 @@ modules_install:

 %.spec: %.spec.in .FORCE
 	sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
+	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
+	    -e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
 	mv $@+ $@


--- a/kmod/scoutfs-kmod.spec.in
+++ b/kmod/scoutfs-kmod.spec.in
@@ -1,6 +1,7 @@
 %define kmod_name scoutfs
 %define kmod_version @@VERSION@@
 %define kmod_git_hash @@GITHASH@@
+%define kmod_git_describe @@GITDESCRIBE@@
 %define pkg_date %(date +%%Y%%m%%d)

 # Disable the building of the debug package(s).
@@ -75,7 +76,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
 for flavor in %flavors_to_build; do
    rm -rf obj/$flavor
    cp -r source obj/$flavor
-    make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
+    make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
 done

 %install
@@ -97,10 +98,21 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
 /lib/modules

 %post
-weak-modules --add-kernel --no-initramfs
+echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
 depmod -a
 %endif

 %clean
 rm -rf %{buildroot}

+%preun
+# stash our modules for postun cleanup
+SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
+rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
+
+%postun
+if [ -x /sbin/weak-modules ]; then
+    cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
+fi
+
+rm /var/run/%{name}-modules-%{version}-%{release} || true
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -2029,187 +2029,253 @@ int scoutfs_btree_rebalance(struct super_block *sb,
 			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
 }

-struct merge_pos {
+struct merged_range {
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct rb_root root;
+	int size;
+};
+
+struct merged_item {
 	struct rb_node node;
-	struct scoutfs_btree_root *root;
-	struct scoutfs_block *bl;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_avl_node *avl;
-	struct scoutfs_key *key;
+	struct scoutfs_key key;
 	u64 seq;
 	u8 flags;
 	unsigned int val_len;
-	u8 *val;
+	u8 val[0];
 };

-static struct merge_pos *first_mpos(struct rb_root *root)
+static inline struct merged_item *mitem_container(struct rb_node *node)
 {
-	struct rb_node *node = rb_first(root);
-	if (node)
-		 return container_of(node, struct merge_pos, node);
+	return node ? container_of(node, struct merged_item, node) : NULL;
+}
+
+static inline struct merged_item *first_mitem(struct rb_root *root)
+{
+	return mitem_container(rb_first(root));
+}
+
+static inline struct merged_item *last_mitem(struct rb_root *root)
+{
+	return mitem_container(rb_last(root));
+}
+
+static inline struct merged_item *next_mitem(struct merged_item *mitem)
+{
+	return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
+}
+
+static inline struct merged_item *prev_mitem(struct merged_item *mitem)
+{
+	return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
+}
+
+static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
+				      struct rb_node **parent_ret, struct rb_node ***link_ret)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct merged_item *mitem;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		mitem = container_of(*node, struct merged_item, node);
+
+		cmp = scoutfs_key_compare(key, &mitem->key);
+
+		if (cmp < 0) {
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			*parent_ret = NULL;
+			*link_ret = NULL;
+			return mitem;
+		}
+	}
+
+	*parent_ret = parent;
+	*link_ret = node;
 	return NULL;
 }

-static struct merge_pos *next_mpos(struct merge_pos *mpos)
+static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
+			 struct rb_node *parent, struct rb_node **link)
 {
-	struct rb_node *node;
-
-	if (mpos && (node = rb_next(&mpos->node)))
-		return container_of(node, struct merge_pos, node);
-	else
-		return NULL;
+	rb_link_node(&mitem->node, parent, link);
+	rb_insert_color(&mitem->node, &rng->root);
+	rng->size += item_len_bytes(mitem->val_len);
 }

-static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
+static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
+				struct merged_item *new)
 {
-	scoutfs_block_put(sb, mpos->bl);
-	kfree(mpos);
+	rb_replace_node(&victim->node, &new->node, &rng->root);
+	RB_CLEAR_NODE(&victim->node);
+	rng->size -= item_len_bytes(victim->val_len);
+	rng->size += item_len_bytes(new->val_len);
 }

-static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
+static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
 {
-	struct rb_node **node = &pos_root->rb_node;
-	struct rb_node *parent = NULL;
-	struct merge_pos *mpos;
-	int cmp;
+	if (IS_ERR_OR_NULL(mitem))
+		return;

-	parent = NULL;
-	while (*node) {
-		parent = *node;
-		mpos = container_of(*node, struct merge_pos, node);
-
-		/* sort merge items by key then newest to oldest */
-		cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
-		      -scoutfs_cmp(ins->seq, mpos->seq);
-
-		if (cmp < 0)
-			node = &(*node)->rb_left;
-		else
-			node = &(*node)->rb_right;
+	if (!RB_EMPTY_NODE(&mitem->node)) {
+		rng->size -= item_len_bytes(mitem->val_len);
+		rb_erase(&mitem->node, &rng->root);
 	}

-	rb_link_node(&ins->node, parent, node);
-	rb_insert_color(&ins->node, pos_root);
+	kfree(mitem);
+}
+
+static void trim_range_size(struct merged_range *rng, int merge_window)
+{
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+
+	mitem = last_mitem(&rng->root);
+	while (mitem && rng->size > merge_window) {
+
+		rng->end = mitem->key;
+		scoutfs_key_dec(&rng->end);
+
+		tmp = mitem;
+		mitem = prev_mitem(mitem);
+		free_mitem(rng, tmp);
+	}
+}
+
+static void trim_range_end(struct merged_range *rng)
+{
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+
+	mitem = last_mitem(&rng->root);
+	while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
+		tmp = mitem;
+		mitem = prev_mitem(mitem);
+		free_mitem(rng, tmp);
+	}
 }

 /*
- * Find the next item in the merge_pos root in the caller's range and
- * insert it into the rbtree sorted by key and version so that merging
- * can find the next newest item at the front of the rbtree.  We free
- * the mpos on error or if there are no more items in the range.
+ * Record and combine logged items from log roots for merging with the
+ * writable destination root.  The caller is responsible for trimming
+ * the range if it gets too large or if the key range shrinks.
 */
-static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
-		      struct scoutfs_key *start, struct scoutfs_key *end)
+static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
+			   void *val, int val_len, void *arg)
 {
-	struct scoutfs_btree_item *item;
-	struct scoutfs_avl_node *next;
-	struct btree_walk_key_range kr;
-	struct scoutfs_key walk_key;
-	int ret = 0;
+	struct merged_range *rng = arg;
+	struct merged_item *mitem;
+	struct merged_item *found;
+	struct rb_node *parent;
+	struct rb_node **link;
+	int ret;

-	/* always erase before freeing or inserting */
-	if (!RB_EMPTY_NODE(&mpos->node)) {
-		rb_erase(&mpos->node, pos_root);
-		RB_CLEAR_NODE(&mpos->node);
-	}
-
-	/*
-	 * advance to next item via the avl tree.   The caller's pos is
-	 * only ever incremented past the last key so we can use next to
-	 * iterate rather than using search to skip past multiple items.
-	 */
-	if (mpos->avl)
-		mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
-
-	/* find the next leaf with the key if we run out of items */
-	walk_key = *start;
-	while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
-		scoutfs_block_put(sb, mpos->bl);
-		mpos->bl = NULL;
-		ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
-				 0, &mpos->bl, &kr, NULL);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
-			free_mpos(sb, mpos);
+	found = find_mitem(&rng->root, key, &parent, &link);
+	if (found) {
+		ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (ret == SCOUTFS_DELTA_COMBINED) {
+				scoutfs_inc_counter(sb, btree_merge_delta_combined);
+			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
+				scoutfs_inc_counter(sb, btree_merge_delta_null);
+				free_mitem(rng, found);
+			}
+			ret = 0;
 			goto out;
 		}
-		mpos->bt = mpos->bl->data;

-		mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
-					       start, NULL, NULL, &next, NULL) ?: next;
-		if (mpos->avl == NULL)
-			walk_key = kr.iter_next;
+		if (found->seq >= seq) {
+			ret = 0;
+			goto out;
+		}
 	}

-	/* see if we're out of items within the range */
-	item = node_item(mpos->avl);
-	if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
-		free_mpos(sb, mpos);
-		ret = 0;
+	mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
+	if (!mitem) {
+		ret = -ENOMEM;
 		goto out;
 	}

-	/* insert the next item within range at its version */
-	mpos->key = item_key(item);
-	mpos->seq = le64_to_cpu(item->seq);
-	mpos->flags = item->flags;
-	mpos->val_len = item_val_len(item);
-	mpos->val = item_val(mpos->bt, item);
+	mitem->key = *key;
+	mitem->seq = seq;
+	mitem->flags = flags;
+	mitem->val_len = val_len;
+	if (val_len)
+		memcpy(mitem->val, val, val_len);
+
+	if (found) {
+		replace_mitem(rng, found, mitem);
+		free_mitem(rng, found);
+	} else {
+		insert_mitem(rng, mitem, parent, link);
+	}

-	insert_mpos(pos_root, mpos);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * The caller has reset all the merge positions for all the input log
- * btree roots and wants the next logged item it should try and merge
- * with the items in the fs_root.
+ * Read a range of merged items.  The caller has set the key bounds of
+ * the range.  We read a merge window's worth of items from blocks in
+ * each input btree.
 *
- * We look ahead in the logged item stream to see if we should merge any
- * older logged delta items into one result for the caller.  We also
- * take this opportunity to skip and reset the mpos for any older
- * versions of the first item.
+ * The caller can only use the smallest range that overlaps with all the
+ * blocks that we read.  We start reading from the range's start key so
+ * it will always be present and we don't need to adjust it.  The final
+ * block we read from each input might not cover the range's end so it
+ * needs to be adjusted.
+ *
+ * The end range can also shrink if we have to drop items because the
+ * items exceeded the merge window size.
 */
-static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
-			      struct scoutfs_key *end, struct merge_pos **mpos_ret)
+static int read_merged_range(struct super_block *sb, struct merged_range *rng,
+			     struct list_head *inputs, int merge_window)
 {
-	struct merge_pos *mpos;
-	struct merge_pos *next;
+	struct scoutfs_btree_root_head *rhead;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
 	struct scoutfs_key key;
 	int ret = 0;
+	int i;

-	while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
-	       !scoutfs_key_compare(mpos->key, next->key)) {
+	list_for_each_entry(rhead, inputs, head) {
+		key = rng->start;

-		ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
-						    next->val, next->val_len);
-		if (ret < 0)
-			break;
-
-		/* reset advances to the next item */
-		key = *mpos->key;
-		scoutfs_key_inc(&key);
-
-		/* always skip next combined or older version */
-		ret = reset_mpos(sb, pos_root, next, &key, end);
-		if (ret < 0)
-			break;
-
-		if (ret == SCOUTFS_DELTA_COMBINED) {
-			scoutfs_inc_counter(sb, btree_merge_delta_combined);
-		} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
-			scoutfs_inc_counter(sb, btree_merge_delta_null);
-			/* if merging resulted in no info, skip current */
-			ret = reset_mpos(sb, pos_root, mpos, &key, end);
+		for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
+			start = key;
+			end = rng->end;
+			ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
+						       merge_read_item, rng);
 			if (ret < 0)
+				goto out;
+
+			if (scoutfs_key_compare(&end, &rng->end) >= 0)
 				break;
+
+			key = end;
+			scoutfs_key_inc(&key);
 		}
+
+		if (scoutfs_key_compare(&end, &rng->end) < 0) {
+			rng->end = end;
+			trim_range_end(rng);
+		}
+
+		if (rng->size > merge_window)
+			trim_range_size(rng, merge_window);
 	}

-	*mpos_ret = mpos;
+	trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
+	ret = 0;
+out:
 	return ret;
 }

@@ -2226,6 +2292,13 @@ static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
 * to allocators running low or needing to join/split the parent.
 * *next_ret is set to the next key which hasn't been merged so that the
 * caller can retry with a new allocator and subtree.
+ *
+ * The number of input roots can be immense.  The merge_window specifies
+ * the size of the set of merged items that we'll maintain as we iterate
+ * over all the input roots.  Once we've merged items into the window
+ * from all the input roots the merged input items are then merged to
+ * the writable destination root.  It may take multiple passes of
+ * windows of merged items to cover the input key range.
 */
 int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_alloc *alloc,
@@ -2235,18 +2308,16 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *inputs,
-			bool subtree, int dirty_limit, int alloc_low)
+			bool subtree, int dirty_limit, int alloc_low, int merge_window)
 {
-	struct scoutfs_btree_root_head *rhead;
-	struct rb_root pos_root = RB_ROOT;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl = NULL;
 	struct btree_walk_key_range kr;
 	struct scoutfs_avl_node *par;
-	struct scoutfs_key next;
-	struct merge_pos *mpos;
-	struct merge_pos *tmp;
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+	struct merged_range rng;
 	int walk_val_len;
 	int walk_flags;
 	bool is_del;
@@ -2257,49 +2328,59 @@ int scoutfs_btree_merge(struct super_block *sb,
 	trace_scoutfs_btree_merge(sb, root, start, end);
 	scoutfs_inc_counter(sb, btree_merge);

-	list_for_each_entry(rhead, inputs, head) {
-		mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
-		if (!mpos) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		RB_CLEAR_NODE(&mpos->node);
-		mpos->root = &rhead->root;
-
-		ret = reset_mpos(sb, &pos_root, mpos, start, end);
-		if (ret < 0)
-			goto out;
-	}
-
 	walk_flags = BTW_DIRTY;
 	if (subtree)
 		walk_flags |= BTW_SUBTREE;
 	walk_val_len = 0;

-	while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
+	rng.start = *start;
+	rng.end = *end;
+	rng.root = RB_ROOT;
+	rng.size = 0;
+
+	ret = read_merged_range(sb, &rng, inputs, merge_window);
+	if (ret < 0)
+		goto out;
+
+	for (;;) {
+		/* read next window as it empties (and it is possible to read an empty range) */
+		mitem = first_mitem(&rng.root);
+		if (!mitem) {
+			/* done if the read range hit the end */
+			if (scoutfs_key_compare(&rng.end, end) >= 0)
+				break;
+
+			/* read next batch of merged items */
+			rng.start = rng.end;
+			scoutfs_key_inc(&rng.start);
+			rng.end = *end;
+			ret = read_merged_range(sb, &rng, inputs, merge_window);
+			if (ret < 0)
+				break;
+			continue;
+		}

 		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
 			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
 			ret = -ERANGE;
-			*next_ret = *mpos->key;
+			*next_ret = mitem->key;
 			goto out;
 		}

 		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
 			scoutfs_inc_counter(sb, btree_merge_alloc_low);
 			ret = -ERANGE;
-			*next_ret = *mpos->key;
+			*next_ret = mitem->key;
 			goto out;
 		}

 		scoutfs_block_put(sb, bl);
 		bl = NULL;
 		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-			         mpos->key, walk_val_len, &bl, &kr, NULL);
+			         &mitem->key, walk_val_len, &bl, &kr, NULL);
 		if (ret < 0) {
 			if (ret == -ERANGE)
-				*next_ret = *mpos->key;
+				*next_ret = mitem->key;
 			goto out;
 		}
 		bt = bl->data;
@@ -2311,22 +2392,21 @@ int scoutfs_btree_merge(struct super_block *sb,
 			continue;
 		}

-		while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
-
+		while (mitem) {
 			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
+			if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
 				break;

 			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, mpos->key);
-			is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);
+			item = leaf_item_hash_search(sb, bt, &mitem->key);
+			is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);

 			/* see if we're merging delta items */
 			if (item && !is_del)
-				delta = scoutfs_forest_combine_deltas(mpos->key,
+				delta = scoutfs_forest_combine_deltas(&mitem->key,
 								      item_val(bt, item),
 								      item_val_len(item),
-								      mpos->val, mpos->val_len);
+								      mitem->val, mitem->val_len);
 			else
 				delta = 0;
 			if (delta < 0) {
@@ -2338,40 +2418,38 @@ int scoutfs_btree_merge(struct super_block *sb,
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 			}

-			trace_scoutfs_btree_merge_items(sb, mpos->root,
-					mpos->key, mpos->val_len,
+			trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
 					item ? root : NULL,
 					item ? item_key(item) : NULL,
 					item ? item_val_len(item) : 0, is_del);

 			/* rewalk and split if ins/update needs room */
-			if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
+			if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
 				walk_flags |= BTW_INSERT;
-				walk_val_len = mpos->val_len;
+				walk_val_len = mitem->val_len;
 				break;
 			}

 			/* insert missing non-deletion merge items */
 			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root,
-						   cmp_key_item, mpos->key,
+				scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, mpos->key, mpos->seq, mpos->flags,
-					    mpos->val, mpos->val_len, par, cmp);
+				create_item(bt, &mitem->key, mitem->seq, mitem->flags,
+					    mitem->val, mitem->val_len, par, cmp);
 				scoutfs_inc_counter(sb, btree_merge_insert);
 			}

 			/* update existing items */
 			if (item && !is_del && !delta) {
-				item->seq = cpu_to_le64(mpos->seq);
-				item->flags = mpos->flags;
-				update_item_value(bt, item, mpos->val, mpos->val_len);
+				item->seq = cpu_to_le64(mitem->seq);
+				item->flags = mitem->flags;
+				update_item_value(bt, item, mitem->val, mitem->val_len);
 				scoutfs_inc_counter(sb, btree_merge_update);
 			}

 			/* update combined delta item seq */
 			if (delta == SCOUTFS_DELTA_COMBINED) {
-				item->seq = cpu_to_le64(mpos->seq);
+				item->seq = cpu_to_le64(mitem->seq);
 			}

 			/*
@@ -2403,21 +2481,18 @@ int scoutfs_btree_merge(struct super_block *sb,
 			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
 			walk_val_len = 0;

-			/* finished with this key, skip any older items */
-			next = *mpos->key;
-			scoutfs_key_inc(&next);
-			ret = reset_mpos(sb, &pos_root, mpos, &next, end);
-			if (ret < 0)
-				goto out;
+			/* finished with this merged item */
+			tmp = mitem;
+			mitem = next_mitem(mitem);
+			free_mitem(&rng, tmp);
 		}
 	}

 	ret = 0;
 out:
 	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
-		free_mpos(sb, mpos);
-	}
+	rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
+		free_mitem(&rng, mitem);

 	return ret;
 }
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *input_list,
-			bool subtree, int dirty_limit, int alloc_low);
+			bool subtree, int dirty_limit, int alloc_low, int merge_window);

 int scoutfs_btree_free_blocks(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -145,6 +145,7 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
 	EXPAND_COUNTER(net_send_error)				\
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -721,7 +721,8 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
 	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
 				  &next, &comp.root, &inputs,
 				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
+				  (2 * 1024 * 1024));
 	if (ret == -ERANGE) {
 		comp.remain = next;
 		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -33,6 +33,7 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
+	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
@@ -45,6 +46,7 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
+	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

+#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
+#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
+#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
+
 #define MIN_ORPHAN_SCAN_DELAY_MS	100UL
 #define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
 #define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
@@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
+	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
 }

+static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
+		return -EINVAL;
+	}
+	if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
+		scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
+			    val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
 {
 	if (ret < 0) {
@@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

+		case Opt_log_merge_wait_timeout_ms:
+			ret = match_int(args, &nr);
+			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->log_merge_wait_timeout_ms = nr;
+			break;
+
 		case Opt_metadev_path:
 			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
 			if (ret < 0)
@@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

+static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
+}
+static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	int val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoint(nullterm, 0, &val);
+	ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
+	if (ret == 0) {
+		write_seqlock(&optinf->seqlock);
+		optinf->opts.log_merge_wait_timeout_ms = val;
+		write_sequnlock(&optinf->seqlock);
+		ret = count;
+	}
+
+	return ret;
+}
+SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
+
 static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
+	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -8,6 +8,7 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
+	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -1747,21 +1747,41 @@ TRACE_EVENT(scoutfs_btree_merge,
 		  sk_trace_args(end))
 );

+TRACE_EVENT(scoutfs_btree_merge_read_range,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
+		 int size),
+
+	TP_ARGS(sb, start, end, size),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(start)
+		sk_trace_define(end)
+		__field(int, size)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		__entry->size = size;
+	),
+
+	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
+);
+
 TRACE_EVENT(scoutfs_btree_merge_items,
 	TP_PROTO(struct super_block *sb,
-		 struct scoutfs_btree_root *m_root,
 		 struct scoutfs_key *m_key, int m_val_len,
 		 struct scoutfs_btree_root *f_root,
 		 struct scoutfs_key *f_key, int f_val_len,
 		 int is_del),

-	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+	TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, m_root_blkno)
-		__field(__u64, m_root_seq)
-		__field(__u8, m_root_height)
 		sk_trace_define(m_key)
 		__field(int, m_val_len)
 		__field(__u64, f_root_blkno)
@@ -1774,10 +1794,6 @@ TRACE_EVENT(scoutfs_btree_merge_items,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->m_root_blkno = m_root ?
-					le64_to_cpu(m_root->ref.blkno) : 0;
-		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
-		__entry->m_root_height = m_root ? m_root->height : 0;
 		sk_trace_assign(m_key, m_key);
 		__entry->m_val_len = m_val_len;
 		__entry->f_root_blkno = f_root ?
@@ -1789,11 +1805,9 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		__entry->is_del = !!is_del;
 	),

-	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
-		  __entry->m_root_height, sk_trace_args(m_key),
-		  __entry->m_val_len, __entry->f_root_blkno,
-		  __entry->f_root_seq, __entry->f_root_height,
+	TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
+		  __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );

@@ -2076,6 +2090,71 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

+TRACE_EVENT(scoutfs_server_finalize_items,
+	TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
+		 u64 item_get_trans_seq),
+
+	TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, c_rid)
+		__field(__u64, item_rid)
+		__field(__u64, item_nr)
+		__field(__u64, item_flags)
+		__field(__u64, item_get_trans_seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->c_rid = rid;
+		__entry->item_rid = item_rid;
+		__entry->item_nr = item_nr;
+		__entry->item_flags = item_flags;
+		__entry->item_get_trans_seq = item_get_trans_seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
+		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
+		  __entry->item_flags, __entry->item_get_trans_seq)
+);
+
+TRACE_EVENT(scoutfs_server_finalize_decision,
+	TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
+		 bool ours_visible, bool finalize_ours, unsigned int delay_ms,
+		 u64 finalize_sent_seq),
+
+	TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
+		finalize_sent_seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, c_rid)
+		__field(bool, saw_finalized)
+		__field(bool, others_active)
+		__field(bool, ours_visible)
+		__field(bool, finalize_ours)
+		__field(unsigned int, delay_ms)
+		__field(__u64, finalize_sent_seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->c_rid = rid;
+		__entry->saw_finalized = saw_finalized;
+		__entry->others_active = others_active;
+		__entry->ours_visible = ours_visible;
+		__entry->finalize_ours = finalize_ours;
+		__entry->delay_ms = delay_ms;
+		__entry->finalize_sent_seq = finalize_sent_seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
+		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
+		  __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
+		  __entry->finalize_sent_seq)
+);
+
 TRACE_EVENT(scoutfs_get_log_merge_status,
 	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
 		 u64 nr_requests, u64 nr_complete, u64 seq),
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -148,6 +148,8 @@ struct server_info {
 	struct scoutfs_quorum_config qconf;
 	/* a running server maintains a private dirty super */
 	struct scoutfs_super_block dirty_super;
+
+	u64 finalize_sent_seq;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -413,6 +415,27 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
 	wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
 }

+/*
+ * Return the higher of the avail or freed used by the active commit
+ * since this holder joined the commit.  This is *not* the amount used
+ * by the holder, we don't track per-holder alloc use.
+ */
+static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	u32 avail_used;
+	u32 freed_used;
+	u32 avail_now;
+	u32 freed_now;
+
+	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
+
+	avail_used = hold->avail - avail_now;
+	freed_used = hold->freed - freed_now;
+
+	return max(avail_used, freed_used);
+}
+
 /*
 * This is called while holding the commit and returns once the commit
 * is successfully written.  Many holders can all wait for all holders
@@ -938,22 +961,24 @@ static int find_log_trees_item(struct super_block *sb,
 }

 /*
- * Find the next log_trees item from the key.  Fills the caller's log_trees and sets
- * the key past the returned log_trees for iteration.  Returns 0 when done, > 0 for each
- * item, and -errno on fatal errors.
+ * Find the log_trees item with the greatest nr for each rid.  Fills the
+ * caller's log_trees and sets the key before the returned log_trees for
+ * the next iteration.  Returns 0 when done, > 0 for each item, and
+ * -errno on fatal errors.
 */
-static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *key, struct scoutfs_log_trees *lt)
+static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
+				struct scoutfs_key *key, struct scoutfs_log_trees *lt)
 {
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	int ret;

-	ret = scoutfs_btree_next(sb, root, key, &iref);
+	ret = scoutfs_btree_prev(sb, root, key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			memcpy(lt, iref.val, iref.val_len);
 			*key = *iref.key;
-			scoutfs_key_inc(key);
+			key->sklt_nr = 0;
+			scoutfs_key_dec(key);
 			ret = 1;
 		} else {
 			ret = -EIO;
@@ -1048,21 +1073,13 @@ static int next_log_merge_item(struct super_block *sb,
 * abandoned log btree finalized.  If it takes too long each client has
 * a change to make forward progress before being asked to commit again.
 *
- * We're waiting on heavy state that is protected by mutexes and
- * transaction machinery.  It's tricky to recreate that state for
- * lightweight condition tests that don't change task state.  Instead of
- * trying to get that right, particularly as we unwind after success or
- * after timeouts, waiters use an unsatisfying poll.   Short enough to
- * not add terrible latency, given how heavy and infrequent this already
- * is, and long enough to not melt the cpu.  This could be tuned if it
- * becomes a problem.
- *
 * This can end up finalizing a new empty log btree if a new mount
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
-#define FINALIZE_POLL_MS	(11)
-#define FINALIZE_TIMEOUT_MS	(MSEC_PER_SEC / 2)
+#define FINALIZE_POLL_MIN_DELAY_MS	5U
+#define FINALIZE_POLL_MAX_DELAY_MS	100U
+#define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
 static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
 					u64 rid, struct commit_hold *hold)
 {
@@ -1070,8 +1087,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
+	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
 	struct scoutfs_log_trees fin;
+	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
 	bool others_active;
@@ -1079,10 +1098,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	bool ours_visible;
 	struct scoutfs_key key;
 	char *err_str = NULL;
+	ktime_t start;
 	int ret;
 	int err;

-	timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);
+	scoutfs_options_read(sb, &opts);
+	timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
+	delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
+	start = ktime_get_raw();

 	for (;;) {
 		/* nothing to do if there's already a merge in flight */
@@ -1099,8 +1122,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		saw_finalized = false;
 		others_active = false;
 		ours_visible = false;
-		scoutfs_key_init_log_trees(&key, 0, 0);
-		while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+		while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+
+			trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
+							    le64_to_cpu(each_lt.nr),
+							    le64_to_cpu(each_lt.flags),
+							    le64_to_cpu(each_lt.get_trans_seq));

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
 				saw_finalized = true;
@@ -1125,6 +1153,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		finalize_ours = (lt->item_root.height > 2) ||
 				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);

+		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
+						       ours_visible, finalize_ours, delay_ms,
+						       server->finalize_sent_seq);
+
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
@@ -1132,12 +1164,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		}

 		/* send sync requests soon to give time to commit */
-		scoutfs_key_init_log_trees(&key, 0, 0);
+		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
 		while (others_active &&
-		       (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		       (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
-			    (le64_to_cpu(each_lt.rid) == rid))
+			    (le64_to_cpu(each_lt.rid) == rid) ||
+			    (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
 				continue;

 			ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1157,6 +1190,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

+		server->finalize_sent_seq = scoutfs_server_seq(sb);
+
 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
 			fin = *lt;
@@ -1194,13 +1229,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			if (ret < 0)
 				err_str = "applying commit before waiting for finalized";

-			msleep(FINALIZE_POLL_MS);
+			msleep(delay_ms);
+			delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
+				       FINALIZE_POLL_MAX_DELAY_MS);

 			server_hold_commit(sb, hold);
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
 			if (time_after(jiffies, timeo)) {
+				scoutfs_inc_counter(sb, log_merge_wait_timeout);
 				ret = 0;
 				break;
 			}
@@ -1783,43 +1821,29 @@ out:
 * Give the caller the last seq before outstanding client commits.  All
 * seqs up to and including this are stable, new client transactions can
 * only have greater seqs.
+ *
+ * For each rid, only its greatest log trees nr can be an open commit.
+ * We look at the last log_trees item for each client rid and record its
+ * trans seq if it hasn't been committed.
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_log_trees *lt;
+	struct scoutfs_log_trees lt;
 	struct scoutfs_key key;
 	u64 last_seq = 0;
 	int ret;

 	last_seq = scoutfs_server_seq(sb) - 1;
-	scoutfs_key_init_log_trees(&key, 0, 0);

 	mutex_lock(&server->logs_mutex);

-	for (;; scoutfs_key_inc(&key)) {
-		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
-		if (ret == 0) {
-			if (iref.val_len == sizeof(*lt)) {
-				lt = iref.val;
-				if ((le64_to_cpu(lt->get_trans_seq) >
-				     le64_to_cpu(lt->commit_trans_seq)) &&
-				     le64_to_cpu(lt->get_trans_seq) <= last_seq) {
-					last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
-				}
-				key = *iref.key;
-			} else {
-				ret = -EIO;
-			}
-			scoutfs_btree_put_iref(&iref);
-		}
-		if (ret < 0) {
-			if (ret == -ENOENT) {
-				ret = 0;
-				break;
-			}
+	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
+		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
+		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
+			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
 		}
 	}

@@ -2471,9 +2495,11 @@ static void server_log_merge_free_work(struct work_struct *work)

 	while (!server_is_stopping(server)) {

-		server_hold_commit(sb, &hold);
-		mutex_lock(&server->logs_mutex);
-		commit = true;
+		if (!commit) {
+			server_hold_commit(sb, &hold);
+			mutex_lock(&server->logs_mutex);
+			commit = true;
+		}

 		ret = next_log_merge_item(sb, &super->log_merge,
 					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2520,12 +2546,14 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		mutex_unlock(&server->logs_mutex);
-		ret = server_apply_commit(sb, &hold, ret);
-		commit = false;
-		if (ret < 0) {
-			err_str = "looping commit del/upd freeing item";
-			break;
+		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
+			mutex_unlock(&server->logs_mutex);
+			ret = server_apply_commit(sb, &hold, ret);
+			commit = false;
+			if (ret < 0) {
+				err_str = "looping commit del/upd freeing item";
+				break;
+			}
 		}
 	}

@@ -4298,6 +4326,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
+	server->finalize_sent_seq = 0;

 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -12,7 +12,8 @@ BIN := src/createmany			\
 	src/find_xattrs			\
 	src/create_xattr_loop		\
 	src/fragmented_data_extents	\
-	src/o_tmpfile_umask
+	src/o_tmpfile_umask		\
+	src/parallel_restore

 DEPS := $(wildcard src/*.d)

@@ -22,8 +23,11 @@ ifneq ($(DEPS),)
 -include $(DEPS)
 endif

+src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
+
 $(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $($(@)_cflags)
+

 .PHONY: clean
 clean:
--- a/tests/src/parallel_restore.c
+++ b/tests/src/parallel_restore.c
@@ -0,0 +1,782 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <sys/socket.h>
+
+#include "../../utils/src/sparse.h"
+#include "../../utils/src/util.h"
+#include "../../utils/src/list.h"
+#include "../../utils/src/parse.h"
+#include "../../kmod/src/format.h"
+#include "../../utils/src/parallel_restore.h"
+
+/*
+ * XXX:
+ *  - add a nice description of what's going on
+ *  - mention allocator contention
+ *  - test child process dying handling
+ *  - root dir entry name length is wrong
+ */
+
+#define ERRF " errno %d (%s)"
+#define ERRA errno, strerror(errno)
+
+#define error_exit(cond, fmt, args...)			\
+do {							\
+	if (cond) {					\
+		printf("error: "fmt"\n", ##args);	\
+		exit(1);				\
+	}						\
+} while (0)
+
+#define dprintf(fmt, args...)		\
+do {					\
+	if (0)				\
+		printf(fmt, ##args);	\
+} while (0)
+
+#define REG_MODE (S_IFREG | 0644)
+#define DIR_MODE (S_IFDIR | 0755)
+
+struct opts {
+	unsigned long long buf_size;
+
+	unsigned long long write_batch;
+	unsigned long long low_dirs;
+	unsigned long long high_dirs;
+	unsigned long long low_files;
+	unsigned long long high_files;
+	char *meta_path;
+	unsigned long long total_files;
+	bool read_only;
+	unsigned long long seed;
+	unsigned long long nr_writers;
+};
+
+static void usage(void)
+{
+	printf("usage:\n"
+	       " -b NR       | threads write blocks in batches files (100000)\n"
+	       " -d LOW:HIGH | range of subdirs per directory (5:10)\n"
+	       " -f LOW:HIGH | range of files per directory (10:20)\n"
+	       " -m PATH     | path to metadata device\n"
+	       " -n NR       | total number of files to create (100)\n"
+	       " -r          | read-only, all work except writing, measure cpu cost\n"
+	       " -s NR       | randomization seed (random)\n"
+	       " -w NR       | number of writing processes to fork (online cpus)\n"
+	       );
+}
+
+static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			 void *buf, size_t buf_size, int dev_fd)
+{
+	size_t total = 0;
+	size_t count;
+	off_t off;
+	int ret;
+
+	do {
+		ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
+		error_exit(ret, "write buf %d", ret);
+
+		if (count > 0) {
+			if (!opts->read_only)
+				ret = pwrite(dev_fd, buf, count, off);
+			else
+				ret = count;
+			error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
+			total += ret;
+		}
+	} while (count > 0);
+
+	return total;
+}
+
+struct gen_inode {
+	struct scoutfs_parallel_restore_inode inode;
+	struct scoutfs_parallel_restore_xattr **xattrs;
+	u64 nr_xattrs;
+	struct scoutfs_parallel_restore_entry **entries;
+	u64 nr_files;
+	u64 nr_entries;
+};
+
+static void free_gino(struct gen_inode *gino)
+{
+	u64 i;
+
+	if (gino) {
+		if (gino->entries) {
+			for (i = 0; i < gino->nr_entries; i++)
+				free(gino->entries[i]);
+			free(gino->entries);
+		}
+		if (gino->xattrs) {
+			for (i = 0; i < gino->nr_xattrs; i++)
+				free(gino->xattrs[i]);
+			free(gino->xattrs);
+		}
+		free(gino);
+	}
+}
+
+static struct scoutfs_parallel_restore_xattr *
+generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
+		int value_len)
+{
+	struct scoutfs_parallel_restore_xattr *xattr;
+
+	xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
+	error_exit(!xattr, "error allocating generated xattr");
+
+	*xattr = (struct scoutfs_parallel_restore_xattr) {
+		.ino = ino,
+		.pos = pos,
+		.name_len = name_len,
+		.value_len = value_len,
+	};
+
+	xattr->name = (void *)(xattr + 1);
+	xattr->value = (void *)(xattr->name + name_len);
+
+	memcpy(xattr->name, name, name_len);
+	if (value_len)
+		memcpy(xattr->value, value, value_len);
+
+	return xattr;
+}
+
+static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
+{
+	struct gen_inode *gino;
+	struct timespec now;
+
+	clock_gettime(CLOCK_REALTIME, &now);
+
+	gino = calloc(1, sizeof(struct gen_inode));
+	error_exit(!gino, "failure allocating generated inode");
+
+	gino->inode = (struct scoutfs_parallel_restore_inode) {
+		.ino = ino,
+		.mode = mode,
+		.atime = now,
+		.ctime = now,
+		.mtime = now,
+		.crtime = now,
+	};
+
+	/*
+	 * hacky creation of a bunch of xattrs for now.
+	 */
+	if ((mode & S_IFMT) == S_IFREG) {
+		#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
+		struct name_val {
+			char *name;
+			int len;
+			char *value;
+			int value_len;
+		} nv[] = {
+			NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
+			NV("scoutfs.hide.sam_reqcopies", ""),
+			NV("scoutfs.hide.sam_copy_2", ""),
+			NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
+			NV("scoutfs.hide.sam_copy_1", ""),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
+			NV("scoutfs.hide.srch.sam_release_cand", ""),
+			NV("scoutfs.hide.sam_restime", ""),
+			NV("scoutfs.hide.sam_uuid", ""),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
+			NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
+			NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
+			NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
+			NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
+			NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
+		};
+		unsigned int nr = array_size(nv);
+		int i;
+
+		gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
+
+		for (i = 0; i < nr; i++)
+			gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
+							 nv[i].value, nv[i].value_len);
+
+		gino->nr_xattrs = nr;
+		gino->inode.nr_xattrs = nr;
+	}
+
+	return gino;
+}
+
+static struct scoutfs_parallel_restore_entry *
+generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	char buf[PATH_MAX];
+	int bytes;
+
+	bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
+
+	entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
+	error_exit(!entry, "error allocating generated entry");
+
+	*entry = (struct scoutfs_parallel_restore_entry) {
+		.dir_ino = dir_ino,
+		.pos = pos,
+		.ino = ino,
+		.mode = mode,
+		.name = (void *)(entry + 1),
+		.name_len = bytes,
+	};
+
+	memcpy(entry->name, buf, bytes);
+
+	return entry;
+}
+
+static u64 random64(void)
+{
+	return ((u64)lrand48() << 32) | lrand48();
+}
+
+static u64 random_range(u64 low, u64 high)
+{
+	return low + (random64() % (high - low + 1));
+}
+
+static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
+				      bool no_dirs)
+{
+	struct scoutfs_parallel_restore_entry *entry;
+	struct gen_inode *gino;
+	u64 nr_entries;
+	u64 nr_files;
+	u64 nr_dirs;
+	u64 ino;
+	char *prefix;
+	mode_t mode;
+	u64 i;
+
+	nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
+	nr_files = random_range(opts->low_files, opts->high_files);
+
+	if (1 + nr_dirs + nr_files > ino_len) {
+		nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
+		nr_files = (ino_len - 1) - nr_dirs;
+	}
+
+	nr_entries = nr_dirs + nr_files;
+
+	gino = generate_inode(opts, dir_ino, DIR_MODE);
+	error_exit(!gino, "error allocating generated inode");
+
+	gino->inode.nr_subdirs = nr_dirs;
+	gino->nr_files = nr_files;
+
+	if (nr_entries) {
+		gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
+		error_exit(!gino->entries, "error allocating generated inode entries");
+
+		gino->nr_entries = nr_entries;
+	}
+
+	mode = DIR_MODE;
+	prefix = "dir";
+	for (i = 0; i < nr_entries; i++) {
+		if (i == nr_dirs) {
+			mode = REG_MODE;
+			prefix = "file";
+		}
+
+		ino = ino_start + i;
+		entry = generate_entry(opts, prefix, ino, gino->inode.ino,
+				       SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
+
+		gino->entries[i] = entry;
+		gino->inode.total_entry_name_bytes += entry->name_len;
+	}
+
+	return gino;
+}
+
+/*
+ * Restore a generated inode.  If it's a directory then we also restore
+ * all its entries.  The caller is going to descend into subdir entries and generate
+ * those dir inodes.  We have to generate and restore all non-dir inodes referenced
+ * by this inode's entries.
+ */
+static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+			  struct gen_inode *gino)
+{
+	struct gen_inode *nondir;
+	int ret;
+	u64 i;
+
+	ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+	error_exit(ret, "thread add root inode %d", ret);
+
+	for (i = 0; i < gino->nr_entries; i++) {
+		ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
+		error_exit(ret, "thread add entry %d", ret);
+
+		/* caller only needs subdir entries, generate and free others */
+		if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
+
+			nondir = generate_inode(opts, gino->entries[i]->ino,
+						gino->entries[i]->mode);
+			restore_inode(opts, wri, nondir);
+			free_gino(nondir);
+
+			free(gino->entries[i]);
+			if (i != gino->nr_entries - 1)
+				gino->entries[i] = gino->entries[gino->nr_entries - 1];
+			gino->nr_entries--;
+			gino->nr_files--;
+			i--;
+		}
+	}
+
+	for (i = 0; i < gino->nr_xattrs; i++) {
+		ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
+		error_exit(ret, "thread add xattr %d", ret);
+	}
+}
+
+struct writer_args {
+	struct list_head head;
+
+	int dev_fd;
+	int pair_fd;
+
+	struct scoutfs_parallel_restore_slice slice;
+	u64 writer_nr;
+	u64 dir_height;
+	u64 ino_start;
+	u64 ino_len;
+};
+
+struct write_result {
+	struct scoutfs_parallel_restore_progress prog;
+	struct scoutfs_parallel_restore_slice slice;
+	__le64 files_created;
+	__le64 bytes_written;
+};
+
+static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
+				  void *buf, size_t buf_size, int dev_fd,
+				  struct write_result *res, bool get_slice, int pair_fd)
+{
+	size_t total;
+	int ret;
+
+	total = write_bufs(opts, wri, buf, buf_size, dev_fd);
+	le64_add_cpu(&res->bytes_written, total);
+
+	ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
+	error_exit(ret, "get prog %d", ret);
+
+	if (get_slice) {
+		ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
+		error_exit(ret, "thread get slice %d", ret);
+	}
+
+	ret = write(pair_fd, res, sizeof(struct write_result));
+	error_exit(ret != sizeof(struct write_result), "result send error");
+
+	memset(res, 0, sizeof(struct write_result));
+}
+
+/*
+ * Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
+ * number of writers.
+ */
+static u64 topdir_entry_bytes(u64 nr_writers)
+{
+	u64 bytes = (3 + 1) * nr_writers;
+	u64 limit;
+	u64 done;
+	u64 wid;
+	u64 nr;
+
+	for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
+		nr = min(limit - done, nr_writers - done);
+		bytes += nr * wid;
+	}
+
+	return bytes;
+}
+
+struct dir_pos {
+	struct gen_inode *gino;
+	u64 pos;
+};
+
+static void writer_proc(struct opts *opts, struct writer_args *args)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_entry *entry;
+	struct dir_pos *dirs = NULL;
+	struct write_result res;
+	struct gen_inode *gino;
+	void *buf = NULL;
+	u64 level;
+	u64 ino;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	dirs = calloc(args->dir_height, sizeof(struct dir_pos));
+	error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
+
+	errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block buf "ERRF, ERRA);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
+	error_exit(ret, "add slice %d", ret);
+
+	/* writer 0 creates the root dir */
+	if (args->writer_nr == 0) {
+		gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
+		gino->inode.nr_subdirs = opts->nr_writers;
+		gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
+
+		ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
+		error_exit(ret, "thread add root inode %d", ret);
+		free_gino(gino);
+	}
+
+	/* create root entry for our top level dir */
+	ino = args->ino_start++;
+	args->ino_len--;
+
+	entry = generate_entry(opts, "top", args->writer_nr,
+			       SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
+			       ino, DIR_MODE);
+
+	ret = scoutfs_parallel_restore_add_entry(wri, entry);
+	error_exit(ret, "thread top entry %d", ret);
+	free(entry);
+
+	level = args->dir_height - 1;
+
+	while (args->ino_len > 0 && level < args->dir_height) {
+		gino = dirs[level].gino;
+
+		/* generate and restore if we follow entries */
+		if (!gino) {
+			gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
+			args->ino_start += gino->nr_entries;
+			args->ino_len -= gino->nr_entries;
+			le64_add_cpu(&res.files_created, gino->nr_files);
+
+			restore_inode(opts, wri, gino);
+			dirs[level].gino = gino;
+		}
+
+		if (dirs[level].pos == gino->nr_entries) {
+			/* ascend if we're done with this dir */
+			dirs[level].gino = NULL;
+			dirs[level].pos = 0;
+			free_gino(gino);
+			level++;
+
+		} else {
+			/* otherwise descend into subdir entry */
+			ino = gino->entries[dirs[level].pos]->ino;
+			dirs[level].pos++;
+			level--;
+		}
+
+		/* do a partial write at batch intervals when there's still more to do */
+		if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
+			write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+					    &res, false, args->pair_fd);
+	}
+
+	write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
+			    &res, true, args->pair_fd);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	free(dirs);
+	free(buf);
+}
+
+static void fork_writer(struct opts *opts, struct writer_args *args)
+{
+	pid_t parent = getpid();
+	pid_t pid;
+	int ret;
+
+	pid = fork();
+	error_exit(pid == -1, "fork error");
+
+	if (pid != 0)
+		return;
+
+	ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
+	error_exit(ret < 0, "failed to set parent death sig");
+
+	printf("pid %u getpid() %u parent %u getppid() %u\n",
+		pid, getpid(), parent, getppid());
+	error_exit(getppid() != parent, "child parent already changed");
+
+	writer_proc(opts, args);
+	exit(0);
+}
+
+static int do_restore(struct opts *opts)
+{
+	struct scoutfs_parallel_restore_writer *wri = NULL;
+	struct scoutfs_parallel_restore_slice *slices = NULL;
+	struct scoutfs_super_block *super = NULL;
+	struct write_result res;
+	struct writer_args *args;
+	struct timespec begin;
+	struct timespec end;
+	LIST_HEAD(writers);
+	u64 next_ino;
+	u64 ino_per;
+	u64 avg_dirs;
+	u64 avg_files;
+	u64 dir_height;
+	u64 tot_files;
+	u64 tot_bytes;
+	int pair[2] = {-1, -1};
+	float secs;
+	void *buf = NULL;
+	int dev_fd = -1;
+	int ret;
+	int i;
+
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+	error_exit(ret, "socketpair error "ERRF, ERRA);
+
+	dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
+	error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
+
+	errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
+		posix_memalign((void **)&buf, 4096, opts->buf_size);
+	error_exit(errno, "error allocating block bufs "ERRF, ERRA);
+
+	ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+		    SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+	error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
+
+	ret = scoutfs_parallel_restore_create_writer(&wri);
+	error_exit(ret, "create writer %d", ret);
+
+	ret = scoutfs_parallel_restore_import_super(wri, super);
+	error_exit(ret, "import super %d", ret);
+
+	slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
+	error_exit(!slices, "alloc slices");
+
+	scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
+
+	ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
+	error_exit(ret, "add slices[0] %d", ret);
+
+	next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
+	ino_per = opts->total_files / opts->nr_writers;
+	avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
+	avg_files = (opts->low_files + opts->high_files) / 2;
+
+	dir_height = 1;
+	tot_files = avg_files * opts->nr_writers;
+
+	while (tot_files < opts->total_files) {
+		dir_height++;
+		tot_files *= avg_dirs;
+	}
+
+	dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
+
+	/* start each writing process */
+	for (i = 0; i < opts->nr_writers; i++) {
+		args = calloc(1, sizeof(struct writer_args));
+		error_exit(!args, "alloc writer args");
+
+		args->dev_fd = dev_fd;
+		args->pair_fd = pair[1];
+		args->slice = slices[1 + i];
+		args->writer_nr = i;
+		args->dir_height = dir_height;
+		args->ino_start = next_ino;
+		args->ino_len = ino_per;
+
+		list_add_tail(&args->head, &writers);
+		next_ino += ino_per;
+
+		fork_writer(opts, args);
+	}
+
+	/* read results and watch for writers to finish */
+	tot_files = 0;
+	tot_bytes = 0;
+	i = 0;
+	while (i < opts->nr_writers) {
+		ret = read(pair[0], &res, sizeof(struct write_result));
+		error_exit(ret != sizeof(struct write_result), "result read error");
+
+		ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
+		error_exit(ret, "add thr prog %d", ret);
+
+		if (res.slice.meta_len != 0) {
+			ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
+			error_exit(ret, "add thr slice %d", ret);
+			i++;
+		}
+
+		tot_files += le64_to_cpu(res.files_created);
+		tot_bytes += le64_to_cpu(res.bytes_written);
+	}
+
+	tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
+
+	ret = scoutfs_parallel_restore_export_super(wri, super);
+	error_exit(ret, "update super %d", ret);
+
+	if (!opts->read_only) {
+		ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
+			     SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
+		error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
+	}
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+
+	scoutfs_parallel_restore_destroy_writer(&wri);
+
+	secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
+	       ((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
+	printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
+		tot_files, tot_bytes, secs,
+		(float)tot_bytes / tot_files, (float)tot_files / secs);
+
+	if (dev_fd >= 0)
+		close(dev_fd);
+	if (pair[0] >= 0)
+		close(pair[0]);
+	if (pair[1] >= 0)
+		close(pair[1]);
+	free(super);
+	free(slices);
+	free(buf);
+
+	return 0;
+}
+
+static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
+{
+	char *sep;
+	int ret = 0;
+
+	sep = index(str, ':');
+	if (sep) {
+		*sep = '\0';
+		ret = parse_u64(sep + 1, high_ret);
+	}
+
+	if (ret == 0)
+		ret = parse_u64(str, low_ret);
+
+	if (sep)
+		*sep = ':';
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	struct opts opts = {
+		.buf_size = (32 * 1024 * 1024),
+
+		.write_batch = 1000000,
+		.low_dirs = 5,
+		.high_dirs = 10,
+		.low_files = 10,
+		.high_files = 20,
+		.total_files = 100,
+	};
+	int ret;
+	int c;
+
+	opts.seed = random64();
+	opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
+
+        while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
+                switch(c) {
+                case 'b':
+			ret = parse_u64(optarg, &opts.write_batch);
+			error_exit(ret, "error parsing -b '%s'\n", optarg);
+			error_exit(opts.write_batch == 0, "-b can't be 0");
+                        break;
+                case 'd':
+			ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
+			error_exit(ret, "error parsing -d '%s'\n", optarg);
+                        break;
+                case 'f':
+			ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
+			error_exit(ret, "error parsing -f '%s'\n", optarg);
+                        break;
+                case 'm':
+                        opts.meta_path = strdup(optarg);
+                        break;
+                case 'n':
+			ret = parse_u64(optarg, &opts.total_files);
+			error_exit(ret, "error parsing -n '%s'\n", optarg);
+                        break;
+                case 'r':
+			opts.read_only = true;
+			break;
+                case 's':
+			ret = parse_u64(optarg, &opts.seed);
+			error_exit(ret, "error parsing -s '%s'\n", optarg);
+                        break;
+                case 'w':
+			ret = parse_u64(optarg, &opts.nr_writers);
+			error_exit(ret, "error parsing -w '%s'\n", optarg);
+                        break;
+                case '?':
+                        printf("Unknown option '%c'\n", optopt);
+                        usage();
+			exit(1);
+                }
+        }
+
+	error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
+		   opts.low_dirs, opts.high_dirs);
+	error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
+		   opts.low_files, opts.high_files);
+	error_exit(!opts.meta_path, "must specify metadata device path with -m");
+
+	printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
+		opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
+		opts.total_files, opts.seed, opts.nr_writers);
+
+	ret = do_restore(&opts);
+
+	free(opts.meta_path);
+
+	return ret == 0 ? 0 : 1;
+}
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -18,7 +18,9 @@ BIN := src/scoutfs
 OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
 DEPS := $(wildcard */*.d)

-all: $(BIN)
+AR := src/scoutfs_parallel_restore.a
+
+all: $(BIN) $(AR)

 ifneq ($(DEPS),)
 -include $(DEPS)
@@ -36,6 +38,10 @@ $(BIN): $(OBJ)
 	$(QU)  [BIN $@]
 	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid

+$(AR): $(OBJ)
+	$(QU)  [AR $@]
+	$(VE)ar rcs $@ $^
+
 %.o %.d: %.c Makefile sparse.sh
 	$(QU)  [CC $<]
 	$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
+.B log_merge_wait_timeout_ms=<number>
+This option sets the amount of time, in milliseconds, that log merge
+creation can wait before timing out.  This setting is per-mount, only
+changes the behavior of that mount, and only affects the server when it
+is running in that mount.
+.sp
+This determines how long it may take for mounts to synchronize
+committing their log trees to create a log merge operation.  Setting it
+too high can create long latencies in the event that a mount takes a
+long time to commit their log.  Setting it too low can result in the
+creation of excessive numbers of log trees that are never merged.  The
+default is 500 and it can not be less than 100 nor greater than 60000.
+.TP
 .B metadev_path=<device>
 The metadev_path option specifies the path to the block device that
 contains the filesystem's metadata.
--- a/utils/scoutfs-utils.spec.in
+++ b/utils/scoutfs-utils.spec.in
@@ -54,6 +54,8 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
+install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
+install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
 install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
 install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
 install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -70,6 +72,7 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
 %files -n scoutfs-devel
 %defattr(644,root,root,755)
 %{_includedir}/scoutfs
+%{_libdir}/scoutfs

 %clean
 rm -rf %{buildroot}
--- a/utils/src/bloom.c
+++ b/utils/src/bloom.c
@@ -0,0 +1,20 @@
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "hash.h"
+#include "bloom.h"
+
+void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs)
+{
+	u64 hash;
+	int i;
+
+	hash = scoutfs_hash64(key, sizeof(struct scoutfs_key));
+
+	for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
+		nrs[i] = (u32)hash % SCOUTFS_FOREST_BLOOM_BITS;
+		hash >>= SCOUTFS_FOREST_BLOOM_FUNC_BITS;
+	}
+}
--- a/utils/src/bloom.h
+++ b/utils/src/bloom.h
@@ -0,0 +1,6 @@
+#ifndef _BLOOM_H_
+#define _BLOOM_H_
+
+void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs);
+
+#endif
--- a/utils/src/btree.c
+++ b/utils/src/btree.c
@@ -8,7 +8,7 @@
 #include "leaf_item_hash.h"
 #include "btree.h"

-static void init_block(struct scoutfs_btree_block *bt, int level)
+void btree_init_block(struct scoutfs_btree_block *bt, int level)
 {
 	int free;

@@ -33,7 +33,7 @@ void btree_init_root_single(struct scoutfs_btree_root *root,

 	memset(bt, 0, SCOUTFS_BLOCK_LG_SIZE);

-	init_block(bt, 0);
+	btree_init_block(bt, 0);
 }

 static void *alloc_val(struct scoutfs_btree_block *bt, int len)
--- a/utils/src/btree.h
+++ b/utils/src/btree.h
@@ -1,6 +1,7 @@
 #ifndef _BTREE_H_
 #define _BTREE_H_

+void btree_init_block(struct scoutfs_btree_block *bt, int level);
 void btree_init_root_single(struct scoutfs_btree_root *root,
 			    struct scoutfs_btree_block *bt,
 			    u64 seq, u64 blkno);
--- a/utils/src/lk_rbtree_wrapper.h
+++ b/utils/src/lk_rbtree_wrapper.h
@@ -0,0 +1,24 @@
+#ifndef _LK_RBTREE_WRAPPER_H_
+#define _LK_RBTREE_WRAPPER_H_
+
+/*
+ * We're using this lame hack to build and use the kernel's rbtree in
+ * userspace.  We drop the kernel's rbtree*[ch] implementation in and
+ * use them with this wrapper.  We only have to remove the kernel
+ * includes from the imported files.
+ */
+
+#include <stdbool.h>
+#include "util.h"
+
+#define rcu_assign_pointer(a, b)	do { a = b; } while (0)
+#define READ_ONCE(a)			({ a; })
+#define WRITE_ONCE(a, b)		do { a = b; } while (0)
+#define unlikely(a)			({ a; })
+#define EXPORT_SYMBOL(a)		/* nop */
+
+#include "rbtree_types.h"
+#include "rbtree.h"
+#include "rbtree_augmented.h"
+
+#endif
--- a/utils/src/mode_types.c
+++ b/utils/src/mode_types.c
@@ -0,0 +1,24 @@
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "mode_types.h"
+
+unsigned int mode_to_type(mode_t mode)
+{
+#define S_SHIFT 12
+	static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
+		[S_IFIFO >> S_SHIFT]	= SCOUTFS_DT_FIFO,
+		[S_IFCHR >> S_SHIFT]	= SCOUTFS_DT_CHR,
+		[S_IFDIR >> S_SHIFT]	= SCOUTFS_DT_DIR,
+		[S_IFBLK >> S_SHIFT]	= SCOUTFS_DT_BLK,
+		[S_IFREG >> S_SHIFT]	= SCOUTFS_DT_REG,
+		[S_IFLNK >> S_SHIFT]	= SCOUTFS_DT_LNK,
+		[S_IFSOCK >> S_SHIFT]	= SCOUTFS_DT_SOCK,
+	};
+
+	return mode_types[(mode & S_IFMT) >> S_SHIFT];
+#undef S_SHIFT
+}
--- a/utils/src/mode_types.h
+++ b/utils/src/mode_types.h
@@ -0,0 +1,6 @@
+#ifndef _MODE_TYPES_H_
+#define _MODE_TYPES_H_
+
+unsigned int mode_to_type(mode_t mode);
+
+#endif
--- a/utils/src/name_hash.h
+++ b/utils/src/name_hash.h
@@ -0,0 +1,46 @@
+#ifndef _SCOUTFS_NAME_HASH_H_
+#define _SCOUTFS_NAME_HASH_H_
+
+#include "hash.h"
+
+/*
+ * Test a bit number as though an array of bytes is a large len-bit
+ * big-endian value.  nr 0 is the LSB of the final byte, nr (len - 1) is
+ * the MSB of the first byte.
+ */
+static int test_be_bytes_bit(int nr, const char *bytes, int len)
+{
+	return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
+}
+
+/*
+ * Generate a 32bit "fingerprint" of the name by extracting 32 evenly
+ * distributed bits from the name.  The intent is to have the sort order
+ * of the fingerprints reflect the memcmp() sort order of the names
+ * while mapping large names down to small fs keys.
+ *
+ * Names that are smaller than 32bits are biased towards the high bits
+ * of the fingerprint so that most significant bits of the fingerprints
+ * consistently reflect the initial characters of the names.
+ */
+static inline u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
+{
+	int name_bits = name_len * 8;
+	int skip = max(name_bits / 32, 1);
+	u32 fp = 0;
+	int f;
+	int n;
+
+	for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
+		fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
+
+	return fp;
+}
+
+static inline u64 dirent_name_hash(const char *name, unsigned int name_len)
+{
+       return scoutfs_hash32(name, name_len) |
+              ((u64)dirent_name_fingerprint(name, name_len) << 32);
+}
+
+#endif
--- a/utils/src/parallel_restore.c
+++ b/utils/src/parallel_restore.c
--- a/utils/src/parallel_restore.h
+++ b/utils/src/parallel_restore.h
@@ -0,0 +1,103 @@
+#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
+#define _SCOUTFS_PARALLEL_RESTORE_H_
+
+#include <errno.h>
+
+struct scoutfs_parallel_restore_progress {
+	struct scoutfs_btree_root fs_items;
+	struct scoutfs_btree_root root_items;
+	struct scoutfs_srch_file sfl;
+	struct scoutfs_block_ref bloom_ref;
+	__le64 inode_count;
+	__le64 max_ino;
+};
+
+struct scoutfs_parallel_restore_slice {
+	__le64 fsid;
+	__le64 meta_start;
+	__le64 meta_len;
+};
+
+struct scoutfs_parallel_restore_entry {
+	u64 dir_ino;
+	u64 pos;
+	u64 ino;
+	mode_t mode;
+	char *name;
+	unsigned int name_len;
+};
+
+struct scoutfs_parallel_restore_xattr {
+	u64 ino;
+	u64 pos;
+	char *name;
+	unsigned int name_len;
+	void *value;
+	unsigned int value_len;
+};
+
+struct scoutfs_parallel_restore_inode {
+	/* all inodes */
+	u64 ino;
+	u64 nr_xattrs;
+	u32 uid;
+	u32 gid;
+	u32 mode;
+	u32 rdev;
+	struct timespec atime;
+	struct timespec ctime;
+	struct timespec mtime;
+	struct timespec crtime;
+
+	/* regular files */
+	u64 data_version;
+	u64 size;
+	bool offline;
+
+	/* only used for directories */
+	u64 nr_subdirs;
+	u64 total_entry_name_bytes;
+
+	/* only used for symlnks */
+	char *target;
+	unsigned int target_len; /* not including null terminator */
+};
+
+typedef __typeof__(EINVAL) spr_err_t;
+
+struct scoutfs_parallel_restore_writer;
+
+spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
+void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
+
+spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
+					       struct scoutfs_parallel_restore_slice *slices,
+					       int nr);
+spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
+					    struct scoutfs_parallel_restore_slice *slice);
+spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
+					    struct scoutfs_parallel_restore_slice *slice);
+
+spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_inode *inode);
+spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_entry *entry);
+spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
+					     struct scoutfs_parallel_restore_xattr *xattr);
+
+spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog);
+spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_parallel_restore_progress *prog);
+
+spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
+					     void *buf, size_t len, off_t *off_ret,
+					     size_t *count_ret);
+
+spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super);
+spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
+						struct scoutfs_super_block *super);
+
+
+#endif
--- a/utils/src/rbtree.c
+++ b/utils/src/rbtree.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+
+  linux/lib/rbtree.c
+*/
+
+#include "lk_rbtree_wrapper.h"
+
+/*
+ * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
+
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
+static inline void rb_set_black(struct rb_node *rb)
+{
+	rb->__rb_parent_color |= RB_BLACK;
+}
+
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
+{
+	return (struct rb_node *)red->__rb_parent_color;
+}
+
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	__rb_change_child(old, new, parent, root);
+}
+
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root,
+	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+	while (true) {
+		/*
+		 * Loop invariant: node is red.
+		 */
+		if (unlikely(!parent)) {
+			/*
+			 * The inserted node is root. Either this is the
+			 * first node, or we recursed at Case 1 below and
+			 * are no longer violating 4).
+			 */
+			rb_set_parent_color(node, NULL, RB_BLACK);
+			break;
+		}
+
+		/*
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as,
+		 * per 4), we don't want a red root or two
+		 * consecutive red nodes.
+		 */
+		if(rb_is_black(parent))
+			break;
+
+		gparent = rb_red_parent(parent);
+
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - node's uncle is red (color flips).
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            n
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_right;
+			if (node == tmp) {
+				/*
+				 * Case 2 - node's uncle is black and node is
+				 * the parent's right child (left rotate at parent).
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_right;
+			}
+
+			/*
+			 * Case 3 - node's uncle is black and node is
+			 * the parent's left child (right rotate at gparent).
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		} else {
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_left;
+			if (node == tmp) {
+				/* Case 2 - right rotate at parent */
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_left;
+			}
+
+			/* Case 3 - left rotate at gparent */
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		}
+	}
+}
+
+/*
+ * Inline version for rb_erase() use - we want to be able to inline
+ * and eliminate the dummy_rotate callback there
+ */
+static __always_inline void
+____rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
+
+	while (true) {
+		/*
+		 * Loop invariants:
+		 * - node is black (or NULL on first iteration)
+		 * - node is not the root (parent is not NULL)
+		 * - All leaf paths going through parent and node have a
+		 *   black node count that is 1 lower than other leaf paths.
+		 */
+		sibling = parent->rb_right;
+		if (node != sibling) {	/* node == parent->rb_left */
+			if (rb_is_red(sibling)) {
+				/*
+				 * Case 1 - left rotate at parent
+				 *
+				 *     P               S
+				 *    / \             / \
+				 *   N   s    -->    p   Sr
+				 *      / \         / \
+				 *     Sl  Sr      N   Sl
+				 */
+				tmp1 = sibling->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp1);
+				WRITE_ONCE(sibling->rb_left, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_right;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_left;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/*
+					 * Case 2 - sibling color flip
+					 * (p could be either color here)
+					 *
+					 *    (p)           (p)
+					 *    / \           / \
+					 *   N   S    -->  N   s
+					 *      / \           / \
+					 *     Sl  Sr        Sl  Sr
+					 *
+					 * This leaves us violating 5) which
+					 * can be fixed by flipping p to black
+					 * if it was red, or by recursing at p.
+					 * p is red when coming from Case 1.
+					 */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/*
+				 * Case 3 - right rotate at sibling
+				 * (p could be either color here)
+				 *
+				 *   (p)           (p)
+				 *   / \           / \
+				 *  N   S    -->  N   sl
+				 *     / \             \
+				 *    sl  Sr            S
+				 *                       \
+				 *                        Sr
+				 *
+				 * Note: p might be red, and then both
+				 * p and sl are red after rotation(which
+				 * breaks property 4). This is fixed in
+				 * Case 4 (in __rb_rotate_set_parents()
+				 *         which set sl the color of p
+				 *         and set p RB_BLACK)
+				 *
+				 *   (p)            (sl)
+				 *   / \            /  \
+				 *  N   sl   -->   P    S
+				 *       \        /      \
+				 *        S      N        Sr
+				 *         \
+				 *          Sr
+				 */
+				tmp1 = tmp2->rb_right;
+				WRITE_ONCE(sibling->rb_left, tmp1);
+				WRITE_ONCE(tmp2->rb_right, sibling);
+				WRITE_ONCE(parent->rb_right, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/*
+			 * Case 4 - left rotate at parent + color flips
+			 * (p and sl could be either color here.
+			 *  After rotation, p becomes black, s acquires
+			 *  p's color, and sl keeps its color)
+			 *
+			 *      (p)             (s)
+			 *      / \             / \
+			 *     N   S     -->   P   Sr
+			 *        / \         / \
+			 *      (sl) sr      N  (sl)
+			 */
+			tmp2 = sibling->rb_left;
+			WRITE_ONCE(parent->rb_right, tmp2);
+			WRITE_ONCE(sibling->rb_left, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		} else {
+			sibling = parent->rb_left;
+			if (rb_is_red(sibling)) {
+				/* Case 1 - right rotate at parent */
+				tmp1 = sibling->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp1);
+				WRITE_ONCE(sibling->rb_right, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_left;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_right;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/* Case 2 - sibling color flip */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/* Case 3 - left rotate at sibling */
+				tmp1 = tmp2->rb_left;
+				WRITE_ONCE(sibling->rb_right, tmp1);
+				WRITE_ONCE(tmp2->rb_left, sibling);
+				WRITE_ONCE(parent->rb_left, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/* Case 4 - right rotate at parent + color flips */
+			tmp2 = sibling->rb_right;
+			WRITE_ONCE(parent->rb_left, tmp2);
+			WRITE_ONCE(sibling->rb_right, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		}
+	}
+}
+
+/* Non-inline version for rb_erase_augmented() use */
+void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	____rb_erase_color(parent, root, augment_rotate);
+}
+EXPORT_SYMBOL(__rb_erase_color);
+
+/*
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+ */
+
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+
+static const struct rb_augment_callbacks dummy_callbacks = {
+	.propagate = dummy_propagate,
+	.copy = dummy_copy,
+	.rotate = dummy_rotate
+};
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	__rb_insert(node, root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color);
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *rebalance;
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+	if (rebalance)
+		____rb_erase_color(rebalance, root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_erase);
+
+/*
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
+ */
+
+void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	__rb_insert(node, root, augment_rotate);
+}
+EXPORT_SYMBOL(__rb_insert_augmented);
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+EXPORT_SYMBOL(rb_first);
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+EXPORT_SYMBOL(rb_last);
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right;
+		while (node->rb_left)
+			node = node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+EXPORT_SYMBOL(rb_next);
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
+	if (node->rb_left) {
+		node = node->rb_left;
+		while (node->rb_right)
+			node = node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+EXPORT_SYMBOL(rb_prev);
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+	__rb_change_child(victim, new, parent, root);
+}
+EXPORT_SYMBOL(rb_replace_node);
+
+void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+			 struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+
+	/* Set the parent's pointer to the new node last after an RCU barrier
+	 * so that the pointers onwards are seen to be set correctly when doing
+	 * an RCU walk over the tree.
+	 */
+	__rb_change_child_rcu(victim, new, parent, root);
+}
+EXPORT_SYMBOL(rb_replace_node_rcu);
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+EXPORT_SYMBOL(rb_next_postorder);
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
+EXPORT_SYMBOL(rb_first_postorder);
--- a/utils/src/rbtree.h
+++ b/utils/src/rbtree.h
@@ -0,0 +1,328 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  See Documentation/core-api/rbtree.rst for documentation and samples.
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
+
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
+
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+			    struct rb_root *root);
+extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+				struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}
+
+#define rb_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
+	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); 1; }); \
+	     pos = n)
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+
+static inline struct rb_node *
+rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
+{
+	struct rb_node *leftmost = NULL;
+
+	if (root->rb_leftmost == node)
+		leftmost = root->rb_leftmost = rb_next(node);
+
+	rb_erase(node, &root->rb_root);
+
+	return leftmost;
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ *
+ * Returns @node when it is the new leftmost, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+
+	return leftmost ? node : NULL;
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
+#endif	/* _LINUX_RBTREE_H */
--- a/utils/src/rbtree_augmented.h
+++ b/utils/src/rbtree_augmented.h
@@ -0,0 +1,313 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+
+  linux/include/linux/rbtree_augmented.h
+*/
+
+#ifndef _LINUX_RBTREE_AUGMENTED_H
+#define _LINUX_RBTREE_AUGMENTED_H
+
+/*
+ * Please note - only struct rb_augment_callbacks and the prototypes for
+ * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
+ * The rest are implementation details you are not expected to depend on.
+ *
+ * See Documentation/core-api/rbtree.rst for documentation and samples.
+ */
+
+struct rb_augment_callbacks {
+	void (*propagate)(struct rb_node *node, struct rb_node *stop);
+	void (*copy)(struct rb_node *old, struct rb_node *new);
+	void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+/*
+ * Fixup the rbtree and update the augmented information when rebalancing.
+ *
+ * On insertion, the user must update the augmented information on the path
+ * leading to the inserted node, then call rb_link_node() as usual and
+ * rb_insert_augmented() instead of the usual rb_insert_color() call.
+ * If rb_insert_augmented() rebalances the rbtree, it will callback into
+ * a user provided function to update the augmented information on the
+ * affected subtrees.
+ */
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+		    const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, root, augment->rotate);
+}
+
+static inline void
+rb_insert_augmented_cached(struct rb_node *node,
+			   struct rb_root_cached *root, bool newleft,
+			   const struct rb_augment_callbacks *augment)
+{
+	if (newleft)
+		root->rb_leftmost = node;
+	rb_insert_augmented(node, &root->rb_root, augment);
+}
+
+/*
+ * Template for declaring augmented rbtree callbacks (generic case)
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
+ */
+
+#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,				\
+			     RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)	\
+static inline void							\
+RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+{									\
+	while (rb != stop) {						\
+		RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);	\
+		if (RBCOMPUTE(node, true))				\
+			break;						\
+		rb = rb_parent(&node->RBFIELD);				\
+	}								\
+}									\
+static inline void							\
+RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+{									\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
+}									\
+static void								\
+RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+{									\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
+	RBCOMPUTE(old, false);						\
+}									\
+RBSTATIC const struct rb_augment_callbacks RBNAME = {			\
+	.propagate = RBNAME ## _propagate,				\
+	.copy = RBNAME ## _copy,					\
+	.rotate = RBNAME ## _rotate					\
+};
+
+/*
+ * Template for declaring augmented rbtree callbacks,
+ * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBTYPE:      type of the RBAUGMENTED field
+ * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
+ */
+
+#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,	      \
+				 RBTYPE, RBAUGMENTED, RBCOMPUTE)	      \
+static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)	      \
+{									      \
+	RBSTRUCT *child;						      \
+	RBTYPE max = RBCOMPUTE(node);					      \
+	if (node->RBFIELD.rb_left) {					      \
+		child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (node->RBFIELD.rb_right) {					      \
+		child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (exit && node->RBAUGMENTED == max)				      \
+		return true;						      \
+	node->RBAUGMENTED = max;					      \
+	return false;							      \
+}									      \
+RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,					      \
+		     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
+
+
+#define	RB_RED		0
+#define	RB_BLACK	1
+
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			WRITE_ONCE(parent->rb_left, new);
+		else
+			WRITE_ONCE(parent->rb_right, new);
+	} else
+		WRITE_ONCE(root->rb_node, new);
+}
+
+static inline void
+__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
+		      struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			rcu_assign_pointer(parent->rb_left, new);
+		else
+			rcu_assign_pointer(parent->rb_right, new);
+	} else
+		rcu_assign_pointer(root->rb_node, new);
+}
+
+extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+static __always_inline struct rb_node *
+__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		     const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
+	struct rb_node *parent, *rebalance;
+	unsigned long pc;
+
+	if (!tmp) {
+		/*
+		 * Case 1: node to erase has no more than 1 child (easy!)
+		 *
+		 * Note that if there is one child it must be red due to 5)
+		 * and node must be black due to 4). We adjust colors locally
+		 * so as to bypass __rb_erase_color() later on.
+		 */
+		pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, child, parent, root);
+		if (child) {
+			child->__rb_parent_color = pc;
+			rebalance = NULL;
+		} else
+			rebalance = __rb_is_black(pc) ? parent : NULL;
+		tmp = parent;
+	} else if (!child) {
+		/* Still case 1, but this time the child is node->rb_left */
+		tmp->__rb_parent_color = pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, tmp, parent, root);
+		rebalance = NULL;
+		tmp = parent;
+	} else {
+		struct rb_node *successor = child, *child2;
+
+		tmp = child->rb_left;
+		if (!tmp) {
+			/*
+			 * Case 2: node's successor is its right child
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (s)  ->  (x) (c)
+			 *        \
+			 *        (c)
+			 */
+			parent = successor;
+			child2 = successor->rb_right;
+
+			augment->copy(node, successor);
+		} else {
+			/*
+			 * Case 3: node's successor is leftmost under
+			 * node's right child subtree
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (y)  ->  (x) (y)
+			 *      /            /
+			 *    (p)          (p)
+			 *    /            /
+			 *  (s)          (c)
+			 *    \
+			 *    (c)
+			 */
+			do {
+				parent = successor;
+				successor = tmp;
+				tmp = tmp->rb_left;
+			} while (tmp);
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
+			rb_set_parent(child, successor);
+
+			augment->copy(node, successor);
+			augment->propagate(parent, successor);
+		}
+
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
+		rb_set_parent(tmp, successor);
+
+		pc = node->__rb_parent_color;
+		tmp = __rb_parent(pc);
+		__rb_change_child(node, successor, tmp, root);
+
+		if (child2) {
+			rb_set_parent_color(child2, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			rebalance = rb_is_black(successor) ? parent : NULL;
+		}
+		successor->__rb_parent_color = pc;
+		tmp = successor;
+	}
+
+	augment->propagate(tmp, NULL);
+	return rebalance;
+}
+
+static __always_inline void
+rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		   const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+	if (rebalance)
+		__rb_erase_color(rebalance, root, augment->rotate);
+}
+
+static __always_inline void
+rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
+			  const struct rb_augment_callbacks *augment)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase_augmented(node, &root->rb_root, augment);
+}
+
+#endif	/* _LINUX_RBTREE_AUGMENTED_H */
--- a/utils/src/rbtree_types.h
+++ b/utils/src/rbtree_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_RBTREE_TYPES_H
+#define _LINUX_RBTREE_TYPES_H
+
+struct rb_node {
+	unsigned long  __rb_parent_color;
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+/* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+	struct rb_node *rb_node;
+};
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT (struct rb_root) { NULL, }
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+#endif
--- a/utils/src/srch.c
+++ b/utils/src/srch.c
@@ -44,3 +44,37 @@ int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,

 	return tot;
 }
+
+static int encode_u64(__le64 *buf, u64 val)
+{
+	int bytes;
+
+	val = (val << 1) ^ ((s64)val >> 63); /* shift sign extend */
+	bytes = (fls64(val) + 7) >> 3;
+
+	put_unaligned_le64(val, buf);
+	return bytes;
+}
+
+int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev)
+{
+	u64 diffs[] = {
+		le64_to_cpu(sre->hash) - le64_to_cpu(prev->hash),
+		le64_to_cpu(sre->ino) - le64_to_cpu(prev->ino),
+		le64_to_cpu(sre->id) - le64_to_cpu(prev->id),
+	};
+	u16 lengths = 0;
+	int bytes;
+	int tot = 2;
+	int i;
+
+	for (i = 0; i < array_size(diffs); i++) {
+		bytes = encode_u64(buf + tot, diffs[i]);
+		lengths |= bytes << (i << 2);
+		tot += bytes;
+	}
+
+	put_unaligned_le16(lengths, buf);
+
+	return tot;
+}
--- a/utils/src/srch.h
+++ b/utils/src/srch.h
@@ -3,5 +3,6 @@

 int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
 		      struct scoutfs_srch_entry *prev);
+int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev);

 #endif
--- a/utils/src/util.h
+++ b/utils/src/util.h
@@ -69,6 +69,8 @@ do {				\
 #define container_of(ptr, type, memb) \
 	((type *)((void *)(ptr) - offsetof(type, memb)))

+#define NSEC_PER_SEC 1000000000
+
 #define BITS_PER_LONG (sizeof(long) * 8)
 #define U8_MAX ((u8)~0ULL)
 #define U16_MAX ((u16)~0ULL)
@@ -81,6 +83,7 @@ do {				\
 							\
 	(_x == 0 ? 0 : 64 - __builtin_clzll(_x));	\
 })
+#define fls64(x) flsll(x)

 #define ilog2(x)					\
 ({							\
@@ -98,6 +101,16 @@ emit_get_unaligned_le(16)
 emit_get_unaligned_le(32)
 emit_get_unaligned_le(64)

+#define emit_put_unaligned_le(nr)				\
+static inline void put_unaligned_le##nr(u##nr val, void *buf)	\
+{								\
+	__le##nr x = cpu_to_le##nr(val);			\
+	memcpy(buf, &x, sizeof(x));				\
+}
+emit_put_unaligned_le(16)
+emit_put_unaligned_le(32)
+emit_put_unaligned_le(64)
+
 /*
 * return -1,0,+1 based on the memcmp comparison of the minimum of their
 * two lengths.  If their min shared bytes are equal but the lengths
Author	SHA1	Message	Date
Zach Brown	92ba4f5662	Add test for parallel restore Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	7754feee3f	Add parallel restore Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	c4b00d059a	Add userspace NSEC_PER_SEC Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	42a60e31c5	Add bloom filter index calc for userspace utils Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	ea7637b915	Add srch_encode_entry() for userspace utils Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	1324c39372	Add put_unaligned_leXX() for userspace Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	f41bd60ca4	Add fls64() alias for userspace flsll() Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	70a932614a	Promote userspace btree block initialization Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	c6a11fcd6b	Add userspace version of our mode to type Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	3c67805004	Add userspace version of our dirent name hash Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	02fe58b727	Add lk rbtree wrapper Import the kernel's rbtree implementation with a wrapper so we can use it from userspace. Signed-off-by: Zach Brown <zab@versity.com>	2024-04-15 11:06:24 -07:00
Zach Brown	0519830229	Merge pull request #165 from versity/greg/kmod-uninstall-cleanup More cleanly drive weak-modules on install/uninstall	2024-04-11 14:32:06 -07:00
Greg Cymbalski	4d6e1a14ae	More safely install/uninstall with weak-modules This addresses some minor issues with how we handle driving the weak-modules infrastructure for handling running on kernels not explicitly built for. For one, we now drive weak-modules at install-time more explicitly (it was adding symlinks for all modules into the right place for the running kernel, whereas now it only handles that for scoutfs against all installed kernels). Also we no longer leave stale modules on the filesystem after an uninstall/upgrade, similar to what's done for vsm's kmods right now. RPM's pre/postinstall scriptlets are used to drive weak-modules to clean things up. Note that this (intentionally) does not (re)generate initrds of any kind. Finally, this was tested on both the native kernel version and on updates that would need the migrated modules. As a result, installs are a little quicker, the module still gets migrated successfully, and uninstalls correctly remove (only) the packaged module.	2024-04-11 13:20:50 -07:00
Greg Cymbalski	fc3e061ea8	Merge pull request #164 from versity/greg/preserve-git-describe Encode git info into spec to keep git info in final kmod	2024-03-29 13:48:33 -07:00
Greg Cymbalski	a4bc3fb27d	Capture git info at spec creation time, pass into make	2024-02-05 15:44:10 -08:00
Zach Brown	67990a7007	Merge pull request #162 from versity/zab/v1.19 v1.19 Release	2024-01-30 15:46:49 -08:00
Zach Brown	ba819be8f9	v1.19 Release Finish the release notes for the 1.19 release. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-30 12:11:23 -08:00
Zach Brown	1b103184ca	Merge pull request #161 from versity/zab/merge_timeout_option_fix Correctly set the log_merge_wait_timeout_ms option	2024-01-30 12:07:10 -08:00
Zach Brown	c3890abd7b	Correctly set the log_merge_wait_timeout_ms option The initial code for setting the timeout used the wrong parsed variable. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-30 12:01:35 -08:00
Zach Brown	5ab38bfa48	Merge pull request #160 from versity/zab/log_merging_speedups Zab/log merging speedups	2024-01-29 12:26:55 -08:00
Zach Brown	e9ad61b444	Delete multiple log trees items per server commit server_log_merge_free_work() is responsible for freeing all the input log trees for a log merge operation that has finished. It looks for the next item to free, frees the log btree it references, and then deletes the item. It was doing this with a full server commit for each item which can take an agonizingly long time. This changes it perform multiple deletions in a commit as long as there's plenty of alloc space. The moment the commit gets low it applies the commit and opens a new one. This sped up the deletion of a few hundred thousand log tree items from taking hours to seconds. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	91bbf90f71	Don't pin input btrees when merging The btree_merge code was pinning leaf blocks for all input btrees as it iterated over them. This doesn't work when there are a very large number of input btrees. It can run out of memory trying to hold a reference to a 64KiB leaf block for each input root. This reworks the btree merging code. It reads a window of blocks from all input trees to get a set of merged items. It can take multiple passes to complete the merge but by setting the merge window large enough this overhead is reduced. Merging now consumes a fixed amount of memory rather than using memory proportional to the number of input btrees. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	b5630f540d	Add tracing of the log merge finalizing decision Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	90a4c82363	Make log merge wait timeout tunable Add a mount option for the amount of time that log merge creation can wait before giving up. We add some counters so we can see how often the timeout is being hit and what the average successfull wait time is. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:25:56 -08:00
Zach Brown	f654fa0fda	Send syncs once when starting to merge The server sends sync requests to clients when it sees that they have open log trees that need to be committed for log merging to proceed. These are currently sent in the context of each client's get_log_trees request, resulting in sync requests queued for one client from all clients. Depending on message delivery and commit latencies, this can create a sync storm. The server's sends are reliable and the open commits are marked with the seq when they opened. It's easy for us to record having sent syncs to all open commits so that future attempts can be avoided. Later open commits will have higher seqs and will get a new round of syncs sent. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:25:20 -08:00
Zach Brown	50168a2d2a	Check each client's last log item for stable seq The server was checking all client log_trees items to search for the lowest commit seq that was still open. This can be expensive when there are a lot of finalized log_trees items that won't have open seqs. Only the last log_trees item for each client rid can be open, and the items are sorted by rid and nr, so we can easily only check the last item for each client rid. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:24:50 -08:00
Zach Brown	3c0616524a	Only search last log_trees per rid for finalizing During get_log_trees the server checks log_trees items to see if it should start a log merge operation. It did this by iterating over all log_trees items and there can be quite a lot of them. It doesn't need to see all of the items. It only needs to see the most recent log_trees item for each mount. That's enough to make the decisions that start the log merging process. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:23:59 -08:00