Add test_bit to utils bitmap

Add test_bit() to the trivial utils bitmap.c implementation. Signed-off-by: Zach Brown <zab@versity.com>
Add {read,write}-metadata-image scoutfs commands
2026-01-08 13:01:23 +00:00 · 2024-03-06 17:18:57 -08:00 · 2024-03-06 17:18:57 -08:00 · 2024-03-06 17:18:01 -08:00 · 2024-03-04 15:13:46 -08:00 · 2024-03-04 15:13:46 -08:00
62 changed files with 5416 additions and 385 deletions
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,6 +1,32 @@
 Versity ScoutFS Release Notes
 =============================

+---
+v1.19
+\
+*Jan 30, 2024*
+
+Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
+for creating log merge operations.  The previous timeout, now the
+default, was too short for some systems and was resulting in consistent
+timeouts which created an excessive number of log trees waiting to be
+merged.
+
+Improved performance of many in-mount server operations when there are a
+large number of log trees waiting to be merged.
+
+---
+v1.18
+\
+*Nov 7, 2023*
+
+Fixed a bug where background srch file compaction could stop making
+forward progress if a partial compaction operation was committed at a
+specific byte offset in a block.  This would cause srch file searches to
+be progressively more expensive over time.  Once this fix is running
+background compaction will resume, bringing the cost of searches back
+down.
+
 ---
 v1.17
 \
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -2029,187 +2029,253 @@ int scoutfs_btree_rebalance(struct super_block *sb,
 			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
 }

-struct merge_pos {
+struct merged_range {
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct rb_root root;
+	int size;
+};
+
+struct merged_item {
 	struct rb_node node;
-	struct scoutfs_btree_root *root;
-	struct scoutfs_block *bl;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_avl_node *avl;
-	struct scoutfs_key *key;
+	struct scoutfs_key key;
 	u64 seq;
 	u8 flags;
 	unsigned int val_len;
-	u8 *val;
+	u8 val[0];
 };

-static struct merge_pos *first_mpos(struct rb_root *root)
+static inline struct merged_item *mitem_container(struct rb_node *node)
 {
-	struct rb_node *node = rb_first(root);
-	if (node)
-		 return container_of(node, struct merge_pos, node);
+	return node ? container_of(node, struct merged_item, node) : NULL;
+}
+
+static inline struct merged_item *first_mitem(struct rb_root *root)
+{
+	return mitem_container(rb_first(root));
+}
+
+static inline struct merged_item *last_mitem(struct rb_root *root)
+{
+	return mitem_container(rb_last(root));
+}
+
+static inline struct merged_item *next_mitem(struct merged_item *mitem)
+{
+	return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
+}
+
+static inline struct merged_item *prev_mitem(struct merged_item *mitem)
+{
+	return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
+}
+
+static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
+				      struct rb_node **parent_ret, struct rb_node ***link_ret)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct merged_item *mitem;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		mitem = container_of(*node, struct merged_item, node);
+
+		cmp = scoutfs_key_compare(key, &mitem->key);
+
+		if (cmp < 0) {
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			*parent_ret = NULL;
+			*link_ret = NULL;
+			return mitem;
+		}
+	}
+
+	*parent_ret = parent;
+	*link_ret = node;
 	return NULL;
 }

-static struct merge_pos *next_mpos(struct merge_pos *mpos)
+static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
+			 struct rb_node *parent, struct rb_node **link)
 {
-	struct rb_node *node;
-
-	if (mpos && (node = rb_next(&mpos->node)))
-		return container_of(node, struct merge_pos, node);
-	else
-		return NULL;
+	rb_link_node(&mitem->node, parent, link);
+	rb_insert_color(&mitem->node, &rng->root);
+	rng->size += item_len_bytes(mitem->val_len);
 }

-static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
+static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
+				struct merged_item *new)
 {
-	scoutfs_block_put(sb, mpos->bl);
-	kfree(mpos);
+	rb_replace_node(&victim->node, &new->node, &rng->root);
+	RB_CLEAR_NODE(&victim->node);
+	rng->size -= item_len_bytes(victim->val_len);
+	rng->size += item_len_bytes(new->val_len);
 }

-static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
+static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
 {
-	struct rb_node **node = &pos_root->rb_node;
-	struct rb_node *parent = NULL;
-	struct merge_pos *mpos;
-	int cmp;
+	if (IS_ERR_OR_NULL(mitem))
+		return;

-	parent = NULL;
-	while (*node) {
-		parent = *node;
-		mpos = container_of(*node, struct merge_pos, node);
-
-		/* sort merge items by key then newest to oldest */
-		cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
-		      -scoutfs_cmp(ins->seq, mpos->seq);
-
-		if (cmp < 0)
-			node = &(*node)->rb_left;
-		else
-			node = &(*node)->rb_right;
+	if (!RB_EMPTY_NODE(&mitem->node)) {
+		rng->size -= item_len_bytes(mitem->val_len);
+		rb_erase(&mitem->node, &rng->root);
 	}

-	rb_link_node(&ins->node, parent, node);
-	rb_insert_color(&ins->node, pos_root);
+	kfree(mitem);
+}
+
+static void trim_range_size(struct merged_range *rng, int merge_window)
+{
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+
+	mitem = last_mitem(&rng->root);
+	while (mitem && rng->size > merge_window) {
+
+		rng->end = mitem->key;
+		scoutfs_key_dec(&rng->end);
+
+		tmp = mitem;
+		mitem = prev_mitem(mitem);
+		free_mitem(rng, tmp);
+	}
+}
+
+static void trim_range_end(struct merged_range *rng)
+{
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+
+	mitem = last_mitem(&rng->root);
+	while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
+		tmp = mitem;
+		mitem = prev_mitem(mitem);
+		free_mitem(rng, tmp);
+	}
 }

 /*
- * Find the next item in the merge_pos root in the caller's range and
- * insert it into the rbtree sorted by key and version so that merging
- * can find the next newest item at the front of the rbtree.  We free
- * the mpos on error or if there are no more items in the range.
+ * Record and combine logged items from log roots for merging with the
+ * writable destination root.  The caller is responsible for trimming
+ * the range if it gets too large or if the key range shrinks.
 */
-static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
-		      struct scoutfs_key *start, struct scoutfs_key *end)
+static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
+			   void *val, int val_len, void *arg)
 {
-	struct scoutfs_btree_item *item;
-	struct scoutfs_avl_node *next;
-	struct btree_walk_key_range kr;
-	struct scoutfs_key walk_key;
-	int ret = 0;
+	struct merged_range *rng = arg;
+	struct merged_item *mitem;
+	struct merged_item *found;
+	struct rb_node *parent;
+	struct rb_node **link;
+	int ret;

-	/* always erase before freeing or inserting */
-	if (!RB_EMPTY_NODE(&mpos->node)) {
-		rb_erase(&mpos->node, pos_root);
-		RB_CLEAR_NODE(&mpos->node);
-	}
-
-	/*
-	 * advance to next item via the avl tree.   The caller's pos is
-	 * only ever incremented past the last key so we can use next to
-	 * iterate rather than using search to skip past multiple items.
-	 */
-	if (mpos->avl)
-		mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
-
-	/* find the next leaf with the key if we run out of items */
-	walk_key = *start;
-	while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
-		scoutfs_block_put(sb, mpos->bl);
-		mpos->bl = NULL;
-		ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
-				 0, &mpos->bl, &kr, NULL);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
-			free_mpos(sb, mpos);
+	found = find_mitem(&rng->root, key, &parent, &link);
+	if (found) {
+		ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (ret == SCOUTFS_DELTA_COMBINED) {
+				scoutfs_inc_counter(sb, btree_merge_delta_combined);
+			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
+				scoutfs_inc_counter(sb, btree_merge_delta_null);
+				free_mitem(rng, found);
+			}
+			ret = 0;
 			goto out;
 		}
-		mpos->bt = mpos->bl->data;

-		mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
-					       start, NULL, NULL, &next, NULL) ?: next;
-		if (mpos->avl == NULL)
-			walk_key = kr.iter_next;
+		if (found->seq >= seq) {
+			ret = 0;
+			goto out;
+		}
 	}

-	/* see if we're out of items within the range */
-	item = node_item(mpos->avl);
-	if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
-		free_mpos(sb, mpos);
-		ret = 0;
+	mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
+	if (!mitem) {
+		ret = -ENOMEM;
 		goto out;
 	}

-	/* insert the next item within range at its version */
-	mpos->key = item_key(item);
-	mpos->seq = le64_to_cpu(item->seq);
-	mpos->flags = item->flags;
-	mpos->val_len = item_val_len(item);
-	mpos->val = item_val(mpos->bt, item);
+	mitem->key = *key;
+	mitem->seq = seq;
+	mitem->flags = flags;
+	mitem->val_len = val_len;
+	if (val_len)
+		memcpy(mitem->val, val, val_len);
+
+	if (found) {
+		replace_mitem(rng, found, mitem);
+		free_mitem(rng, found);
+	} else {
+		insert_mitem(rng, mitem, parent, link);
+	}

-	insert_mpos(pos_root, mpos);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * The caller has reset all the merge positions for all the input log
- * btree roots and wants the next logged item it should try and merge
- * with the items in the fs_root.
+ * Read a range of merged items.  The caller has set the key bounds of
+ * the range.  We read a merge window's worth of items from blocks in
+ * each input btree.
 *
- * We look ahead in the logged item stream to see if we should merge any
- * older logged delta items into one result for the caller.  We also
- * take this opportunity to skip and reset the mpos for any older
- * versions of the first item.
+ * The caller can only use the smallest range that overlaps with all the
+ * blocks that we read.  We start reading from the range's start key so
+ * it will always be present and we don't need to adjust it.  The final
+ * block we read from each input might not cover the range's end so it
+ * needs to be adjusted.
+ *
+ * The end range can also shrink if we have to drop items because the
+ * items exceeded the merge window size.
 */
-static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
-			      struct scoutfs_key *end, struct merge_pos **mpos_ret)
+static int read_merged_range(struct super_block *sb, struct merged_range *rng,
+			     struct list_head *inputs, int merge_window)
 {
-	struct merge_pos *mpos;
-	struct merge_pos *next;
+	struct scoutfs_btree_root_head *rhead;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
 	struct scoutfs_key key;
 	int ret = 0;
+	int i;

-	while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
-	       !scoutfs_key_compare(mpos->key, next->key)) {
+	list_for_each_entry(rhead, inputs, head) {
+		key = rng->start;

-		ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
-						    next->val, next->val_len);
-		if (ret < 0)
-			break;
-
-		/* reset advances to the next item */
-		key = *mpos->key;
-		scoutfs_key_inc(&key);
-
-		/* always skip next combined or older version */
-		ret = reset_mpos(sb, pos_root, next, &key, end);
-		if (ret < 0)
-			break;
-
-		if (ret == SCOUTFS_DELTA_COMBINED) {
-			scoutfs_inc_counter(sb, btree_merge_delta_combined);
-		} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
-			scoutfs_inc_counter(sb, btree_merge_delta_null);
-			/* if merging resulted in no info, skip current */
-			ret = reset_mpos(sb, pos_root, mpos, &key, end);
+		for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
+			start = key;
+			end = rng->end;
+			ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
+						       merge_read_item, rng);
 			if (ret < 0)
+				goto out;
+
+			if (scoutfs_key_compare(&end, &rng->end) >= 0)
 				break;
+
+			key = end;
+			scoutfs_key_inc(&key);
 		}
+
+		if (scoutfs_key_compare(&end, &rng->end) < 0) {
+			rng->end = end;
+			trim_range_end(rng);
+		}
+
+		if (rng->size > merge_window)
+			trim_range_size(rng, merge_window);
 	}

-	*mpos_ret = mpos;
+	trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
+	ret = 0;
+out:
 	return ret;
 }

@@ -2226,6 +2292,13 @@ static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
 * to allocators running low or needing to join/split the parent.
 * *next_ret is set to the next key which hasn't been merged so that the
 * caller can retry with a new allocator and subtree.
+ *
+ * The number of input roots can be immense.  The merge_window specifies
+ * the size of the set of merged items that we'll maintain as we iterate
+ * over all the input roots.  Once we've merged items into the window
+ * from all the input roots the merged input items are then merged to
+ * the writable destination root.  It may take multiple passes of
+ * windows of merged items to cover the input key range.
 */
 int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_alloc *alloc,
@@ -2235,18 +2308,16 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *inputs,
-			bool subtree, int dirty_limit, int alloc_low)
+			bool subtree, int dirty_limit, int alloc_low, int merge_window)
 {
-	struct scoutfs_btree_root_head *rhead;
-	struct rb_root pos_root = RB_ROOT;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl = NULL;
 	struct btree_walk_key_range kr;
 	struct scoutfs_avl_node *par;
-	struct scoutfs_key next;
-	struct merge_pos *mpos;
-	struct merge_pos *tmp;
+	struct merged_item *mitem;
+	struct merged_item *tmp;
+	struct merged_range rng;
 	int walk_val_len;
 	int walk_flags;
 	bool is_del;
@@ -2257,49 +2328,59 @@ int scoutfs_btree_merge(struct super_block *sb,
 	trace_scoutfs_btree_merge(sb, root, start, end);
 	scoutfs_inc_counter(sb, btree_merge);

-	list_for_each_entry(rhead, inputs, head) {
-		mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
-		if (!mpos) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		RB_CLEAR_NODE(&mpos->node);
-		mpos->root = &rhead->root;
-
-		ret = reset_mpos(sb, &pos_root, mpos, start, end);
-		if (ret < 0)
-			goto out;
-	}
-
 	walk_flags = BTW_DIRTY;
 	if (subtree)
 		walk_flags |= BTW_SUBTREE;
 	walk_val_len = 0;

-	while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
+	rng.start = *start;
+	rng.end = *end;
+	rng.root = RB_ROOT;
+	rng.size = 0;
+
+	ret = read_merged_range(sb, &rng, inputs, merge_window);
+	if (ret < 0)
+		goto out;
+
+	for (;;) {
+		/* read next window as it empties (and it is possible to read an empty range) */
+		mitem = first_mitem(&rng.root);
+		if (!mitem) {
+			/* done if the read range hit the end */
+			if (scoutfs_key_compare(&rng.end, end) >= 0)
+				break;
+
+			/* read next batch of merged items */
+			rng.start = rng.end;
+			scoutfs_key_inc(&rng.start);
+			rng.end = *end;
+			ret = read_merged_range(sb, &rng, inputs, merge_window);
+			if (ret < 0)
+				break;
+			continue;
+		}

 		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
 			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
 			ret = -ERANGE;
-			*next_ret = *mpos->key;
+			*next_ret = mitem->key;
 			goto out;
 		}

 		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
 			scoutfs_inc_counter(sb, btree_merge_alloc_low);
 			ret = -ERANGE;
-			*next_ret = *mpos->key;
+			*next_ret = mitem->key;
 			goto out;
 		}

 		scoutfs_block_put(sb, bl);
 		bl = NULL;
 		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-			         mpos->key, walk_val_len, &bl, &kr, NULL);
+			         &mitem->key, walk_val_len, &bl, &kr, NULL);
 		if (ret < 0) {
 			if (ret == -ERANGE)
-				*next_ret = *mpos->key;
+				*next_ret = mitem->key;
 			goto out;
 		}
 		bt = bl->data;
@@ -2311,22 +2392,21 @@ int scoutfs_btree_merge(struct super_block *sb,
 			continue;
 		}

-		while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
-
+		while (mitem) {
 			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
+			if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
 				break;

 			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, mpos->key);
-			is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);
+			item = leaf_item_hash_search(sb, bt, &mitem->key);
+			is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);

 			/* see if we're merging delta items */
 			if (item && !is_del)
-				delta = scoutfs_forest_combine_deltas(mpos->key,
+				delta = scoutfs_forest_combine_deltas(&mitem->key,
 								      item_val(bt, item),
 								      item_val_len(item),
-								      mpos->val, mpos->val_len);
+								      mitem->val, mitem->val_len);
 			else
 				delta = 0;
 			if (delta < 0) {
@@ -2338,40 +2418,38 @@ int scoutfs_btree_merge(struct super_block *sb,
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 			}

-			trace_scoutfs_btree_merge_items(sb, mpos->root,
-					mpos->key, mpos->val_len,
+			trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
 					item ? root : NULL,
 					item ? item_key(item) : NULL,
 					item ? item_val_len(item) : 0, is_del);

 			/* rewalk and split if ins/update needs room */
-			if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
+			if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
 				walk_flags |= BTW_INSERT;
-				walk_val_len = mpos->val_len;
+				walk_val_len = mitem->val_len;
 				break;
 			}

 			/* insert missing non-deletion merge items */
 			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root,
-						   cmp_key_item, mpos->key,
+				scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, mpos->key, mpos->seq, mpos->flags,
-					    mpos->val, mpos->val_len, par, cmp);
+				create_item(bt, &mitem->key, mitem->seq, mitem->flags,
+					    mitem->val, mitem->val_len, par, cmp);
 				scoutfs_inc_counter(sb, btree_merge_insert);
 			}

 			/* update existing items */
 			if (item && !is_del && !delta) {
-				item->seq = cpu_to_le64(mpos->seq);
-				item->flags = mpos->flags;
-				update_item_value(bt, item, mpos->val, mpos->val_len);
+				item->seq = cpu_to_le64(mitem->seq);
+				item->flags = mitem->flags;
+				update_item_value(bt, item, mitem->val, mitem->val_len);
 				scoutfs_inc_counter(sb, btree_merge_update);
 			}

 			/* update combined delta item seq */
 			if (delta == SCOUTFS_DELTA_COMBINED) {
-				item->seq = cpu_to_le64(mpos->seq);
+				item->seq = cpu_to_le64(mitem->seq);
 			}

 			/*
@@ -2403,21 +2481,18 @@ int scoutfs_btree_merge(struct super_block *sb,
 			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
 			walk_val_len = 0;

-			/* finished with this key, skip any older items */
-			next = *mpos->key;
-			scoutfs_key_inc(&next);
-			ret = reset_mpos(sb, &pos_root, mpos, &next, end);
-			if (ret < 0)
-				goto out;
+			/* finished with this merged item */
+			tmp = mitem;
+			mitem = next_mitem(mitem);
+			free_mitem(&rng, tmp);
 		}
 	}

 	ret = 0;
 out:
 	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
-		free_mpos(sb, mpos);
-	}
+	rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
+		free_mitem(&rng, mitem);

 	return ret;
 }
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *input_list,
-			bool subtree, int dirty_limit, int alloc_low);
+			bool subtree, int dirty_limit, int alloc_low, int merge_window);

 int scoutfs_btree_free_blocks(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -145,6 +145,7 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
+	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
 	EXPAND_COUNTER(net_send_error)				\
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -721,7 +721,8 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
 	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
 				  &next, &comp.root, &inputs,
 				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
+				  (2 * 1024 * 1024));
 	if (ret == -ERANGE) {
 		comp.remain = next;
 		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -33,6 +33,7 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
+	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
@@ -45,6 +46,7 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
+	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -113,6 +115,10 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

+#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
+#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
+#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
+
 #define MIN_ORPHAN_SCAN_DELAY_MS	100UL
 #define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
 #define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
@@ -126,11 +132,27 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
+	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
 }

+static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
+{
+	if (ret < 0) {
+		scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
+		return -EINVAL;
+	}
+	if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
+		scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
+			    val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
 {
 	if (ret < 0) {
@@ -196,6 +218,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

+		case Opt_log_merge_wait_timeout_ms:
+			ret = match_int(args, &nr);
+			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
+			if (ret < 0)
+				return ret;
+			opts->log_merge_wait_timeout_ms = nr;
+			break;
+
 		case Opt_metadev_path:
 			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
 			if (ret < 0)
@@ -422,6 +452,43 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

+static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+						char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct scoutfs_mount_options opts;
+
+	scoutfs_options_read(sb, &opts);
+
+	return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
+}
+static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+						 const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_OPTIONS_INFO(sb, optinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	int val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoint(nullterm, 0, &val);
+	ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
+	if (ret == 0) {
+		write_seqlock(&optinf->seqlock);
+		optinf->opts.log_merge_wait_timeout_ms = val;
+		write_sequnlock(&optinf->seqlock);
+		ret = count;
+	}
+
+	return ret;
+}
+SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
+
 static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -525,6 +592,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
+	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -8,6 +8,7 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
+	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -439,6 +439,7 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->journal_info = (unsigned long)journal_info;
 		__entry->holders = holders;
+		__entry->ret = ret;
 	),

 	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
@@ -1746,21 +1747,41 @@ TRACE_EVENT(scoutfs_btree_merge,
 		  sk_trace_args(end))
 );

+TRACE_EVENT(scoutfs_btree_merge_read_range,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
+		 int size),
+
+	TP_ARGS(sb, start, end, size),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(start)
+		sk_trace_define(end)
+		__field(int, size)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		__entry->size = size;
+	),
+
+	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
+);
+
 TRACE_EVENT(scoutfs_btree_merge_items,
 	TP_PROTO(struct super_block *sb,
-		 struct scoutfs_btree_root *m_root,
 		 struct scoutfs_key *m_key, int m_val_len,
 		 struct scoutfs_btree_root *f_root,
 		 struct scoutfs_key *f_key, int f_val_len,
 		 int is_del),

-	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+	TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, m_root_blkno)
-		__field(__u64, m_root_seq)
-		__field(__u8, m_root_height)
 		sk_trace_define(m_key)
 		__field(int, m_val_len)
 		__field(__u64, f_root_blkno)
@@ -1773,10 +1794,6 @@ TRACE_EVENT(scoutfs_btree_merge_items,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->m_root_blkno = m_root ?
-					le64_to_cpu(m_root->ref.blkno) : 0;
-		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
-		__entry->m_root_height = m_root ? m_root->height : 0;
 		sk_trace_assign(m_key, m_key);
 		__entry->m_val_len = m_val_len;
 		__entry->f_root_blkno = f_root ?
@@ -1788,11 +1805,9 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		__entry->is_del = !!is_del;
 	),

-	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
-		  __entry->m_root_height, sk_trace_args(m_key),
-		  __entry->m_val_len, __entry->f_root_blkno,
-		  __entry->f_root_seq, __entry->f_root_height,
+	TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
+		  __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );

@@ -2075,6 +2090,71 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

+TRACE_EVENT(scoutfs_server_finalize_items,
+	TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
+		 u64 item_get_trans_seq),
+
+	TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, c_rid)
+		__field(__u64, item_rid)
+		__field(__u64, item_nr)
+		__field(__u64, item_flags)
+		__field(__u64, item_get_trans_seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->c_rid = rid;
+		__entry->item_rid = item_rid;
+		__entry->item_nr = item_nr;
+		__entry->item_flags = item_flags;
+		__entry->item_get_trans_seq = item_get_trans_seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
+		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
+		  __entry->item_flags, __entry->item_get_trans_seq)
+);
+
+TRACE_EVENT(scoutfs_server_finalize_decision,
+	TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
+		 bool ours_visible, bool finalize_ours, unsigned int delay_ms,
+		 u64 finalize_sent_seq),
+
+	TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
+		finalize_sent_seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, c_rid)
+		__field(bool, saw_finalized)
+		__field(bool, others_active)
+		__field(bool, ours_visible)
+		__field(bool, finalize_ours)
+		__field(unsigned int, delay_ms)
+		__field(__u64, finalize_sent_seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->c_rid = rid;
+		__entry->saw_finalized = saw_finalized;
+		__entry->others_active = others_active;
+		__entry->ours_visible = ours_visible;
+		__entry->finalize_ours = finalize_ours;
+		__entry->delay_ms = delay_ms;
+		__entry->finalize_sent_seq = finalize_sent_seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
+		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
+		  __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
+		  __entry->finalize_sent_seq)
+);
+
 TRACE_EVENT(scoutfs_get_log_merge_status,
 	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
 		 u64 nr_requests, u64 nr_complete, u64 seq),
@@ -2799,6 +2879,81 @@ TRACE_EVENT(scoutfs_omap_should_delete,
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->ret)
 );

+#define SSCF_FMT "[bo %llu bs %llu es %llu]"
+#define SSCF_FIELDS(pref)					\
+	__field(__u64, pref##_blkno)				\
+	__field(__u64, pref##_blocks)				\
+	__field(__u64, pref##_entries)
+#define SSCF_ASSIGN(pref, sfl)					\
+	__entry->pref##_blkno = le64_to_cpu((sfl)->ref.blkno);	\
+	__entry->pref##_blocks = le64_to_cpu((sfl)->blocks);	\
+	__entry->pref##_entries = le64_to_cpu((sfl)->entries);
+#define SSCF_ENTRY_ARGS(pref)					\
+	__entry->pref##_blkno,					\
+	__entry->pref##_blocks,					\
+	__entry->pref##_entries
+
+DECLARE_EVENT_CLASS(scoutfs_srch_compact_class,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+
+	TP_ARGS(sb, sc),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, id)
+		__field(__u8, nr)
+		__field(__u8, flags)
+		SSCF_FIELDS(out)
+		__field(__u64, in0_blk)
+		__field(__u64, in0_pos)
+		SSCF_FIELDS(in0)
+		__field(__u64, in1_blk)
+		__field(__u64, in1_pos)
+		SSCF_FIELDS(in1)
+		__field(__u64, in2_blk)
+		__field(__u64, in2_pos)
+		SSCF_FIELDS(in2)
+		__field(__u64, in3_blk)
+		__field(__u64, in3_pos)
+		SSCF_FIELDS(in3)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->id = le64_to_cpu(sc->id);
+		__entry->nr = sc->nr;
+		__entry->flags = sc->flags;
+		SSCF_ASSIGN(out, &sc->out)
+		__entry->in0_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in0_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in0, &sc->in[0].sfl)
+		__entry->in1_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in1_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in1, &sc->in[1].sfl)
+		__entry->in2_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in2_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in2, &sc->in[2].sfl)
+		__entry->in3_blk = le64_to_cpu(sc->in[0].blk);
+		__entry->in3_pos = le64_to_cpu(sc->in[0].pos);
+		SSCF_ASSIGN(in3, &sc->in[3].sfl)
+	),
+
+	TP_printk(SCSBF" id %llu nr %u flags 0x%x out "SSCF_FMT" in0 b %llu p %llu "SSCF_FMT" in1 b %llu p %llu "SSCF_FMT" in2 b %llu p %llu "SSCF_FMT" in3 b %llu p %llu "SSCF_FMT,
+		  SCSB_TRACE_ARGS, __entry->id, __entry->nr, __entry->flags, SSCF_ENTRY_ARGS(out),
+		  __entry->in0_blk, __entry->in0_pos, SSCF_ENTRY_ARGS(in0),
+		  __entry->in1_blk, __entry->in1_pos, SSCF_ENTRY_ARGS(in1),
+		  __entry->in2_blk, __entry->in2_pos, SSCF_ENTRY_ARGS(in2),
+		  __entry->in3_blk, __entry->in3_pos, SSCF_ENTRY_ARGS(in3))
+);
+DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_send,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+	TP_ARGS(sb, sc)
+);
+DEFINE_EVENT(scoutfs_srch_compact_class, scoutfs_srch_compact_client_recv,
+	TP_PROTO(struct super_block *sb, struct scoutfs_srch_compact *sc),
+	TP_ARGS(sb, sc)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -148,6 +148,8 @@ struct server_info {
 	struct scoutfs_quorum_config qconf;
 	/* a running server maintains a private dirty super */
 	struct scoutfs_super_block dirty_super;
+
+	u64 finalize_sent_seq;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -413,6 +415,27 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
 	wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
 }

+/*
+ * Return the higher of the avail or freed used by the active commit
+ * since this holder joined the commit.  This is *not* the amount used
+ * by the holder, we don't track per-holder alloc use.
+ */
+static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
+{
+	DECLARE_SERVER_INFO(sb, server);
+	u32 avail_used;
+	u32 freed_used;
+	u32 avail_now;
+	u32 freed_now;
+
+	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
+
+	avail_used = hold->avail - avail_now;
+	freed_used = hold->freed - freed_now;
+
+	return max(avail_used, freed_used);
+}
+
 /*
 * This is called while holding the commit and returns once the commit
 * is successfully written.  Many holders can all wait for all holders
@@ -938,22 +961,24 @@ static int find_log_trees_item(struct super_block *sb,
 }

 /*
- * Find the next log_trees item from the key.  Fills the caller's log_trees and sets
- * the key past the returned log_trees for iteration.  Returns 0 when done, > 0 for each
- * item, and -errno on fatal errors.
+ * Find the log_trees item with the greatest nr for each rid.  Fills the
+ * caller's log_trees and sets the key before the returned log_trees for
+ * the next iteration.  Returns 0 when done, > 0 for each item, and
+ * -errno on fatal errors.
 */
-static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *key, struct scoutfs_log_trees *lt)
+static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
+				struct scoutfs_key *key, struct scoutfs_log_trees *lt)
 {
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	int ret;

-	ret = scoutfs_btree_next(sb, root, key, &iref);
+	ret = scoutfs_btree_prev(sb, root, key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			memcpy(lt, iref.val, iref.val_len);
 			*key = *iref.key;
-			scoutfs_key_inc(key);
+			key->sklt_nr = 0;
+			scoutfs_key_dec(key);
 			ret = 1;
 		} else {
 			ret = -EIO;
@@ -1048,21 +1073,13 @@ static int next_log_merge_item(struct super_block *sb,
 * abandoned log btree finalized.  If it takes too long each client has
 * a change to make forward progress before being asked to commit again.
 *
- * We're waiting on heavy state that is protected by mutexes and
- * transaction machinery.  It's tricky to recreate that state for
- * lightweight condition tests that don't change task state.  Instead of
- * trying to get that right, particularly as we unwind after success or
- * after timeouts, waiters use an unsatisfying poll.   Short enough to
- * not add terrible latency, given how heavy and infrequent this already
- * is, and long enough to not melt the cpu.  This could be tuned if it
- * becomes a problem.
- *
 * This can end up finalizing a new empty log btree if a new mount
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
-#define FINALIZE_POLL_MS	(11)
-#define FINALIZE_TIMEOUT_MS	(MSEC_PER_SEC / 2)
+#define FINALIZE_POLL_MIN_DELAY_MS	5U
+#define FINALIZE_POLL_MAX_DELAY_MS	100U
+#define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
 static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
 					u64 rid, struct commit_hold *hold)
 {
@@ -1070,8 +1087,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
+	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
 	struct scoutfs_log_trees fin;
+	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
 	bool others_active;
@@ -1079,10 +1098,14 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	bool ours_visible;
 	struct scoutfs_key key;
 	char *err_str = NULL;
+	ktime_t start;
 	int ret;
 	int err;

-	timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);
+	scoutfs_options_read(sb, &opts);
+	timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
+	delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
+	start = ktime_get_raw();

 	for (;;) {
 		/* nothing to do if there's already a merge in flight */
@@ -1099,8 +1122,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		saw_finalized = false;
 		others_active = false;
 		ours_visible = false;
-		scoutfs_key_init_log_trees(&key, 0, 0);
-		while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+		while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+
+			trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
+							    le64_to_cpu(each_lt.nr),
+							    le64_to_cpu(each_lt.flags),
+							    le64_to_cpu(each_lt.get_trans_seq));

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
 				saw_finalized = true;
@@ -1125,6 +1153,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		finalize_ours = (lt->item_root.height > 2) ||
 				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);

+		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
+						       ours_visible, finalize_ours, delay_ms,
+						       server->finalize_sent_seq);
+
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
@@ -1132,12 +1164,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		}

 		/* send sync requests soon to give time to commit */
-		scoutfs_key_init_log_trees(&key, 0, 0);
+		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
 		while (others_active &&
-		       (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		       (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
-			    (le64_to_cpu(each_lt.rid) == rid))
+			    (le64_to_cpu(each_lt.rid) == rid) ||
+			    (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
 				continue;

 			ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1157,6 +1190,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

+		server->finalize_sent_seq = scoutfs_server_seq(sb);
+
 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
 			fin = *lt;
@@ -1194,13 +1229,16 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			if (ret < 0)
 				err_str = "applying commit before waiting for finalized";

-			msleep(FINALIZE_POLL_MS);
+			msleep(delay_ms);
+			delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
+				       FINALIZE_POLL_MAX_DELAY_MS);

 			server_hold_commit(sb, hold);
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
 			if (time_after(jiffies, timeo)) {
+				scoutfs_inc_counter(sb, log_merge_wait_timeout);
 				ret = 0;
 				break;
 			}
@@ -1783,43 +1821,29 @@ out:
 * Give the caller the last seq before outstanding client commits.  All
 * seqs up to and including this are stable, new client transactions can
 * only have greater seqs.
+ *
+ * For each rid, only its greatest log trees nr can be an open commit.
+ * We look at the last log_trees item for each client rid and record its
+ * trans seq if it hasn't been committed.
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_log_trees *lt;
+	struct scoutfs_log_trees lt;
 	struct scoutfs_key key;
 	u64 last_seq = 0;
 	int ret;

 	last_seq = scoutfs_server_seq(sb) - 1;
-	scoutfs_key_init_log_trees(&key, 0, 0);

 	mutex_lock(&server->logs_mutex);

-	for (;; scoutfs_key_inc(&key)) {
-		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
-		if (ret == 0) {
-			if (iref.val_len == sizeof(*lt)) {
-				lt = iref.val;
-				if ((le64_to_cpu(lt->get_trans_seq) >
-				     le64_to_cpu(lt->commit_trans_seq)) &&
-				     le64_to_cpu(lt->get_trans_seq) <= last_seq) {
-					last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
-				}
-				key = *iref.key;
-			} else {
-				ret = -EIO;
-			}
-			scoutfs_btree_put_iref(&iref);
-		}
-		if (ret < 0) {
-			if (ret == -ENOENT) {
-				ret = 0;
-				break;
-			}
+	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
+		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
+		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
+			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
 		}
 	}

@@ -1966,9 +1990,7 @@ static int server_srch_get_compact(struct super_block *sb,
 	ret = scoutfs_srch_get_compact(sb, &server->alloc, &server->wri,
 				       &super->srch_root, rid, sc);
 	mutex_unlock(&server->srch_mutex);
-	if (ret == 0 && sc->nr == 0)
-		ret = -ENOENT;
-	if (ret < 0)
+	if (ret < 0 || (ret == 0 && sc->nr == 0))
 		goto apply;

 	mutex_lock(&server->alloc_mutex);
@@ -2473,9 +2495,11 @@ static void server_log_merge_free_work(struct work_struct *work)

 	while (!server_is_stopping(server)) {

-		server_hold_commit(sb, &hold);
-		mutex_lock(&server->logs_mutex);
-		commit = true;
+		if (!commit) {
+			server_hold_commit(sb, &hold);
+			mutex_lock(&server->logs_mutex);
+			commit = true;
+		}

 		ret = next_log_merge_item(sb, &super->log_merge,
 					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2522,12 +2546,14 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		mutex_unlock(&server->logs_mutex);
-		ret = server_apply_commit(sb, &hold, ret);
-		commit = false;
-		if (ret < 0) {
-			err_str = "looping commit del/upd freeing item";
-			break;
+		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
+			mutex_unlock(&server->logs_mutex);
+			ret = server_apply_commit(sb, &hold, ret);
+			commit = false;
+			if (ret < 0) {
+				err_str = "looping commit del/upd freeing item";
+				break;
+			}
 		}
 	}

@@ -4300,6 +4326,7 @@ static void scoutfs_server_worker(struct work_struct *work)
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
+	server->finalize_sent_seq = 0;

 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -30,6 +30,9 @@
 #include "client.h"
 #include "counters.h"
 #include "scoutfs_trace.h"
+#include "triggers.h"
+#include "sysfs.h"
+#include "msg.h"

 /*
 * This srch subsystem gives us a way to find inodes that have a given
@@ -68,10 +71,14 @@ struct srch_info {
 	atomic_t shutdown;
 	struct workqueue_struct *workq;
 	struct delayed_work compact_dwork;
+	struct scoutfs_sysfs_attrs ssa;
+	atomic_t compact_delay_ms;
 };

 #define DECLARE_SRCH_INFO(sb, name) \
 	struct srch_info *name = SCOUTFS_SB(sb)->srch_info
+#define DECLARE_SRCH_INFO_KOBJ(kobj, name) \
+	DECLARE_SRCH_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_ARG(sre)						\
@@ -520,6 +527,95 @@ out:
 	return ret;
 }

+/*
+ * Padded entries are encoded in pairs after an existing entry.  All of
+ * the pairs cancel each other out by all readers (the second encoding
+ * looks like deletion) so they aren't visible to the first/last bounds of
+ * the block or file.
+ */
+static int append_padded_entry(struct scoutfs_srch_file *sfl, u64 blk,
+			       struct scoutfs_srch_block *srb, struct scoutfs_srch_entry *sre)
+{
+	int ret;
+
+	ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
+			   sre, &srb->tail);
+	if (ret > 0) {
+		srb->tail = *sre;
+		le32_add_cpu(&srb->entry_nr, 1);
+		le32_add_cpu(&srb->entry_bytes, ret);
+		le64_add_cpu(&sfl->entries, 1);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * This is called by a testing trigger to create a very specific case of
+ * encoded entry offsets.  We want the last entry in the block to start
+ * precisely at the _SAFE_BYTES offset.
+ *
+ * This is called when there is a single existing entry in the block.
+ * We have the entire block to work with.  We encode pairs of matching
+ * entries.  This hides them from readers (both searches and merging) as
+ * they're interpreted as creation and deletion and are deleted.  We use
+ * the existing hash value of the first entry in the block but then set
+ * the inode to an impossibly large number so it doesn't interfere with
+ * anything.
+ *
+ * To hit the specific offset we very carefully manage the amount of
+ * bytes of change between fields in the entry.  We know that if we
+ * change all the byte of the ino and id we end up with a 20 byte
+ * (2+8+8,2) encoding of the pair of entries.  To have the last entry
+ * start at the _SAFE_POS offset we know that the final 20 byte pair
+ * encoding needs to end at 2 bytes (second entry encoding) after the
+ * _SAFE_POS offset.
+ *
+ * So as we encode pairs we watch the delta of our current offset from
+ * that desired final offset of 2 past _SAFE_POS.  If we're a multiple
+ * of 20 away then we encode the full 20 byte pairs.  If we're not, then
+ * we drop a byte to encode 19 bytes.  That'll slowly change the offset
+ * to be a multiple of 20 again while encoding large entries.
+ */
+static void pad_entries_at_safe(struct scoutfs_srch_file *sfl, u64 blk,
+				struct scoutfs_srch_block *srb)
+{
+	struct scoutfs_srch_entry sre;
+	u32 target;
+	s32 diff;
+	u64 hash;
+	u64 ino;
+	u64 id;
+	int ret;
+
+	hash = le64_to_cpu(srb->tail.hash);
+	ino = le64_to_cpu(srb->tail.ino) | (1ULL << 62);
+	id = le64_to_cpu(srb->tail.id);
+
+	target = SCOUTFS_SRCH_BLOCK_SAFE_BYTES + 2;
+
+	while ((diff = target - le32_to_cpu(srb->entry_bytes)) > 0) {
+		ino ^= 1ULL << (7 * 8);
+		if (diff % 20 == 0) {
+			id ^= 1ULL << (7 * 8);
+		} else {
+			id ^= 1ULL << (6 * 8);
+		}
+
+		sre.hash = cpu_to_le64(hash);
+		sre.ino = cpu_to_le64(ino);
+		sre.id = cpu_to_le64(id);
+
+		ret = append_padded_entry(sfl, blk, srb, &sre);
+		if (ret == 0)
+			ret = append_padded_entry(sfl, blk, srb, &sre);
+		BUG_ON(ret != 0);
+
+		diff = target - le32_to_cpu(srb->entry_bytes);
+	}
+}
+
 /*
 * The caller is dropping an ino/id because the tracking rbtree is full.
 * This loses information so we can't return any entries at or after the
@@ -987,6 +1083,9 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 	struct scoutfs_key key;
 	int ret;

+	if (sfl->ref.blkno && !force && scoutfs_trigger(sb, SRCH_FORCE_LOG_ROTATE))
+		force = true;
+
 	if (sfl->ref.blkno == 0 ||
 	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
 		return 0;
@@ -1462,7 +1561,7 @@ static int kway_merge(struct super_block *sb,
 		      struct scoutfs_block_writer *wri,
 		      struct scoutfs_srch_file *sfl,
 		      kway_get_t kway_get, kway_advance_t kway_adv,
-		      void **args, int nr)
+		      void **args, int nr, bool logs_input)
 {
 	DECLARE_SRCH_INFO(sb, srinf);
 	struct scoutfs_srch_block *srb = NULL;
@@ -1567,6 +1666,15 @@ static int kway_merge(struct super_block *sb,
 				blk++;
 			}

+			/* end sorted block on _SAFE offset for testing */
+			if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
+			    scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
+				pad_entries_at_safe(sfl, blk, srb);
+				scoutfs_block_put(sb, bl);
+				bl = NULL;
+				blk++;
+			}
+
 			scoutfs_inc_counter(sb, srch_compact_entry);

 		} else {
@@ -1609,6 +1717,8 @@ static int kway_merge(struct super_block *sb,
 			empty++;
 			ret = 0;
 		} else if (ret < 0) {
+			if (ret == -ENOANO) /* just testing trigger */
+				ret = 0;
 			goto out;
 		}

@@ -1816,7 +1926,7 @@ static int compact_logs(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_page, kway_adv_page,
-			 args, nr_pages);
+			 args, nr_pages, true);
 	if (ret < 0)
 		goto out;

@@ -1874,12 +1984,18 @@ static int kway_get_reader(struct super_block *sb,
 	srb = rdr->bl->data;

 	if (rdr->pos > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
-	    rdr->skip >= SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
+	    rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
 	    rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
 		/* XXX inconsistency */
 		return -EIO;
 	}

+	if (rdr->decoded_bytes == 0 && rdr->pos == SCOUTFS_SRCH_BLOCK_SAFE_BYTES &&
+	    scoutfs_trigger(sb, SRCH_MERGE_STOP_SAFE)) {
+		/* only used in testing */
+		return -ENOANO;
+	}
+
 	/* decode entry, possibly skipping start of the block */
 	while (rdr->decoded_bytes == 0 || rdr->pos < rdr->skip) {
 		ret = decode_entry(srb->entries + rdr->pos,
@@ -1969,7 +2085,7 @@ static int compact_sorted(struct super_block *sb,
 	}

 	ret = kway_merge(sb, alloc, wri, &sc->out, kway_get_reader,
-			 kway_adv_reader, args, nr);
+			 kway_adv_reader, args, nr, false);

 	sc->flags |= SCOUTFS_SRCH_COMPACT_FLAG_DONE;
 	for (i = 0; i < nr; i++) {
@@ -2098,8 +2214,15 @@ static int delete_files(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-/* wait 10s between compact attempts on error, immediate after success */
-#define SRCH_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
+static void queue_compact_work(struct srch_info *srinf, bool immediate)
+{
+	unsigned long delay;
+
+	if (!atomic_read(&srinf->shutdown)) {
+		delay = immediate ? 0 : msecs_to_jiffies(atomic_read(&srinf->compact_delay_ms));
+		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
+	}
+}

 /*
 * Get a compaction operation from the server, sort the entries from the
@@ -2127,7 +2250,6 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	struct super_block *sb = srinf->sb;
 	struct scoutfs_block_writer wri;
 	struct scoutfs_alloc alloc;
-	unsigned long delay;
 	int ret;
 	int err;

@@ -2140,6 +2262,8 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
 	scoutfs_block_writer_init(sb, &wri);

 	ret = scoutfs_client_srch_get_compact(sb, sc);
+	if (ret >= 0)
+		trace_scoutfs_srch_compact_client_recv(sb, sc);
 	if (ret < 0 || sc->nr == 0)
 		goto out;

@@ -2168,6 +2292,7 @@ commit:
 	sc->meta_freed = alloc.freed;
 	sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;

+	trace_scoutfs_srch_compact_client_send(sb, sc);
 	err = scoutfs_client_srch_commit_compact(sb, sc);
 	if (err < 0 && ret == 0)
 		ret = err;
@@ -2178,14 +2303,56 @@ out:
 		scoutfs_inc_counter(sb, srch_compact_error);

 	scoutfs_block_writer_forget_all(sb, &wri);
-	if (!atomic_read(&srinf->shutdown)) {
-		delay = ret == 0 ? 0 : msecs_to_jiffies(SRCH_COMPACT_DELAY_MS);
-		queue_delayed_work(srinf->workq, &srinf->compact_dwork, delay);
-	}
+	queue_compact_work(srinf, sc->nr > 0 && ret == 0);

 	kfree(sc);
 }

+static ssize_t compact_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	DECLARE_SRCH_INFO_KOBJ(kobj, srinf);
+
+	return snprintf(buf, PAGE_SIZE, "%u", atomic_read(&srinf->compact_delay_ms));
+}
+
+#define MIN_COMPACT_DELAY_MS MSEC_PER_SEC
+#define DEF_COMPACT_DELAY_MS (10 * MSEC_PER_SEC)
+#define MAX_COMPACT_DELAY_MS (60 * MSEC_PER_SEC)
+
+static ssize_t compact_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	DECLARE_SRCH_INFO(sb, srinf);
+	char nullterm[30]; /* more than enough for octal -U64_MAX */
+	u64 val;
+	int len;
+	int ret;
+
+	len = min(count, sizeof(nullterm) - 1);
+	memcpy(nullterm, buf, len);
+	nullterm[len] = '\0';
+
+	ret = kstrtoll(nullterm, 0, &val);
+	if (ret < 0 || val < MIN_COMPACT_DELAY_MS || val > MAX_COMPACT_DELAY_MS) {
+		scoutfs_err(sb, "invalid compact_delay_ms value, must be between %lu and %lu",
+			    MIN_COMPACT_DELAY_MS, MAX_COMPACT_DELAY_MS);
+		return -EINVAL;
+	}
+
+	atomic_set(&srinf->compact_delay_ms, val);
+	cancel_delayed_work(&srinf->compact_dwork);
+	queue_compact_work(srinf, false);
+
+	return count;
+}
+SCOUTFS_ATTR_RW(compact_delay_ms);
+
+static struct attribute *srch_attrs[] = {
+	SCOUTFS_ATTR_PTR(compact_delay_ms),
+	NULL,
+};
+
 void scoutfs_srch_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -2202,6 +2369,8 @@ void scoutfs_srch_destroy(struct super_block *sb)
 		destroy_workqueue(srinf->workq);
 	}

+	scoutfs_sysfs_destroy_attrs(sb, &srinf->ssa);
+
 	kfree(srinf);
 	sbi->srch_info = NULL;
 }
@@ -2219,8 +2388,15 @@ int scoutfs_srch_setup(struct super_block *sb)
 	srinf->sb = sb;
 	atomic_set(&srinf->shutdown, 0);
 	INIT_DELAYED_WORK(&srinf->compact_dwork, scoutfs_srch_compact_worker);
+	scoutfs_sysfs_init_attrs(sb, &srinf->ssa);
+	atomic_set(&srinf->compact_delay_ms, DEF_COMPACT_DELAY_MS);
+
 	sbi->srch_info = srinf;

+	ret = scoutfs_sysfs_create_attrs(sb, &srinf->ssa, srch_attrs, "srch");
+	if (ret < 0)
+		goto out;
+
 	srinf->workq = alloc_workqueue("scoutfs_srch_compact",
 				       WQ_NON_REENTRANT | WQ_UNBOUND |
 				       WQ_HIGHPRI, 0);
@@ -2229,8 +2405,7 @@ int scoutfs_srch_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_delayed_work(srinf->workq, &srinf->compact_dwork,
-			   msecs_to_jiffies(SRCH_COMPACT_DELAY_MS));
+	queue_compact_work(srinf, false);

 	ret = 0;
 out:
--- a/kmod/src/triggers.c
+++ b/kmod/src/triggers.c
@@ -39,6 +39,9 @@ struct scoutfs_triggers {

 static char *names[] = {
 	[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
+	[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
+	[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
+	[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
 	[SCOUTFS_TRIGGER_STATFS_LOCK_PURGE] = "statfs_lock_purge",
 };

--- a/kmod/src/triggers.h
+++ b/kmod/src/triggers.h
@@ -3,6 +3,9 @@

 enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
+	SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
+	SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
+	SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,
 	SCOUTFS_TRIGGER_STATFS_LOCK_PURGE,
 	SCOUTFS_TRIGGER_NR,
 };
--- a/tests/README.md
+++ b/tests/README.md
@@ -25,8 +25,9 @@ All options can be seen by running with -h.
 This script is built to test multi-node systems on one host by using
 different mounts of the same devices.  The script creates a fake block
 device in front of each fs block device for each mount that will be
-tested.  Currently it will create free loop devices and will mount on
-/mnt/test.[0-9].
+tested.  It will create predictable device mapper devices and mounts
+them on /mnt/test.N.  These static device names and mount paths limit
+the script to a single execution per host.

 All tests will be run by default.  Particular tests can be included or
 excluded by providing test name regular expressions with the -I and -E
@@ -104,8 +105,8 @@ used during the test.

 | Variable         | Description          | Origin          | Example           |
 | ---------------- | -------------------  | --------------- | ----------------- |
-| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
-| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
+| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/mapper/\_scoutfs\_test\_meta\_[0-9]        |
+| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/mapper/\_scoutfs\_test\_data\_[0-9]        |
 | T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
 | T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
 | T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -6,6 +6,61 @@ t_filter_fs()
 	    -e 's@Device: [a-fA-F0-9]*h/[0-9]*d@Device: 0h/0d@g'
 }

+#
+# We can hit a spurious kasan warning that was fixed upstream:
+#
+#  e504e74cc3a2 x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
+#
+# KASAN can get mad when the unwinder doesn't find ORC metadata and
+# wanders up without using frames and hits the KASAN stack red zones.
+# We can ignore these messages.
+#
+# They're bracketed by:
+# [ 2687.690127] ==================================================================
+# [ 2687.691366] BUG: KASAN: stack-out-of-bounds in get_reg+0x1bc/0x230
+# ...
+# [ 2687.706220] ==================================================================
+# [ 2687.707284] Disabling lock debugging due to kernel taint
+#
+# That final lock debugging message may not be included.
+#
+ignore_harmless_unwind_kasan_stack_oob()
+{
+awk '
+        BEGIN {
+                in_soob = 0
+                soob_nr = 0
+        }
+        ( !in_soob && $0 ~ /==================================================================/ ) {
+                in_soob = 1
+                soob_nr = NR
+                saved = $0
+        }
+        ( in_soob == 1 && NR == (soob_nr + 1) ) {
+                if (match($0, /KASAN: stack-out-of-bounds in get_reg/) != 0) {
+                        in_soob = 2
+                } else {
+                        in_soob = 0
+                        print saved
+                }
+		saved=""
+        }
+        ( in_soob == 2 && $0 ~ /==================================================================/ ) {
+                in_soob = 3
+                soob_nr = NR
+        }
+        ( in_soob == 3 && NR > soob_nr && $0 !~ /Disabling lock debugging/ ) {
+                in_soob = 0
+        }
+        ( !in_soob ) { print $0 }
+        END {
+                if (saved) {
+                        print saved
+                }
+        }
+'
+}
+
 #
 # Filter out expected messages.  Putting messages here implies that
 # tests aren't relying on messages to discover failures.. they're
@@ -86,10 +141,12 @@ t_filter_dmesg()
 	re="$re|scoutfs .* critical transaction commit failure.*"

 	# change-devices causes loop device resizing
+	re="$re|loop: module loaded"
 	re="$re|loop[0-9].* detected capacity change from.*"

 	# ignore systemd-journal rotating
 	re="$re|systemd-journald.*"

-	egrep -v "($re)" 
+	egrep -v "($re)" | \
+		ignore_harmless_unwind_kasan_stack_oob
 }
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -265,6 +265,15 @@ t_trigger_get() {
 	cat "$(t_trigger_path "$nr")/$which"
 }

+t_trigger_set() {
+	local which="$1"
+	local nr="$2"
+	local val="$3"
+	local path=$(t_trigger_path "$nr")
+
+	echo "$val" > "$path/$which"
+}
+
 t_trigger_show() {
 	local which="$1"
 	local string="$2"
@@ -276,9 +285,8 @@ t_trigger_show() {
 t_trigger_arm_silent() {
 	local which="$1"
 	local nr="$2"
-	local path=$(t_trigger_path "$nr")

-	echo 1 > "$path/$which"
+	t_trigger_set "$which" "$nr" 1
 }

 t_trigger_arm() {
--- a/tests/golden/createmany-parallel-mounts
+++ b/tests/golden/createmany-parallel-mounts
@@ -1,3 +1,4 @@
 == measure initial createmany
 == measure initial createmany
 == measure two concurrent createmany runs
+== cleanup
--- a/tests/golden/large-fragmented-free
+++ b/tests/golden/large-fragmented-free
@@ -1,3 +1,4 @@
+== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
--- a/tests/golden/srch-safe-merge-pos
+++ b/tests/golden/srch-safe-merge-pos
@@ -0,0 +1,37 @@
+== initialize per-mount values
+== arm compaction triggers
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_merge_stop_safe armed: 1
+== compact more often
+== create padded sorted inputs by forcing log rotation
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_force_log_rotate armed: 1
+trigger srch_compact_logs_pad_safe armed: 1
+== compaction of padded should stop at safe
+== verify no compaction errors
+== cleanup
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -326,16 +326,10 @@ unmount_all() {
 		cmd wait $p
 	done

-	# delete all temp meta devices
-	for dev in $(losetup --associated "$T_META_DEVICE" | cut -d : -f 1); do
-		if [ -e "$dev" ]; then
-			cmd losetup -d "$dev"
-		fi
-	done
-	# delete all temp data devices
-	for dev in $(losetup --associated "$T_DATA_DEVICE" | cut -d : -f 1); do
-		if [ -e "$dev" ]; then
-			cmd losetup -d "$dev"
+	# delete all temp devices
+	for dev in /dev/mapper/_scoutfs_test_*; do
+		if [ -b "$dev" ]; then
+			cmd dmsetup remove $dev
 		fi
 	done
 }
@@ -434,6 +428,12 @@ $T_UTILS/fenced/scoutfs-fenced > "$T_FENCED_LOG" 2>&1 &
 fenced_pid=$!
 fenced_log "started fenced pid $fenced_pid in the background"

+# setup dm tables
+echo "0 $(blockdev --getsz $T_META_DEVICE) linear $T_META_DEVICE 0" > \
+	$T_RESULTS/dmtable.meta
+echo "0 $(blockdev --getsz $T_DATA_DEVICE) linear $T_DATA_DEVICE 0" > \
+	$T_RESULTS/dmtable.data
+
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
@@ -442,10 +442,13 @@ msg "mounting $T_NR_MOUNTS mounts on meta $T_META_DEVICE data $T_DATA_DEVICE"
 pids=""
 for i in $(seq 0 $((T_NR_MOUNTS - 1))); do

-	meta_dev=$(losetup --find --show $T_META_DEVICE)
-	test -b "$meta_dev" || die "failed to create temp device $meta_dev"
-	data_dev=$(losetup --find --show $T_DATA_DEVICE)
-	test -b "$data_dev" || die "failed to create temp device $data_dev"
+	name="_scoutfs_test_meta_$i"
+	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.meta)"
+	meta_dev="/dev/mapper/$name"
+
+	name="_scoutfs_test_data_$i"
+	cmd dmsetup create "$name" --table "$(cat $T_RESULTS/dmtable.data)"
+	data_dev="/dev/mapper/$name"

 	dir="/mnt/test.$i"
 	test -d "$dir" || cmd mkdir -p "$dir"
--- a/tests/sequence
+++ b/tests/sequence
@@ -14,6 +14,7 @@ offline-extent-waiting.sh
 move-blocks.sh
 large-fragmented-free.sh
 enospc.sh
+srch-safe-merge-pos.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 totl-xattr-tag.sh
--- a/tests/src/bulk_create_paths.c
+++ b/tests/src/bulk_create_paths.c
@@ -1,6 +1,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdarg.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/stat.h>
@@ -35,10 +36,10 @@ struct opts {
 	unsigned int dry_run:1,
 		     ls_output:1,
 		     quiet:1,
-		     user_xattr:1,
-		     same_srch_xattr:1,
-		     group_srch_xattr:1,
-		     unique_srch_xattr:1;
+		     xattr_set:1,
+		     xattr_file:1,
+		     xattr_group:1;
+	char *xattr_name;
 };

 struct stats {
@@ -149,12 +150,31 @@ static void free_dir(struct dir *dir)
 	free(dir);
 }

+static size_t snprintf_off(void *buf, size_t sz, size_t off, char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	if (off >= sz)
+		return sz;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(buf + off, sz - off, fmt, ap);
+	va_end(ap);
+
+	if (ret <= 0)
+		return sz;
+
+	return off + ret;
+}
+
 static void create_dir(struct dir *dir, struct opts *opts,
 		       struct stats *stats)
 {
 	struct str_list *s;
-	char name[100];
+	char name[256]; /* max len and null term */
 	char val = 'v';
+	size_t off;
 	int rc;
 	int i;

@@ -175,29 +195,21 @@ static void create_dir(struct dir *dir, struct opts *opts,
 		rc = mknod(s->str, S_IFREG | 0644, 0);
 		error_exit(rc, "mknod %s failed"ERRF, s->str, ERRA);

-		rc = 0;
-		if (rc == 0 && opts->user_xattr) {
-			strcpy(name, "user.scoutfs_bcp");
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->same_srch_xattr) {
-			strcpy(name, "scoutfs.srch.scoutfs_bcp");
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->group_srch_xattr) {
-			snprintf(name, sizeof(name),
-				 "scoutfs.srch.scoutfs_bcp.group.%lu",
-				 stats->files / 10000);
-			rc = setxattr(s->str, name, &val, 1, 0);
-		}
-		if (rc == 0 && opts->unique_srch_xattr) {
-			snprintf(name, sizeof(name),
-				 "scoutfs.srch.scoutfs_bcp.unique.%lu",
-				 stats->files);
+		if (opts->xattr_set) {
+			off = snprintf_off(name, sizeof(name), 0, "%s", opts->xattr_name);
+			if (opts->xattr_file)
+				off = snprintf_off(name, sizeof(name), off,
+						   "-f-%lu", stats->files);
+			if (opts->xattr_group)
+				off = snprintf_off(name, sizeof(name), off,
+						   "-g-%lu", stats->files / 10000);
+
+			error_exit(off >= sizeof(name), "xattr name longer than 255 bytes");
+
 			rc = setxattr(s->str, name, &val, 1, 0);
+			error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);
 		}

-		error_exit(rc, "setxattr %s %s failed"ERRF, s->str, name, ERRA);

 		stats->files++;
 		rate_banner(opts, stats);
@@ -365,11 +377,10 @@ static void usage(void)
 	       " -d DIR | create all files in DIR top level directory\n"
 	       " -n     | dry run, only parse, don't create any files\n"
 	       " -q     | quiet, don't regularly print rates\n"
+	       " -F     | append \"-f-NR\" file nr to xattr name, requires -X\n"
+	       " -G     | append \"-g-NR\" file nr/10000 to xattr name, requires -X\n"
 	       " -L     | parse ls output; only reg, skip meta, paths at ./\n"
-	       " -X     | set the same user. xattr name in all files\n"
-	       " -S     | set the same .srch. xattr name in all files\n"
-	       " -G     | set a .srch. xattr name shared by groups of files\n"
-	       " -U     | set a unique .srch. xattr name in all files\n");
+	       " -X NAM | set named xattr in all files\n");
 }

 int main(int argc, char **argv)
@@ -386,7 +397,7 @@ int main(int argc, char **argv)

 	memset(&opts, 0, sizeof(opts));

-        while ((c = getopt(argc, argv, "d:nqLXSGU")) != -1) {
+        while ((c = getopt(argc, argv, "d:nqFGLX:")) != -1) {
                switch(c) {
                case 'd':
                        top_dir = strdup(optarg);
@@ -397,20 +408,19 @@ int main(int argc, char **argv)
                case 'q':
                        opts.quiet = 1;
                        break;
+                case 'F':
+                        opts.xattr_file = 1;
+                        break;
+                case 'G':
+                        opts.xattr_group = 1;
+                        break;
                case 'L':
                        opts.ls_output = 1;
                        break;
                case 'X':
-                        opts.user_xattr = 1;
-                        break;
-                case 'S':
-                        opts.same_srch_xattr = 1;
-                        break;
-                case 'G':
-                        opts.group_srch_xattr = 1;
-                        break;
-                case 'U':
-                        opts.unique_srch_xattr = 1;
+			opts.xattr_set = 1;
+			opts.xattr_name = strdup(optarg);
+			error_exit(!opts.xattr_name, "error allocating xattr name");
                        break;
                case '?':
                        printf("Unknown option '%c'\n", optopt);
@@ -419,6 +429,11 @@ int main(int argc, char **argv)
                }
        }

+	error_exit(opts.xattr_file && !opts.xattr_set,
+		   "must specify xattr -X when appending file nr with -F");
+	error_exit(opts.xattr_group && !opts.xattr_set,
+		   "must specify xattr -X when appending file nr with -G");
+
 	if (!opts.dry_run) {
 		error_exit(!top_dir,
 			   "must specify top level directory with -d");
--- a/tests/tests/createmany-parallel-mounts.sh
+++ b/tests/tests/createmany-parallel-mounts.sh
@@ -7,9 +7,11 @@ t_require_mounts 2

 COUNT=50000

-# Prep dirs for test. Each mount needs to make their own parent dir for
-# the createmany run, otherwise both dirs will end up in the same inode
-# group, causing updates to bounce that lock around.
+#
+# Prep dirs for test.  We have per-directory inode number allocators so
+# by putting each createmany in a per-mount dir they get their own inode
+# number region and cluster locks.
+#
 echo "== measure initial createmany"
 mkdir -p $T_D0/dir/0
 mkdir $T_D1/dir/1
@@ -17,18 +19,20 @@ mkdir $T_D1/dir/1
 echo "== measure initial createmany"
 START=$SECONDS
 createmany -o "$T_D0/file_" $COUNT >> $T_TMP.full
+sync
 SINGLE=$((SECONDS - START))
 echo single $SINGLE >> $T_TMP.full

 echo "== measure two concurrent createmany runs"
 START=$SECONDS
-createmany -o $T_D0/dir/0/file $COUNT > /dev/null &
+(cd $T_D0/dir/0; createmany -o ./file_ $COUNT > /dev/null) &
 pids="$!"
-createmany -o $T_D1/dir/1/file $COUNT > /dev/null &
+(cd $T_D1/dir/1; createmany -o ./file_ $COUNT > /dev/null) &
 pids="$pids $!"
 for p in $pids; do
        wait $p
 done
+sync
 BOTH=$((SECONDS - START))
 echo both $BOTH >> $T_TMP.full

@@ -41,7 +45,10 @@ echo both $BOTH >> $T_TMP.full
 # synchronized operation.
 FACTOR=200
 if [ "$BOTH" -gt $(($SINGLE*$FACTOR)) ]; then
-	echo "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
+	t_fail "both createmany took $BOTH sec, more than $FACTOR x single $SINGLE sec"
 fi

+echo "== cleanup"
+find $T_D0/dir -delete
+
 t_pass
--- a/tests/tests/large-fragmented-free.sh
+++ b/tests/tests/large-fragmented-free.sh
@@ -10,6 +10,30 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

+#
+# This test specifically creates a pathologically sparse file that will
+# be as expensive as possible to free.  This is usually fine on
+# dedicated or reasonable hardware, but trying to run this in
+# virtualized debug kernels can take a very long time.  This test is
+# about making sure that the server doesn't fail, not that the platform
+# can handle the scale of work that our btree formats happen to require
+# while execution is bogged down with use-after-free memory reference
+# tracking.  So we give the test a lot more breathing room before
+# deciding that its hung.
+#
+echo "== setting longer hung task timeout"
+if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
+	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
+	test "$secs" -gt 0 || \
+		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
+	restore_hung_task_timeout()
+	{
+		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
+	}
+	trap restore_hung_task_timeout EXIT
+	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
+fi
+
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

--- a/tests/tests/srch-basic-functionality.sh
+++ b/tests/tests/srch-basic-functionality.sh
@@ -9,6 +9,7 @@ LOG=340000
 LIM=1000000

 SEQF="%.20g"
+SXA="scoutfs.srch.test-srch-basic-functionality"

 t_require_commands touch rm setfattr scoutfs find_xattrs

@@ -27,20 +28,20 @@ diff_srch_find()

 echo "== create new xattrs"
 touch "$T_D0/"{create,update}
-setfattr -n scoutfs.srch.test -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -n $SXA -v 1 "$T_D0/"{create,update} 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== update existing xattr"
-setfattr -n scoutfs.srch.test -v 2 "$T_D0/update" 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -n $SXA -v 2 "$T_D0/update" 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== remove an xattr"
-setfattr -x scoutfs.srch.test "$T_D0/create" 2>&1 | t_filter_fs
-diff_srch_find scoutfs.srch.test
+setfattr -x $SXA "$T_D0/create" 2>&1 | t_filter_fs
+diff_srch_find $SXA

 echo "== remove xattr with files"
 rm -f "$T_D0/"{create,update}
-diff_srch_find scoutfs.srch.test
+diff_srch_find $SXA

 echo "== trigger small log merges by rotating single block with unmount"
 sv=$(t_server_nr)
@@ -56,7 +57,7 @@ while [ "$i" -lt "8" ]; do

 		eval path="\$T_D${nr}/single-block-$i"
 		touch "$path"
-		setfattr -n scoutfs.srch.single-block-logs -v $i "$path"
+		setfattr -n $SXA -v $i "$path"
 		t_umount $nr
 		t_mount $nr

@@ -65,51 +66,51 @@ while [ "$i" -lt "8" ]; do
 done
 # wait for srch compaction worker delay
 sleep 10
-rm -rf "$T_D0/single-block-*"
+find "$T_D0" -type f -name 'single-block-*' -delete

 echo "== create entries in current log"
 DIR="$T_D0/dir"
 NR=$((LOG / 4))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete small fraction"
-seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "$DIR/f-$SEQF" 1 7 $NR | xargs setfattr -x $SXA
+diff_srch_find $SXA

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 echo "== create entries that exceed one log"
 NR=$((LOG * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete fractions in phases"
 for i in $(seq 1 3); do
-	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-	diff_srch_find scoutfs.srch.scoutfs_bcp
+	seq -f "$DIR/f-$SEQF" $i 3 $NR | xargs setfattr -x $SXA
+	diff_srch_find $SXA
 done

 echo "== remove files"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 echo "== create entries for exceed search entry limit"
 NR=$((LIM * 3 / 2))
 mkdir -p "$DIR"
-seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -S -d "$DIR" > /dev/null
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "f-$SEQF" 1 $NR | src/bulk_create_paths -X $SXA -d "$DIR" > /dev/null
+diff_srch_find $SXA

 echo "== delete half"
-seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x scoutfs.srch.scoutfs_bcp
-diff_srch_find scoutfs.srch.scoutfs_bcp
+seq -f "$DIR/f-$SEQF" 1 2 $NR | xargs setfattr -x $SXA
+diff_srch_find $SXA

 echo "== entirely remove third batch"
 rm -rf "$DIR"
-diff_srch_find scoutfs.srch.scoutfs_bcp
+diff_srch_find $SXA

 t_pass
--- a/tests/tests/srch-safe-merge-pos.sh
+++ b/tests/tests/srch-safe-merge-pos.sh
@@ -0,0 +1,90 @@
+#
+# There was a bug where srch file compaction could get stuck if a
+# partial compaction finished at the specific _SAFE_BYTES offset in a
+# block.  Resuming from that position would return an error and
+# compaction would stop making forward progress.
+#
+# We use triggers to pad the output of log compaction to end on the safe
+# offset and then cause compaction of those padded inputs to stop at the
+# safe offset.  Continuation will either succeed or return errors.  
+#
+
+# forcing rotation, so just a few
+NR=10
+SEQF="%.20g"
+COMPACT_NR=4
+
+echo "== initialize per-mount values"
+declare -a err
+declare -a compact_delay
+for nr in $(t_fs_nrs); do
+	err[$nr]=$(t_counter srch_compact_error $nr)
+	compact_delay[$nr]=$(cat $(t_sysfs_path $nr)/srch/compact_delay_ms)
+done
+restore_compact_delay()
+{
+	for nr in $(t_fs_nrs); do
+		echo ${compact_delay[$nr]} > $(t_sysfs_path $nr)/srch/compact_delay_ms
+	done
+}
+trap restore_compact_delay EXIT
+
+echo "== arm compaction triggers"
+for nr in $(t_fs_nrs); do
+	t_trigger_arm srch_compact_logs_pad_safe $nr
+	t_trigger_arm srch_merge_stop_safe $nr
+done
+
+echo "== compact more often"
+for nr in $(t_fs_nrs); do
+	echo 1000 > $(t_sysfs_path $nr)/srch/compact_delay_ms
+done
+
+echo "== create padded sorted inputs by forcing log rotation"
+sv=$(t_server_nr)
+for i in $(seq 1 $COMPACT_NR); do
+	for j in $(seq 1 $COMPACT_NR); do
+		t_trigger_arm srch_force_log_rotate $sv
+
+		seq -f "f-$i-$j-$SEQF" 1 10 | \
+			bulk_create_paths -X "scoutfs.srch.t-srch-safe-merge-pos" -d "$T_D0" > \
+			/dev/null
+		sync
+
+		test "$(t_trigger_get srch_force_log_rotate $sv)" == "0" || \
+			t_fail "srch_force_log_rotate didn't trigger"
+	done
+
+	padded=0
+	while test $padded == 0 && sleep .5; do
+		for nr in $(t_fs_nrs); do
+			if [ "$(t_trigger_get srch_compact_logs_pad_safe $nr)" == "0" ]; then
+				t_trigger_arm srch_compact_logs_pad_safe $nr
+				padded=1
+				break
+			fi
+			test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
+				t_fail "srch_compact_error counter increased on mount $nr"
+		done
+	done
+done
+
+echo "== compaction of padded should stop at safe"
+sleep 2
+for nr in $(t_fs_nrs); do
+	if [ "$(t_trigger_get srch_merge_stop_safe $nr)" == "0" ]; then
+		break
+	fi
+done
+
+echo "== verify no compaction errors"
+sleep 2
+for nr in $(t_fs_nrs); do
+	test "$(t_counter srch_compact_error $nr)" == "${err[$nr]}" || \
+		t_fail "srch_compact_error counter increased on mount $nr"
+done
+
+echo "== cleanup"
+find "$T_D0" -type f -name 'f-*' -delete
+
+t_pass
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))

 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-	-fno-strict-aliasing \
+	-I src/ -fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU

 ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,8 +15,9 @@ CFLAGS += -I../kmod/src
 endif

 BIN := src/scoutfs
-OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
-DEPS := $(wildcard */*.d)
+OBJ_DIRS := src src/check
+OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
+DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))

 all: $(BIN)

--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -55,6 +55,19 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
+.B log_merge_wait_timeout_ms=<number>
+This option sets the amount of time, in milliseconds, that log merge
+creation can wait before timing out.  This setting is per-mount, only
+changes the behavior of that mount, and only affects the server when it
+is running in that mount.
+.sp
+This determines how long it may take for mounts to synchronize
+committing their log trees to create a log merge operation.  Setting it
+too high can create long latencies in the event that a mount takes a
+long time to commit their log.  Setting it too low can result in the
+creation of excessive numbers of log trees that are never merged.  The
+default is 500 and it can not be less than 100 nor greater than 60000.
+.TP
 .B metadev_path=<device>
 The metadev_path option specifies the path to the block device that
 contains the filesystem's metadata.
--- a/utils/src/bitmap.c
+++ b/utils/src/bitmap.c
@@ -10,6 +10,11 @@
 * Just a quick simple native bitmap.
 */

+int test_bit(unsigned long *bits, u64 nr)
+{
+	return !!(bits[nr / BITS_PER_LONG] & (1UL << (nr & (BITS_PER_LONG - 1))));
+}
+
 void set_bit(unsigned long *bits, u64 nr)
 {
 	bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));
--- a/utils/src/bitmap.h
+++ b/utils/src/bitmap.h
@@ -1,6 +1,7 @@
 #ifndef _BITMAP_H_
 #define _BITMAP_H_

+int test_bit(unsigned long *bits, u64 nr);
 void set_bit(unsigned long *bits, u64 nr);
 void clear_bit(unsigned long *bits, u64 nr);
 u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);
--- a/utils/src/check/alloc.c
+++ b/utils/src/check/alloc.c
@@ -0,0 +1,159 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+
+/*
+ * We check the list blocks serially.
+ *
+ * XXX:
+ *  - compare ref seqs
+ *  - detect cycles?
+ */
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = cb(blkno, 1, cb_arg);
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		/* XXX sort?   maybe */
+
+		ref = lblk->next;
+
+		block_put(&blk);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	return btree_meta_iter(&root->root, cb, cb_arg);
+}
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_alloc_list_block *lblk;
+	struct scoutfs_block_ref ref;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	ref = lhead->ref;
+
+	while (ref.blkno) {
+		blkno = le64_to_cpu(ref.blkno);
+
+		ret = block_get(&blk, blkno, 0);
+		if (ret < 0)
+			goto out;
+
+		sns_push("alloc_list_block", blkno, 0);
+
+		lblk = block_buf(blk);
+		/* XXX verify block */
+		/* XXX sort?   maybe */
+
+		ret = 0;
+		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
+			blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
+
+			ret = cb(blkno, 1, cb_arg);
+			if (ret < 0)
+				break;
+		}
+
+		ref = lblk->next;
+
+		block_put(&blk);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static bool valid_free_extent_key(struct scoutfs_key *key)
+{
+	return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
+	        key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
+	       (!key->_sk_fourth && !key->sk_type &&
+		(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
+}
+
+static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct extent_cb_arg_t *ecba = cb_arg;
+	u64 start;
+	u64 len;
+
+	/* XXX not sure these eios are what we want */
+
+	if (val_len != 0)
+		return -EIO;
+
+	if (!valid_free_extent_key(key))
+		return -EIO;
+
+	if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
+		return -ECHECK_ITER_DONE;
+
+	start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
+	len = le64_to_cpu(key->skfb_len);
+
+	return ecba->cb(start, len, ecba->cb_arg);
+}
+
+/*
+ * Call the callback with each of the primary BLKNO free extents stored
+ * in item in the given alloc root.  It doesn't visit the secondary
+ * ORDER extents.
+ */
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
+{
+	struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&root->root, free_item_cb, &ecba);
+}
--- a/utils/src/check/alloc.h
+++ b/utils/src/check/alloc.h
@@ -0,0 +1,12 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
+#define _SCOUTFS_UTILS_CHECK_ALLOC_H
+
+#include "extent.h"
+
+int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
+int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
+
+#endif
--- a/utils/src/check/block.c
+++ b/utils/src/check/block.c
@@ -0,0 +1,564 @@
+#define _ISOC11_SOURCE /* aligned_alloc */
+#define _DEFAULT_SOURCE /* syscall() */
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/syscall.h>
+#include <linux/aio_abi.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "list.h"
+#include "cmp.h"
+#include "hash.h"
+
+#include "block.h"
+#include "debug.h"
+#include "eno.h"
+
+static struct block_data {
+	struct list_head *hash_lists;
+	size_t hash_nr;
+
+	struct list_head active_head;
+	struct list_head inactive_head;
+	struct list_head dirty_list;
+	size_t nr_active;
+	size_t nr_inactive;
+	size_t nr_dirty;
+
+	int meta_fd;
+	size_t max_cached;
+	size_t nr_events;
+
+	aio_context_t ctx;
+	struct iocb *iocbs;
+	struct iocb **iocbps;
+	struct io_event *events;
+} global_bdat;
+
+struct block {
+	struct list_head hash_head;
+	struct list_head lru_head;
+	struct list_head dirty_head;
+	struct list_head submit_head;
+	unsigned long refcount;
+	unsigned long uptodate:1,
+		      active:1;
+	u64 blkno;
+	void *buf;
+	size_t size;
+};
+
+#define BLK_FMT \
+	"blkno %llu rc %ld d %u a %u"
+#define BLK_ARG(blk) \
+	(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
+#define debug_blk(blk, fmt, args...) \
+	debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
+
+/*
+ * This just allocates and initialzies the block.  The caller is
+ * responsible for putting it on the appropriate initial lists and
+ * managing refcounts.
+ */
+static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
+{
+	struct block *blk;
+
+	blk = calloc(1, sizeof(struct block));
+	if (blk) {
+		blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
+		if (!blk->buf) {
+			free(blk);
+			blk = NULL;
+		} else {
+			INIT_LIST_HEAD(&blk->hash_head);
+			INIT_LIST_HEAD(&blk->lru_head);
+			INIT_LIST_HEAD(&blk->dirty_head);
+			INIT_LIST_HEAD(&blk->submit_head);
+			blk->blkno = blkno;
+			blk->size = size;
+		}
+	}
+
+	return blk;
+}
+
+static void free_block(struct block_data *bdat, struct block *blk)
+{
+	debug_blk(blk, "free");
+
+	if (!list_empty(&blk->lru_head)) {
+		if (blk->active)
+			bdat->nr_active--;
+		else
+			bdat->nr_inactive--;
+		list_del(&blk->lru_head);
+	}
+
+	if (!list_empty(&blk->dirty_head)) {
+		bdat->nr_dirty--;
+		list_del(&blk->dirty_head);
+	}
+
+	if (!list_empty(&blk->hash_head))
+		list_del(&blk->hash_head);
+
+	if (!list_empty(&blk->submit_head))
+		list_del(&blk->submit_head);
+
+	free(blk->buf);
+	free(blk);
+}
+
+static bool blk_is_dirty(struct block *blk)
+{
+	return !list_empty(&blk->dirty_head);
+}
+
+/*
+ * Rebalance the cache.
+ *
+ * First we shrink the cache to limit it to max_cached blocks.
+ * Logically, we walk from oldest to newest in the inactive list and
+ * then in the active list.  Since these lists are physically one
+ * list_head list we achieve this with a reverse walk starting from the
+ * active head.
+ *
+ * Then we rebalnace the size of the two lists.  The constraint is that
+ * we don't let the active list grow larger than the inactive list.  We
+ * move blocks from the oldest tail of the active list to the newest
+ * head of the inactive list.
+ *
+ * <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
+ */
+static void rebalance_cache(struct block_data *bdat)
+{
+	struct block *blk;
+	struct block *blk_;
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
+		if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
+			break;
+
+		if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
+		    blk_is_dirty(blk))
+			continue;
+
+		free_block(bdat, blk);
+	}
+
+	list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
+		if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
+			break;
+
+		list_move(&blk->lru_head, &bdat->inactive_head);
+		blk->active = 0;
+		bdat->nr_active--;
+		bdat->nr_inactive++;
+	}
+}
+
+static void make_active(struct block_data *bdat, struct block *blk)
+{
+	if (!blk->active) {
+		if (!list_empty(&blk->lru_head)) {
+			list_move(&blk->lru_head, &bdat->active_head);
+			bdat->nr_inactive--;
+		} else {
+			list_add(&blk->lru_head, &bdat->active_head);
+		}
+
+		blk->active = 1;
+		bdat->nr_active++;
+	}
+}
+
+static int compar_iocbp(const void *A, const void *B)
+{
+	struct iocb *a = *(struct iocb **)A;
+	struct iocb *b = *(struct iocb **)B;
+
+	return scoutfs_cmp(a->aio_offset, b->aio_offset);
+}
+
+static int submit_and_wait(struct block_data *bdat, struct list_head *list)
+{
+	struct io_event *event;
+	struct iocb *iocb;
+	struct block *blk;
+	int ret;
+	int err;
+	int nr;
+	int i;
+
+	err = 0;
+	nr = 0;
+	list_for_each_entry(blk, list, submit_head) {
+		iocb = &bdat->iocbs[nr];
+		bdat->iocbps[nr] = iocb;
+
+		memset(iocb, 0, sizeof(struct iocb));
+
+		iocb->aio_data = (intptr_t)blk;
+		iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
+		iocb->aio_fildes = bdat->meta_fd;
+		iocb->aio_buf = (intptr_t)blk->buf;
+		iocb->aio_nbytes = blk->size;
+		iocb->aio_offset = blk->blkno * blk->size;
+
+		nr++;
+
+		debug_blk(blk, "submit");
+
+		if ((nr < bdat->nr_events) && blk->submit_head.next != list)
+			continue;
+
+		qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
+
+		ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			printf("fatal system error submitting async IO: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
+		if (ret != nr) {
+			if (ret >= 0)
+				errno = EIO;
+			ret = -errno;
+			printf("fatal system error getting IO events: "ENO_FMT"\n",
+				ENO_ARG(-ret));
+			goto out;
+		}
+
+		ret = 0;
+		for (i = 0; i < nr; i++) {
+			event = &bdat->events[i];
+			iocb = (struct iocb *)(intptr_t)event->obj;
+			blk = (struct block *)(intptr_t)event->data;
+
+			debug_blk(blk, "complete res %lld", (long long)event->res);
+
+			if (event->res >= 0 && event->res != blk->size)
+				event->res = -EIO;
+
+			/* io errors are fatal */
+			if (event->res < 0) {
+				ret = event->res;
+				goto out;
+			}
+
+			if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
+				blk->uptodate = 1;
+			} else {
+				list_del_init(&blk->dirty_head);
+				bdat->nr_dirty--;
+			}
+		}
+		nr = 0;
+	}
+
+	ret = 0;
+out:
+	return ret ?: err;
+}
+
+static void inc_refcount(struct block *blk)
+{
+	blk->refcount++;
+}
+
+void block_put(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk = *blkp;
+
+	if (blk) {
+		blk->refcount--;
+		*blkp = NULL;
+
+		rebalance_cache(bdat);
+	}
+}
+
+static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
+{
+	u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
+
+	return &bdat->hash_lists[hash % bdat->hash_nr];
+}
+
+static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
+{
+	struct list_head *bucket = hash_bucket(bdat, blkno);
+	struct block *search;
+	struct block *blk;
+	size_t size;
+
+	size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
+
+	blk = NULL;
+	list_for_each_entry(search, bucket, hash_head) {
+		if (search->blkno && blkno && search->size == size) {
+			blk = search;
+			break;
+		}
+	}
+
+	if (!blk) {
+		blk = alloc_block(bdat, blkno, size);
+		if (blk) {
+			list_add(&blk->hash_head, bucket);
+			list_add(&blk->lru_head, &bdat->inactive_head);
+			bdat->nr_inactive++;
+		}
+	}
+	if (blk)
+		inc_refcount(blk);
+
+	return blk;
+}
+
+/*
+ * Get a block.
+ *
+ * The caller holds a refcount to the block while it's in use that
+ * prevents it from being removed from the cache.  It must be dropped
+ * with block_put();
+ */
+int block_get(struct block **blk_ret, u64 blkno, int bf)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	LIST_HEAD(list);
+	int ret;
+
+	blk = get_or_alloc(bdat, blkno, bf);
+	if (!blk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if ((bf & BF_ZERO)) {
+		memset(blk->buf, 0, blk->size);
+		blk->uptodate = 1;
+	}
+
+	if (bf & BF_OVERWRITE)
+		blk->uptodate = 1;
+
+	if (!blk->uptodate) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		if (ret < 0)
+			goto out;
+	}
+
+	if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
+		list_add_tail(&bdat->dirty_list, &blk->dirty_head);
+		bdat->nr_dirty++;
+	}
+
+	make_active(bdat, blk);
+
+	rebalance_cache(bdat);
+	ret = 0;
+out:
+	if (ret < 0)
+		block_put(&blk);
+	*blk_ret = blk;
+	return ret;
+}
+
+void *block_buf(struct block *blk)
+{
+	return blk->buf;
+}
+
+size_t block_size(struct block *blk)
+{
+	return blk->size;
+}
+
+/*
+ * Drop the block from the cache, regardless of if it was free or not.
+ * This is used to avoid writing blocks which were dirtied but then
+ * later freed.
+ *
+ * The block is immediately freed and can't be referenced after this
+ * returns.
+ */
+void block_drop(struct block **blkp)
+{
+	struct block_data *bdat = &global_bdat;
+
+	free_block(bdat, *blkp);
+	*blkp = NULL;
+	rebalance_cache(bdat);
+}
+
+/*
+ * This doesn't quite work for mixing large and small blocks, but that's
+ * fine, we never do that.
+ */
+static int compar_u64(const void *A, const void *B)
+{
+	u64 a = *((u64 *)A);
+	u64 b = *((u64 *)B);
+
+	return scoutfs_cmp(a, b);
+}
+
+/*
+ * This read-ahead is synchronous and errors are ignored.  If any of the
+ * blknos aren't present in the cache then we issue concurrent reads for
+ * them and wait.  Any existing cached blocks will be left as is.
+ *
+ * We might be trying to read a lot more than the number of events so we
+ * sort the caller's blknos before iterating over them rather than
+ * relying on submission sorting the blocks in each submitted set.
+ */
+void block_readahead(u64 *blknos, size_t nr)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	size_t i;
+
+	if (nr == 0)
+		return;
+
+	qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
+
+	for (i = 0; i < nr; i++) {
+		blk = get_or_alloc(bdat, blknos[i], 0);
+		if (blk) {
+			if (!blk->uptodate)
+				list_add_tail(&blk->submit_head, &list);
+			else
+				block_put(&blk);
+		}
+	}
+
+	(void)submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	rebalance_cache(bdat);
+}
+
+/*
+ * The caller's block changes form a consistent transaction.  If the amount of dirty
+ * blocks is large enough we issue a write.
+ */
+int block_try_commit(bool force)
+{
+	struct block_data *bdat = &global_bdat;
+	struct block *blk;
+	struct block *blk_;
+	LIST_HEAD(list);
+	int ret;
+
+	if (!force && bdat->nr_dirty < bdat->nr_events)
+		return 0;
+
+	list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
+		list_add_tail(&blk->submit_head, &list);
+		inc_refcount(blk);
+	}
+
+	ret = submit_and_wait(bdat, &list);
+
+	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	}
+
+	if (ret < 0) {
+		printf("error writing dirty transaction blocks\n");
+		goto out;
+	}
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
+	if (ret == 0) {
+		list_add(&blk->submit_head, &list);
+		ret = submit_and_wait(bdat, &list);
+		list_del_init(&blk->submit_head);
+		block_put(&blk);
+	} else {
+		ret = -ENOMEM;
+	}
+	if (ret < 0)
+		printf("error writing super block to commit transaction\n");
+
+out:
+	rebalance_cache(bdat);
+	return ret;
+}
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
+{
+	struct block_data *bdat = &global_bdat;
+	size_t i;
+	int ret;
+
+	bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
+	bdat->hash_nr = bdat->max_cached / 4;
+	bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
+
+	bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
+	bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
+	bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
+	bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
+	if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&bdat->active_head);
+	INIT_LIST_HEAD(&bdat->inactive_head);
+	INIT_LIST_HEAD(&bdat->dirty_list);
+	bdat->meta_fd = meta_fd;
+	list_add(&bdat->inactive_head, &bdat->active_head);
+
+	for (i = 0; i < bdat->hash_nr; i++)
+		INIT_LIST_HEAD(&bdat->hash_lists[i]);
+
+	ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
+
+out:
+	if (ret < 0) {
+		free(bdat->iocbs);
+		free(bdat->iocbps);
+		free(bdat->events);
+		free(bdat->hash_lists);
+	}
+
+	return ret;
+}
+
+void block_shutdown(void)
+{
+	struct block_data *bdat = &global_bdat;
+
+	syscall(SYS_io_destroy, bdat->ctx);
+
+	free(bdat->iocbs);
+	free(bdat->iocbps);
+	free(bdat->events);
+	free(bdat->hash_lists);
+}
--- a/utils/src/check/block.h
+++ b/utils/src/check/block.h
@@ -0,0 +1,32 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
+#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
+
+#include <unistd.h>
+#include <stdbool.h>
+
+struct block;
+
+#include "sparse.h"
+
+/* block flags passed to block_get() */
+enum {
+	BF_ZERO      = (1 << 0), /* zero contents buf as block is returned */
+	BF_DIRTY     = (1 << 1), /* block will be written with transaction */
+	BF_SM        = (1 << 2), /* small 4k block instead of large 64k block */
+	BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
+};
+
+int block_get(struct block **blk_ret, u64 blkno, int bf);
+void block_put(struct block **blkp);
+
+void *block_buf(struct block *blk);
+size_t block_size(struct block *blk);
+void block_drop(struct block **blkp);
+
+void block_readahead(u64 *blknos, size_t nr);
+int block_try_commit(bool force);
+
+int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
+void block_shutdown(void);
+
+#endif
--- a/utils/src/check/btree.c
+++ b/utils/src/check/btree.c
@@ -0,0 +1,209 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+#include "avl.h"
+
+#include "block.h"
+#include "btree.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "meta.h"
+#include "problem.h"
+
+static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
+{
+	return (void *)bt + le16_to_cpu(item->val_off);
+}
+
+static void readahead_refs(struct scoutfs_btree_block *bt)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_block_ref *ref;
+	u64 *blknos;
+	u64 blkno;
+	u16 valid = 0;
+	u16 nr = le16_to_cpu(bt->nr_items);
+	int i;
+
+	blknos = calloc(nr, sizeof(blknos[0]));
+	if (!blknos)
+		return;
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < nr; i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+		blkno = le64_to_cpu(ref->blkno);
+
+		if (valid_meta_blkno(blkno))
+			blknos[valid++] = blkno;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	if (valid > 0)
+		block_readahead(blknos, valid);
+	free(blknos);
+}
+
+/*
+ * Call the callback on the referenced block.  Then if the block
+ * contains referneces read it and recurse into all its references.
+ */
+static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
+			       void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = cb(blkno, 1, cb_arg);
+	if (ret < 0) {
+		ret = xlate_iter_errno(ret);
+		return 0;
+	}
+
+	if (level == 0)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	sns_push("btree_parent", blkno, 0);
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead last level of parents */
+	if (level == 2)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+		ref = item_val(bt, item);
+
+		ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
+		if (ret < 0)
+			goto out;
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
+
+static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
+			       btree_item_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct block *blk = NULL;
+	u64 blkno;
+	int ret;
+	int i;
+
+	blkno = le64_to_cpu(ref->blkno);
+	if (!blkno)
+		return 0;
+
+	ret = block_get(&blk, blkno, 0);
+	if (ret < 0)
+		return ret;
+
+	if (level)
+		sns_push("btree_parent", blkno, 0);
+	else
+		sns_push("btree_leaf", blkno, 0);
+
+	bt = block_buf(blk);
+
+	/* XXX integrate verification with block cache */
+	if (bt->level != level) {
+		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* read-ahead leaves that contain items */
+	if (level == 1)
+		readahead_refs(bt);
+
+	node = avl_first(&bt->item_root);
+
+	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
+		item = container_of(node, struct scoutfs_btree_item, node);
+
+		if (level) {
+			ref = item_val(bt, item);
+			ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
+		} else {
+			ret = cb(&item->key, item_val(bt, item),
+				 le16_to_cpu(item->val_len), cb_arg);
+			debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
+		}
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+
+		node = avl_next(&bt->item_root, &item->node);
+	}
+
+	ret = 0;
+out:
+	block_put(&blk);
+	sns_pop();
+
+	return ret;
+}
+
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
+{
+	/* XXX check root */
+	if (root->height == 0)
+		return 0;
+
+	return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
+}
--- a/utils/src/check/btree.h
+++ b/utils/src/check/btree.h
@@ -0,0 +1,14 @@
+#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
+#define _SCOUTFS_UTILS_CHECK_BTREE_H_
+
+#include "util.h"
+#include "format.h"
+
+#include "extent.h"
+
+typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
+
+int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
+int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
+
+#endif
--- a/utils/src/check/check.c
+++ b/utils/src/check/check.c
@@ -0,0 +1,149 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "ioctl.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "debug.h"
+#include "meta.h"
+#include "super.h"
+
+struct check_args {
+	char *meta_device;
+	char *data_device;
+	char *debug_path;
+};
+
+static int do_check(struct check_args *args)
+{
+	int debug_fd = -1;
+	int meta_fd = -1;
+	int data_fd = -1;
+	int ret;
+
+	if (args->debug_path) {
+		debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (debug_fd < 0) {
+			ret = -errno;
+			fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
+				args->debug_path, strerror(errno), errno);
+			goto out;
+		}
+
+		debug_enable(debug_fd);
+	}
+
+	meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (meta_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
+			args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
+	if (data_fd < 0) {
+		ret = -errno;
+		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
+			args->data_device, strerror(errno), errno);
+		goto out;
+	}
+
+	ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
+	if (ret < 0)
+		goto out;
+
+	ret = check_supers() ?:
+	      check_meta_alloc();
+out:
+	/* and tear it all down */
+	block_shutdown();
+	super_shutdown();
+	debug_disable();
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+	if (data_fd >= 0)
+		close(data_fd);
+	if (debug_fd >= 0)
+		close(debug_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct check_args *args = state->input;
+
+	switch (key) {
+	case 'd':
+		args->debug_path = strdup_or_error(state, arg);
+		break;
+	case 'e':
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else if (!args->data_device)
+			args->data_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		if (!args->data_device)
+			argp_error(state, "no data device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"META-DEVICE DATA-DEVICE",
+	"Check filesystem consistency"
+};
+
+static int check_cmd(int argc, char **argv)
+{
+	struct check_args check_args = {NULL};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
+	if (ret)
+		return ret;
+
+	return do_check(&check_args);
+}
+
+static void __attribute__((constructor)) check_ctor(void)
+{
+	cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
+}
--- a/utils/src/check/debug.c
+++ b/utils/src/check/debug.c
@@ -0,0 +1,16 @@
+#include <stdlib.h>
+
+#include "debug.h"
+
+int debug_fd = -1;
+
+void debug_enable(int fd)
+{
+	debug_fd = fd;
+}
+
+void debug_disable(void)
+{
+	if (debug_fd >= 0)
+		debug_fd = -1;
+}
--- a/utils/src/check/debug.h
+++ b/utils/src/check/debug.h
@@ -0,0 +1,17 @@
+#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
+#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
+
+#include <stdio.h>
+
+#define debug(fmt, args...)				\
+do {							\
+	if (debug_fd >= 0)				\
+		dprintf(debug_fd, fmt"\n", ##args);	\
+} while (0)
+
+extern int debug_fd;
+
+void debug_enable(int fd);
+void debug_disable(void);
+
+#endif
--- a/utils/src/check/eno.h
+++ b/utils/src/check/eno.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
+#define _SCOUTFS_UTILS_CHECK_ENO_H_
+
+#include <errno.h>
+
+#define ENO_FMT		"%d (%s)"
+#define ENO_ARG(eno)	eno, strerror(eno)
+
+#endif
--- a/utils/src/check/extent.c
+++ b/utils/src/check/extent.c
@@ -0,0 +1,312 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util.h"
+#include "lk_rbtree_wrapper.h"
+
+#include "debug.h"
+#include "extent.h"
+
+/*
+ * In-memory extent management in rbtree nodes.
+ */
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
+{
+	u64 a_end = a_start + a_len;
+	u64 b_end = b_start + b_len;
+
+	return !((a_end <= b_start) || (b_end <= a_start));
+}
+
+static int ext_contains(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start <= start && ext->start + ext->len >= start + len;
+}
+
+/*
+ * True if the given extent is bisected by the given range; there's
+ * leftover containing extents on both the left and right sides of the
+ * range in the extent.
+ */
+static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
+{
+	return ext->start < start && ext->start + ext->len > start + len;
+}
+
+static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
+{
+	return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
+}
+
+static struct extent_node *next_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
+}
+
+static struct extent_node *prev_ext(struct extent_node *ext)
+{
+	return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
+}
+
+struct walk_results {
+	unsigned bisect_to_leaf:1;
+	struct extent_node *found;
+	struct extent_node *next;
+	struct rb_node *parent;
+	struct rb_node **node;
+};
+
+static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
+{
+	struct rb_node **node = &root->rbroot.rb_node;
+	struct extent_node *ext;
+	u64 end = start + len;
+	int cmp;
+
+	wlk->found = NULL;
+	wlk->next = NULL;
+	wlk->parent = NULL;
+
+	while (*node) {
+		wlk->parent = *node;
+		ext = ext_from_rbnode(*node);
+		cmp = end <= ext->start ? -1 :
+		      start >= ext->start + ext->len ? 1 : 0;
+
+		if (cmp < 0) {
+			node = &ext->rbnode.rb_left;
+			wlk->next = ext;
+		} else if (cmp > 0) {
+			node = &ext->rbnode.rb_right;
+		} else {
+			wlk->found = ext;
+			if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
+				break;
+			/* walk right so we can insert greater right from bisection */
+			node = &ext->rbnode.rb_right;
+		}
+	}
+
+	wlk->node = node;
+}
+
+/*
+ * Return an extent that overlaps with the given range.
+ */
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
+{
+	struct walk_results wlk = { 0, };
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+	if (wlk.found) {
+		memset(found, 0, sizeof(struct extent_node));
+		found->start = wlk.found->start;
+		found->len = wlk.found->len;
+		ret = 0;
+	} else {
+		ret = -ENOENT;
+	}
+
+	return ret;
+}
+
+/*
+ * Callers can iterate through direct node references and are entirely
+ * responsible for consistency when doing so.
+ */
+struct extent_node *extent_first(struct extent_root *root)
+{
+	struct walk_results wlk = { 0, };
+
+	walk_extents(root, 0, 1, &wlk);
+
+	return wlk.found ?: wlk.next;
+}
+
+struct extent_node *extent_next(struct extent_node *ext)
+{
+	return next_ext(ext);
+}
+
+struct extent_node *extent_prev(struct extent_node *ext)
+{
+	return prev_ext(ext);
+}
+
+/*
+ * Insert a new extent into the tree.  We can extend existing nodes,
+ * merge with neighbours, or remove existing extents entirely if we
+ * insert a range that fully spans existing nodes.
+ */
+static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
+{
+	struct walk_results wlk = { 0, };
+	struct extent_node *ext;
+	struct extent_node *nei;
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	ext = wlk.found;
+	if (ext && found_err) {
+		ret = found_err;
+		goto out;
+	}
+
+	if (!ext) {
+		ext = malloc(sizeof(struct extent_node));
+		if (!ext) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ext->start = start;
+		ext->len = len;
+
+		rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ext->rbnode, &root->rbroot);
+	}
+
+	/* start by expanding an existing extent if our range is larger */
+	if (start < ext->start) {
+		ext->len += ext->start - start;
+		ext->start = start;
+	}
+	if (ext->start + ext->len < start + len)
+		ext->len += (start + len) - (ext->start + ext->len);
+
+	/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
+
+	while ((nei = prev_ext(ext))) {
+		if (nei->start + nei->len < ext->start)
+			break;
+
+		if (nei->start < ext->start) {
+			ext->len += ext->start - nei->start;
+			ext->start = nei->start;
+		}
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	while ((nei = next_ext(ext))) {
+		if (ext->start + ext->len < nei->start)
+			break;
+
+		if (ext->start + ext->len < nei->start + nei->len)
+			ext->len += (nei->start + nei->len) - (ext->start + ext->len);
+
+		rb_erase(&nei->rbnode, &root->rbroot);
+		free(nei);
+	}
+
+	ret = 0;
+out:
+	debug("start %llu len %llu ret %d", start, len, ret);
+	return ret;
+}
+
+/*
+ * Insert a new extent.  The specified extent must not overlap with any
+ * existing extents or -EEXIST is returned.
+ */
+int extent_insert_new(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, true);
+}
+
+/*
+ * Insert an extent, extending any existing extents that may overlap.
+ */
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
+{
+	return walk_insert(root, start, len, false);
+}
+
+/*
+ * Remove the specified extent from an existing node.  The given extent must be fully
+ * contained in a single node or -ENOENT is returned.
+ */
+int extent_remove(struct extent_root *root, u64 start, u64 len)
+{
+	struct extent_node *ext;
+	struct extent_node *ins;
+	struct walk_results wlk = {
+		.bisect_to_leaf = 1,
+	};
+	int ret;
+
+	walk_extents(root, start, len, &wlk);
+
+	if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (ext_bisected(ext, start, len)) {
+		debug("found bisected start %llu len %llu", ext->start, ext->len);
+		ins = malloc(sizeof(struct extent_node));
+		if (!ins) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ins->start = start + len;
+		ins->len = (ext->start + ext->len) - ins->start;
+
+		rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
+		rb_insert_color(&ins->rbnode, &root->rbroot);
+	}
+
+	if (start > ext->start) {
+		ext->len = start - ext->start;
+	} else if (len < ext->len) {
+		ext->start += len;
+		ext->len -= len;
+	} else {
+		rb_erase(&ext->rbnode, &root->rbroot);
+	}
+
+	ret = 0;
+out:
+	debug("start %llu len %llu ret %d", start, len, ret);
+
+	return ret;
+}
+
+void extent_root_init(struct extent_root *root)
+{
+	root->rbroot = RB_ROOT;
+	root->total = 0;
+}
+
+void extent_root_free(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		rb_erase(&ext->rbnode, &root->rbroot);
+		free(ext);
+	}
+}
+
+void extent_root_print(struct extent_root *root)
+{
+	struct extent_node *ext;
+	struct rb_node *node;
+	struct rb_node *tmp;
+
+	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
+		ext = rb_entry(node, struct extent_node, rbnode);
+		debug("  start %llu len %llu", ext->start, ext->len);
+	}
+}
--- a/utils/src/check/extent.h
+++ b/utils/src/check/extent.h
@@ -0,0 +1,38 @@
+#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
+#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
+
+#include "lk_rbtree_wrapper.h"
+
+struct extent_root {
+	struct rb_root rbroot;
+	u64 total;
+};
+
+struct extent_node {
+	struct rb_node rbnode;
+	u64 start;
+	u64 len;
+};
+
+typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
+
+struct extent_cb_arg_t {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
+
+int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
+struct extent_node *extent_first(struct extent_root *root);
+struct extent_node *extent_next(struct extent_node *ext);
+struct extent_node *extent_prev(struct extent_node *ext);
+int extent_insert_new(struct extent_root *root, u64 start, u64 len);
+int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
+int extent_remove(struct extent_root *root, u64 start, u64 len);
+
+void extent_root_init(struct extent_root *root);
+void extent_root_free(struct extent_root *root);
+void extent_root_print(struct extent_root *root);
+
+#endif
--- a/utils/src/check/image.c
+++ b/utils/src/check/image.c
@@ -0,0 +1,540 @@
+#define _GNU_SOURCE /* O_DIRECT */
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdbool.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "bitmap.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "crc.h"
+#include "cmd.h"
+#include "dev.h"
+
+#include "alloc.h"
+#include "block.h"
+#include "btree.h"
+#include "log_trees.h"
+#include "super.h"
+
+/* huh. */
+#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
+
+#define SCOUTFS_META_IMAGE_HEADER_MAGIC		0x8aee00d098fa60c5ULL
+#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC	0x70bd5e9269effd86ULL
+
+struct scoutfs_meta_image_header {
+	__le64 magic;
+	__le64 total_bytes;
+	__le32 version;
+} __packed;
+
+struct scoutfs_meta_image_block_header {
+	__le64 magic;
+	__le64 offset;
+	__le32 size;
+	__le32 crc;
+} __packed;
+
+struct image_args {
+	char *meta_device;
+	bool is_read;
+	bool show_header;
+	u64 ra_window;
+};
+
+struct block_bitmaps {
+	unsigned long *bits;
+	u64 size;
+	u64 count;
+};
+
+#define errf(fmt, args...) \
+	dprintf(STDERR_FILENO, fmt, ##args)
+
+static int set_meta_bit(u64 start, u64 len, void *arg)
+{
+	struct block_bitmaps *bm = arg;
+	int ret;
+
+	if (len != 1) {
+		ret = -EINVAL;
+	} else {
+		if (!test_bit(bm->bits, start)) {
+			set_bit(bm->bits, start);
+			bm->count++;
+		}
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int get_ref_bits(struct block_bitmaps *bm)
+{
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+	u64 i;
+
+	/*
+	 * There are almost no small blocks we need to read, so we read
+	 * them as the large blocks that contain them to simplify the
+	 * block reading process.
+	 */
+	set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
+		set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
+
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
+	      alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
+	      alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
+	      btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
+	      log_trees_meta_iter(set_meta_bit, bm);
+
+	return ret;
+}
+
+/*
+ * Note that this temporarily modifies the header that it's given.
+ */
+static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
+{
+	__le32 saved = bh->crc;
+	u32 crc = ~0;
+
+	bh->crc = 0;
+	crc = crc32c(crc, bh, sizeof(*bh));
+	crc = crc32c(crc, buf, size);
+	bh->crc = saved;
+
+	return cpu_to_le32(crc);
+}
+
+static void printf_header(struct scoutfs_meta_image_header *hdr)
+{
+	errf("magic: 0x%016llx\n"
+	     "total_bytes: %llu\n"
+	     "version: %u\n",
+	       le64_to_cpu(hdr->magic),
+	       le64_to_cpu(hdr->total_bytes),
+	       le32_to_cpu(hdr->version));
+}
+
+typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
+
+static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
+{
+	return read(fd, buf, count);
+}
+
+static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
+{
+	return pread(fd, buf, count, offset);
+}
+
+static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
+{
+	return write(fd, buf, count);
+}
+
+static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
+{
+	return pwrite(fd, buf, count, offset);
+}
+
+static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t sret;
+
+	while (count > 0) {
+		sret = func(fd, buf, count, offset);
+		if (sret <= 0 || sret > count) {
+			if (sret < 0)
+				return -errno;
+			else
+				return -EIO;
+		}
+
+		if (tot)
+			*tot += sret;
+		buf += sret;
+		count -= sret;
+	}
+
+	return 0;
+}
+
+static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	u64 opening;
+	void *buf;
+	off_t off;
+	u64 bit;
+	u64 ra;
+	int ret;
+
+	buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
+	hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
+				      (bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
+	hdr.version = cpu_to_le32(1);
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	opening = args->ra_window;
+	ra = 0;
+	bit = 0;
+
+	for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
+
+		/* readahead to open the full window, then a block at a time */
+		do {
+			ra = find_next_set_bit(bm->bits, ra, bm->size);
+			if (ra < bm->size) {
+				off = ra << SCOUTFS_BLOCK_LG_SHIFT;
+				posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
+				ra++;
+				if (opening)
+					opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
+			}
+		} while (opening > 0);
+
+		off = bit << SCOUTFS_BLOCK_LG_SHIFT;
+		ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
+		if (ret < 0)
+			goto out;
+
+		/*
+		 * Might as well try to drop the pages we've used to
+		 * reduce memory pressure on our read-ahead pages that
+		 * are waiting.
+		 */
+		posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
+
+		bh.magic = SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC;
+		bh.offset = cpu_to_le64(off);
+		bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
+		bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
+
+		ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
+		      rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	free(buf);
+
+	return ret;
+}
+
+static int invalid_header(struct scoutfs_meta_image_header *hdr)
+{
+	if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
+		errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(hdr->version) != 1) {
+		errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/*
+ * Doesn't catch offset+size overflowing, presumes pwrite() will return
+ * an error.
+ */
+static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
+{
+	if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
+		errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
+		       le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
+
+	} else if (le32_to_cpu(bh->size) == 0) {
+		errf("invalid block header size %u\n", le32_to_cpu(bh->size));
+
+	} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
+		errf("block header size %u too large for size_t (> %zu)\n",
+		       le32_to_cpu(bh->size), (size_t)SIZE_MAX);
+
+	} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
+		errf("block header offset %llu too large for off_t (> %llu)\n",
+		       le64_to_cpu(bh->offset), (u64)OFF_MAX);
+
+	} else {
+		return 0;
+	}
+
+	return -EIO;
+}
+
+static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
+{
+	struct scoutfs_meta_image_block_header bh;
+	struct scoutfs_meta_image_header hdr;
+	size_t writeback_batch = (2 * 1024 * 1024);
+	size_t buf_size;
+	size_t dirty;
+	size_t size;
+	off_t first;
+	off_t last;
+	off_t off;
+	__le32 calc;
+	void *buf;
+	u64 tot;
+	int ret;
+
+	tot = 0;
+
+	ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
+	if (ret < 0)
+		goto out;
+
+	if (args->show_header) {
+		printf_header(&hdr);
+		ret = 0;
+		goto out;
+	}
+
+	ret = invalid_header(&hdr);
+	if (ret < 0)
+		goto out;
+
+	dirty = 0;
+	first = OFF_MAX;
+	last = 0;
+	buf = NULL;
+	buf_size = 0;
+
+	while (tot < le64_to_cpu(hdr.total_bytes)) {
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
+		if (ret < 0)
+			goto out;
+
+		ret = invalid_block_header(&bh);
+		if (ret < 0)
+			goto out;
+
+		size = le32_to_cpu(bh.size);
+		if (buf_size < size) {
+			buf = realloc(buf, size);
+			if (!buf) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			buf_size = size;
+		}
+
+		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
+		if (ret < 0)
+			goto out;
+
+		calc = calc_crc(&bh, buf, size);
+		if (calc != bh.crc) {
+			errf("crc err");
+			ret = -EIO;
+			goto out;
+		}
+
+		off = le64_to_cpu(bh.offset);
+
+		ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
+		if (ret < 0)
+			goto out;
+
+		dirty += size;
+		first = min(first, off);
+		last = max(last, off);
+		if (dirty >= writeback_batch) {
+			posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
+			dirty = 0;
+			first = OFF_MAX;
+			last = 0;
+		}
+	}
+
+	ret = fsync(fd);
+	if (ret < 0) {
+		ret = -errno;
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+static int do_image(struct image_args *args)
+{
+	struct block_bitmaps bm = { .bits = NULL };
+	int meta_fd = -1;
+	u64 dev_size;
+	mode_t mode;
+	int ret;
+
+	mode = args->is_read ? O_RDONLY : O_RDWR;
+
+	meta_fd = open(args->meta_device, mode);
+	if (meta_fd < 0) {
+		ret = -errno;
+		errf("failed to open meta device '%s': %s (%d)\n",
+		     args->meta_device, strerror(errno), errno);
+		goto out;
+	}
+
+	if (args->is_read) {
+		ret = flush_device(meta_fd);
+		if (ret < 0)
+			goto out;
+
+		ret = get_device_size(args->meta_device, meta_fd, &dev_size);
+		if (ret < 0)
+			goto out;
+
+		bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
+		bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
+		if (!bm.bits) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
+		      check_supers() ?:
+		      get_ref_bits(&bm) ?:
+		      read_image(args, meta_fd, &bm);
+		block_shutdown();
+	} else {
+		ret = write_image(args, meta_fd, &bm);
+	}
+out:
+	free(bm.bits);
+
+	if (meta_fd >= 0)
+		close(meta_fd);
+
+	return ret;
+}
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct image_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'h':
+		args->show_header = true;
+		break;
+	case 'r':
+		ret = parse_u64(arg, &args->ra_window);
+		if (ret)
+			argp_error(state, "readahead winddoe parse error");
+		break;
+	case ARGP_KEY_ARG:
+		if (!args->meta_device)
+			args->meta_device = strdup_or_error(state, arg);
+		else
+			argp_error(state, "more than two device arguments given");
+		break;
+	case ARGP_KEY_FINI:
+		if (!args->meta_device)
+			argp_error(state, "no metadata device argument given");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
+	{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
+	{ NULL }
+};
+
+static struct argp read_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Read metadata image stream from metadata device file"
+};
+
+#define DEFAULT_RA_WINDOW (512 * 1024)
+
+static int read_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = true,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static struct argp write_image_argp = {
+	options,
+	parse_opt,
+	"META-DEVICE",
+	"Write metadata image stream to metadata device file"
+};
+
+static int write_image_cmd(int argc, char **argv)
+{
+	struct image_args image_args = {
+		.is_read = false,
+		.ra_window = DEFAULT_RA_WINDOW,
+	};
+	int ret;
+
+	ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
+	if (ret)
+		return ret;
+
+	return do_image(&image_args);
+}
+
+static void __attribute__((constructor)) image_ctor(void)
+{
+	cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
+	cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
+}
--- a/utils/src/check/iter.h
+++ b/utils/src/check/iter.h
@@ -0,0 +1,15 @@
+#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
+#define _SCOUTFS_UTILS_CHECK_ITER_H_
+
+/*
+ * Callbacks can return a weird -errno that we'll never use to indicate
+ * that iteration can stop and return 0 for success.
+ */
+#define ECHECK_ITER_DONE EL2HLT
+
+static inline int xlate_iter_errno(int ret)
+{
+	return ret == -ECHECK_ITER_DONE ? 0 : ret;
+}
+
+#endif
--- a/utils/src/check/log_trees.c
+++ b/utils/src/check/log_trees.c
@@ -0,0 +1,98 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "iter.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "super.h"
+
+struct iter_args {
+	extent_cb_t cb;
+	void *cb_arg;
+};
+
+static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
+{
+	struct iter_args *ia = cb_arg;
+	struct scoutfs_log_trees *lt;
+	int ret;
+
+	if (val_len != sizeof(struct scoutfs_log_trees))
+		; /* XXX */
+
+	lt = val;
+
+	sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
+
+	sns_push("meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("item_root", 0, 0);
+	ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	if (lt->bloom_ref.blkno) {
+		sns_push("bloom_ref", 0, 0);
+		ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
+		sns_pop();
+		if (ret < 0) {
+			ret = xlate_iter_errno(ret);
+			goto out;
+		}
+	}
+
+	sns_push("data_avail", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_freed", 0, 0);
+	ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	sns_pop();
+
+	return ret;
+}
+
+/*
+ * Call the callers callback with the extent of all the metadata block references contained
+ * in log btrees.  We walk the logs_root btree items and walk all the metadata structures
+ * they reference.
+ */
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
+{
+	struct scoutfs_super_block *super = global_super;
+	struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
+
+	return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
+}
--- a/utils/src/check/log_trees.h
+++ b/utils/src/check/log_trees.h
@@ -0,0 +1,8 @@
+#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
+
+#include "extent.h"
+
+int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
+
+#endif
--- a/utils/src/check/meta.c
+++ b/utils/src/check/meta.c
@@ -0,0 +1,367 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+#include "bitmap.h"
+#include "key.h"
+
+#include "alloc.h"
+#include "btree.h"
+#include "debug.h"
+#include "extent.h"
+#include "sns.h"
+#include "log_trees.h"
+#include "meta.h"
+#include "problem.h"
+#include "super.h"
+
+static struct meta_data {
+	struct extent_root meta_refed;
+	struct extent_root meta_free;
+	struct {
+		u64 ref_blocks;
+		u64 free_extents;
+		u64 free_blocks;
+	} stats;
+} global_mdat;
+
+bool valid_meta_blkno(u64 blkno)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+
+	return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
+}
+
+static bool valid_meta_extent(u64 start, u64 len)
+{
+	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
+	bool valid;
+
+	valid = len > 0 &&
+		start >= SCOUTFS_META_DEV_START_BLKNO &&
+		start < tot &&
+		len <= tot &&
+		((start + len) <= tot) &&
+		((start + len) > start);
+
+	debug("start %llu len %llu valid %u", start, len, !!valid);
+
+	if (!valid)
+		problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
+
+	return valid;
+}
+
+/*
+ * Track references to individual metadata blocks.  This uses the extent
+ * callback type but is only ever called for single block references.
+ * Any reference to a block that has already been referenced is
+ * considered invalid and is ignored.  Later repair will resolve
+ * duplicate references.
+ */
+static int insert_meta_ref(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	/* this is tracking single metadata block references */
+	if (len != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (valid_meta_blkno(start)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0)
+			mdat->stats.ref_blocks++;
+		else if (ret == -EEXIST)
+			problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
+	}
+
+out:
+	return ret;
+}
+
+static int insert_meta_free(u64 start, u64 len, void *arg)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_root *root = arg;
+	int ret = 0;
+
+	if (valid_meta_extent(start, len)) {
+		ret = extent_insert_new(root, start, len);
+		if (ret == 0) {
+			mdat->stats.free_extents++;
+			mdat->stats.free_blocks++;
+
+		} else if (ret == -EEXIST) {
+			problem(PB_META_FREE_OVERLAPS_EXISTING,
+				"start %llu llen %llu", start, len);
+		}
+
+	}
+
+	return ret;
+}
+
+/*
+ * Walk all metadata references in the system.  This walk doesn't need
+ * to read metadata that doesn't contain any metadata references so it
+ * can skip the bulk of metadata blocks.  This gives us the set of
+ * referenced metadata blocks which we can then use to repair metadata
+ * allocator structures.
+ */
+static int get_meta_refs(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_refed);
+
+	/* XXX record reserved blocks around super as referenced */
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("data_alloc", 1, 0);
+	ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_avail[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[0],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_meta_iter(&super->server_meta_freed[1],
+				   insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("fs_root", 0, 0);
+	ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("logs_root", 0, 0);
+	ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("log_merge", 0, 0);
+	ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("mounted_clients", 0, 0);
+	ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("srch_root", 0, 0);
+	ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
+	if (ret < 0)
+		goto out;
+
+	printf("found %llu referenced metadata blocks\n", mdat->stats.ref_blocks);
+	ret = 0;
+out:
+	return ret;
+}
+
+static int get_meta_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct scoutfs_super_block *super = global_super;
+	int ret;
+
+	extent_root_init(&mdat->meta_free);
+
+	sns_push("meta_alloc", 0, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("meta_alloc", 1, 0);
+	ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_avail", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_avail[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 0, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[0],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	sns_push("server_meta_freed", 1, 0);
+	ret = alloc_list_extent_iter(&super->server_meta_freed[1],
+				     insert_meta_free, &mdat->meta_free);
+	sns_pop();
+	if (ret < 0)
+		goto out;
+
+	printf("found %llu free metadata blocks in %llu extents\n",
+	       mdat->stats.free_blocks, mdat->stats.free_extents);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * All the space between referenced blocks must be recorded in the free
+ * extents.  The free extent walk didn't check that the extents
+ * overlapped with references, we do that here.  Remember that metadata
+ * block references were merged into extents here, the refed extents
+ * aren't necessarily all a single block.
+ */
+static int compare_refs_and_free(void)
+{
+	struct meta_data *mdat = &global_mdat;
+	struct extent_node *ref;
+	struct extent_node *free;
+	struct extent_node *next;
+	struct extent_node *prev;
+	u64 expect;
+	u64 start;
+	u64 end;
+
+	expect = 0;
+	ref = extent_first(&mdat->meta_refed);
+	free = extent_first(&mdat->meta_free);
+	while (ref || free) {
+
+		printf("exp %llu ref %llu.%llu free %llu.%llu\n",
+			expect, ref ? ref->start : 0, ref ? ref->len : 0,
+			free ? free->start : 0, free ? free->len : 0);
+
+		/* referenced marked free, remove ref from free and continue from same point */
+		if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
+			printf("ref extent %llu.%llu overlaps free %llu %llu\n",
+				ref->start, ref->len, free->start, free->len);
+
+			start = max(ref->start, free->start);
+			end = min(ref->start + ref->len, free->start + free->len);
+
+			prev = extent_prev(free);
+
+			extent_remove(&mdat->meta_free, start, end - start);
+
+			if (prev)
+				free = extent_next(prev);
+			else
+				free = extent_first(&mdat->meta_free);
+			continue;
+		}
+
+		/* see which extent starts earlier */
+		if (!free || (ref && ref->start <= free->start))
+			next = ref;
+		else
+			next = free;
+
+		/* untracked region before next extent */
+		if (expect < next->start) {
+			printf("missing free extent %llu.%llu\n", expect, next->start - expect);
+			expect = next->start;
+			continue;
+		}
+
+
+		/* didn't overlap, advance past next extent */
+		expect = next->start + next->len;
+		if (next == ref)
+			ref = extent_next(ref);
+		else
+			free = extent_next(free);
+	}
+
+	return 0;
+}
+
+/*
+ * Check the metadata allocators by comparing the set of referenced
+ * blocks with the set of free blocks that are stored in free btree
+ * items and alloc list blocks.
+ */
+int check_meta_alloc(void)
+{
+	int ret;
+
+	ret = get_meta_refs();
+	if (ret < 0)
+		goto out;
+
+	ret = get_meta_free();
+	if (ret < 0)
+		goto out;
+
+	ret = compare_refs_and_free();
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
--- a/utils/src/check/meta.h
+++ b/utils/src/check/meta.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_META_H_
+#define _SCOUTFS_UTILS_CHECK_META_H_
+
+bool valid_meta_blkno(u64 blkno);
+
+int check_meta_alloc(void);
+
+#endif
+
--- a/utils/src/check/padding.c
+++ b/utils/src/check/padding.c
@@ -0,0 +1,23 @@
+#include <string.h>
+#include <stdbool.h>
+
+#include "util.h"
+#include "padding.h"
+
+bool padding_is_zeros(const void *data, size_t sz)
+{
+	static char zeros[32] = {0,};
+	const size_t batch = array_size(zeros);
+
+	while (sz >= batch) {
+		if (memcmp(data, zeros, batch))
+			return false;
+		data += batch;
+		sz -= batch;
+	}
+
+	if (sz > 0 && memcmp(data, zeros, sz))
+		return false;
+
+	return true;
+}
--- a/utils/src/check/padding.h
+++ b/utils/src/check/padding.h
@@ -0,0 +1,6 @@
+#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
+#define _SCOUTFS_UTILS_CHECK_PADDING_H_
+
+bool padding_is_zeros(const void *data, size_t sz);
+
+#endif
--- a/utils/src/check/problem.c
+++ b/utils/src/check/problem.c
@@ -0,0 +1,23 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#include "problem.h"
+
+#if 0
+#define PROB_STR(pb) [pb] = #pb
+static char *prob_strs[] = {
+	PROB_STR(PB_META_EXTENT_INVALID),
+	PROB_STR(PB_META_EXTENT_OVERLAPS_EXISTING),
+};
+#endif
+
+static struct problem_data {
+	uint64_t counts[PB__NR];
+} global_pdat;
+
+void problem_record(prob_t pb)
+{
+	struct problem_data *pdat = &global_pdat;
+
+	pdat->counts[pb]++;
+}
--- a/utils/src/check/problem.h
+++ b/utils/src/check/problem.h
@@ -0,0 +1,23 @@
+#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
+#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
+
+#include "debug.h"
+#include "sns.h"
+
+typedef enum {
+	PB_META_EXTENT_INVALID,
+	PB_META_REF_OVERLAPS_EXISTING,
+	PB_META_FREE_OVERLAPS_EXISTING,
+	PB_BTREE_BLOCK_BAD_LEVEL,
+	PB__NR,
+} prob_t;
+
+#define problem(pb, fmt, ...)							\
+do {										\
+	debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__);	\
+	problem_record(pb);							\
+} while (0)
+
+void problem_record(prob_t pb);
+
+#endif
--- a/utils/src/check/sns.c
+++ b/utils/src/check/sns.c
@@ -0,0 +1,118 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "sns.h"
+
+/*
+ * This "str num stack" is used to describe our location in metadata at
+ * any given time.
+ *
+ * As we descend into structures we pop a string on decribing them,
+ * perhaps with associated numbers.  Pushing and popping is very cheap
+ * and only rarely do we format the stack into a string, as an arbitrary
+ * example:
+ *   super.fs_root.btree_parent:1231.btree_leaf:3231"
+ */
+
+#define SNS_MAX_DEPTH	1000
+#define SNS_STR_SIZE	(SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
+
+static struct sns_data {
+	unsigned int depth;
+
+	struct sns_entry {
+		char *str;
+		size_t len;
+		u64 a;
+		u64 b;
+	} ents[SNS_MAX_DEPTH];
+
+	char str[SNS_STR_SIZE];
+
+} global_lsdat;
+
+void _sns_push(char *str, size_t len, u64 a, u64 b)
+{
+	struct sns_data *lsdat = &global_lsdat;
+
+	if (lsdat->depth < SNS_MAX_DEPTH) {
+		lsdat->ents[lsdat->depth++] = (struct sns_entry) {
+			.str = str,
+			.len = len,
+			.a = a,
+			.b = b,
+		};
+	}
+}
+
+void sns_pop(void)
+{
+	struct sns_data *lsdat = &global_lsdat;
+
+	if (lsdat->depth > 0)
+		lsdat->depth--;
+}
+
+static char *append_str(char *pos, char *str, size_t len)
+{
+	memcpy(pos, str, len);
+	return pos + len;
+}
+
+/*
+ * This is not called for x = 0 so we don't need to emit an initial 0.
+ * We could by using do {} while instead of while {}.
+ */
+static char *append_u64x(char *pos, u64 x)
+{
+	static char hex[] = "0123456789abcdef";
+
+	while (x) {
+		*pos++ = hex[x & 0xf];
+		x >>= 4;
+	}
+
+	return pos;
+}
+
+static char *append_char(char *pos, char c)
+{
+	*(pos++) = c;
+	return pos;
+}
+
+/*
+ * Return a pointer to a null terminated string that describes the
+ * current location stack.  The string buffer is global.
+ */
+char *sns_str(void)
+{
+	struct sns_data *lsdat = &global_lsdat;
+	struct sns_entry *ent;
+	char *pos;
+	int i;
+
+	pos = lsdat->str;
+	for (i = 0; i < lsdat->depth; i++) {
+		ent = &lsdat->ents[i];
+
+		if (i)
+			pos = append_char(pos, '.');
+
+		pos = append_str(pos, ent->str, ent->len);
+
+		if (ent->a) {
+			pos = append_char(pos, ':');
+			pos = append_u64x(pos, ent->a);
+		}
+
+		if (ent->b) {
+			pos = append_char(pos, ':');
+			pos = append_u64x(pos, ent->b);
+		}
+	}
+
+	*pos = '\0';
+
+	return lsdat->str;
+}
--- a/utils/src/check/sns.h
+++ b/utils/src/check/sns.h
@@ -0,0 +1,20 @@
+#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
+#define _SCOUTFS_UTILS_CHECK_SNS_H_
+
+#include <assert.h>
+
+#include "sparse.h"
+
+#define SNS_MAX_STR_LEN 20
+
+#define sns_push(str, a, b)					\
+do {								\
+	build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN);	\
+	_sns_push((str), sizeof(str) - 1, a, b);		\
+} while (0)
+
+void _sns_push(char *str, size_t len, u64 a, u64 b);
+void sns_pop(void);
+char *sns_str(void);
+
+#endif
--- a/utils/src/check/super.c
+++ b/utils/src/check/super.c
@@ -0,0 +1,57 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "sparse.h"
+#include "util.h"
+#include "format.h"
+
+#include "block.h"
+#include "super.h"
+
+/*
+ * After we check the super blocks we provide a global buffer to track
+ * the current super block.  It is referenced to get static information
+ * about the system and is also modified and written as part of
+ * transactions.
+ */
+struct scoutfs_super_block *global_super;
+
+/*
+ * After checking the supers we save a copy of it in a global buffer that's used by
+ * other modules to track the current super.  It can be modified and written during commits.
+ */
+int check_supers(void)
+{
+	struct scoutfs_super_block *super = NULL;
+	struct block *blk = NULL;
+	int ret;
+
+	global_super = malloc(sizeof(struct scoutfs_super_block));
+	if (!global_super) {
+		printf("error allocating super block buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
+	if (ret < 0) {
+		printf("error reading super block\n");
+		goto out;
+	}
+
+	super = block_buf(blk);
+
+	memcpy(global_super, super, sizeof(struct scoutfs_super_block));
+	ret = 0;
+out:
+	block_put(&blk);
+
+	return ret;
+}
+
+void super_shutdown(void)
+{
+	free(global_super);
+}
--- a/utils/src/check/super.h
+++ b/utils/src/check/super.h
@@ -0,0 +1,9 @@
+#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
+#define _SCOUTFS_UTILS_CHECK_SUPER_H_
+
+extern struct scoutfs_super_block *global_super;
+
+int check_supers(void);
+void super_shutdown(void);
+
+#endif
--- a/utils/src/list.h
+++ b/utils/src/list.h
@@ -156,6 +156,16 @@ static inline void list_move_tail(struct list_head *list,
        list_add_tail(list, head);
 }

+/**
+ * list_is_head - tests whether @list is the list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_head(const struct list_head *list, const struct list_head *head)
+{
+	return list == head;
+}
+
 /**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
@@ -242,6 +252,15 @@ static inline void list_splice_init(struct list_head *list,
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)

+/**
+ * list_entry_is_head - test if the entry points to the head of the list
+ * @pos:	the type * to cursor
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_entry_is_head(pos, head, member)				\
+	(&pos->member == (head))
+
 /**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop counter.
@@ -307,4 +326,28 @@ static inline void list_splice_init(struct list_head *list,
 #define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

+/**
+ * list_prev_entry - get the prev element in list
+ * @pos:	the type * to cursor
+ * @member:	the name of the list_head within the struct.
+ */
+#define list_prev_entry(pos, member) \
+	list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+/**
+ * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_head within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_last_entry(head, typeof(*pos), member),		\
+		n = list_prev_entry(pos, member);			\
+	     !list_entry_is_head(pos, head, member); 			\
+	     pos = n, n = list_prev_entry(n, member))
+
 #endif
--- a/utils/src/lk_rbtree_wrapper.h
+++ b/utils/src/lk_rbtree_wrapper.h
@@ -0,0 +1,24 @@
+#ifndef _LK_RBTREE_WRAPPER_H_
+#define _LK_RBTREE_WRAPPER_H_
+
+/*
+ * We're using this lame hack to build and use the kernel's rbtree in
+ * userspace.  We drop the kernel's rbtree*[ch] implementation in and
+ * use them with this wrapper.  We only have to remove the kernel
+ * includes from the imported files.
+ */
+
+#include <stdbool.h>
+#include "util.h"
+
+#define rcu_assign_pointer(a, b)	do { a = b; } while (0)
+#define READ_ONCE(a)			({ a; })
+#define WRITE_ONCE(a, b)		do { a = b; } while (0)
+#define unlikely(a)			({ a; })
+#define EXPORT_SYMBOL(a)		/* nop */
+
+#include "rbtree_types.h"
+#include "rbtree.h"
+#include "rbtree_augmented.h"
+
+#endif
--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -609,6 +609,8 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
 	u64 blkno;
 	u64 start;
 	u64 len;
+	u64 st;
+	u64 nr;
 	int wid;
 	int ret;
 	int i;
@@ -627,27 +629,37 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
 	       AL_REF_A(&lblk->next), le32_to_cpu(lblk->start),
 	       le32_to_cpu(lblk->nr));

-	if (lblk->nr) {
-		wid = printf("  exts: ");
-		start = 0;
-		len = 0;
-		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
-			if (len == 0)
-				start = le64_to_cpu(lblk->blknos[i]);
-			len++;
-
-			if (i == (le32_to_cpu(lblk->nr) - 1) ||
-			    start + len != le64_to_cpu(lblk->blknos[i + 1])) {
-				if (wid >= 72)
-					wid = printf("\n        ");
-
-				wid += printf("%llu,%llu ", start, len);
-				len = 0;
-			}
-		}
-		printf("\n");
+	st = le32_to_cpu(lblk->start);
+	nr = le32_to_cpu(lblk->nr);
+	if (st >= SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
+	    nr > SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
+	    (st + nr) > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) {
+		printf("  (invalid start and nr fields)\n");
+		goto out;
 	}

+	if (lblk->nr == 0)
+		goto out;
+
+	wid = printf("  exts: ");
+	start = 0;
+	len = 0;
+	for (i = 0; i < nr; i++) {
+		if (len == 0)
+			start = le64_to_cpu(lblk->blknos[st + i]);
+		len++;
+
+		if (i == (nr - 1) || (start + len) != le64_to_cpu(lblk->blknos[st + i + 1])) {
+			if (wid >= 72)
+				wid = printf("\n        ");
+
+			wid += printf("%llu,%llu ", start, len);
+			len = 0;
+		}
+	}
+	printf("\n");
+
+out:
 	next = lblk->next;
 	free(lblk);
 	return print_alloc_list_block(fd, str, &next);
--- a/utils/src/rbtree.c
+++ b/utils/src/rbtree.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+
+  linux/lib/rbtree.c
+*/
+
+#include "lk_rbtree_wrapper.h"
+
+/*
+ * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
+
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
+static inline void rb_set_black(struct rb_node *rb)
+{
+	rb->__rb_parent_color |= RB_BLACK;
+}
+
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
+{
+	return (struct rb_node *)red->__rb_parent_color;
+}
+
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	__rb_change_child(old, new, parent, root);
+}
+
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root,
+	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+	while (true) {
+		/*
+		 * Loop invariant: node is red.
+		 */
+		if (unlikely(!parent)) {
+			/*
+			 * The inserted node is root. Either this is the
+			 * first node, or we recursed at Case 1 below and
+			 * are no longer violating 4).
+			 */
+			rb_set_parent_color(node, NULL, RB_BLACK);
+			break;
+		}
+
+		/*
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as,
+		 * per 4), we don't want a red root or two
+		 * consecutive red nodes.
+		 */
+		if(rb_is_black(parent))
+			break;
+
+		gparent = rb_red_parent(parent);
+
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - node's uncle is red (color flips).
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            n
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_right;
+			if (node == tmp) {
+				/*
+				 * Case 2 - node's uncle is black and node is
+				 * the parent's right child (left rotate at parent).
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_right;
+			}
+
+			/*
+			 * Case 3 - node's uncle is black and node is
+			 * the parent's left child (right rotate at gparent).
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		} else {
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_left;
+			if (node == tmp) {
+				/* Case 2 - right rotate at parent */
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
+				parent = node;
+				tmp = node->rb_left;
+			}
+
+			/* Case 3 - left rotate at gparent */
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
+		}
+	}
+}
+
+/*
+ * Inline version for rb_erase() use - we want to be able to inline
+ * and eliminate the dummy_rotate callback there
+ */
+static __always_inline void
+____rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
+
+	while (true) {
+		/*
+		 * Loop invariants:
+		 * - node is black (or NULL on first iteration)
+		 * - node is not the root (parent is not NULL)
+		 * - All leaf paths going through parent and node have a
+		 *   black node count that is 1 lower than other leaf paths.
+		 */
+		sibling = parent->rb_right;
+		if (node != sibling) {	/* node == parent->rb_left */
+			if (rb_is_red(sibling)) {
+				/*
+				 * Case 1 - left rotate at parent
+				 *
+				 *     P               S
+				 *    / \             / \
+				 *   N   s    -->    p   Sr
+				 *      / \         / \
+				 *     Sl  Sr      N   Sl
+				 */
+				tmp1 = sibling->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp1);
+				WRITE_ONCE(sibling->rb_left, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_right;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_left;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/*
+					 * Case 2 - sibling color flip
+					 * (p could be either color here)
+					 *
+					 *    (p)           (p)
+					 *    / \           / \
+					 *   N   S    -->  N   s
+					 *      / \           / \
+					 *     Sl  Sr        Sl  Sr
+					 *
+					 * This leaves us violating 5) which
+					 * can be fixed by flipping p to black
+					 * if it was red, or by recursing at p.
+					 * p is red when coming from Case 1.
+					 */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/*
+				 * Case 3 - right rotate at sibling
+				 * (p could be either color here)
+				 *
+				 *   (p)           (p)
+				 *   / \           / \
+				 *  N   S    -->  N   sl
+				 *     / \             \
+				 *    sl  Sr            S
+				 *                       \
+				 *                        Sr
+				 *
+				 * Note: p might be red, and then both
+				 * p and sl are red after rotation(which
+				 * breaks property 4). This is fixed in
+				 * Case 4 (in __rb_rotate_set_parents()
+				 *         which set sl the color of p
+				 *         and set p RB_BLACK)
+				 *
+				 *   (p)            (sl)
+				 *   / \            /  \
+				 *  N   sl   -->   P    S
+				 *       \        /      \
+				 *        S      N        Sr
+				 *         \
+				 *          Sr
+				 */
+				tmp1 = tmp2->rb_right;
+				WRITE_ONCE(sibling->rb_left, tmp1);
+				WRITE_ONCE(tmp2->rb_right, sibling);
+				WRITE_ONCE(parent->rb_right, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/*
+			 * Case 4 - left rotate at parent + color flips
+			 * (p and sl could be either color here.
+			 *  After rotation, p becomes black, s acquires
+			 *  p's color, and sl keeps its color)
+			 *
+			 *      (p)             (s)
+			 *      / \             / \
+			 *     N   S     -->   P   Sr
+			 *        / \         / \
+			 *      (sl) sr      N  (sl)
+			 */
+			tmp2 = sibling->rb_left;
+			WRITE_ONCE(parent->rb_right, tmp2);
+			WRITE_ONCE(sibling->rb_left, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		} else {
+			sibling = parent->rb_left;
+			if (rb_is_red(sibling)) {
+				/* Case 1 - right rotate at parent */
+				tmp1 = sibling->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp1);
+				WRITE_ONCE(sibling->rb_right, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
+			}
+			tmp1 = sibling->rb_left;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_right;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/* Case 2 - sibling color flip */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
+				}
+				/* Case 3 - left rotate at sibling */
+				tmp1 = tmp2->rb_left;
+				WRITE_ONCE(sibling->rb_right, tmp1);
+				WRITE_ONCE(tmp2->rb_left, sibling);
+				WRITE_ONCE(parent->rb_left, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
+			}
+			/* Case 4 - right rotate at parent + color flips */
+			tmp2 = sibling->rb_right;
+			WRITE_ONCE(parent->rb_left, tmp2);
+			WRITE_ONCE(sibling->rb_right, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		}
+	}
+}
+
+/* Non-inline version for rb_erase_augmented() use */
+void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	____rb_erase_color(parent, root, augment_rotate);
+}
+EXPORT_SYMBOL(__rb_erase_color);
+
+/*
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+ */
+
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+
+static const struct rb_augment_callbacks dummy_callbacks = {
+	.propagate = dummy_propagate,
+	.copy = dummy_copy,
+	.rotate = dummy_rotate
+};
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	__rb_insert(node, root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color);
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *rebalance;
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+	if (rebalance)
+		____rb_erase_color(rebalance, root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_erase);
+
+/*
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
+ */
+
+void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
+{
+	__rb_insert(node, root, augment_rotate);
+}
+EXPORT_SYMBOL(__rb_insert_augmented);
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+EXPORT_SYMBOL(rb_first);
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+EXPORT_SYMBOL(rb_last);
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right;
+		while (node->rb_left)
+			node = node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+EXPORT_SYMBOL(rb_next);
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
+	if (node->rb_left) {
+		node = node->rb_left;
+		while (node->rb_right)
+			node = node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+EXPORT_SYMBOL(rb_prev);
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+	__rb_change_child(victim, new, parent, root);
+}
+EXPORT_SYMBOL(rb_replace_node);
+
+void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+			 struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+
+	/* Set the parent's pointer to the new node last after an RCU barrier
+	 * so that the pointers onwards are seen to be set correctly when doing
+	 * an RCU walk over the tree.
+	 */
+	__rb_change_child_rcu(victim, new, parent, root);
+}
+EXPORT_SYMBOL(rb_replace_node_rcu);
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+EXPORT_SYMBOL(rb_next_postorder);
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
+EXPORT_SYMBOL(rb_first_postorder);
--- a/utils/src/rbtree.h
+++ b/utils/src/rbtree.h
@@ -0,0 +1,328 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  See Documentation/core-api/rbtree.rst for documentation and samples.
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
+
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
+
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+			    struct rb_root *root);
+extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+				struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}
+
+#define rb_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
+	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); 1; }); \
+	     pos = n)
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+
+static inline struct rb_node *
+rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
+{
+	struct rb_node *leftmost = NULL;
+
+	if (root->rb_leftmost == node)
+		leftmost = root->rb_leftmost = rb_next(node);
+
+	rb_erase(node, &root->rb_root);
+
+	return leftmost;
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ *
+ * Returns @node when it is the new leftmost, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+
+	return leftmost ? node : NULL;
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
+#endif	/* _LINUX_RBTREE_H */
--- a/utils/src/rbtree_augmented.h
+++ b/utils/src/rbtree_augmented.h
@@ -0,0 +1,313 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+
+  linux/include/linux/rbtree_augmented.h
+*/
+
+#ifndef _LINUX_RBTREE_AUGMENTED_H
+#define _LINUX_RBTREE_AUGMENTED_H
+
+/*
+ * Please note - only struct rb_augment_callbacks and the prototypes for
+ * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
+ * The rest are implementation details you are not expected to depend on.
+ *
+ * See Documentation/core-api/rbtree.rst for documentation and samples.
+ */
+
+struct rb_augment_callbacks {
+	void (*propagate)(struct rb_node *node, struct rb_node *stop);
+	void (*copy)(struct rb_node *old, struct rb_node *new);
+	void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+/*
+ * Fixup the rbtree and update the augmented information when rebalancing.
+ *
+ * On insertion, the user must update the augmented information on the path
+ * leading to the inserted node, then call rb_link_node() as usual and
+ * rb_insert_augmented() instead of the usual rb_insert_color() call.
+ * If rb_insert_augmented() rebalances the rbtree, it will callback into
+ * a user provided function to update the augmented information on the
+ * affected subtrees.
+ */
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+		    const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, root, augment->rotate);
+}
+
+static inline void
+rb_insert_augmented_cached(struct rb_node *node,
+			   struct rb_root_cached *root, bool newleft,
+			   const struct rb_augment_callbacks *augment)
+{
+	if (newleft)
+		root->rb_leftmost = node;
+	rb_insert_augmented(node, &root->rb_root, augment);
+}
+
+/*
+ * Template for declaring augmented rbtree callbacks (generic case)
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
+ */
+
+#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,				\
+			     RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)	\
+static inline void							\
+RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+{									\
+	while (rb != stop) {						\
+		RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);	\
+		if (RBCOMPUTE(node, true))				\
+			break;						\
+		rb = rb_parent(&node->RBFIELD);				\
+	}								\
+}									\
+static inline void							\
+RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+{									\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
+}									\
+static void								\
+RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+{									\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
+	RBCOMPUTE(old, false);						\
+}									\
+RBSTATIC const struct rb_augment_callbacks RBNAME = {			\
+	.propagate = RBNAME ## _propagate,				\
+	.copy = RBNAME ## _copy,					\
+	.rotate = RBNAME ## _rotate					\
+};
+
+/*
+ * Template for declaring augmented rbtree callbacks,
+ * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBTYPE:      type of the RBAUGMENTED field
+ * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
+ */
+
+#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,	      \
+				 RBTYPE, RBAUGMENTED, RBCOMPUTE)	      \
+static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)	      \
+{									      \
+	RBSTRUCT *child;						      \
+	RBTYPE max = RBCOMPUTE(node);					      \
+	if (node->RBFIELD.rb_left) {					      \
+		child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (node->RBFIELD.rb_right) {					      \
+		child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (exit && node->RBAUGMENTED == max)				      \
+		return true;						      \
+	node->RBAUGMENTED = max;					      \
+	return false;							      \
+}									      \
+RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,					      \
+		     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
+
+
+#define	RB_RED		0
+#define	RB_BLACK	1
+
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			WRITE_ONCE(parent->rb_left, new);
+		else
+			WRITE_ONCE(parent->rb_right, new);
+	} else
+		WRITE_ONCE(root->rb_node, new);
+}
+
+static inline void
+__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
+		      struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			rcu_assign_pointer(parent->rb_left, new);
+		else
+			rcu_assign_pointer(parent->rb_right, new);
+	} else
+		rcu_assign_pointer(root->rb_node, new);
+}
+
+extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+static __always_inline struct rb_node *
+__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		     const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
+	struct rb_node *parent, *rebalance;
+	unsigned long pc;
+
+	if (!tmp) {
+		/*
+		 * Case 1: node to erase has no more than 1 child (easy!)
+		 *
+		 * Note that if there is one child it must be red due to 5)
+		 * and node must be black due to 4). We adjust colors locally
+		 * so as to bypass __rb_erase_color() later on.
+		 */
+		pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, child, parent, root);
+		if (child) {
+			child->__rb_parent_color = pc;
+			rebalance = NULL;
+		} else
+			rebalance = __rb_is_black(pc) ? parent : NULL;
+		tmp = parent;
+	} else if (!child) {
+		/* Still case 1, but this time the child is node->rb_left */
+		tmp->__rb_parent_color = pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, tmp, parent, root);
+		rebalance = NULL;
+		tmp = parent;
+	} else {
+		struct rb_node *successor = child, *child2;
+
+		tmp = child->rb_left;
+		if (!tmp) {
+			/*
+			 * Case 2: node's successor is its right child
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (s)  ->  (x) (c)
+			 *        \
+			 *        (c)
+			 */
+			parent = successor;
+			child2 = successor->rb_right;
+
+			augment->copy(node, successor);
+		} else {
+			/*
+			 * Case 3: node's successor is leftmost under
+			 * node's right child subtree
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (y)  ->  (x) (y)
+			 *      /            /
+			 *    (p)          (p)
+			 *    /            /
+			 *  (s)          (c)
+			 *    \
+			 *    (c)
+			 */
+			do {
+				parent = successor;
+				successor = tmp;
+				tmp = tmp->rb_left;
+			} while (tmp);
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
+			rb_set_parent(child, successor);
+
+			augment->copy(node, successor);
+			augment->propagate(parent, successor);
+		}
+
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
+		rb_set_parent(tmp, successor);
+
+		pc = node->__rb_parent_color;
+		tmp = __rb_parent(pc);
+		__rb_change_child(node, successor, tmp, root);
+
+		if (child2) {
+			rb_set_parent_color(child2, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			rebalance = rb_is_black(successor) ? parent : NULL;
+		}
+		successor->__rb_parent_color = pc;
+		tmp = successor;
+	}
+
+	augment->propagate(tmp, NULL);
+	return rebalance;
+}
+
+static __always_inline void
+rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		   const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+	if (rebalance)
+		__rb_erase_color(rebalance, root, augment->rotate);
+}
+
+static __always_inline void
+rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
+			  const struct rb_augment_callbacks *augment)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase_augmented(node, &root->rb_root, augment);
+}
+
+#endif	/* _LINUX_RBTREE_AUGMENTED_H */
--- a/utils/src/rbtree_types.h
+++ b/utils/src/rbtree_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_RBTREE_TYPES_H
+#define _LINUX_RBTREE_TYPES_H
+
+struct rb_node {
+	unsigned long  __rb_parent_color;
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+/* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+	struct rb_node *rb_node;
+};
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT (struct rb_root) { NULL, }
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+#endif
Author	SHA1	Message	Date
Zach Brown	6541ccfdd0	Add test_bit to utils bitmap Add test_bit() to the trivial utils bitmap.c implementation. Signed-off-by: Zach Brown <zab@versity.com>	2024-03-06 17:18:57 -08:00
Zach Brown	b56836b395	Add {read,write}-metadata-image scoutfs commands Signed-off-by: Zach Brown <zab@versity.com>	2024-03-06 17:18:57 -08:00
Zach Brown	de8395a9cf	Fix partial rename to check_meta_alloc As I was committing the initial check command I had only partially completed a rename of the function that checks the metadata allocators. Signed-off-by: Zach Brown <zab@versity.com>	2024-03-06 17:18:01 -08:00
Zach Brown	94c608f281	(wip) add check command	2024-03-04 15:13:46 -08:00
Zach Brown	84c1460e4f	Fix printing alloc list block extents The list alloc blocks have an array of blknos that are offset by a start field in the block header. The print code wasn't using that and was always referencing the beginning of the array, which could miss blocks. Signed-off-by: Zach Brown <zab@versity.com>	2024-03-04 15:13:46 -08:00
Zach Brown	e407f67fcc	Import a few more functions to our list.h Import a few more functions from the kernel's list.h into our imported copy. Signed-off-by: Zach Brown <zab@versity.com>	2024-03-04 15:13:46 -08:00
Zach Brown	13c4b35bed	Add lk rbtree wrapper Import the kernel's rbtree implementation with a wrapper so we can use it from userspace. Signed-off-by: Zach Brown <zab@versity.com>	2024-03-04 15:13:46 -08:00
Zach Brown	67990a7007	Merge pull request #162 from versity/zab/v1.19 v1.19 Release	2024-01-30 15:46:49 -08:00
Zach Brown	ba819be8f9	v1.19 Release Finish the release notes for the 1.19 release. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-30 12:11:23 -08:00
Zach Brown	1b103184ca	Merge pull request #161 from versity/zab/merge_timeout_option_fix Correctly set the log_merge_wait_timeout_ms option	2024-01-30 12:07:10 -08:00
Zach Brown	c3890abd7b	Correctly set the log_merge_wait_timeout_ms option The initial code for setting the timeout used the wrong parsed variable. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-30 12:01:35 -08:00
Zach Brown	5ab38bfa48	Merge pull request #160 from versity/zab/log_merging_speedups Zab/log merging speedups	2024-01-29 12:26:55 -08:00
Zach Brown	e9ad61b444	Delete multiple log trees items per server commit server_log_merge_free_work() is responsible for freeing all the input log trees for a log merge operation that has finished. It looks for the next item to free, frees the log btree it references, and then deletes the item. It was doing this with a full server commit for each item which can take an agonizingly long time. This changes it perform multiple deletions in a commit as long as there's plenty of alloc space. The moment the commit gets low it applies the commit and opens a new one. This sped up the deletion of a few hundred thousand log tree items from taking hours to seconds. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	91bbf90f71	Don't pin input btrees when merging The btree_merge code was pinning leaf blocks for all input btrees as it iterated over them. This doesn't work when there are a very large number of input btrees. It can run out of memory trying to hold a reference to a 64KiB leaf block for each input root. This reworks the btree merging code. It reads a window of blocks from all input trees to get a set of merged items. It can take multiple passes to complete the merge but by setting the merge window large enough this overhead is reduced. Merging now consumes a fixed amount of memory rather than using memory proportional to the number of input btrees. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	b5630f540d	Add tracing of the log merge finalizing decision Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:30:17 -08:00
Zach Brown	90a4c82363	Make log merge wait timeout tunable Add a mount option for the amount of time that log merge creation can wait before giving up. We add some counters so we can see how often the timeout is being hit and what the average successfull wait time is. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:25:56 -08:00
Zach Brown	f654fa0fda	Send syncs once when starting to merge The server sends sync requests to clients when it sees that they have open log trees that need to be committed for log merging to proceed. These are currently sent in the context of each client's get_log_trees request, resulting in sync requests queued for one client from all clients. Depending on message delivery and commit latencies, this can create a sync storm. The server's sends are reliable and the open commits are marked with the seq when they opened. It's easy for us to record having sent syncs to all open commits so that future attempts can be avoided. Later open commits will have higher seqs and will get a new round of syncs sent. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:25:20 -08:00
Zach Brown	50168a2d2a	Check each client's last log item for stable seq The server was checking all client log_trees items to search for the lowest commit seq that was still open. This can be expensive when there are a lot of finalized log_trees items that won't have open seqs. Only the last log_trees item for each client rid can be open, and the items are sorted by rid and nr, so we can easily only check the last item for each client rid. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:24:50 -08:00
Zach Brown	3c0616524a	Only search last log_trees per rid for finalizing During get_log_trees the server checks log_trees items to see if it should start a log merge operation. It did this by iterating over all log_trees items and there can be quite a lot of them. It doesn't need to see all of the items. It only needs to see the most recent log_trees item for each mount. That's enough to make the decisions that start the log merging process. Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 11:23:59 -08:00
Zach Brown	8d3e6883c6	Merge pull request #159 from versity/auke/trans_hold Fix ret output for scoutfs_trans_hold trace pt.	2024-01-09 09:23:32 -08:00
Auke Kok	8747dae61c	Fix ret output for scoutfs_trans_hold trace pt. Signed-off-by: Auke Kok <auke.kok@versity.com>	2024-01-08 16:27:41 -08:00
Zach Brown	fffcf4a9bb	Merge pull request #158 from versity/zab/kasan_stack_oob_get_reg Ignore spurious KASAN unwind warning	2023-11-22 10:04:18 -08:00
Zach Brown	b552406427	Ignore spurious KASAN unwind warning KASAN could raise a spurious warning if the unwinder started in code without ORC metadata and tried to access in the KASAN stack frame redzones. This was fixed upstream but we can rarely see it in older kernels. We can ignore these messages. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-21 12:25:16 -08:00
Zach Brown	d812599e6b	Merge pull request #157 from versity/zab/dmsetup_test_devices Zab/dmsetup test devices	2023-11-21 10:13:02 -08:00
Zach Brown	03ab5cedb6	clean up createmany-parallel-mounts test This test is trying to make sure that concurrent work isn't much, much, slower than individual work. It does this by timing creating a bunch of files in a dir on a mount and then timing doing the same in two mounts concurrently. But it messed it up the concurrency pretty badly. It had the concurrent createmany tasks creating files with a full path. That means that every create is trying to read all the parent directories. The way inode number allocation works means that one of the mounts is likely to be getting a write lock that includes a shared parent. This created a ton of cluster lock contention between the two tasks. Then it didn't sync the creates between phases. It could be accidentally recording the time it took to write out the dirty single creates as time taken during the parallel creates. By syncing between phases and having the createmany tasks create files relative to their per-mount directories we actually perform concurrent work and test that we're not creating contention outside of the task load. This became a problem as we switched from loopback devices to device mapper devices. The loopback writers were using buffered writes so we were masking the io cost of constantly invalidating and refilling the item cache by turning the reads into memory copies out of the page cache. While we're in here we actually clean up the created files and then use t_fail to fail the test while the files still exist so they can be examined. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 15:12:57 -08:00
Zach Brown	2b94cd6468	Add loop module kernel message filter Now that we're not setting up per-mount loopback devices we can not have the loop module loaded until tests are running. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 13:39:38 -08:00
Zach Brown	5507ee5351	Use device-mapper for per-mount test devices We don't directly mount the underlying devices for each mount because the kernel notices multiple mounts and doesn't setup a new super block for each. Previously the script used loopback devices to create the local shared block construct 'cause it was easy. This introduced corruption of blocks that saw concurrent read and write IOs. The buffered kernel file IO paths that loopback eventually degrades into by default (via splice) could have buffered readers copying out of pages without the page lock while writers modified the page. This manifest as occasional crc failure of blocks that we knowingly issue concurrent reads and writes to from multiple mounts (the quorum and super blocks). This changes the script to use device-mapper linear passthrough devices. Their IOs don't hit a caching layer and don't provide an opportunity to corrupt blocks. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-15 13:39:38 -08:00
Zach Brown	1600a121d9	Merge pull request #156 from versity/zab/large_fragmented_free_hung_task Extend hung task timeout for large-fragmented-free	2023-11-15 09:49:13 -08:00
Zach Brown	6daf24ff37	Extend hung task timeout for large-fragmented-free Our large fragmented free test creates pathologically file extents which are as expensive as possible to free. We know that debugging kernels can take a long time to do this so we can extend the hung task timeout. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-14 15:01:37 -08:00
Zach Brown	cd5d9ff3e0	Merge pull request #154 from versity/zab/srch_test_fixes Zab/srch test fixes	2023-11-13 09:47:46 -08:00
Zach Brown	d94e49eb63	Fix quoted glob in srch-basic-functionality One of the phases of this test wanted to delete files but got the glob quoting wrong. This didn't matter for the original test but when we changed the test to use its own xattr name then those existing undeleted files got confused with other files in later phases of the test. This changes the test to delete the files with a more reliable find pattern instead of using shell glob expansion. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:16:36 -08:00
Zach Brown	1dbe408539	Add tracing of srch compact struct communication Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:16:33 -08:00
Zach Brown	bf21699ad7	bulk_create_paths test tool takes xattr name Previously the bulk_create_paths test tool used the same xattr name for each category of xattrs it was creating. This created a problem where two tests got their xattrs confused with each other. The first test created a bunch of srch xattrs, failed, and didn't clean up after itself. The second test saw these search xattrs as its own and got very confused when there were far more srch xattrs than it thought it had created. This lets each test specify the srch xattr names that are created by bulk_create_paths so that tests can work with their xattrs independent of each other. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:15:44 -08:00
Zach Brown	c7c67a173d	Specifically wait for compaction in srch test We just added a test to try and get srch compaction stuck by having an input file continue at a specific offset. To exercise the bug the test needs to perform 6 compactions. It needs to merge 4 sets of logs into 4 sorted files, it needs to make partial progress merging those 4 sorted files into another file, and then finall attempt to continue compacting from the partial progress offset. The first version of the test didn't necessarily ensure that these compactions happened. It created far too many log files then just waited for time to pass. If the host was slow then the mounts may not make it through the initial logs to try and compact the sorted files. The triggers wouldn't fire and the test would fail. These changes much more carefully orchestrate and watch the various steps of compaction to make sure that we trigger the bug. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:13:13 -08:00
Zach Brown	0d10189f58	Make srch compact request delay tunable Add a sysfs file for getting and setting the delay between srch compaction requests from the client. We'll use this in testing to ensure compaction runs promptly. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-09 14:13:07 -08:00
Zach Brown	6b88f3268e	Merge pull request #153 from versity/zab/v1.18 v1.18 Release	2023-11-08 10:57:56 -08:00
Zach Brown	4b2afa61b8	v1.18 Release Finish the release notes for the 1.18 release. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 16:01:59 -08:00
Zach Brown	222ba2cede	Merge pull request #152 from versity/zab/stuck_srch_compact Zab/stuck srch compact	2023-11-07 15:56:39 -08:00
Zach Brown	c7e97eeb1f	Allow srch compaction from _SAFE_BYTES Compacting sorted srch files can take multiple transactions because they can be very large. Each transaction resumes at a byte offset in a block where the previous transaction stopped. The resuming code tests that the byte offsets are sane but had a mistake in testing the offset to skip to. It returned an error if the compaction resumed from the last possible safe offset for decoding entries. If a system is unlucky enough to have a compaction transaction stop at just this offset then compaction stops making forward progress as each attempt to resume returns an error. The fix allows continuation from this last safe offset while returning errors for attempts to continue past that offset. This matches all the encoding code which allows encoding the last entry in the block at this offset. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 12:34:00 -08:00
Zach Brown	21c070b42d	Add test for srch continutation safe pos errors Add a test for srch compaction getting stuck hitting errors continuing a partial operation. It ensures that a block has an encoded entry at the _SAFE_BYTES offset, that an operaton stops precisely at that offset, and then watches for errors. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 12:34:00 -08:00
Zach Brown	77fbf92968	Add t_trigger_set helper Add a helper to arm or disarm a trigger with a value argument. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 12:12:10 -08:00
Zach Brown	d5c699c3b4	Don't respond with ENOENT for no srch compaction The srch compaction request building function and the srch compaction worker both have logic to recognize a valid response with no input files indicating that there's no work to do. The server unfortunately translated nr == 0 into ENOENT and send that error response to the client. This caused the client to increment error counters in the common case when there's no compaction work to perform. We'd like the error counter to reflect actual errors, we're about to check it in a test, so let's fix this up to the server sends a sucessful response with nr == 0 to indicate that there's no work to do. Signed-off-by: Zach Brown <zab@versity.com>	2023-11-07 10:30:38 -08:00