Add nr_log_trees debugfs counter

Signed-off-by: Zach Brown <zab@versity.com>
Force merge creation timeout
2026-06-09 21:22:36 +00:00 · 2024-01-25 12:48:27 -08:00 · 2024-01-25 12:46:46 -08:00
46 changed files with 344 additions and 4797 deletions
@@ -1,20 +1,6 @@
 Versity ScoutFS Release Notes
 =============================

---
-v1.19
-\
-*Jan 30, 2024*
-
-Added the log\_merge\_wait\_timeout\_ms mount option to set the timeout
-for creating log merge operations.  The previous timeout, now the
-default, was too short for some systems and was resulting in consistent
-timeouts which created an excessive number of log trees waiting to be
-merged.
-
-Improved performance of many in-mount server operations when there are a
-large number of log trees waiting to be merged.
-
 ---
 v1.18
 \
@@ -2029,253 +2029,187 @@ int scoutfs_btree_rebalance(struct super_block *sb,
 			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
 }

-struct merged_range {
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	struct rb_root root;
-	int size;
-};
-
-struct merged_item {
+struct merge_pos {
 	struct rb_node node;
-	struct scoutfs_key key;
+	struct scoutfs_btree_root *root;
+	struct scoutfs_block *bl;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *avl;
+	struct scoutfs_key *key;
 	u64 seq;
 	u8 flags;
 	unsigned int val_len;
-	u8 val[0];
+	u8 *val;
 };

-static inline struct merged_item *mitem_container(struct rb_node *node)
+static struct merge_pos *first_mpos(struct rb_root *root)
 {
-	return node ? container_of(node, struct merged_item, node) : NULL;
-}
-
-static inline struct merged_item *first_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_first(root));
-}
-
-static inline struct merged_item *last_mitem(struct rb_root *root)
-{
-	return mitem_container(rb_last(root));
-}
-
-static inline struct merged_item *next_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_next(&mitem->node) : NULL);
-}
-
-static inline struct merged_item *prev_mitem(struct merged_item *mitem)
-{
-	return mitem_container(mitem ? rb_prev(&mitem->node) : NULL);
-}
-
-static struct merged_item *find_mitem(struct rb_root *root, struct scoutfs_key *key,
-				      struct rb_node **parent_ret, struct rb_node ***link_ret)
-{
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct merged_item *mitem;
-	int cmp;
-
-	while (*node) {
-		parent = *node;
-		mitem = container_of(*node, struct merged_item, node);
-
-		cmp = scoutfs_key_compare(key, &mitem->key);
-
-		if (cmp < 0) {
-			node = &(*node)->rb_left;
-		} else if (cmp > 0) {
-			node = &(*node)->rb_right;
-		} else {
-			*parent_ret = NULL;
-			*link_ret = NULL;
-			return mitem;
-		}
-	}
-
-	*parent_ret = parent;
-	*link_ret = node;
+	struct rb_node *node = rb_first(root);
+	if (node)
+		 return container_of(node, struct merge_pos, node);
 	return NULL;
 }

-static void insert_mitem(struct merged_range *rng, struct merged_item *mitem,
-			 struct rb_node *parent, struct rb_node **link)
+static struct merge_pos *next_mpos(struct merge_pos *mpos)
 {
-	rb_link_node(&mitem->node, parent, link);
-	rb_insert_color(&mitem->node, &rng->root);
-	rng->size += item_len_bytes(mitem->val_len);
+	struct rb_node *node;
+
+	if (mpos && (node = rb_next(&mpos->node)))
+		return container_of(node, struct merge_pos, node);
+	else
+		return NULL;
 }

-static void replace_mitem(struct merged_range *rng, struct merged_item *victim,
-				struct merged_item *new)
+static void free_mpos(struct super_block *sb, struct merge_pos *mpos)
 {
-	rb_replace_node(&victim->node, &new->node, &rng->root);
-	RB_CLEAR_NODE(&victim->node);
-	rng->size -= item_len_bytes(victim->val_len);
-	rng->size += item_len_bytes(new->val_len);
+	scoutfs_block_put(sb, mpos->bl);
+	kfree(mpos);
 }

-static void free_mitem(struct merged_range *rng, struct merged_item *mitem)
+static void insert_mpos(struct rb_root *pos_root, struct merge_pos *ins)
 {
-	if (IS_ERR_OR_NULL(mitem))
-		return;
+	struct rb_node **node = &pos_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct merge_pos *mpos;
+	int cmp;

-	if (!RB_EMPTY_NODE(&mitem->node)) {
-		rng->size -= item_len_bytes(mitem->val_len);
-		rb_erase(&mitem->node, &rng->root);
+	parent = NULL;
+	while (*node) {
+		parent = *node;
+		mpos = container_of(*node, struct merge_pos, node);
+
+		/* sort merge items by key then newest to oldest */
+		cmp = scoutfs_key_compare(ins->key, mpos->key) ?:
+		      -scoutfs_cmp(ins->seq, mpos->seq);
+
+		if (cmp < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
 	}

-	kfree(mitem);
-}
-
-static void trim_range_size(struct merged_range *rng, int merge_window)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && rng->size > merge_window) {
-
-		rng->end = mitem->key;
-		scoutfs_key_dec(&rng->end);
-
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
-}
-
-static void trim_range_end(struct merged_range *rng)
-{
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-
-	mitem = last_mitem(&rng->root);
-	while (mitem && scoutfs_key_compare(&mitem->key, &rng->end) > 0) {
-		tmp = mitem;
-		mitem = prev_mitem(mitem);
-		free_mitem(rng, tmp);
-	}
+	rb_link_node(&ins->node, parent, node);
+	rb_insert_color(&ins->node, pos_root);
 }

 /*
- * Record and combine logged items from log roots for merging with the
- * writable destination root.  The caller is responsible for trimming
- * the range if it gets too large or if the key range shrinks.
+ * Find the next item in the merge_pos root in the caller's range and
+ * insert it into the rbtree sorted by key and version so that merging
+ * can find the next newest item at the front of the rbtree.  We free
+ * the mpos on error or if there are no more items in the range.
 */
-static int merge_read_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
-			   void *val, int val_len, void *arg)
+static int reset_mpos(struct super_block *sb, struct rb_root *pos_root, struct merge_pos *mpos,
+		      struct scoutfs_key *start, struct scoutfs_key *end)
 {
-	struct merged_range *rng = arg;
-	struct merged_item *mitem;
-	struct merged_item *found;
-	struct rb_node *parent;
-	struct rb_node **link;
-	int ret;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *next;
+	struct btree_walk_key_range kr;
+	struct scoutfs_key walk_key;
+	int ret = 0;

-	found = find_mitem(&rng->root, key, &parent, &link);
-	if (found) {
-		ret = scoutfs_forest_combine_deltas(key, found->val, found->val_len, val, val_len);
-		if (ret < 0)
-			goto out;
-		if (ret > 0) {
-			if (ret == SCOUTFS_DELTA_COMBINED) {
-				scoutfs_inc_counter(sb, btree_merge_delta_combined);
-			} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
-				scoutfs_inc_counter(sb, btree_merge_delta_null);
-				free_mitem(rng, found);
-			}
-			ret = 0;
-			goto out;
-		}
-
-		if (found->seq >= seq) {
-			ret = 0;
-			goto out;
-		}
+	/* always erase before freeing or inserting */
+	if (!RB_EMPTY_NODE(&mpos->node)) {
+		rb_erase(&mpos->node, pos_root);
+		RB_CLEAR_NODE(&mpos->node);
 	}

-	mitem = kmalloc(offsetof(struct merged_item, val[val_len]), GFP_NOFS);
-	if (!mitem) {
-		ret = -ENOMEM;
+	/*
+	 * advance to next item via the avl tree.   The caller's pos is
+	 * only ever incremented past the last key so we can use next to
+	 * iterate rather than using search to skip past multiple items.
+	 */
+	if (mpos->avl)
+		mpos->avl = scoutfs_avl_next(&mpos->bt->item_root, mpos->avl);
+
+	/* find the next leaf with the key if we run out of items */
+	walk_key = *start;
+	while (!mpos->avl && !scoutfs_key_is_zeros(&walk_key)) {
+		scoutfs_block_put(sb, mpos->bl);
+		mpos->bl = NULL;
+		ret = btree_walk(sb, NULL, NULL, mpos->root, BTW_NEXT, &walk_key,
+				 0, &mpos->bl, &kr, NULL);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			free_mpos(sb, mpos);
+			goto out;
+		}
+		mpos->bt = mpos->bl->data;
+
+		mpos->avl = scoutfs_avl_search(&mpos->bt->item_root, cmp_key_item,
+					       start, NULL, NULL, &next, NULL) ?: next;
+		if (mpos->avl == NULL)
+			walk_key = kr.iter_next;
+	}
+
+	/* see if we're out of items within the range */
+	item = node_item(mpos->avl);
+	if (!item || scoutfs_key_compare(item_key(item), end) > 0) {
+		free_mpos(sb, mpos);
+		ret = 0;
 		goto out;
 	}

-	mitem->key = *key;
-	mitem->seq = seq;
-	mitem->flags = flags;
-	mitem->val_len = val_len;
-	if (val_len)
-		memcpy(mitem->val, val, val_len);
-
-	if (found) {
-		replace_mitem(rng, found, mitem);
-		free_mitem(rng, found);
-	} else {
-		insert_mitem(rng, mitem, parent, link);
-	}
+	/* insert the next item within range at its version */
+	mpos->key = item_key(item);
+	mpos->seq = le64_to_cpu(item->seq);
+	mpos->flags = item->flags;
+	mpos->val_len = item_val_len(item);
+	mpos->val = item_val(mpos->bt, item);

+	insert_mpos(pos_root, mpos);
 	ret = 0;
 out:
 	return ret;
 }

 /*
- * Read a range of merged items.  The caller has set the key bounds of
- * the range.  We read a merge window's worth of items from blocks in
- * each input btree.
+ * The caller has reset all the merge positions for all the input log
+ * btree roots and wants the next logged item it should try and merge
+ * with the items in the fs_root.
 *
- * The caller can only use the smallest range that overlaps with all the
- * blocks that we read.  We start reading from the range's start key so
- * it will always be present and we don't need to adjust it.  The final
- * block we read from each input might not cover the range's end so it
- * needs to be adjusted.
- *
- * The end range can also shrink if we have to drop items because the
- * items exceeded the merge window size.
+ * We look ahead in the logged item stream to see if we should merge any
+ * older logged delta items into one result for the caller.  We also
+ * take this opportunity to skip and reset the mpos for any older
+ * versions of the first item.
 */
-static int read_merged_range(struct super_block *sb, struct merged_range *rng,
-			     struct list_head *inputs, int merge_window)
+static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root,
+			      struct scoutfs_key *end, struct merge_pos **mpos_ret)
 {
-	struct scoutfs_btree_root_head *rhead;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
+	struct merge_pos *mpos;
+	struct merge_pos *next;
 	struct scoutfs_key key;
 	int ret = 0;
-	int i;

-	list_for_each_entry(rhead, inputs, head) {
-		key = rng->start;
+	while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) &&
+	       !scoutfs_key_compare(mpos->key, next->key)) {

-		for (i = 0; i < merge_window; i += SCOUTFS_BLOCK_LG_SIZE) {
-			start = key;
-			end = rng->end;
-			ret = scoutfs_btree_read_items(sb, &rhead->root, &key, &start, &end,
-						       merge_read_item, rng);
+		ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len,
+						    next->val, next->val_len);
+		if (ret < 0)
+			break;
+
+		/* reset advances to the next item */
+		key = *mpos->key;
+		scoutfs_key_inc(&key);
+
+		/* always skip next combined or older version */
+		ret = reset_mpos(sb, pos_root, next, &key, end);
+		if (ret < 0)
+			break;
+
+		if (ret == SCOUTFS_DELTA_COMBINED) {
+			scoutfs_inc_counter(sb, btree_merge_delta_combined);
+		} else if (ret == SCOUTFS_DELTA_COMBINED_NULL) {
+			scoutfs_inc_counter(sb, btree_merge_delta_null);
+			/* if merging resulted in no info, skip current */
+			ret = reset_mpos(sb, pos_root, mpos, &key, end);
 			if (ret < 0)
-				goto out;
-
-			if (scoutfs_key_compare(&end, &rng->end) >= 0)
 				break;
-
-			key = end;
-			scoutfs_key_inc(&key);
 		}
-
-		if (scoutfs_key_compare(&end, &rng->end) < 0) {
-			rng->end = end;
-			trim_range_end(rng);
-		}
-
-		if (rng->size > merge_window)
-			trim_range_size(rng, merge_window);
 	}

-	trace_scoutfs_btree_merge_read_range(sb, &rng->start, &rng->end, rng->size);
-	ret = 0;
-out:
+	*mpos_ret = mpos;
 	return ret;
 }

@@ -2292,13 +2226,6 @@ out:
 * to allocators running low or needing to join/split the parent.
 * *next_ret is set to the next key which hasn't been merged so that the
 * caller can retry with a new allocator and subtree.
- *
- * The number of input roots can be immense.  The merge_window specifies
- * the size of the set of merged items that we'll maintain as we iterate
- * over all the input roots.  Once we've merged items into the window
- * from all the input roots the merged input items are then merged to
- * the writable destination root.  It may take multiple passes of
- * windows of merged items to cover the input key range.
 */
 int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_alloc *alloc,
@@ -2308,16 +2235,18 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *inputs,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window)
+			bool subtree, int dirty_limit, int alloc_low)
 {
+	struct scoutfs_btree_root_head *rhead;
+	struct rb_root pos_root = RB_ROOT;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl = NULL;
 	struct btree_walk_key_range kr;
 	struct scoutfs_avl_node *par;
-	struct merged_item *mitem;
-	struct merged_item *tmp;
-	struct merged_range rng;
+	struct scoutfs_key next;
+	struct merge_pos *mpos;
+	struct merge_pos *tmp;
 	int walk_val_len;
 	int walk_flags;
 	bool is_del;
@@ -2328,59 +2257,49 @@ int scoutfs_btree_merge(struct super_block *sb,
 	trace_scoutfs_btree_merge(sb, root, start, end);
 	scoutfs_inc_counter(sb, btree_merge);

+	list_for_each_entry(rhead, inputs, head) {
+		mpos = kzalloc(sizeof(*mpos), GFP_NOFS);
+		if (!mpos) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		RB_CLEAR_NODE(&mpos->node);
+		mpos->root = &rhead->root;
+
+		ret = reset_mpos(sb, &pos_root, mpos, start, end);
+		if (ret < 0)
+			goto out;
+	}
+
 	walk_flags = BTW_DIRTY;
 	if (subtree)
 		walk_flags |= BTW_SUBTREE;
 	walk_val_len = 0;

-	rng.start = *start;
-	rng.end = *end;
-	rng.root = RB_ROOT;
-	rng.size = 0;
-
-	ret = read_merged_range(sb, &rng, inputs, merge_window);
-	if (ret < 0)
-		goto out;
-
-	for (;;) {
-		/* read next window as it empties (and it is possible to read an empty range) */
-		mitem = first_mitem(&rng.root);
-		if (!mitem) {
-			/* done if the read range hit the end */
-			if (scoutfs_key_compare(&rng.end, end) >= 0)
-				break;
-
-			/* read next batch of merged items */
-			rng.start = rng.end;
-			scoutfs_key_inc(&rng.start);
-			rng.end = *end;
-			ret = read_merged_range(sb, &rng, inputs, merge_window);
-			if (ret < 0)
-				break;
-			continue;
-		}
+	while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {

 		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
 			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
 			scoutfs_inc_counter(sb, btree_merge_alloc_low);
 			ret = -ERANGE;
-			*next_ret = mitem->key;
+			*next_ret = *mpos->key;
 			goto out;
 		}

 		scoutfs_block_put(sb, bl);
 		bl = NULL;
 		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-			         &mitem->key, walk_val_len, &bl, &kr, NULL);
+			         mpos->key, walk_val_len, &bl, &kr, NULL);
 		if (ret < 0) {
 			if (ret == -ERANGE)
-				*next_ret = mitem->key;
+				*next_ret = *mpos->key;
 			goto out;
 		}
 		bt = bl->data;
@@ -2392,21 +2311,22 @@ int scoutfs_btree_merge(struct super_block *sb,
 			continue;
 		}

-		while (mitem) {
+		while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) {
+
 			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(&mitem->key, &kr.end) > 0)
+			if (scoutfs_key_compare(mpos->key, &kr.end) > 0)
 				break;

 			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, &mitem->key);
-			is_del = !!(mitem->flags & SCOUTFS_ITEM_FLAG_DELETION);
+			item = leaf_item_hash_search(sb, bt, mpos->key);
+			is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION);

 			/* see if we're merging delta items */
 			if (item && !is_del)
-				delta = scoutfs_forest_combine_deltas(&mitem->key,
+				delta = scoutfs_forest_combine_deltas(mpos->key,
 								      item_val(bt, item),
 								      item_val_len(item),
-								      mitem->val, mitem->val_len);
+								      mpos->val, mpos->val_len);
 			else
 				delta = 0;
 			if (delta < 0) {
@@ -2418,38 +2338,40 @@ int scoutfs_btree_merge(struct super_block *sb,
 				scoutfs_inc_counter(sb, btree_merge_delta_null);
 			}

-			trace_scoutfs_btree_merge_items(sb, &mitem->key, mitem->val_len,
+			trace_scoutfs_btree_merge_items(sb, mpos->root,
+					mpos->key, mpos->val_len,
 					item ? root : NULL,
 					item ? item_key(item) : NULL,
 					item ? item_val_len(item) : 0, is_del);

 			/* rewalk and split if ins/update needs room */
-			if (!is_del && !delta && !mid_free_item_room(bt, mitem->val_len)) {
+			if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) {
 				walk_flags |= BTW_INSERT;
-				walk_val_len = mitem->val_len;
+				walk_val_len = mpos->val_len;
 				break;
 			}

 			/* insert missing non-deletion merge items */
 			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root, cmp_key_item, &mitem->key,
+				scoutfs_avl_search(&bt->item_root,
+						   cmp_key_item, mpos->key,
 						   &cmp, &par, NULL, NULL);
-				create_item(bt, &mitem->key, mitem->seq, mitem->flags,
-					    mitem->val, mitem->val_len, par, cmp);
+				create_item(bt, mpos->key, mpos->seq, mpos->flags,
+					    mpos->val, mpos->val_len, par, cmp);
 				scoutfs_inc_counter(sb, btree_merge_insert);
 			}

 			/* update existing items */
 			if (item && !is_del && !delta) {
-				item->seq = cpu_to_le64(mitem->seq);
-				item->flags = mitem->flags;
-				update_item_value(bt, item, mitem->val, mitem->val_len);
+				item->seq = cpu_to_le64(mpos->seq);
+				item->flags = mpos->flags;
+				update_item_value(bt, item, mpos->val, mpos->val_len);
 				scoutfs_inc_counter(sb, btree_merge_update);
 			}

 			/* update combined delta item seq */
 			if (delta == SCOUTFS_DELTA_COMBINED) {
-				item->seq = cpu_to_le64(mitem->seq);
+				item->seq = cpu_to_le64(mpos->seq);
 			}

 			/*
@@ -2481,18 +2403,21 @@ int scoutfs_btree_merge(struct super_block *sb,
 			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
 			walk_val_len = 0;

-			/* finished with this merged item */
-			tmp = mitem;
-			mitem = next_mitem(mitem);
-			free_mitem(&rng, tmp);
+			/* finished with this key, skip any older items */
+			next = *mpos->key;
+			scoutfs_key_inc(&next);
+			ret = reset_mpos(sb, &pos_root, mpos, &next, end);
+			if (ret < 0)
+				goto out;
 		}
 	}

 	ret = 0;
 out:
 	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mitem, tmp, &rng.root, node)
-		free_mitem(&rng, mitem);
+	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
+		free_mpos(sb, mpos);
+	}

 	return ret;
 }
@@ -119,7 +119,7 @@ int scoutfs_btree_merge(struct super_block *sb,
 			struct scoutfs_key *next_ret,
 			struct scoutfs_btree_root *root,
 			struct list_head *input_list,
-			bool subtree, int dirty_limit, int alloc_low, int merge_window);
+			bool subtree, int dirty_limit, int alloc_low);

 int scoutfs_btree_free_blocks(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
@@ -145,7 +145,6 @@
 	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
-	EXPAND_COUNTER(log_merge_wait_timeout)			\
 	EXPAND_COUNTER(net_dropped_response)			\
 	EXPAND_COUNTER(net_send_bytes)				\
 	EXPAND_COUNTER(net_send_error)				\
@@ -68,6 +68,8 @@ struct forest_info {
 	struct delayed_work log_merge_dwork;

 	atomic64_t inode_count_delta;
+
+	struct dentry *dent;
 };

 #define DECLARE_FOREST_INFO(sb, name) \
@@ -721,8 +723,7 @@ static void scoutfs_forest_log_merge_worker(struct work_struct *work)
 	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
 				  &next, &comp.root, &inputs,
 				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10,
-				  (2 * 1024 * 1024));
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
 	if (ret == -ERANGE) {
 		comp.remain = next;
 		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
@@ -751,6 +752,60 @@ resched:
 	queue_delayed_work(finf->workq, &finf->log_merge_dwork, delay);
 }

+static int count_log_trees(struct super_block *sb, struct scoutfs_key *key, u64 seq,
+			   u8 flags, void *val, int val_len, void *arg)
+{
+	u64 *count = arg;
+
+	(*count)++;
+	return 0;
+}
+
+static int debugfs_nr_log_trees_get(void *data, u64 *val)
+{
+	struct super_block *sb = data;
+	struct scoutfs_super_block *super = NULL;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct scoutfs_key key;
+	u64 count;
+	int ret;
+
+	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
+	if (!super) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = scoutfs_read_super(sb, super);
+	if (ret < 0)
+		goto out;
+
+	scoutfs_key_init_log_trees(&key, 0, 0);
+	count = 0;
+	for (;;) {
+		scoutfs_key_set_zeros(&start);
+		scoutfs_key_set_ones(&end);
+		ret = scoutfs_btree_read_items(sb, &super->logs_root, &key, &start, &end,
+					       count_log_trees, &count);
+		if (ret == -ENOENT || scoutfs_key_is_ones(&end))
+			break;
+		if (ret < 0)
+			goto out;
+
+		key = end;
+		scoutfs_key_inc(&key);
+	}
+
+	*val = count;
+	ret = 0;
+out:
+	kfree(super);
+	return ret ? -EIO : 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_nr_log_trees, debugfs_nr_log_trees_get, NULL, "%llu\n");
+
 int scoutfs_forest_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -771,6 +826,13 @@ int scoutfs_forest_setup(struct super_block *sb)
 			  scoutfs_forest_log_merge_worker);
 	sbi->forest_info = finf;

+	finf->dent = debugfs_create_file("nr_log_trees", S_IFREG|S_IRUSR, sbi->debug_root, sb,
+					 &fops_nr_log_trees);
+	if (IS_ERR(finf->dent)) {
+		ret = PTR_ERR(finf->dent);
+		goto out;
+	}
+
 	finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT |
 				      WQ_UNBOUND | WQ_HIGHPRI, 0);
 	if (!finf->workq) {
@@ -800,6 +862,8 @@ void scoutfs_forest_stop(struct super_block *sb)

 	if (finf && finf->workq) {
 		cancel_delayed_work_sync(&finf->log_merge_dwork);
+		if (!IS_ERR_OR_NULL(finf->dent))
+			debugfs_remove(finf->dent);
 		destroy_workqueue(finf->workq);
 	}
 }
@@ -33,7 +33,6 @@ enum {
 	Opt_acl,
 	Opt_data_prealloc_blocks,
 	Opt_data_prealloc_contig_only,
-	Opt_log_merge_wait_timeout_ms,
 	Opt_metadev_path,
 	Opt_noacl,
 	Opt_orphan_scan_delay_ms,
@@ -46,7 +45,6 @@ static const match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
 	{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
-	{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
 	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_noacl, "noacl"},
 	{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
@@ -115,10 +113,6 @@ static void free_options(struct scoutfs_mount_options *opts)
 	kfree(opts->metadev_path);
 }

-#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS		100UL
-#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS	500
-#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS		(60 * MSEC_PER_SEC)
-
 #define MIN_ORPHAN_SCAN_DELAY_MS	100UL
 #define DEFAULT_ORPHAN_SCAN_DELAY_MS	(10 * MSEC_PER_SEC)
 #define MAX_ORPHAN_SCAN_DELAY_MS	(60 * MSEC_PER_SEC)
@@ -132,27 +126,11 @@ static void init_default_options(struct scoutfs_mount_options *opts)

 	opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
 	opts->data_prealloc_contig_only = 1;
-	opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
 	opts->orphan_scan_delay_ms = -1;
 	opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
 	opts->quorum_slot_nr = -1;
 }

-static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
-{
-	if (ret < 0) {
-		scoutfs_err(sb, "failed to parse log_merge_wait_timeout_ms value");
-		return -EINVAL;
-	}
-	if (val < MIN_LOG_MERGE_WAIT_TIMEOUT_MS || val > MAX_LOG_MERGE_WAIT_TIMEOUT_MS) {
-		scoutfs_err(sb, "invalid log_merge_wait_timeout_ms value %d, must be between %lu and %lu",
-			    val, MIN_LOG_MERGE_WAIT_TIMEOUT_MS, MAX_LOG_MERGE_WAIT_TIMEOUT_MS);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
 {
 	if (ret < 0) {
@@ -218,14 +196,6 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
 			opts->data_prealloc_contig_only = nr;
 			break;

-		case Opt_log_merge_wait_timeout_ms:
-			ret = match_int(args, &nr);
-			ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
-			if (ret < 0)
-				return ret;
-			opts->log_merge_wait_timeout_ms = nr;
-			break;
-
 		case Opt_metadev_path:
 			ret = parse_bdev_path(sb, &args[0], &opts->metadev_path);
 			if (ret < 0)
@@ -452,43 +422,6 @@ static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj
 }
 SCOUTFS_ATTR_RW(data_prealloc_contig_only);

-static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
-						char *buf)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	struct scoutfs_mount_options opts;
-
-	scoutfs_options_read(sb, &opts);
-
-	return snprintf(buf, PAGE_SIZE, "%u", opts.log_merge_wait_timeout_ms);
-}
-static ssize_t log_merge_wait_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-						 const char *buf, size_t count)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	DECLARE_OPTIONS_INFO(sb, optinf);
-	char nullterm[30]; /* more than enough for octal -U64_MAX */
-	int val;
-	int len;
-	int ret;
-
-	len = min(count, sizeof(nullterm) - 1);
-	memcpy(nullterm, buf, len);
-	nullterm[len] = '\0';
-
-	ret = kstrtoint(nullterm, 0, &val);
-	ret = verify_log_merge_wait_timeout_ms(sb, ret, val);
-	if (ret == 0) {
-		write_seqlock(&optinf->seqlock);
-		optinf->opts.log_merge_wait_timeout_ms = val;
-		write_sequnlock(&optinf->seqlock);
-		ret = count;
-	}
-
-	return ret;
-}
-SCOUTFS_ATTR_RW(log_merge_wait_timeout_ms);
-
 static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
@@ -592,7 +525,6 @@ SCOUTFS_ATTR_RO(quorum_slot_nr);
 static struct attribute *options_attrs[] = {
 	SCOUTFS_ATTR_PTR(data_prealloc_blocks),
 	SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
-	SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
 	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
 	SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
@@ -8,7 +8,6 @@
 struct scoutfs_mount_options {
 	u64 data_prealloc_blocks;
 	bool data_prealloc_contig_only;
-	unsigned int log_merge_wait_timeout_ms;
 	char *metadev_path;
 	unsigned int orphan_scan_delay_ms;
 	int quorum_slot_nr;
@@ -1747,41 +1747,21 @@ TRACE_EVENT(scoutfs_btree_merge,
 		  sk_trace_args(end))
 );

-TRACE_EVENT(scoutfs_btree_merge_read_range,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *start, struct scoutfs_key *end,
-		 int size),
-
-	TP_ARGS(sb, start, end, size),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		sk_trace_define(start)
-		sk_trace_define(end)
-		__field(int, size)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		sk_trace_assign(start, start);
-		sk_trace_assign(end, end);
-		__entry->size = size;
-	),
-
-	TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" size %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), __entry->size)
-);
-
 TRACE_EVENT(scoutfs_btree_merge_items,
 	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *m_root,
 		 struct scoutfs_key *m_key, int m_val_len,
 		 struct scoutfs_btree_root *f_root,
 		 struct scoutfs_key *f_key, int f_val_len,
 		 int is_del),

-	TP_ARGS(sb, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, m_root_blkno)
+		__field(__u64, m_root_seq)
+		__field(__u8, m_root_height)
 		sk_trace_define(m_key)
 		__field(int, m_val_len)
 		__field(__u64, f_root_blkno)
@@ -1794,6 +1774,10 @@ TRACE_EVENT(scoutfs_btree_merge_items,

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		__entry->m_root_blkno = m_root ?
+					le64_to_cpu(m_root->ref.blkno) : 0;
+		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
+		__entry->m_root_height = m_root ? m_root->height : 0;
 		sk_trace_assign(m_key, m_key);
 		__entry->m_val_len = m_val_len;
 		__entry->f_root_blkno = f_root ?
@@ -1805,9 +1789,11 @@ TRACE_EVENT(scoutfs_btree_merge_items,
 		__entry->is_del = !!is_del;
 	),

-	TP_printk(SCSBF" merge item key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, sk_trace_args(m_key), __entry->m_val_len,
-		  __entry->f_root_blkno, __entry->f_root_seq, __entry->f_root_height,
+	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
+		  __entry->m_root_height, sk_trace_args(m_key),
+		  __entry->m_val_len, __entry->f_root_blkno,
+		  __entry->f_root_seq, __entry->f_root_height,
 		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
 );

@@ -2090,71 +2076,6 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_server_finalize_items,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 item_rid, u64 item_nr, u64 item_flags,
-		 u64 item_get_trans_seq),
-
-	TP_ARGS(sb, rid, item_rid, item_nr, item_flags, item_get_trans_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(__u64, item_rid)
-		__field(__u64, item_nr)
-		__field(__u64, item_flags)
-		__field(__u64, item_get_trans_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->item_rid = item_rid;
-		__entry->item_nr = item_nr;
-		__entry->item_flags = item_flags;
-		__entry->item_get_trans_seq = item_get_trans_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx item_rid %016llx item_nr %llu item_flags 0x%llx item_get_trans_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->item_rid, __entry->item_nr,
-		  __entry->item_flags, __entry->item_get_trans_seq)
-);
-
-TRACE_EVENT(scoutfs_server_finalize_decision,
-	TP_PROTO(struct super_block *sb, u64 rid, bool saw_finalized, bool others_active,
-		 bool ours_visible, bool finalize_ours, unsigned int delay_ms,
-		 u64 finalize_sent_seq),
-
-	TP_ARGS(sb, rid, saw_finalized, others_active, ours_visible, finalize_ours, delay_ms,
-		finalize_sent_seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, c_rid)
-		__field(bool, saw_finalized)
-		__field(bool, others_active)
-		__field(bool, ours_visible)
-		__field(bool, finalize_ours)
-		__field(unsigned int, delay_ms)
-		__field(__u64, finalize_sent_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->c_rid = rid;
-		__entry->saw_finalized = saw_finalized;
-		__entry->others_active = others_active;
-		__entry->ours_visible = ours_visible;
-		__entry->finalize_ours = finalize_ours;
-		__entry->delay_ms = delay_ms;
-		__entry->finalize_sent_seq = finalize_sent_seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx saw_finalized %u others_active %u ours_visible %u finalize_ours %u delay_ms %u finalize_sent_seq %llu",
-		  SCSB_TRACE_ARGS, __entry->c_rid, __entry->saw_finalized, __entry->others_active,
-		  __entry->ours_visible, __entry->finalize_ours, __entry->delay_ms,
-		  __entry->finalize_sent_seq)
-);
-
 TRACE_EVENT(scoutfs_get_log_merge_status,
 	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
 		 u64 nr_requests, u64 nr_complete, u64 seq),
@@ -148,8 +148,6 @@ struct server_info {
 	struct scoutfs_quorum_config qconf;
 	/* a running server maintains a private dirty super */
 	struct scoutfs_super_block dirty_super;
-
-	u64 finalize_sent_seq;
 };

 #define DECLARE_SERVER_INFO(sb, name) \
@@ -415,27 +413,6 @@ static void server_hold_commit(struct super_block *sb, struct commit_hold *hold)
 	wait_event(cusers->waitq, hold_commit(sb, server, cusers, hold));
 }

-/*
- * Return the higher of the avail or freed used by the active commit
- * since this holder joined the commit.  This is *not* the amount used
- * by the holder, we don't track per-holder alloc use.
- */
-static u32 server_hold_alloc_used_since(struct super_block *sb, struct commit_hold *hold)
-{
-	DECLARE_SERVER_INFO(sb, server);
-	u32 avail_used;
-	u32 freed_used;
-	u32 avail_now;
-	u32 freed_now;
-
-	scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
-
-	avail_used = hold->avail - avail_now;
-	freed_used = hold->freed - freed_now;
-
-	return max(avail_used, freed_used);
-}
-
 /*
 * This is called while holding the commit and returns once the commit
 * is successfully written.  Many holders can all wait for all holders
@@ -961,24 +938,22 @@ static int find_log_trees_item(struct super_block *sb,
 }

 /*
- * Find the log_trees item with the greatest nr for each rid.  Fills the
- * caller's log_trees and sets the key before the returned log_trees for
- * the next iteration.  Returns 0 when done, > 0 for each item, and
- * -errno on fatal errors.
+ * Find the next log_trees item from the key.  Fills the caller's log_trees and sets
+ * the key past the returned log_trees for iteration.  Returns 0 when done, > 0 for each
+ * item, and -errno on fatal errors.
 */
-static int for_each_rid_last_lt(struct super_block *sb, struct scoutfs_btree_root *root,
-				struct scoutfs_key *key, struct scoutfs_log_trees *lt)
+static int for_each_lt(struct super_block *sb, struct scoutfs_btree_root *root,
+		       struct scoutfs_key *key, struct scoutfs_log_trees *lt)
 {
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	int ret;

-	ret = scoutfs_btree_prev(sb, root, key, &iref);
+	ret = scoutfs_btree_next(sb, root, key, &iref);
 	if (ret == 0) {
 		if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 			memcpy(lt, iref.val, iref.val_len);
 			*key = *iref.key;
-			key->sklt_nr = 0;
-			scoutfs_key_dec(key);
+			scoutfs_key_inc(key);
 			ret = 1;
 		} else {
 			ret = -EIO;
@@ -1073,13 +1048,21 @@ static int next_log_merge_item(struct super_block *sb,
 * abandoned log btree finalized.  If it takes too long each client has
 * a change to make forward progress before being asked to commit again.
 *
+ * We're waiting on heavy state that is protected by mutexes and
+ * transaction machinery.  It's tricky to recreate that state for
+ * lightweight condition tests that don't change task state.  Instead of
+ * trying to get that right, particularly as we unwind after success or
+ * after timeouts, waiters use an unsatisfying poll.   Short enough to
+ * not add terrible latency, given how heavy and infrequent this already
+ * is, and long enough to not melt the cpu.  This could be tuned if it
+ * becomes a problem.
+ *
 * This can end up finalizing a new empty log btree if a new mount
 * happens to arrive at just the right time.  That's fine, merging will
 * ignore and tear down the empty input.
 */
-#define FINALIZE_POLL_MIN_DELAY_MS	5U
-#define FINALIZE_POLL_MAX_DELAY_MS	100U
-#define FINALIZE_POLL_DELAY_GROWTH_PCT	150U
+#define FINALIZE_POLL_MS	(11)
+#define FINALIZE_TIMEOUT_MS	(MSEC_PER_SEC / 2)
 static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_log_trees *lt,
 					u64 rid, struct commit_hold *hold)
 {
@@ -1087,10 +1070,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	struct scoutfs_log_merge_status stat;
 	struct scoutfs_log_merge_range rng;
-	struct scoutfs_mount_options opts;
 	struct scoutfs_log_trees each_lt;
 	struct scoutfs_log_trees fin;
-	unsigned int delay_ms;
 	unsigned long timeo;
 	bool saw_finalized;
 	bool others_active;
@@ -1098,14 +1079,10 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 	bool ours_visible;
 	struct scoutfs_key key;
 	char *err_str = NULL;
-	ktime_t start;
 	int ret;
 	int err;

-	scoutfs_options_read(sb, &opts);
-	timeo = jiffies + msecs_to_jiffies(opts.log_merge_wait_timeout_ms);
-	delay_ms = FINALIZE_POLL_MIN_DELAY_MS;
-	start = ktime_get_raw();
+	timeo = jiffies + msecs_to_jiffies(FINALIZE_TIMEOUT_MS);

 	for (;;) {
 		/* nothing to do if there's already a merge in flight */
@@ -1122,13 +1099,8 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		saw_finalized = false;
 		others_active = false;
 		ours_visible = false;
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-		while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
-
-			trace_scoutfs_server_finalize_items(sb, rid, le64_to_cpu(each_lt.rid),
-							    le64_to_cpu(each_lt.nr),
-							    le64_to_cpu(each_lt.flags),
-							    le64_to_cpu(each_lt.get_trans_seq));
+		scoutfs_key_init_log_trees(&key, 0, 0);
+		while ((ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED))
 				saw_finalized = true;
@@ -1153,10 +1125,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		finalize_ours = (lt->item_root.height > 2) ||
 				(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);

-		trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
-						       ours_visible, finalize_ours, delay_ms,
-						       server->finalize_sent_seq);
-
 		/* done if we're not finalizing and there's no finalized */
 		if (!finalize_ours && !saw_finalized) {
 			ret = 0;
@@ -1164,13 +1132,12 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 		}

 		/* send sync requests soon to give time to commit */
-		scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
+		scoutfs_key_init_log_trees(&key, 0, 0);
 		while (others_active &&
-		       (ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {
+		       (ret = for_each_lt(sb, &super->logs_root, &key, &each_lt)) > 0) {

 			if ((le64_to_cpu(each_lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) ||
-			    (le64_to_cpu(each_lt.rid) == rid) ||
-			    (le64_to_cpu(each_lt.get_trans_seq) <= server->finalize_sent_seq))
+			    (le64_to_cpu(each_lt.rid) == rid))
 				continue;

 			ret = scoutfs_net_submit_request_node(sb, server->conn,
@@ -1190,8 +1157,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			break;
 		}

-		server->finalize_sent_seq = scoutfs_server_seq(sb);
-
 		/* Finalize ours if it's visible to others */
 		if (ours_visible) {
 			fin = *lt;
@@ -1229,16 +1194,13 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
 			if (ret < 0)
 				err_str = "applying commit before waiting for finalized";

-			msleep(delay_ms);
-			delay_ms = min(delay_ms * FINALIZE_POLL_DELAY_GROWTH_PCT / 100,
-				       FINALIZE_POLL_MAX_DELAY_MS);
+			msleep(FINALIZE_POLL_MS);

 			server_hold_commit(sb, hold);
 			mutex_lock(&server->logs_mutex);

 			/* done if we timed out */
-			if (time_after(jiffies, timeo)) {
-				scoutfs_inc_counter(sb, log_merge_wait_timeout);
+			if (1 || time_after(jiffies, timeo)) {
 				ret = 0;
 				break;
 			}
@@ -1821,29 +1783,43 @@ out:
 * Give the caller the last seq before outstanding client commits.  All
 * seqs up to and including this are stable, new client transactions can
 * only have greater seqs.
- *
- * For each rid, only its greatest log trees nr can be an open commit.
- * We look at the last log_trees item for each client rid and record its
- * trans seq if it hasn't been committed.
 */
 static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
 {
 	struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
 	DECLARE_SERVER_INFO(sb, server);
-	struct scoutfs_log_trees lt;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_log_trees *lt;
 	struct scoutfs_key key;
 	u64 last_seq = 0;
 	int ret;

 	last_seq = scoutfs_server_seq(sb) - 1;
+	scoutfs_key_init_log_trees(&key, 0, 0);

 	mutex_lock(&server->logs_mutex);

-	scoutfs_key_init_log_trees(&key, U64_MAX, U64_MAX);
-	while ((ret = for_each_rid_last_lt(sb, &super->logs_root, &key, &lt)) > 0) {
-		if ((le64_to_cpu(lt.get_trans_seq) > le64_to_cpu(lt.commit_trans_seq)) &&
-		     le64_to_cpu(lt.get_trans_seq) <= last_seq) {
-			last_seq = le64_to_cpu(lt.get_trans_seq) - 1;
+	for (;; scoutfs_key_inc(&key)) {
+		ret = scoutfs_btree_next(sb, &super->logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				lt = iref.val;
+				if ((le64_to_cpu(lt->get_trans_seq) >
+				     le64_to_cpu(lt->commit_trans_seq)) &&
+				     le64_to_cpu(lt->get_trans_seq) <= last_seq) {
+					last_seq = le64_to_cpu(lt->get_trans_seq) - 1;
+				}
+				key = *iref.key;
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
 		}
 	}

@@ -2495,11 +2471,9 @@ static void server_log_merge_free_work(struct work_struct *work)

 	while (!server_is_stopping(server)) {

-		if (!commit) {
-			server_hold_commit(sb, &hold);
-			mutex_lock(&server->logs_mutex);
-			commit = true;
-		}
+		server_hold_commit(sb, &hold);
+		mutex_lock(&server->logs_mutex);
+		commit = true;

 		ret = next_log_merge_item(sb, &super->log_merge,
 					  SCOUTFS_LOG_MERGE_FREEING_ZONE,
@@ -2546,14 +2520,12 @@ static void server_log_merge_free_work(struct work_struct *work)
 		/* freed blocks are in allocator, we *have* to update fr */
 		BUG_ON(ret < 0);

-		if (server_hold_alloc_used_since(sb, &hold) >= COMMIT_HOLD_ALLOC_BUDGET / 2) {
-			mutex_unlock(&server->logs_mutex);
-			ret = server_apply_commit(sb, &hold, ret);
-			commit = false;
-			if (ret < 0) {
-				err_str = "looping commit del/upd freeing item";
-				break;
-			}
+		mutex_unlock(&server->logs_mutex);
+		ret = server_apply_commit(sb, &hold, ret);
+		commit = false;
+		if (ret < 0) {
+			err_str = "looping commit del/upd freeing item";
+			break;
 		}
 	}

@@ -4326,7 +4298,6 @@ static void scoutfs_server_worker(struct work_struct *work)
 	scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));

 	scoutfs_block_writer_init(sb, &server->wri);
-	server->finalize_sent_seq = 0;

 	/* first make sure no other servers are still running */
 	ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
 FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))

 CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-	-I src/ -fno-strict-aliasing \
+	-fno-strict-aliasing \
 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU

 ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,9 +15,8 @@ CFLAGS += -I../kmod/src
 endif

 BIN := src/scoutfs
-OBJ_DIRS := src src/check
-OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
-DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
+OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
+DEPS := $(wildcard */*.d)

 all: $(BIN)

@@ -55,19 +55,6 @@ with initial sparse regions (perhaps by multiple threads writing to
 different regions) and wasted space isn't an issue (perhaps because the
 file population contains few small files).
 .TP
-.B log_merge_wait_timeout_ms=<number>
-This option sets the amount of time, in milliseconds, that log merge
-creation can wait before timing out.  This setting is per-mount, only
-changes the behavior of that mount, and only affects the server when it
-is running in that mount.
-.sp
-This determines how long it may take for mounts to synchronize
-committing their log trees to create a log merge operation.  Setting it
-too high can create long latencies in the event that a mount takes a
-long time to commit their log.  Setting it too low can result in the
-creation of excessive numbers of log trees that are never merged.  The
-default is 500 and it can not be less than 100 nor greater than 60000.
-.TP
 .B metadev_path=<device>
 The metadev_path option specifies the path to the block device that
 contains the filesystem's metadata.
@@ -10,11 +10,6 @@
 * Just a quick simple native bitmap.
 */

-int test_bit(unsigned long *bits, u64 nr)
-{
-	return !!(bits[nr / BITS_PER_LONG] & (1UL << (nr & (BITS_PER_LONG - 1))));
-}
-
 void set_bit(unsigned long *bits, u64 nr)
 {
 	bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));
@@ -1,7 +1,6 @@
 #ifndef _BITMAP_H_
 #define _BITMAP_H_

-int test_bit(unsigned long *bits, u64 nr);
 void set_bit(unsigned long *bits, u64 nr);
 void clear_bit(unsigned long *bits, u64 nr);
 u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);
@@ -1,159 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "bitmap.h"
-#include "key.h"
-
-#include "alloc.h"
-#include "block.h"
-#include "btree.h"
-#include "extent.h"
-#include "iter.h"
-#include "sns.h"
-
-/*
- * We check the list blocks serially.
- *
- * XXX:
- *  - compare ref seqs
- *  - detect cycles?
- */
-int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_alloc_list_block *lblk;
-	struct scoutfs_block_ref ref;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-
-	ref = lhead->ref;
-
-	while (ref.blkno) {
-		blkno = le64_to_cpu(ref.blkno);
-
-		ret = cb(blkno, 1, cb_arg);
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-
-		ret = block_get(&blk, blkno, 0);
-		if (ret < 0)
-			goto out;
-
-		lblk = block_buf(blk);
-		/* XXX verify block */
-		/* XXX sort?   maybe */
-
-		ref = lblk->next;
-
-		block_put(&blk);
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
-
-int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
-{
-	return btree_meta_iter(&root->root, cb, cb_arg);
-}
-
-int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_alloc_list_block *lblk;
-	struct scoutfs_block_ref ref;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-	int i;
-
-	ref = lhead->ref;
-
-	while (ref.blkno) {
-		blkno = le64_to_cpu(ref.blkno);
-
-		ret = block_get(&blk, blkno, 0);
-		if (ret < 0)
-			goto out;
-
-		sns_push("alloc_list_block", blkno, 0);
-
-		lblk = block_buf(blk);
-		/* XXX verify block */
-		/* XXX sort?   maybe */
-
-		ret = 0;
-		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
-			blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
-
-			ret = cb(blkno, 1, cb_arg);
-			if (ret < 0)
-				break;
-		}
-
-		ref = lblk->next;
-
-		block_put(&blk);
-		sns_pop();
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
-
-static bool valid_free_extent_key(struct scoutfs_key *key)
-{
-	return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
-	        key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
-	       (!key->_sk_fourth && !key->sk_type &&
-		(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
-}
-
-static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
-{
-	struct extent_cb_arg_t *ecba = cb_arg;
-	u64 start;
-	u64 len;
-
-	/* XXX not sure these eios are what we want */
-
-	if (val_len != 0)
-		return -EIO;
-
-	if (!valid_free_extent_key(key))
-		return -EIO;
-
-	if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
-		return -ECHECK_ITER_DONE;
-
-	start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
-	len = le64_to_cpu(key->skfb_len);
-
-	return ecba->cb(start, len, ecba->cb_arg);
-}
-
-/*
- * Call the callback with each of the primary BLKNO free extents stored
- * in item in the given alloc root.  It doesn't visit the secondary
- * ORDER extents.
- */
-int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
-{
-	struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
-
-	return btree_item_iter(&root->root, free_item_cb, &ecba);
-}
@@ -1,12 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
-#define _SCOUTFS_UTILS_CHECK_ALLOC_H
-
-#include "extent.h"
-
-int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
-int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
-
-int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
-int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
-
-#endif
@@ -1,564 +0,0 @@
-#define _ISOC11_SOURCE /* aligned_alloc */
-#define _DEFAULT_SOURCE /* syscall() */
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <errno.h>
-#include <sys/syscall.h>
-#include <linux/aio_abi.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "list.h"
-#include "cmp.h"
-#include "hash.h"
-
-#include "block.h"
-#include "debug.h"
-#include "eno.h"
-
-static struct block_data {
-	struct list_head *hash_lists;
-	size_t hash_nr;
-
-	struct list_head active_head;
-	struct list_head inactive_head;
-	struct list_head dirty_list;
-	size_t nr_active;
-	size_t nr_inactive;
-	size_t nr_dirty;
-
-	int meta_fd;
-	size_t max_cached;
-	size_t nr_events;
-
-	aio_context_t ctx;
-	struct iocb *iocbs;
-	struct iocb **iocbps;
-	struct io_event *events;
-} global_bdat;
-
-struct block {
-	struct list_head hash_head;
-	struct list_head lru_head;
-	struct list_head dirty_head;
-	struct list_head submit_head;
-	unsigned long refcount;
-	unsigned long uptodate:1,
-		      active:1;
-	u64 blkno;
-	void *buf;
-	size_t size;
-};
-
-#define BLK_FMT \
-	"blkno %llu rc %ld d %u a %u"
-#define BLK_ARG(blk) \
-	(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
-#define debug_blk(blk, fmt, args...) \
-	debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
-
-/*
- * This just allocates and initialzies the block.  The caller is
- * responsible for putting it on the appropriate initial lists and
- * managing refcounts.
- */
-static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
-{
-	struct block *blk;
-
-	blk = calloc(1, sizeof(struct block));
-	if (blk) {
-		blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
-		if (!blk->buf) {
-			free(blk);
-			blk = NULL;
-		} else {
-			INIT_LIST_HEAD(&blk->hash_head);
-			INIT_LIST_HEAD(&blk->lru_head);
-			INIT_LIST_HEAD(&blk->dirty_head);
-			INIT_LIST_HEAD(&blk->submit_head);
-			blk->blkno = blkno;
-			blk->size = size;
-		}
-	}
-
-	return blk;
-}
-
-static void free_block(struct block_data *bdat, struct block *blk)
-{
-	debug_blk(blk, "free");
-
-	if (!list_empty(&blk->lru_head)) {
-		if (blk->active)
-			bdat->nr_active--;
-		else
-			bdat->nr_inactive--;
-		list_del(&blk->lru_head);
-	}
-
-	if (!list_empty(&blk->dirty_head)) {
-		bdat->nr_dirty--;
-		list_del(&blk->dirty_head);
-	}
-
-	if (!list_empty(&blk->hash_head))
-		list_del(&blk->hash_head);
-
-	if (!list_empty(&blk->submit_head))
-		list_del(&blk->submit_head);
-
-	free(blk->buf);
-	free(blk);
-}
-
-static bool blk_is_dirty(struct block *blk)
-{
-	return !list_empty(&blk->dirty_head);
-}
-
-/*
- * Rebalance the cache.
- *
- * First we shrink the cache to limit it to max_cached blocks.
- * Logically, we walk from oldest to newest in the inactive list and
- * then in the active list.  Since these lists are physically one
- * list_head list we achieve this with a reverse walk starting from the
- * active head.
- *
- * Then we rebalnace the size of the two lists.  The constraint is that
- * we don't let the active list grow larger than the inactive list.  We
- * move blocks from the oldest tail of the active list to the newest
- * head of the inactive list.
- *
- * <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
- */
-static void rebalance_cache(struct block_data *bdat)
-{
-	struct block *blk;
-	struct block *blk_;
-
-	list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
-		if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
-			break;
-
-		if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
-		    blk_is_dirty(blk))
-			continue;
-
-		free_block(bdat, blk);
-	}
-
-	list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
-		if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
-			break;
-
-		list_move(&blk->lru_head, &bdat->inactive_head);
-		blk->active = 0;
-		bdat->nr_active--;
-		bdat->nr_inactive++;
-	}
-}
-
-static void make_active(struct block_data *bdat, struct block *blk)
-{
-	if (!blk->active) {
-		if (!list_empty(&blk->lru_head)) {
-			list_move(&blk->lru_head, &bdat->active_head);
-			bdat->nr_inactive--;
-		} else {
-			list_add(&blk->lru_head, &bdat->active_head);
-		}
-
-		blk->active = 1;
-		bdat->nr_active++;
-	}
-}
-
-static int compar_iocbp(const void *A, const void *B)
-{
-	struct iocb *a = *(struct iocb **)A;
-	struct iocb *b = *(struct iocb **)B;
-
-	return scoutfs_cmp(a->aio_offset, b->aio_offset);
-}
-
-static int submit_and_wait(struct block_data *bdat, struct list_head *list)
-{
-	struct io_event *event;
-	struct iocb *iocb;
-	struct block *blk;
-	int ret;
-	int err;
-	int nr;
-	int i;
-
-	err = 0;
-	nr = 0;
-	list_for_each_entry(blk, list, submit_head) {
-		iocb = &bdat->iocbs[nr];
-		bdat->iocbps[nr] = iocb;
-
-		memset(iocb, 0, sizeof(struct iocb));
-
-		iocb->aio_data = (intptr_t)blk;
-		iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
-		iocb->aio_fildes = bdat->meta_fd;
-		iocb->aio_buf = (intptr_t)blk->buf;
-		iocb->aio_nbytes = blk->size;
-		iocb->aio_offset = blk->blkno * blk->size;
-
-		nr++;
-
-		debug_blk(blk, "submit");
-
-		if ((nr < bdat->nr_events) && blk->submit_head.next != list)
-			continue;
-
-		qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
-
-		ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
-		if (ret != nr) {
-			if (ret >= 0)
-				errno = EIO;
-			ret = -errno;
-			printf("fatal system error submitting async IO: "ENO_FMT"\n",
-				ENO_ARG(-ret));
-			goto out;
-		}
-
-		ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
-		if (ret != nr) {
-			if (ret >= 0)
-				errno = EIO;
-			ret = -errno;
-			printf("fatal system error getting IO events: "ENO_FMT"\n",
-				ENO_ARG(-ret));
-			goto out;
-		}
-
-		ret = 0;
-		for (i = 0; i < nr; i++) {
-			event = &bdat->events[i];
-			iocb = (struct iocb *)(intptr_t)event->obj;
-			blk = (struct block *)(intptr_t)event->data;
-
-			debug_blk(blk, "complete res %lld", (long long)event->res);
-
-			if (event->res >= 0 && event->res != blk->size)
-				event->res = -EIO;
-
-			/* io errors are fatal */
-			if (event->res < 0) {
-				ret = event->res;
-				goto out;
-			}
-
-			if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
-				blk->uptodate = 1;
-			} else {
-				list_del_init(&blk->dirty_head);
-				bdat->nr_dirty--;
-			}
-		}
-		nr = 0;
-	}
-
-	ret = 0;
-out:
-	return ret ?: err;
-}
-
-static void inc_refcount(struct block *blk)
-{
-	blk->refcount++;
-}
-
-void block_put(struct block **blkp)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk = *blkp;
-
-	if (blk) {
-		blk->refcount--;
-		*blkp = NULL;
-
-		rebalance_cache(bdat);
-	}
-}
-
-static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
-{
-	u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
-
-	return &bdat->hash_lists[hash % bdat->hash_nr];
-}
-
-static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
-{
-	struct list_head *bucket = hash_bucket(bdat, blkno);
-	struct block *search;
-	struct block *blk;
-	size_t size;
-
-	size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
-
-	blk = NULL;
-	list_for_each_entry(search, bucket, hash_head) {
-		if (search->blkno && blkno && search->size == size) {
-			blk = search;
-			break;
-		}
-	}
-
-	if (!blk) {
-		blk = alloc_block(bdat, blkno, size);
-		if (blk) {
-			list_add(&blk->hash_head, bucket);
-			list_add(&blk->lru_head, &bdat->inactive_head);
-			bdat->nr_inactive++;
-		}
-	}
-	if (blk)
-		inc_refcount(blk);
-
-	return blk;
-}
-
-/*
- * Get a block.
- *
- * The caller holds a refcount to the block while it's in use that
- * prevents it from being removed from the cache.  It must be dropped
- * with block_put();
- */
-int block_get(struct block **blk_ret, u64 blkno, int bf)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk;
-	LIST_HEAD(list);
-	int ret;
-
-	blk = get_or_alloc(bdat, blkno, bf);
-	if (!blk) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	if ((bf & BF_ZERO)) {
-		memset(blk->buf, 0, blk->size);
-		blk->uptodate = 1;
-	}
-
-	if (bf & BF_OVERWRITE)
-		blk->uptodate = 1;
-
-	if (!blk->uptodate) {
-		list_add(&blk->submit_head, &list);
-		ret = submit_and_wait(bdat, &list);
-		list_del_init(&blk->submit_head);
-		if (ret < 0)
-			goto out;
-	}
-
-	if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
-		list_add_tail(&bdat->dirty_list, &blk->dirty_head);
-		bdat->nr_dirty++;
-	}
-
-	make_active(bdat, blk);
-
-	rebalance_cache(bdat);
-	ret = 0;
-out:
-	if (ret < 0)
-		block_put(&blk);
-	*blk_ret = blk;
-	return ret;
-}
-
-void *block_buf(struct block *blk)
-{
-	return blk->buf;
-}
-
-size_t block_size(struct block *blk)
-{
-	return blk->size;
-}
-
-/*
- * Drop the block from the cache, regardless of if it was free or not.
- * This is used to avoid writing blocks which were dirtied but then
- * later freed.
- *
- * The block is immediately freed and can't be referenced after this
- * returns.
- */
-void block_drop(struct block **blkp)
-{
-	struct block_data *bdat = &global_bdat;
-
-	free_block(bdat, *blkp);
-	*blkp = NULL;
-	rebalance_cache(bdat);
-}
-
-/*
- * This doesn't quite work for mixing large and small blocks, but that's
- * fine, we never do that.
- */
-static int compar_u64(const void *A, const void *B)
-{
-	u64 a = *((u64 *)A);
-	u64 b = *((u64 *)B);
-
-	return scoutfs_cmp(a, b);
-}
-
-/*
- * This read-ahead is synchronous and errors are ignored.  If any of the
- * blknos aren't present in the cache then we issue concurrent reads for
- * them and wait.  Any existing cached blocks will be left as is.
- *
- * We might be trying to read a lot more than the number of events so we
- * sort the caller's blknos before iterating over them rather than
- * relying on submission sorting the blocks in each submitted set.
- */
-void block_readahead(u64 *blknos, size_t nr)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk;
-	struct block *blk_;
-	LIST_HEAD(list);
-	size_t i;
-
-	if (nr == 0)
-		return;
-
-	qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
-
-	for (i = 0; i < nr; i++) {
-		blk = get_or_alloc(bdat, blknos[i], 0);
-		if (blk) {
-			if (!blk->uptodate)
-				list_add_tail(&blk->submit_head, &list);
-			else
-				block_put(&blk);
-		}
-	}
-
-	(void)submit_and_wait(bdat, &list);
-
-	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
-		list_del_init(&blk->submit_head);
-		block_put(&blk);
-	}
-
-	rebalance_cache(bdat);
-}
-
-/*
- * The caller's block changes form a consistent transaction.  If the amount of dirty
- * blocks is large enough we issue a write.
- */
-int block_try_commit(bool force)
-{
-	struct block_data *bdat = &global_bdat;
-	struct block *blk;
-	struct block *blk_;
-	LIST_HEAD(list);
-	int ret;
-
-	if (!force && bdat->nr_dirty < bdat->nr_events)
-		return 0;
-
-	list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
-		list_add_tail(&blk->submit_head, &list);
-		inc_refcount(blk);
-	}
-
-	ret = submit_and_wait(bdat, &list);
-
-	list_for_each_entry_safe(blk, blk_, &list, submit_head) {
-		list_del_init(&blk->submit_head);
-		block_put(&blk);
-	}
-
-	if (ret < 0) {
-		printf("error writing dirty transaction blocks\n");
-		goto out;
-	}
-
-	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
-	if (ret == 0) {
-		list_add(&blk->submit_head, &list);
-		ret = submit_and_wait(bdat, &list);
-		list_del_init(&blk->submit_head);
-		block_put(&blk);
-	} else {
-		ret = -ENOMEM;
-	}
-	if (ret < 0)
-		printf("error writing super block to commit transaction\n");
-
-out:
-	rebalance_cache(bdat);
-	return ret;
-}
-
-int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
-{
-	struct block_data *bdat = &global_bdat;
-	size_t i;
-	int ret;
-
-	bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
-	bdat->hash_nr = bdat->max_cached / 4;
-	bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
-
-	bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
-	bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
-	bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
-	bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
-	if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	INIT_LIST_HEAD(&bdat->active_head);
-	INIT_LIST_HEAD(&bdat->inactive_head);
-	INIT_LIST_HEAD(&bdat->dirty_list);
-	bdat->meta_fd = meta_fd;
-	list_add(&bdat->inactive_head, &bdat->active_head);
-
-	for (i = 0; i < bdat->hash_nr; i++)
-		INIT_LIST_HEAD(&bdat->hash_lists[i]);
-
-	ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
-
-out:
-	if (ret < 0) {
-		free(bdat->iocbs);
-		free(bdat->iocbps);
-		free(bdat->events);
-		free(bdat->hash_lists);
-	}
-
-	return ret;
-}
-
-void block_shutdown(void)
-{
-	struct block_data *bdat = &global_bdat;
-
-	syscall(SYS_io_destroy, bdat->ctx);
-
-	free(bdat->iocbs);
-	free(bdat->iocbps);
-	free(bdat->events);
-	free(bdat->hash_lists);
-}
@@ -1,32 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
-#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
-
-#include <unistd.h>
-#include <stdbool.h>
-
-struct block;
-
-#include "sparse.h"
-
-/* block flags passed to block_get() */
-enum {
-	BF_ZERO      = (1 << 0), /* zero contents buf as block is returned */
-	BF_DIRTY     = (1 << 1), /* block will be written with transaction */
-	BF_SM        = (1 << 2), /* small 4k block instead of large 64k block */
-	BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
-};
-
-int block_get(struct block **blk_ret, u64 blkno, int bf);
-void block_put(struct block **blkp);
-
-void *block_buf(struct block *blk);
-size_t block_size(struct block *blk);
-void block_drop(struct block **blkp);
-
-void block_readahead(u64 *blknos, size_t nr);
-int block_try_commit(bool force);
-
-int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
-void block_shutdown(void);
-
-#endif
@@ -1,209 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "key.h"
-#include "avl.h"
-
-#include "block.h"
-#include "btree.h"
-#include "extent.h"
-#include "iter.h"
-#include "sns.h"
-#include "meta.h"
-#include "problem.h"
-
-static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
-{
-	return (void *)bt + le16_to_cpu(item->val_off);
-}
-
-static void readahead_refs(struct scoutfs_btree_block *bt)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_avl_node *node;
-	struct scoutfs_block_ref *ref;
-	u64 *blknos;
-	u64 blkno;
-	u16 valid = 0;
-	u16 nr = le16_to_cpu(bt->nr_items);
-	int i;
-
-	blknos = calloc(nr, sizeof(blknos[0]));
-	if (!blknos)
-		return;
-
-	node = avl_first(&bt->item_root);
-
-	for (i = 0; i < nr; i++) {
-		item = container_of(node, struct scoutfs_btree_item, node);
-		ref = item_val(bt, item);
-		blkno = le64_to_cpu(ref->blkno);
-
-		if (valid_meta_blkno(blkno))
-			blknos[valid++] = blkno;
-
-		node = avl_next(&bt->item_root, &item->node);
-	}
-
-	if (valid > 0)
-		block_readahead(blknos, valid);
-	free(blknos);
-}
-
-/*
- * Call the callback on the referenced block.  Then if the block
- * contains referneces read it and recurse into all its references.
- */
-static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
-			       void *cb_arg)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_avl_node *node;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-	int i;
-
-	blkno = le64_to_cpu(ref->blkno);
-	if (!blkno)
-		return 0;
-
-	ret = cb(blkno, 1, cb_arg);
-	if (ret < 0) {
-		ret = xlate_iter_errno(ret);
-		return 0;
-	}
-
-	if (level == 0)
-		return 0;
-
-	ret = block_get(&blk, blkno, 0);
-	if (ret < 0)
-		return ret;
-
-	sns_push("btree_parent", blkno, 0);
-
-	bt = block_buf(blk);
-
-	/* XXX integrate verification with block cache */
-	if (bt->level != level) {
-		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* read-ahead last level of parents */
-	if (level == 2)
-		readahead_refs(bt);
-
-	node = avl_first(&bt->item_root);
-
-	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
-		item = container_of(node, struct scoutfs_btree_item, node);
-		ref = item_val(bt, item);
-
-		ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
-		if (ret < 0)
-			goto out;
-
-		node = avl_next(&bt->item_root, &item->node);
-	}
-
-	ret = 0;
-out:
-	block_put(&blk);
-	sns_pop();
-
-	return ret;
-}
-
-int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
-{
-	/* XXX check root */
-	if (root->height == 0)
-		return 0;
-
-	return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
-}
-
-static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
-			       btree_item_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_avl_node *node;
-	struct block *blk = NULL;
-	u64 blkno;
-	int ret;
-	int i;
-
-	blkno = le64_to_cpu(ref->blkno);
-	if (!blkno)
-		return 0;
-
-	ret = block_get(&blk, blkno, 0);
-	if (ret < 0)
-		return ret;
-
-	if (level)
-		sns_push("btree_parent", blkno, 0);
-	else
-		sns_push("btree_leaf", blkno, 0);
-
-	bt = block_buf(blk);
-
-	/* XXX integrate verification with block cache */
-	if (bt->level != level) {
-		problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* read-ahead leaves that contain items */
-	if (level == 1)
-		readahead_refs(bt);
-
-	node = avl_first(&bt->item_root);
-
-	for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
-		item = container_of(node, struct scoutfs_btree_item, node);
-
-		if (level) {
-			ref = item_val(bt, item);
-			ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
-		} else {
-			ret = cb(&item->key, item_val(bt, item),
-				 le16_to_cpu(item->val_len), cb_arg);
-			debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
-		}
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-
-		node = avl_next(&bt->item_root, &item->node);
-	}
-
-	ret = 0;
-out:
-	block_put(&blk);
-	sns_pop();
-
-	return ret;
-}
-
-int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
-{
-	/* XXX check root */
-	if (root->height == 0)
-		return 0;
-
-	return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
-}
@@ -1,14 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
-#define _SCOUTFS_UTILS_CHECK_BTREE_H_
-
-#include "util.h"
-#include "format.h"
-
-#include "extent.h"
-
-typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
-
-int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
-int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
-
-#endif
@@ -1,149 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <string.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <argp.h>
-
-#include "sparse.h"
-#include "parse.h"
-#include "util.h"
-#include "format.h"
-#include "ioctl.h"
-#include "cmd.h"
-#include "dev.h"
-
-#include "alloc.h"
-#include "block.h"
-#include "debug.h"
-#include "meta.h"
-#include "super.h"
-
-struct check_args {
-	char *meta_device;
-	char *data_device;
-	char *debug_path;
-};
-
-static int do_check(struct check_args *args)
-{
-	int debug_fd = -1;
-	int meta_fd = -1;
-	int data_fd = -1;
-	int ret;
-
-	if (args->debug_path) {
-		debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-		if (debug_fd < 0) {
-			ret = -errno;
-			fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
-				args->debug_path, strerror(errno), errno);
-			goto out;
-		}
-
-		debug_enable(debug_fd);
-	}
-
-	meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
-	if (meta_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
-			args->meta_device, strerror(errno), errno);
-		goto out;
-	}
-
-	data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
-	if (data_fd < 0) {
-		ret = -errno;
-		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
-			args->data_device, strerror(errno), errno);
-		goto out;
-	}
-
-	ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
-	if (ret < 0)
-		goto out;
-
-	ret = check_supers() ?:
-	      check_meta_alloc();
-out:
-	/* and tear it all down */
-	block_shutdown();
-	super_shutdown();
-	debug_disable();
-
-	if (meta_fd >= 0)
-		close(meta_fd);
-	if (data_fd >= 0)
-		close(data_fd);
-	if (debug_fd >= 0)
-		close(debug_fd);
-
-	return ret;
-}
-
-static int parse_opt(int key, char *arg, struct argp_state *state)
-{
-	struct check_args *args = state->input;
-
-	switch (key) {
-	case 'd':
-		args->debug_path = strdup_or_error(state, arg);
-		break;
-	case 'e':
-	case ARGP_KEY_ARG:
-		if (!args->meta_device)
-			args->meta_device = strdup_or_error(state, arg);
-		else if (!args->data_device)
-			args->data_device = strdup_or_error(state, arg);
-		else
-			argp_error(state, "more than two device arguments given");
-		break;
-	case ARGP_KEY_FINI:
-		if (!args->meta_device)
-			argp_error(state, "no metadata device argument given");
-		if (!args->data_device)
-			argp_error(state, "no data device argument given");
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static struct argp_option options[] = {
-	{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
-	{ NULL }
-};
-
-static struct argp argp = {
-	options,
-	parse_opt,
-	"META-DEVICE DATA-DEVICE",
-	"Check filesystem consistency"
-};
-
-static int check_cmd(int argc, char **argv)
-{
-	struct check_args check_args = {NULL};
-	int ret;
-
-	ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
-	if (ret)
-		return ret;
-
-	return do_check(&check_args);
-}
-
-static void __attribute__((constructor)) check_ctor(void)
-{
-	cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
-}
@@ -1,16 +0,0 @@
-#include <stdlib.h>
-
-#include "debug.h"
-
-int debug_fd = -1;
-
-void debug_enable(int fd)
-{
-	debug_fd = fd;
-}
-
-void debug_disable(void)
-{
-	if (debug_fd >= 0)
-		debug_fd = -1;
-}
@@ -1,17 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
-#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
-
-#include <stdio.h>
-
-#define debug(fmt, args...)				\
-do {							\
-	if (debug_fd >= 0)				\
-		dprintf(debug_fd, fmt"\n", ##args);	\
-} while (0)
-
-extern int debug_fd;
-
-void debug_enable(int fd);
-void debug_disable(void);
-
-#endif
@@ -1,9 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
-#define _SCOUTFS_UTILS_CHECK_ENO_H_
-
-#include <errno.h>
-
-#define ENO_FMT		"%d (%s)"
-#define ENO_ARG(eno)	eno, strerror(eno)
-
-#endif
@@ -1,312 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <errno.h>
-
-#include "util.h"
-#include "lk_rbtree_wrapper.h"
-
-#include "debug.h"
-#include "extent.h"
-
-/*
- * In-memory extent management in rbtree nodes.
- */
-
-bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
-{
-	u64 a_end = a_start + a_len;
-	u64 b_end = b_start + b_len;
-
-	return !((a_end <= b_start) || (b_end <= a_start));
-}
-
-static int ext_contains(struct extent_node *ext, u64 start, u64 len)
-{
-	return ext->start <= start && ext->start + ext->len >= start + len;
-}
-
-/*
- * True if the given extent is bisected by the given range; there's
- * leftover containing extents on both the left and right sides of the
- * range in the extent.
- */
-static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
-{
-	return ext->start < start && ext->start + ext->len > start + len;
-}
-
-static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
-{
-	return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
-}
-
-static struct extent_node *next_ext(struct extent_node *ext)
-{
-	return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
-}
-
-static struct extent_node *prev_ext(struct extent_node *ext)
-{
-	return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
-}
-
-struct walk_results {
-	unsigned bisect_to_leaf:1;
-	struct extent_node *found;
-	struct extent_node *next;
-	struct rb_node *parent;
-	struct rb_node **node;
-};
-
-static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
-{
-	struct rb_node **node = &root->rbroot.rb_node;
-	struct extent_node *ext;
-	u64 end = start + len;
-	int cmp;
-
-	wlk->found = NULL;
-	wlk->next = NULL;
-	wlk->parent = NULL;
-
-	while (*node) {
-		wlk->parent = *node;
-		ext = ext_from_rbnode(*node);
-		cmp = end <= ext->start ? -1 :
-		      start >= ext->start + ext->len ? 1 : 0;
-
-		if (cmp < 0) {
-			node = &ext->rbnode.rb_left;
-			wlk->next = ext;
-		} else if (cmp > 0) {
-			node = &ext->rbnode.rb_right;
-		} else {
-			wlk->found = ext;
-			if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
-				break;
-			/* walk right so we can insert greater right from bisection */
-			node = &ext->rbnode.rb_right;
-		}
-	}
-
-	wlk->node = node;
-}
-
-/*
- * Return an extent that overlaps with the given range.
- */
-int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
-{
-	struct walk_results wlk = { 0, };
-	int ret;
-
-	walk_extents(root, start, len, &wlk);
-	if (wlk.found) {
-		memset(found, 0, sizeof(struct extent_node));
-		found->start = wlk.found->start;
-		found->len = wlk.found->len;
-		ret = 0;
-	} else {
-		ret = -ENOENT;
-	}
-
-	return ret;
-}
-
-/*
- * Callers can iterate through direct node references and are entirely
- * responsible for consistency when doing so.
- */
-struct extent_node *extent_first(struct extent_root *root)
-{
-	struct walk_results wlk = { 0, };
-
-	walk_extents(root, 0, 1, &wlk);
-
-	return wlk.found ?: wlk.next;
-}
-
-struct extent_node *extent_next(struct extent_node *ext)
-{
-	return next_ext(ext);
-}
-
-struct extent_node *extent_prev(struct extent_node *ext)
-{
-	return prev_ext(ext);
-}
-
-/*
- * Insert a new extent into the tree.  We can extend existing nodes,
- * merge with neighbours, or remove existing extents entirely if we
- * insert a range that fully spans existing nodes.
- */
-static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
-{
-	struct walk_results wlk = { 0, };
-	struct extent_node *ext;
-	struct extent_node *nei;
-	int ret;
-
-	walk_extents(root, start, len, &wlk);
-
-	ext = wlk.found;
-	if (ext && found_err) {
-		ret = found_err;
-		goto out;
-	}
-
-	if (!ext) {
-		ext = malloc(sizeof(struct extent_node));
-		if (!ext) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ext->start = start;
-		ext->len = len;
-
-		rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
-		rb_insert_color(&ext->rbnode, &root->rbroot);
-	}
-
-	/* start by expanding an existing extent if our range is larger */
-	if (start < ext->start) {
-		ext->len += ext->start - start;
-		ext->start = start;
-	}
-	if (ext->start + ext->len < start + len)
-		ext->len += (start + len) - (ext->start + ext->len);
-
-	/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
-
-	while ((nei = prev_ext(ext))) {
-		if (nei->start + nei->len < ext->start)
-			break;
-
-		if (nei->start < ext->start) {
-			ext->len += ext->start - nei->start;
-			ext->start = nei->start;
-		}
-
-		rb_erase(&nei->rbnode, &root->rbroot);
-		free(nei);
-	}
-
-	while ((nei = next_ext(ext))) {
-		if (ext->start + ext->len < nei->start)
-			break;
-
-		if (ext->start + ext->len < nei->start + nei->len)
-			ext->len += (nei->start + nei->len) - (ext->start + ext->len);
-
-		rb_erase(&nei->rbnode, &root->rbroot);
-		free(nei);
-	}
-
-	ret = 0;
-out:
-	debug("start %llu len %llu ret %d", start, len, ret);
-	return ret;
-}
-
-/*
- * Insert a new extent.  The specified extent must not overlap with any
- * existing extents or -EEXIST is returned.
- */
-int extent_insert_new(struct extent_root *root, u64 start, u64 len)
-{
-	return walk_insert(root, start, len, true);
-}
-
-/*
- * Insert an extent, extending any existing extents that may overlap.
- */
-int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
-{
-	return walk_insert(root, start, len, false);
-}
-
-/*
- * Remove the specified extent from an existing node.  The given extent must be fully
- * contained in a single node or -ENOENT is returned.
- */
-int extent_remove(struct extent_root *root, u64 start, u64 len)
-{
-	struct extent_node *ext;
-	struct extent_node *ins;
-	struct walk_results wlk = {
-		.bisect_to_leaf = 1,
-	};
-	int ret;
-
-	walk_extents(root, start, len, &wlk);
-
-	if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	if (ext_bisected(ext, start, len)) {
-		debug("found bisected start %llu len %llu", ext->start, ext->len);
-		ins = malloc(sizeof(struct extent_node));
-		if (!ins) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ins->start = start + len;
-		ins->len = (ext->start + ext->len) - ins->start;
-
-		rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
-		rb_insert_color(&ins->rbnode, &root->rbroot);
-	}
-
-	if (start > ext->start) {
-		ext->len = start - ext->start;
-	} else if (len < ext->len) {
-		ext->start += len;
-		ext->len -= len;
-	} else {
-		rb_erase(&ext->rbnode, &root->rbroot);
-	}
-
-	ret = 0;
-out:
-	debug("start %llu len %llu ret %d", start, len, ret);
-
-	return ret;
-}
-
-void extent_root_init(struct extent_root *root)
-{
-	root->rbroot = RB_ROOT;
-	root->total = 0;
-}
-
-void extent_root_free(struct extent_root *root)
-{
-	struct extent_node *ext;
-	struct rb_node *node;
-	struct rb_node *tmp;
-
-	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
-		ext = rb_entry(node, struct extent_node, rbnode);
-		rb_erase(&ext->rbnode, &root->rbroot);
-		free(ext);
-	}
-}
-
-void extent_root_print(struct extent_root *root)
-{
-	struct extent_node *ext;
-	struct rb_node *node;
-	struct rb_node *tmp;
-
-	for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
-		ext = rb_entry(node, struct extent_node, rbnode);
-		debug("  start %llu len %llu", ext->start, ext->len);
-	}
-}
@@ -1,38 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
-#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
-
-#include "lk_rbtree_wrapper.h"
-
-struct extent_root {
-	struct rb_root rbroot;
-	u64 total;
-};
-
-struct extent_node {
-	struct rb_node rbnode;
-	u64 start;
-	u64 len;
-};
-
-typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
-
-struct extent_cb_arg_t {
-	extent_cb_t cb;
-	void *cb_arg;
-};
-
-bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
-
-int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
-struct extent_node *extent_first(struct extent_root *root);
-struct extent_node *extent_next(struct extent_node *ext);
-struct extent_node *extent_prev(struct extent_node *ext);
-int extent_insert_new(struct extent_root *root, u64 start, u64 len);
-int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
-int extent_remove(struct extent_root *root, u64 start, u64 len);
-
-void extent_root_init(struct extent_root *root);
-void extent_root_free(struct extent_root *root);
-void extent_root_print(struct extent_root *root);
-
-#endif
@@ -1,540 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <string.h>
-#include <stdbool.h>
-#include <argp.h>
-
-#include "sparse.h"
-#include "bitmap.h"
-#include "parse.h"
-#include "util.h"
-#include "format.h"
-#include "crc.h"
-#include "cmd.h"
-#include "dev.h"
-
-#include "alloc.h"
-#include "block.h"
-#include "btree.h"
-#include "log_trees.h"
-#include "super.h"
-
-/* huh. */
-#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
-
-#define SCOUTFS_META_IMAGE_HEADER_MAGIC		0x8aee00d098fa60c5ULL
-#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC	0x70bd5e9269effd86ULL
-
-struct scoutfs_meta_image_header {
-	__le64 magic;
-	__le64 total_bytes;
-	__le32 version;
-} __packed;
-
-struct scoutfs_meta_image_block_header {
-	__le64 magic;
-	__le64 offset;
-	__le32 size;
-	__le32 crc;
-} __packed;
-
-struct image_args {
-	char *meta_device;
-	bool is_read;
-	bool show_header;
-	u64 ra_window;
-};
-
-struct block_bitmaps {
-	unsigned long *bits;
-	u64 size;
-	u64 count;
-};
-
-#define errf(fmt, args...) \
-	dprintf(STDERR_FILENO, fmt, ##args)
-
-static int set_meta_bit(u64 start, u64 len, void *arg)
-{
-	struct block_bitmaps *bm = arg;
-	int ret;
-
-	if (len != 1) {
-		ret = -EINVAL;
-	} else {
-		if (!test_bit(bm->bits, start)) {
-			set_bit(bm->bits, start);
-			bm->count++;
-		}
-		ret = 0;
-	}
-
-	return ret;
-}
-
-static int get_ref_bits(struct block_bitmaps *bm)
-{
-	struct scoutfs_super_block *super = global_super;
-	int ret;
-	u64 i;
-
-	/*
-	 * There are almost no small blocks we need to read, so we read
-	 * them as the large blocks that contain them to simplify the
-	 * block reading process.
-	 */
-	set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
-
-	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
-		set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
-
-	ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
-	      alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
-	      alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
-	      alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
-	      btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
-	      log_trees_meta_iter(set_meta_bit, bm);
-
-	return ret;
-}
-
-/*
- * Note that this temporarily modifies the header that it's given.
- */
-static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
-{
-	__le32 saved = bh->crc;
-	u32 crc = ~0;
-
-	bh->crc = 0;
-	crc = crc32c(crc, bh, sizeof(*bh));
-	crc = crc32c(crc, buf, size);
-	bh->crc = saved;
-
-	return cpu_to_le32(crc);
-}
-
-static void printf_header(struct scoutfs_meta_image_header *hdr)
-{
-	errf("magic: 0x%016llx\n"
-	     "total_bytes: %llu\n"
-	     "version: %u\n",
-	       le64_to_cpu(hdr->magic),
-	       le64_to_cpu(hdr->total_bytes),
-	       le32_to_cpu(hdr->version));
-}
-
-typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
-
-static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
-{
-	return read(fd, buf, count);
-}
-
-static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
-{
-	return pread(fd, buf, count, offset);
-}
-
-static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
-{
-	return write(fd, buf, count);
-}
-
-static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
-{
-	return pwrite(fd, buf, count, offset);
-}
-
-static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
-{
-	ssize_t sret;
-
-	while (count > 0) {
-		sret = func(fd, buf, count, offset);
-		if (sret <= 0 || sret > count) {
-			if (sret < 0)
-				return -errno;
-			else
-				return -EIO;
-		}
-
-		if (tot)
-			*tot += sret;
-		buf += sret;
-		count -= sret;
-	}
-
-	return 0;
-}
-
-static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
-{
-	struct scoutfs_meta_image_block_header bh;
-	struct scoutfs_meta_image_header hdr;
-	u64 opening;
-	void *buf;
-	off_t off;
-	u64 bit;
-	u64 ra;
-	int ret;
-
-	buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
-	if (!buf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
-	hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
-				      (bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
-	hdr.version = cpu_to_le32(1);
-
-	if (args->show_header) {
-		printf_header(&hdr);
-		ret = 0;
-		goto out;
-	}
-
-	ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
-	if (ret < 0)
-		goto out;
-
-	opening = args->ra_window;
-	ra = 0;
-	bit = 0;
-
-	for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
-
-		/* readahead to open the full window, then a block at a time */
-		do {
-			ra = find_next_set_bit(bm->bits, ra, bm->size);
-			if (ra < bm->size) {
-				off = ra << SCOUTFS_BLOCK_LG_SHIFT;
-				posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
-				ra++;
-				if (opening)
-					opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
-			}
-		} while (opening > 0);
-
-		off = bit << SCOUTFS_BLOCK_LG_SHIFT;
-		ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
-		if (ret < 0)
-			goto out;
-
-		/*
-		 * Might as well try to drop the pages we've used to
-		 * reduce memory pressure on our read-ahead pages that
-		 * are waiting.
-		 */
-		posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
-
-		bh.magic = SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC;
-		bh.offset = cpu_to_le64(off);
-		bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
-		bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
-
-		ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
-		      rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
-		if (ret < 0)
-			goto out;
-	}
-
-out:
-	free(buf);
-
-	return ret;
-}
-
-static int invalid_header(struct scoutfs_meta_image_header *hdr)
-{
-	if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
-		errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
-		       le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
-
-	} else if (le32_to_cpu(hdr->version) != 1) {
-		errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
-
-	} else {
-		return 0;
-	}
-
-	return -EIO;
-}
-
-/*
- * Doesn't catch offset+size overflowing, presumes pwrite() will return
- * an error.
- */
-static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
-{
-	if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
-		errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
-		       le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
-
-	} else if (le32_to_cpu(bh->size) == 0) {
-		errf("invalid block header size %u\n", le32_to_cpu(bh->size));
-
-	} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
-		errf("block header size %u too large for size_t (> %zu)\n",
-		       le32_to_cpu(bh->size), (size_t)SIZE_MAX);
-
-	} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
-		errf("block header offset %llu too large for off_t (> %llu)\n",
-		       le64_to_cpu(bh->offset), (u64)OFF_MAX);
-
-	} else {
-		return 0;
-	}
-
-	return -EIO;
-}
-
-static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
-{
-	struct scoutfs_meta_image_block_header bh;
-	struct scoutfs_meta_image_header hdr;
-	size_t writeback_batch = (2 * 1024 * 1024);
-	size_t buf_size;
-	size_t dirty;
-	size_t size;
-	off_t first;
-	off_t last;
-	off_t off;
-	__le32 calc;
-	void *buf;
-	u64 tot;
-	int ret;
-
-	tot = 0;
-
-	ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
-	if (ret < 0)
-		goto out;
-
-	if (args->show_header) {
-		printf_header(&hdr);
-		ret = 0;
-		goto out;
-	}
-
-	ret = invalid_header(&hdr);
-	if (ret < 0)
-		goto out;
-
-	dirty = 0;
-	first = OFF_MAX;
-	last = 0;
-	buf = NULL;
-	buf_size = 0;
-
-	while (tot < le64_to_cpu(hdr.total_bytes)) {
-
-		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
-		if (ret < 0)
-			goto out;
-
-		ret = invalid_block_header(&bh);
-		if (ret < 0)
-			goto out;
-
-		size = le32_to_cpu(bh.size);
-		if (buf_size < size) {
-			buf = realloc(buf, size);
-			if (!buf) {
-				ret = -ENOMEM;
-				goto out;
-			}
-
-			buf_size = size;
-		}
-
-		ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
-		if (ret < 0)
-			goto out;
-
-		calc = calc_crc(&bh, buf, size);
-		if (calc != bh.crc) {
-			errf("crc err");
-			ret = -EIO;
-			goto out;
-		}
-
-		off = le64_to_cpu(bh.offset);
-
-		ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
-		if (ret < 0)
-			goto out;
-
-		dirty += size;
-		first = min(first, off);
-		last = max(last, off);
-		if (dirty >= writeback_batch) {
-			posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
-			dirty = 0;
-			first = OFF_MAX;
-			last = 0;
-		}
-	}
-
-	ret = fsync(fd);
-	if (ret < 0) {
-		ret = -errno;
-		goto out;
-	}
-
-out:
-	return ret;
-}
-
-static int do_image(struct image_args *args)
-{
-	struct block_bitmaps bm = { .bits = NULL };
-	int meta_fd = -1;
-	u64 dev_size;
-	mode_t mode;
-	int ret;
-
-	mode = args->is_read ? O_RDONLY : O_RDWR;
-
-	meta_fd = open(args->meta_device, mode);
-	if (meta_fd < 0) {
-		ret = -errno;
-		errf("failed to open meta device '%s': %s (%d)\n",
-		     args->meta_device, strerror(errno), errno);
-		goto out;
-	}
-
-	if (args->is_read) {
-		ret = flush_device(meta_fd);
-		if (ret < 0)
-			goto out;
-
-		ret = get_device_size(args->meta_device, meta_fd, &dev_size);
-		if (ret < 0)
-			goto out;
-
-		bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
-		bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
-		if (!bm.bits) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
-		      check_supers() ?:
-		      get_ref_bits(&bm) ?:
-		      read_image(args, meta_fd, &bm);
-		block_shutdown();
-	} else {
-		ret = write_image(args, meta_fd, &bm);
-	}
-out:
-	free(bm.bits);
-
-	if (meta_fd >= 0)
-		close(meta_fd);
-
-	return ret;
-}
-
-static int parse_opt(int key, char *arg, struct argp_state *state)
-{
-	struct image_args *args = state->input;
-	int ret;
-
-	switch (key) {
-	case 'h':
-		args->show_header = true;
-		break;
-	case 'r':
-		ret = parse_u64(arg, &args->ra_window);
-		if (ret)
-			argp_error(state, "readahead winddoe parse error");
-		break;
-	case ARGP_KEY_ARG:
-		if (!args->meta_device)
-			args->meta_device = strdup_or_error(state, arg);
-		else
-			argp_error(state, "more than two device arguments given");
-		break;
-	case ARGP_KEY_FINI:
-		if (!args->meta_device)
-			argp_error(state, "no metadata device argument given");
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static struct argp_option options[] = {
-	{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
-	{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
-	{ NULL }
-};
-
-static struct argp read_image_argp = {
-	options,
-	parse_opt,
-	"META-DEVICE",
-	"Read metadata image stream from metadata device file"
-};
-
-#define DEFAULT_RA_WINDOW (512 * 1024)
-
-static int read_image_cmd(int argc, char **argv)
-{
-	struct image_args image_args = {
-		.is_read = true,
-		.ra_window = DEFAULT_RA_WINDOW,
-	};
-	int ret;
-
-	ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
-	if (ret)
-		return ret;
-
-	return do_image(&image_args);
-}
-
-static struct argp write_image_argp = {
-	options,
-	parse_opt,
-	"META-DEVICE",
-	"Write metadata image stream to metadata device file"
-};
-
-static int write_image_cmd(int argc, char **argv)
-{
-	struct image_args image_args = {
-		.is_read = false,
-		.ra_window = DEFAULT_RA_WINDOW,
-	};
-	int ret;
-
-	ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
-	if (ret)
-		return ret;
-
-	return do_image(&image_args);
-}
-
-static void __attribute__((constructor)) image_ctor(void)
-{
-	cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
-	cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
-}
@@ -1,15 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
-#define _SCOUTFS_UTILS_CHECK_ITER_H_
-
-/*
- * Callbacks can return a weird -errno that we'll never use to indicate
- * that iteration can stop and return 0 for success.
- */
-#define ECHECK_ITER_DONE EL2HLT
-
-static inline int xlate_iter_errno(int ret)
-{
-	return ret == -ECHECK_ITER_DONE ? 0 : ret;
-}
-
-#endif
@@ -1,98 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "key.h"
-
-#include "alloc.h"
-#include "btree.h"
-#include "debug.h"
-#include "extent.h"
-#include "iter.h"
-#include "sns.h"
-#include "log_trees.h"
-#include "super.h"
-
-struct iter_args {
-	extent_cb_t cb;
-	void *cb_arg;
-};
-
-static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
-{
-	struct iter_args *ia = cb_arg;
-	struct scoutfs_log_trees *lt;
-	int ret;
-
-	if (val_len != sizeof(struct scoutfs_log_trees))
-		; /* XXX */
-
-	lt = val;
-
-	sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
-
-	debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
-
-	sns_push("meta_avail", 0, 0);
-	ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("meta_freed", 0, 0);
-	ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("item_root", 0, 0);
-	ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	if (lt->bloom_ref.blkno) {
-		sns_push("bloom_ref", 0, 0);
-		ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
-		sns_pop();
-		if (ret < 0) {
-			ret = xlate_iter_errno(ret);
-			goto out;
-		}
-	}
-
-	sns_push("data_avail", 0, 0);
-	ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("data_freed", 0, 0);
-	ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	ret = 0;
-out:
-	sns_pop();
-
-	return ret;
-}
-
-/*
- * Call the callers callback with the extent of all the metadata block references contained
- * in log btrees.  We walk the logs_root btree items and walk all the metadata structures
- * they reference.
- */
-int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
-{
-	struct scoutfs_super_block *super = global_super;
-	struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
-
-	return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
-}
@@ -1,8 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
-#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
-
-#include "extent.h"
-
-int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
-
-#endif
@@ -1,367 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-#include "bitmap.h"
-#include "key.h"
-
-#include "alloc.h"
-#include "btree.h"
-#include "debug.h"
-#include "extent.h"
-#include "sns.h"
-#include "log_trees.h"
-#include "meta.h"
-#include "problem.h"
-#include "super.h"
-
-static struct meta_data {
-	struct extent_root meta_refed;
-	struct extent_root meta_free;
-	struct {
-		u64 ref_blocks;
-		u64 free_extents;
-		u64 free_blocks;
-	} stats;
-} global_mdat;
-
-bool valid_meta_blkno(u64 blkno)
-{
-	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
-
-	return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
-}
-
-static bool valid_meta_extent(u64 start, u64 len)
-{
-	u64 tot = le64_to_cpu(global_super->total_meta_blocks);
-	bool valid;
-
-	valid = len > 0 &&
-		start >= SCOUTFS_META_DEV_START_BLKNO &&
-		start < tot &&
-		len <= tot &&
-		((start + len) <= tot) &&
-		((start + len) > start);
-
-	debug("start %llu len %llu valid %u", start, len, !!valid);
-
-	if (!valid)
-		problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
-
-	return valid;
-}
-
-/*
- * Track references to individual metadata blocks.  This uses the extent
- * callback type but is only ever called for single block references.
- * Any reference to a block that has already been referenced is
- * considered invalid and is ignored.  Later repair will resolve
- * duplicate references.
- */
-static int insert_meta_ref(u64 start, u64 len, void *arg)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct extent_root *root = arg;
-	int ret = 0;
-
-	/* this is tracking single metadata block references */
-	if (len != 1) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (valid_meta_blkno(start)) {
-		ret = extent_insert_new(root, start, len);
-		if (ret == 0)
-			mdat->stats.ref_blocks++;
-		else if (ret == -EEXIST)
-			problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
-	}
-
-out:
-	return ret;
-}
-
-static int insert_meta_free(u64 start, u64 len, void *arg)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct extent_root *root = arg;
-	int ret = 0;
-
-	if (valid_meta_extent(start, len)) {
-		ret = extent_insert_new(root, start, len);
-		if (ret == 0) {
-			mdat->stats.free_extents++;
-			mdat->stats.free_blocks++;
-
-		} else if (ret == -EEXIST) {
-			problem(PB_META_FREE_OVERLAPS_EXISTING,
-				"start %llu llen %llu", start, len);
-		}
-
-	}
-
-	return ret;
-}
-
-/*
- * Walk all metadata references in the system.  This walk doesn't need
- * to read metadata that doesn't contain any metadata references so it
- * can skip the bulk of metadata blocks.  This gives us the set of
- * referenced metadata blocks which we can then use to repair metadata
- * allocator structures.
- */
-static int get_meta_refs(void)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct scoutfs_super_block *super = global_super;
-	int ret;
-
-	extent_root_init(&mdat->meta_refed);
-
-	/* XXX record reserved blocks around super as referenced */
-
-	sns_push("meta_alloc", 0, 0);
-	ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("meta_alloc", 1, 0);
-	ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("data_alloc", 1, 0);
-	ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 0, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_avail[0],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 1, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_avail[1],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 0, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_freed[0],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 1, 0);
-	ret = alloc_list_meta_iter(&super->server_meta_freed[1],
-				   insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("fs_root", 0, 0);
-	ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("logs_root", 0, 0);
-	ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("log_merge", 0, 0);
-	ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("mounted_clients", 0, 0);
-	ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("srch_root", 0, 0);
-	ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
-	if (ret < 0)
-		goto out;
-
-	printf("found %llu referenced metadata blocks\n", mdat->stats.ref_blocks);
-	ret = 0;
-out:
-	return ret;
-}
-
-static int get_meta_free(void)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct scoutfs_super_block *super = global_super;
-	int ret;
-
-	extent_root_init(&mdat->meta_free);
-
-	sns_push("meta_alloc", 0, 0);
-	ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("meta_alloc", 1, 0);
-	ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 0, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_avail[0],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_avail", 1, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_avail[1],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 0, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_freed[0],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	sns_push("server_meta_freed", 1, 0);
-	ret = alloc_list_extent_iter(&super->server_meta_freed[1],
-				     insert_meta_free, &mdat->meta_free);
-	sns_pop();
-	if (ret < 0)
-		goto out;
-
-	printf("found %llu free metadata blocks in %llu extents\n",
-	       mdat->stats.free_blocks, mdat->stats.free_extents);
-	ret = 0;
-out:
-	return ret;
-}
-
-/*
- * All the space between referenced blocks must be recorded in the free
- * extents.  The free extent walk didn't check that the extents
- * overlapped with references, we do that here.  Remember that metadata
- * block references were merged into extents here, the refed extents
- * aren't necessarily all a single block.
- */
-static int compare_refs_and_free(void)
-{
-	struct meta_data *mdat = &global_mdat;
-	struct extent_node *ref;
-	struct extent_node *free;
-	struct extent_node *next;
-	struct extent_node *prev;
-	u64 expect;
-	u64 start;
-	u64 end;
-
-	expect = 0;
-	ref = extent_first(&mdat->meta_refed);
-	free = extent_first(&mdat->meta_free);
-	while (ref || free) {
-
-		printf("exp %llu ref %llu.%llu free %llu.%llu\n",
-			expect, ref ? ref->start : 0, ref ? ref->len : 0,
-			free ? free->start : 0, free ? free->len : 0);
-
-		/* referenced marked free, remove ref from free and continue from same point */
-		if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
-			printf("ref extent %llu.%llu overlaps free %llu %llu\n",
-				ref->start, ref->len, free->start, free->len);
-
-			start = max(ref->start, free->start);
-			end = min(ref->start + ref->len, free->start + free->len);
-
-			prev = extent_prev(free);
-
-			extent_remove(&mdat->meta_free, start, end - start);
-
-			if (prev)
-				free = extent_next(prev);
-			else
-				free = extent_first(&mdat->meta_free);
-			continue;
-		}
-
-		/* see which extent starts earlier */
-		if (!free || (ref && ref->start <= free->start))
-			next = ref;
-		else
-			next = free;
-
-		/* untracked region before next extent */
-		if (expect < next->start) {
-			printf("missing free extent %llu.%llu\n", expect, next->start - expect);
-			expect = next->start;
-			continue;
-		}
-
-
-		/* didn't overlap, advance past next extent */
-		expect = next->start + next->len;
-		if (next == ref)
-			ref = extent_next(ref);
-		else
-			free = extent_next(free);
-	}
-
-	return 0;
-}
-
-/*
- * Check the metadata allocators by comparing the set of referenced
- * blocks with the set of free blocks that are stored in free btree
- * items and alloc list blocks.
- */
-int check_meta_alloc(void)
-{
-	int ret;
-
-	ret = get_meta_refs();
-	if (ret < 0)
-		goto out;
-
-	ret = get_meta_free();
-	if (ret < 0)
-		goto out;
-
-	ret = compare_refs_and_free();
-	if (ret < 0)
-		goto out;
-
-	ret = 0;
-out:
-	return ret;
-}
@@ -1,9 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_META_H_
-#define _SCOUTFS_UTILS_CHECK_META_H_
-
-bool valid_meta_blkno(u64 blkno);
-
-int check_meta_alloc(void);
-
-#endif
-
@@ -1,23 +0,0 @@
-#include <string.h>
-#include <stdbool.h>
-
-#include "util.h"
-#include "padding.h"
-
-bool padding_is_zeros(const void *data, size_t sz)
-{
-	static char zeros[32] = {0,};
-	const size_t batch = array_size(zeros);
-
-	while (sz >= batch) {
-		if (memcmp(data, zeros, batch))
-			return false;
-		data += batch;
-		sz -= batch;
-	}
-
-	if (sz > 0 && memcmp(data, zeros, sz))
-		return false;
-
-	return true;
-}
@@ -1,6 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
-#define _SCOUTFS_UTILS_CHECK_PADDING_H_
-
-bool padding_is_zeros(const void *data, size_t sz);
-
-#endif
@@ -1,23 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-
-#include "problem.h"
-
-#if 0
-#define PROB_STR(pb) [pb] = #pb
-static char *prob_strs[] = {
-	PROB_STR(PB_META_EXTENT_INVALID),
-	PROB_STR(PB_META_EXTENT_OVERLAPS_EXISTING),
-};
-#endif
-
-static struct problem_data {
-	uint64_t counts[PB__NR];
-} global_pdat;
-
-void problem_record(prob_t pb)
-{
-	struct problem_data *pdat = &global_pdat;
-
-	pdat->counts[pb]++;
-}
@@ -1,23 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
-#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
-
-#include "debug.h"
-#include "sns.h"
-
-typedef enum {
-	PB_META_EXTENT_INVALID,
-	PB_META_REF_OVERLAPS_EXISTING,
-	PB_META_FREE_OVERLAPS_EXISTING,
-	PB_BTREE_BLOCK_BAD_LEVEL,
-	PB__NR,
-} prob_t;
-
-#define problem(pb, fmt, ...)							\
-do {										\
-	debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__);	\
-	problem_record(pb);							\
-} while (0)
-
-void problem_record(prob_t pb);
-
-#endif
@@ -1,118 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-
-#include "sns.h"
-
-/*
- * This "str num stack" is used to describe our location in metadata at
- * any given time.
- *
- * As we descend into structures we pop a string on decribing them,
- * perhaps with associated numbers.  Pushing and popping is very cheap
- * and only rarely do we format the stack into a string, as an arbitrary
- * example:
- *   super.fs_root.btree_parent:1231.btree_leaf:3231"
- */
-
-#define SNS_MAX_DEPTH	1000
-#define SNS_STR_SIZE	(SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
-
-static struct sns_data {
-	unsigned int depth;
-
-	struct sns_entry {
-		char *str;
-		size_t len;
-		u64 a;
-		u64 b;
-	} ents[SNS_MAX_DEPTH];
-
-	char str[SNS_STR_SIZE];
-
-} global_lsdat;
-
-void _sns_push(char *str, size_t len, u64 a, u64 b)
-{
-	struct sns_data *lsdat = &global_lsdat;
-
-	if (lsdat->depth < SNS_MAX_DEPTH) {
-		lsdat->ents[lsdat->depth++] = (struct sns_entry) {
-			.str = str,
-			.len = len,
-			.a = a,
-			.b = b,
-		};
-	}
-}
-
-void sns_pop(void)
-{
-	struct sns_data *lsdat = &global_lsdat;
-
-	if (lsdat->depth > 0)
-		lsdat->depth--;
-}
-
-static char *append_str(char *pos, char *str, size_t len)
-{
-	memcpy(pos, str, len);
-	return pos + len;
-}
-
-/*
- * This is not called for x = 0 so we don't need to emit an initial 0.
- * We could by using do {} while instead of while {}.
- */
-static char *append_u64x(char *pos, u64 x)
-{
-	static char hex[] = "0123456789abcdef";
-
-	while (x) {
-		*pos++ = hex[x & 0xf];
-		x >>= 4;
-	}
-
-	return pos;
-}
-
-static char *append_char(char *pos, char c)
-{
-	*(pos++) = c;
-	return pos;
-}
-
-/*
- * Return a pointer to a null terminated string that describes the
- * current location stack.  The string buffer is global.
- */
-char *sns_str(void)
-{
-	struct sns_data *lsdat = &global_lsdat;
-	struct sns_entry *ent;
-	char *pos;
-	int i;
-
-	pos = lsdat->str;
-	for (i = 0; i < lsdat->depth; i++) {
-		ent = &lsdat->ents[i];
-
-		if (i)
-			pos = append_char(pos, '.');
-
-		pos = append_str(pos, ent->str, ent->len);
-
-		if (ent->a) {
-			pos = append_char(pos, ':');
-			pos = append_u64x(pos, ent->a);
-		}
-
-		if (ent->b) {
-			pos = append_char(pos, ':');
-			pos = append_u64x(pos, ent->b);
-		}
-	}
-
-	*pos = '\0';
-
-	return lsdat->str;
-}
@@ -1,20 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
-#define _SCOUTFS_UTILS_CHECK_SNS_H_
-
-#include <assert.h>
-
-#include "sparse.h"
-
-#define SNS_MAX_STR_LEN 20
-
-#define sns_push(str, a, b)					\
-do {								\
-	build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN);	\
-	_sns_push((str), sizeof(str) - 1, a, b);		\
-} while (0)
-
-void _sns_push(char *str, size_t len, u64 a, u64 b);
-void sns_pop(void);
-char *sns_str(void);
-
-#endif
@@ -1,57 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-
-#include "sparse.h"
-#include "util.h"
-#include "format.h"
-
-#include "block.h"
-#include "super.h"
-
-/*
- * After we check the super blocks we provide a global buffer to track
- * the current super block.  It is referenced to get static information
- * about the system and is also modified and written as part of
- * transactions.
- */
-struct scoutfs_super_block *global_super;
-
-/*
- * After checking the supers we save a copy of it in a global buffer that's used by
- * other modules to track the current super.  It can be modified and written during commits.
- */
-int check_supers(void)
-{
-	struct scoutfs_super_block *super = NULL;
-	struct block *blk = NULL;
-	int ret;
-
-	global_super = malloc(sizeof(struct scoutfs_super_block));
-	if (!global_super) {
-		printf("error allocating super block buffer\n");
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
-	if (ret < 0) {
-		printf("error reading super block\n");
-		goto out;
-	}
-
-	super = block_buf(blk);
-
-	memcpy(global_super, super, sizeof(struct scoutfs_super_block));
-	ret = 0;
-out:
-	block_put(&blk);
-
-	return ret;
-}
-
-void super_shutdown(void)
-{
-	free(global_super);
-}
@@ -1,9 +0,0 @@
-#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
-#define _SCOUTFS_UTILS_CHECK_SUPER_H_
-
-extern struct scoutfs_super_block *global_super;
-
-int check_supers(void);
-void super_shutdown(void);
-
-#endif
@@ -156,16 +156,6 @@ static inline void list_move_tail(struct list_head *list,
        list_add_tail(list, head);
 }

-/**
- * list_is_head - tests whether @list is the list @head
- * @list: the entry to test
- * @head: the head of the list
- */
-static inline int list_is_head(const struct list_head *list, const struct list_head *head)
-{
-	return list == head;
-}
-
 /**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
@@ -252,15 +242,6 @@ static inline void list_splice_init(struct list_head *list,
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)

-/**
- * list_entry_is_head - test if the entry points to the head of the list
- * @pos:	the type * to cursor
- * @head:	the head for your list.
- * @member:	the name of the list_head within the struct.
- */
-#define list_entry_is_head(pos, head, member)				\
-	(&pos->member == (head))
-
 /**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop counter.
@@ -326,28 +307,4 @@ static inline void list_splice_init(struct list_head *list,
 #define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

-/**
- * list_prev_entry - get the prev element in list
- * @pos:	the type * to cursor
- * @member:	the name of the list_head within the struct.
- */
-#define list_prev_entry(pos, member) \
-	list_entry((pos)->member.prev, typeof(*(pos)), member)
-
-/**
- * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_head within the struct.
- *
- * Iterate backwards over list of given type, safe against removal
- * of list entry.
- */
-#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
-	for (pos = list_last_entry(head, typeof(*pos), member),		\
-		n = list_prev_entry(pos, member);			\
-	     !list_entry_is_head(pos, head, member); 			\
-	     pos = n, n = list_prev_entry(n, member))
-
 #endif
@@ -1,24 +0,0 @@
-#ifndef _LK_RBTREE_WRAPPER_H_
-#define _LK_RBTREE_WRAPPER_H_
-
-/*
- * We're using this lame hack to build and use the kernel's rbtree in
- * userspace.  We drop the kernel's rbtree*[ch] implementation in and
- * use them with this wrapper.  We only have to remove the kernel
- * includes from the imported files.
- */
-
-#include <stdbool.h>
-#include "util.h"
-
-#define rcu_assign_pointer(a, b)	do { a = b; } while (0)
-#define READ_ONCE(a)			({ a; })
-#define WRITE_ONCE(a, b)		do { a = b; } while (0)
-#define unlikely(a)			({ a; })
-#define EXPORT_SYMBOL(a)		/* nop */
-
-#include "rbtree_types.h"
-#include "rbtree.h"
-#include "rbtree_augmented.h"
-
-#endif
@@ -609,8 +609,6 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
 	u64 blkno;
 	u64 start;
 	u64 len;
-	u64 st;
-	u64 nr;
 	int wid;
 	int ret;
 	int i;
@@ -629,37 +627,27 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
 	       AL_REF_A(&lblk->next), le32_to_cpu(lblk->start),
 	       le32_to_cpu(lblk->nr));

-	st = le32_to_cpu(lblk->start);
-	nr = le32_to_cpu(lblk->nr);
-	if (st >= SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
-	    nr > SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
-	    (st + nr) > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) {
-		printf("  (invalid start and nr fields)\n");
-		goto out;
-	}
+	if (lblk->nr) {
+		wid = printf("  exts: ");
+		start = 0;
+		len = 0;
+		for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
+			if (len == 0)
+				start = le64_to_cpu(lblk->blknos[i]);
+			len++;

-	if (lblk->nr == 0)
-		goto out;
+			if (i == (le32_to_cpu(lblk->nr) - 1) ||
+			    start + len != le64_to_cpu(lblk->blknos[i + 1])) {
+				if (wid >= 72)
+					wid = printf("\n        ");

-	wid = printf("  exts: ");
-	start = 0;
-	len = 0;
-	for (i = 0; i < nr; i++) {
-		if (len == 0)
-			start = le64_to_cpu(lblk->blknos[st + i]);
-		len++;
-
-		if (i == (nr - 1) || (start + len) != le64_to_cpu(lblk->blknos[st + i + 1])) {
-			if (wid >= 72)
-				wid = printf("\n        ");
-
-			wid += printf("%llu,%llu ", start, len);
-			len = 0;
+				wid += printf("%llu,%llu ", start, len);
+				len = 0;
+			}
 		}
+		printf("\n");
 	}
-	printf("\n");

-out:
 	next = lblk->next;
 	free(lblk);
 	return print_alloc_list_block(fd, str, &next);
@@ -1,629 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  (C) 2012  Michel Lespinasse <walken@google.com>
-
-
-  linux/lib/rbtree.c
-*/
-
-#include "lk_rbtree_wrapper.h"
-
-/*
- * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
- *
- *  1) A node is either red or black
- *  2) The root is black
- *  3) All leaves (NULL) are black
- *  4) Both children of every red node are black
- *  5) Every simple path from root to leaves contains the same number
- *     of black nodes.
- *
- *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
- *  consecutive red nodes in a path and every red node is therefore followed by
- *  a black. So if B is the number of black nodes on every simple path (as per
- *  5), then the longest possible path due to 4 is 2B.
- *
- *  We shall indicate color with case, where black nodes are uppercase and red
- *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
- *  parentheses and have some accompanying text comment.
- */
-
-/*
- * Notes on lockless lookups:
- *
- * All stores to the tree structure (rb_left and rb_right) must be done using
- * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
- * tree structure as seen in program order.
- *
- * These two requirements will allow lockless iteration of the tree -- not
- * correct iteration mind you, tree rotations are not atomic so a lookup might
- * miss entire subtrees.
- *
- * But they do guarantee that any such traversal will only see valid elements
- * and that it will indeed complete -- does not get stuck in a loop.
- *
- * It also guarantees that if the lookup returns an element it is the 'correct'
- * one. But not returning an element does _NOT_ mean it's not present.
- *
- * NOTE:
- *
- * Stores to __rb_parent_color are not important for simple lookups so those
- * are left undone as of now. Nor did I check for loops involving parent
- * pointers.
- */
-
-static inline void rb_set_black(struct rb_node *rb)
-{
-	rb->__rb_parent_color |= RB_BLACK;
-}
-
-static inline struct rb_node *rb_red_parent(struct rb_node *red)
-{
-	return (struct rb_node *)red->__rb_parent_color;
-}
-
-/*
- * Helper function for rotations:
- * - old's parent and color get assigned to new
- * - old gets assigned new as a parent and 'color' as a color.
- */
-static inline void
-__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
-			struct rb_root *root, int color)
-{
-	struct rb_node *parent = rb_parent(old);
-	new->__rb_parent_color = old->__rb_parent_color;
-	rb_set_parent_color(old, new, color);
-	__rb_change_child(old, new, parent, root);
-}
-
-static __always_inline void
-__rb_insert(struct rb_node *node, struct rb_root *root,
-	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
-
-	while (true) {
-		/*
-		 * Loop invariant: node is red.
-		 */
-		if (unlikely(!parent)) {
-			/*
-			 * The inserted node is root. Either this is the
-			 * first node, or we recursed at Case 1 below and
-			 * are no longer violating 4).
-			 */
-			rb_set_parent_color(node, NULL, RB_BLACK);
-			break;
-		}
-
-		/*
-		 * If there is a black parent, we are done.
-		 * Otherwise, take some corrective action as,
-		 * per 4), we don't want a red root or two
-		 * consecutive red nodes.
-		 */
-		if(rb_is_black(parent))
-			break;
-
-		gparent = rb_red_parent(parent);
-
-		tmp = gparent->rb_right;
-		if (parent != tmp) {	/* parent == gparent->rb_left */
-			if (tmp && rb_is_red(tmp)) {
-				/*
-				 * Case 1 - node's uncle is red (color flips).
-				 *
-				 *       G            g
-				 *      / \          / \
-				 *     p   u  -->   P   U
-				 *    /            /
-				 *   n            n
-				 *
-				 * However, since g's parent might be red, and
-				 * 4) does not allow this, we need to recurse
-				 * at g.
-				 */
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-				rb_set_parent_color(parent, gparent, RB_BLACK);
-				node = gparent;
-				parent = rb_parent(node);
-				rb_set_parent_color(node, parent, RB_RED);
-				continue;
-			}
-
-			tmp = parent->rb_right;
-			if (node == tmp) {
-				/*
-				 * Case 2 - node's uncle is black and node is
-				 * the parent's right child (left rotate at parent).
-				 *
-				 *      G             G
-				 *     / \           / \
-				 *    p   U  -->    n   U
-				 *     \           /
-				 *      n         p
-				 *
-				 * This still leaves us in violation of 4), the
-				 * continuation into Case 3 will fix that.
-				 */
-				tmp = node->rb_left;
-				WRITE_ONCE(parent->rb_right, tmp);
-				WRITE_ONCE(node->rb_left, parent);
-				if (tmp)
-					rb_set_parent_color(tmp, parent,
-							    RB_BLACK);
-				rb_set_parent_color(parent, node, RB_RED);
-				augment_rotate(parent, node);
-				parent = node;
-				tmp = node->rb_right;
-			}
-
-			/*
-			 * Case 3 - node's uncle is black and node is
-			 * the parent's left child (right rotate at gparent).
-			 *
-			 *        G           P
-			 *       / \         / \
-			 *      p   U  -->  n   g
-			 *     /                 \
-			 *    n                   U
-			 */
-			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
-			WRITE_ONCE(parent->rb_right, gparent);
-			if (tmp)
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
-			augment_rotate(gparent, parent);
-			break;
-		} else {
-			tmp = gparent->rb_left;
-			if (tmp && rb_is_red(tmp)) {
-				/* Case 1 - color flips */
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-				rb_set_parent_color(parent, gparent, RB_BLACK);
-				node = gparent;
-				parent = rb_parent(node);
-				rb_set_parent_color(node, parent, RB_RED);
-				continue;
-			}
-
-			tmp = parent->rb_left;
-			if (node == tmp) {
-				/* Case 2 - right rotate at parent */
-				tmp = node->rb_right;
-				WRITE_ONCE(parent->rb_left, tmp);
-				WRITE_ONCE(node->rb_right, parent);
-				if (tmp)
-					rb_set_parent_color(tmp, parent,
-							    RB_BLACK);
-				rb_set_parent_color(parent, node, RB_RED);
-				augment_rotate(parent, node);
-				parent = node;
-				tmp = node->rb_left;
-			}
-
-			/* Case 3 - left rotate at gparent */
-			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
-			WRITE_ONCE(parent->rb_left, gparent);
-			if (tmp)
-				rb_set_parent_color(tmp, gparent, RB_BLACK);
-			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
-			augment_rotate(gparent, parent);
-			break;
-		}
-	}
-}
-
-/*
- * Inline version for rb_erase() use - we want to be able to inline
- * and eliminate the dummy_rotate callback there
- */
-static __always_inline void
-____rb_erase_color(struct rb_node *parent, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
-
-	while (true) {
-		/*
-		 * Loop invariants:
-		 * - node is black (or NULL on first iteration)
-		 * - node is not the root (parent is not NULL)
-		 * - All leaf paths going through parent and node have a
-		 *   black node count that is 1 lower than other leaf paths.
-		 */
-		sibling = parent->rb_right;
-		if (node != sibling) {	/* node == parent->rb_left */
-			if (rb_is_red(sibling)) {
-				/*
-				 * Case 1 - left rotate at parent
-				 *
-				 *     P               S
-				 *    / \             / \
-				 *   N   s    -->    p   Sr
-				 *      / \         / \
-				 *     Sl  Sr      N   Sl
-				 */
-				tmp1 = sibling->rb_left;
-				WRITE_ONCE(parent->rb_right, tmp1);
-				WRITE_ONCE(sibling->rb_left, parent);
-				rb_set_parent_color(tmp1, parent, RB_BLACK);
-				__rb_rotate_set_parents(parent, sibling, root,
-							RB_RED);
-				augment_rotate(parent, sibling);
-				sibling = tmp1;
-			}
-			tmp1 = sibling->rb_right;
-			if (!tmp1 || rb_is_black(tmp1)) {
-				tmp2 = sibling->rb_left;
-				if (!tmp2 || rb_is_black(tmp2)) {
-					/*
-					 * Case 2 - sibling color flip
-					 * (p could be either color here)
-					 *
-					 *    (p)           (p)
-					 *    / \           / \
-					 *   N   S    -->  N   s
-					 *      / \           / \
-					 *     Sl  Sr        Sl  Sr
-					 *
-					 * This leaves us violating 5) which
-					 * can be fixed by flipping p to black
-					 * if it was red, or by recursing at p.
-					 * p is red when coming from Case 1.
-					 */
-					rb_set_parent_color(sibling, parent,
-							    RB_RED);
-					if (rb_is_red(parent))
-						rb_set_black(parent);
-					else {
-						node = parent;
-						parent = rb_parent(node);
-						if (parent)
-							continue;
-					}
-					break;
-				}
-				/*
-				 * Case 3 - right rotate at sibling
-				 * (p could be either color here)
-				 *
-				 *   (p)           (p)
-				 *   / \           / \
-				 *  N   S    -->  N   sl
-				 *     / \             \
-				 *    sl  Sr            S
-				 *                       \
-				 *                        Sr
-				 *
-				 * Note: p might be red, and then both
-				 * p and sl are red after rotation(which
-				 * breaks property 4). This is fixed in
-				 * Case 4 (in __rb_rotate_set_parents()
-				 *         which set sl the color of p
-				 *         and set p RB_BLACK)
-				 *
-				 *   (p)            (sl)
-				 *   / \            /  \
-				 *  N   sl   -->   P    S
-				 *       \        /      \
-				 *        S      N        Sr
-				 *         \
-				 *          Sr
-				 */
-				tmp1 = tmp2->rb_right;
-				WRITE_ONCE(sibling->rb_left, tmp1);
-				WRITE_ONCE(tmp2->rb_right, sibling);
-				WRITE_ONCE(parent->rb_right, tmp2);
-				if (tmp1)
-					rb_set_parent_color(tmp1, sibling,
-							    RB_BLACK);
-				augment_rotate(sibling, tmp2);
-				tmp1 = sibling;
-				sibling = tmp2;
-			}
-			/*
-			 * Case 4 - left rotate at parent + color flips
-			 * (p and sl could be either color here.
-			 *  After rotation, p becomes black, s acquires
-			 *  p's color, and sl keeps its color)
-			 *
-			 *      (p)             (s)
-			 *      / \             / \
-			 *     N   S     -->   P   Sr
-			 *        / \         / \
-			 *      (sl) sr      N  (sl)
-			 */
-			tmp2 = sibling->rb_left;
-			WRITE_ONCE(parent->rb_right, tmp2);
-			WRITE_ONCE(sibling->rb_left, parent);
-			rb_set_parent_color(tmp1, sibling, RB_BLACK);
-			if (tmp2)
-				rb_set_parent(tmp2, parent);
-			__rb_rotate_set_parents(parent, sibling, root,
-						RB_BLACK);
-			augment_rotate(parent, sibling);
-			break;
-		} else {
-			sibling = parent->rb_left;
-			if (rb_is_red(sibling)) {
-				/* Case 1 - right rotate at parent */
-				tmp1 = sibling->rb_right;
-				WRITE_ONCE(parent->rb_left, tmp1);
-				WRITE_ONCE(sibling->rb_right, parent);
-				rb_set_parent_color(tmp1, parent, RB_BLACK);
-				__rb_rotate_set_parents(parent, sibling, root,
-							RB_RED);
-				augment_rotate(parent, sibling);
-				sibling = tmp1;
-			}
-			tmp1 = sibling->rb_left;
-			if (!tmp1 || rb_is_black(tmp1)) {
-				tmp2 = sibling->rb_right;
-				if (!tmp2 || rb_is_black(tmp2)) {
-					/* Case 2 - sibling color flip */
-					rb_set_parent_color(sibling, parent,
-							    RB_RED);
-					if (rb_is_red(parent))
-						rb_set_black(parent);
-					else {
-						node = parent;
-						parent = rb_parent(node);
-						if (parent)
-							continue;
-					}
-					break;
-				}
-				/* Case 3 - left rotate at sibling */
-				tmp1 = tmp2->rb_left;
-				WRITE_ONCE(sibling->rb_right, tmp1);
-				WRITE_ONCE(tmp2->rb_left, sibling);
-				WRITE_ONCE(parent->rb_left, tmp2);
-				if (tmp1)
-					rb_set_parent_color(tmp1, sibling,
-							    RB_BLACK);
-				augment_rotate(sibling, tmp2);
-				tmp1 = sibling;
-				sibling = tmp2;
-			}
-			/* Case 4 - right rotate at parent + color flips */
-			tmp2 = sibling->rb_right;
-			WRITE_ONCE(parent->rb_left, tmp2);
-			WRITE_ONCE(sibling->rb_right, parent);
-			rb_set_parent_color(tmp1, sibling, RB_BLACK);
-			if (tmp2)
-				rb_set_parent(tmp2, parent);
-			__rb_rotate_set_parents(parent, sibling, root,
-						RB_BLACK);
-			augment_rotate(parent, sibling);
-			break;
-		}
-	}
-}
-
-/* Non-inline version for rb_erase_augmented() use */
-void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	____rb_erase_color(parent, root, augment_rotate);
-}
-EXPORT_SYMBOL(__rb_erase_color);
-
-/*
- * Non-augmented rbtree manipulation functions.
- *
- * We use dummy augmented callbacks here, and have the compiler optimize them
- * out of the rb_insert_color() and rb_erase() function definitions.
- */
-
-static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
-static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
-static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
-
-static const struct rb_augment_callbacks dummy_callbacks = {
-	.propagate = dummy_propagate,
-	.copy = dummy_copy,
-	.rotate = dummy_rotate
-};
-
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
-{
-	__rb_insert(node, root, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_insert_color);
-
-void rb_erase(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
-	if (rebalance)
-		____rb_erase_color(rebalance, root, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_erase);
-
-/*
- * Augmented rbtree manipulation functions.
- *
- * This instantiates the same __always_inline functions as in the non-augmented
- * case, but this time with user-defined callbacks.
- */
-
-void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
-{
-	__rb_insert(node, root, augment_rotate);
-}
-EXPORT_SYMBOL(__rb_insert_augmented);
-
-/*
- * This function returns the first node (in sort order) of the tree.
- */
-struct rb_node *rb_first(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_left)
-		n = n->rb_left;
-	return n;
-}
-EXPORT_SYMBOL(rb_first);
-
-struct rb_node *rb_last(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_right)
-		n = n->rb_right;
-	return n;
-}
-EXPORT_SYMBOL(rb_last);
-
-struct rb_node *rb_next(const struct rb_node *node)
-{
-	struct rb_node *parent;
-
-	if (RB_EMPTY_NODE(node))
-		return NULL;
-
-	/*
-	 * If we have a right-hand child, go down and then left as far
-	 * as we can.
-	 */
-	if (node->rb_right) {
-		node = node->rb_right;
-		while (node->rb_left)
-			node = node->rb_left;
-		return (struct rb_node *)node;
-	}
-
-	/*
-	 * No right-hand children. Everything down and left is smaller than us,
-	 * so any 'next' node must be in the general direction of our parent.
-	 * Go up the tree; any time the ancestor is a right-hand child of its
-	 * parent, keep going up. First time it's a left-hand child of its
-	 * parent, said parent is our 'next' node.
-	 */
-	while ((parent = rb_parent(node)) && node == parent->rb_right)
-		node = parent;
-
-	return parent;
-}
-EXPORT_SYMBOL(rb_next);
-
-struct rb_node *rb_prev(const struct rb_node *node)
-{
-	struct rb_node *parent;
-
-	if (RB_EMPTY_NODE(node))
-		return NULL;
-
-	/*
-	 * If we have a left-hand child, go down and then right as far
-	 * as we can.
-	 */
-	if (node->rb_left) {
-		node = node->rb_left;
-		while (node->rb_right)
-			node = node->rb_right;
-		return (struct rb_node *)node;
-	}
-
-	/*
-	 * No left-hand children. Go up till we find an ancestor which
-	 * is a right-hand child of its parent.
-	 */
-	while ((parent = rb_parent(node)) && node == parent->rb_left)
-		node = parent;
-
-	return parent;
-}
-EXPORT_SYMBOL(rb_prev);
-
-void rb_replace_node(struct rb_node *victim, struct rb_node *new,
-		     struct rb_root *root)
-{
-	struct rb_node *parent = rb_parent(victim);
-
-	/* Copy the pointers/colour from the victim to the replacement */
-	*new = *victim;
-
-	/* Set the surrounding nodes to point to the replacement */
-	if (victim->rb_left)
-		rb_set_parent(victim->rb_left, new);
-	if (victim->rb_right)
-		rb_set_parent(victim->rb_right, new);
-	__rb_change_child(victim, new, parent, root);
-}
-EXPORT_SYMBOL(rb_replace_node);
-
-void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
-			 struct rb_root *root)
-{
-	struct rb_node *parent = rb_parent(victim);
-
-	/* Copy the pointers/colour from the victim to the replacement */
-	*new = *victim;
-
-	/* Set the surrounding nodes to point to the replacement */
-	if (victim->rb_left)
-		rb_set_parent(victim->rb_left, new);
-	if (victim->rb_right)
-		rb_set_parent(victim->rb_right, new);
-
-	/* Set the parent's pointer to the new node last after an RCU barrier
-	 * so that the pointers onwards are seen to be set correctly when doing
-	 * an RCU walk over the tree.
-	 */
-	__rb_change_child_rcu(victim, new, parent, root);
-}
-EXPORT_SYMBOL(rb_replace_node_rcu);
-
-static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
-{
-	for (;;) {
-		if (node->rb_left)
-			node = node->rb_left;
-		else if (node->rb_right)
-			node = node->rb_right;
-		else
-			return (struct rb_node *)node;
-	}
-}
-
-struct rb_node *rb_next_postorder(const struct rb_node *node)
-{
-	const struct rb_node *parent;
-	if (!node)
-		return NULL;
-	parent = rb_parent(node);
-
-	/* If we're sitting on node, we've already seen our children */
-	if (parent && node == parent->rb_left && parent->rb_right) {
-		/* If we are the parent's left node, go to the parent's right
-		 * node then all the way down to the left */
-		return rb_left_deepest_node(parent->rb_right);
-	} else
-		/* Otherwise we are the parent's right node, and the parent
-		 * should be next */
-		return (struct rb_node *)parent;
-}
-EXPORT_SYMBOL(rb_next_postorder);
-
-struct rb_node *rb_first_postorder(const struct rb_root *root)
-{
-	if (!root->rb_node)
-		return NULL;
-
-	return rb_left_deepest_node(root->rb_node);
-}
-EXPORT_SYMBOL(rb_first_postorder);
@@ -1,328 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  
-
-  linux/include/linux/rbtree.h
-
-  To use rbtrees you'll have to implement your own insert and search cores.
-  This will avoid us to use callbacks and to drop drammatically performances.
-  I know it's not the cleaner way,  but in C (not in C++) to get
-  performances and genericity...
-
-  See Documentation/core-api/rbtree.rst for documentation and samples.
-*/
-
-#ifndef	_LINUX_RBTREE_H
-#define	_LINUX_RBTREE_H
-
-#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
-
-#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
-
-#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
-
-/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
-#define RB_EMPTY_NODE(node)  \
-	((node)->__rb_parent_color == (unsigned long)(node))
-#define RB_CLEAR_NODE(node)  \
-	((node)->__rb_parent_color = (unsigned long)(node))
-
-
-extern void rb_insert_color(struct rb_node *, struct rb_root *);
-extern void rb_erase(struct rb_node *, struct rb_root *);
-
-
-/* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_next(const struct rb_node *);
-extern struct rb_node *rb_prev(const struct rb_node *);
-extern struct rb_node *rb_first(const struct rb_root *);
-extern struct rb_node *rb_last(const struct rb_root *);
-
-/* Postorder iteration - always visit the parent after its children */
-extern struct rb_node *rb_first_postorder(const struct rb_root *);
-extern struct rb_node *rb_next_postorder(const struct rb_node *);
-
-/* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
-			    struct rb_root *root);
-extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
-				struct rb_root *root);
-
-static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
-				struct rb_node **rb_link)
-{
-	node->__rb_parent_color = (unsigned long)parent;
-	node->rb_left = node->rb_right = NULL;
-
-	*rb_link = node;
-}
-
-static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
-				    struct rb_node **rb_link)
-{
-	node->__rb_parent_color = (unsigned long)parent;
-	node->rb_left = node->rb_right = NULL;
-
-	rcu_assign_pointer(*rb_link, node);
-}
-
-#define rb_entry_safe(ptr, type, member) \
-	({ typeof(ptr) ____ptr = (ptr); \
-	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
-	})
-
-/**
- * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
- * given type allowing the backing memory of @pos to be invalidated
- *
- * @pos:	the 'type *' to use as a loop cursor.
- * @n:		another 'type *' to use as temporary storage
- * @root:	'rb_root *' of the rbtree.
- * @field:	the name of the rb_node field within 'type'.
- *
- * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
- * list_for_each_entry_safe() and allows the iteration to continue independent
- * of changes to @pos by the body of the loop.
- *
- * Note, however, that it cannot handle other modifications that re-order the
- * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
- * rb_erase() may rebalance the tree, causing us to miss some nodes.
- */
-#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
-	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
-	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
-			typeof(*pos), field); 1; }); \
-	     pos = n)
-
-/* Same as rb_first(), but O(1) */
-#define rb_first_cached(root) (root)->rb_leftmost
-
-static inline void rb_insert_color_cached(struct rb_node *node,
-					  struct rb_root_cached *root,
-					  bool leftmost)
-{
-	if (leftmost)
-		root->rb_leftmost = node;
-	rb_insert_color(node, &root->rb_root);
-}
-
-
-static inline struct rb_node *
-rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
-{
-	struct rb_node *leftmost = NULL;
-
-	if (root->rb_leftmost == node)
-		leftmost = root->rb_leftmost = rb_next(node);
-
-	rb_erase(node, &root->rb_root);
-
-	return leftmost;
-}
-
-static inline void rb_replace_node_cached(struct rb_node *victim,
-					  struct rb_node *new,
-					  struct rb_root_cached *root)
-{
-	if (root->rb_leftmost == victim)
-		root->rb_leftmost = new;
-	rb_replace_node(victim, new, &root->rb_root);
-}
-
-/*
- * The below helper functions use 2 operators with 3 different
- * calling conventions. The operators are related like:
- *
- *	comp(a->key,b) < 0  := less(a,b)
- *	comp(a->key,b) > 0  := less(b,a)
- *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
- *
- * If these operators define a partial order on the elements we make no
- * guarantee on which of the elements matching the key is found. See
- * rb_find().
- *
- * The reason for this is to allow the find() interface without requiring an
- * on-stack dummy object, which might not be feasible due to object size.
- */
-
-/**
- * rb_add_cached() - insert @node into the leftmost cached tree @tree
- * @node: node to insert
- * @tree: leftmost cached tree to insert @node into
- * @less: operator defining the (partial) node order
- *
- * Returns @node when it is the new leftmost, or NULL.
- */
-static __always_inline struct rb_node *
-rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
-	      bool (*less)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	bool leftmost = true;
-
-	while (*link) {
-		parent = *link;
-		if (less(node, parent)) {
-			link = &parent->rb_left;
-		} else {
-			link = &parent->rb_right;
-			leftmost = false;
-		}
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color_cached(node, tree, leftmost);
-
-	return leftmost ? node : NULL;
-}
-
-/**
- * rb_add() - insert @node into @tree
- * @node: node to insert
- * @tree: tree to insert @node into
- * @less: operator defining the (partial) node order
- */
-static __always_inline void
-rb_add(struct rb_node *node, struct rb_root *tree,
-       bool (*less)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*link) {
-		parent = *link;
-		if (less(node, parent))
-			link = &parent->rb_left;
-		else
-			link = &parent->rb_right;
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color(node, tree);
-}
-
-/**
- * rb_find_add() - find equivalent @node in @tree, or add @node
- * @node: node to look-for / insert
- * @tree: tree to search / modify
- * @cmp: operator defining the node order
- *
- * Returns the rb_node matching @node, or NULL when no match is found and @node
- * is inserted.
- */
-static __always_inline struct rb_node *
-rb_find_add(struct rb_node *node, struct rb_root *tree,
-	    int (*cmp)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_node;
-	struct rb_node *parent = NULL;
-	int c;
-
-	while (*link) {
-		parent = *link;
-		c = cmp(node, parent);
-
-		if (c < 0)
-			link = &parent->rb_left;
-		else if (c > 0)
-			link = &parent->rb_right;
-		else
-			return parent;
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color(node, tree);
-	return NULL;
-}
-
-/**
- * rb_find() - find @key in tree @tree
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining the node order
- *
- * Returns the rb_node matching @key or NULL.
- */
-static __always_inline struct rb_node *
-rb_find(const void *key, const struct rb_root *tree,
-	int (*cmp)(const void *key, const struct rb_node *))
-{
-	struct rb_node *node = tree->rb_node;
-
-	while (node) {
-		int c = cmp(key, node);
-
-		if (c < 0)
-			node = node->rb_left;
-		else if (c > 0)
-			node = node->rb_right;
-		else
-			return node;
-	}
-
-	return NULL;
-}
-
-/**
- * rb_find_first() - find the first @key in @tree
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining node order
- *
- * Returns the leftmost node matching @key, or NULL.
- */
-static __always_inline struct rb_node *
-rb_find_first(const void *key, const struct rb_root *tree,
-	      int (*cmp)(const void *key, const struct rb_node *))
-{
-	struct rb_node *node = tree->rb_node;
-	struct rb_node *match = NULL;
-
-	while (node) {
-		int c = cmp(key, node);
-
-		if (c <= 0) {
-			if (!c)
-				match = node;
-			node = node->rb_left;
-		} else if (c > 0) {
-			node = node->rb_right;
-		}
-	}
-
-	return match;
-}
-
-/**
- * rb_next_match() - find the next @key in @tree
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining node order
- *
- * Returns the next node matching @key, or NULL.
- */
-static __always_inline struct rb_node *
-rb_next_match(const void *key, struct rb_node *node,
-	      int (*cmp)(const void *key, const struct rb_node *))
-{
-	node = rb_next(node);
-	if (node && cmp(key, node))
-		node = NULL;
-	return node;
-}
-
-/**
- * rb_for_each() - iterates a subtree matching @key
- * @node: iterator
- * @key: key to match
- * @tree: tree to search
- * @cmp: operator defining node order
- */
-#define rb_for_each(node, key, tree, cmp) \
-	for ((node) = rb_find_first((key), (tree), (cmp)); \
-	     (node); (node) = rb_next_match((key), (node), (cmp)))
-
-#endif	/* _LINUX_RBTREE_H */
@@ -1,313 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  (C) 2012  Michel Lespinasse <walken@google.com>
-
-
-  linux/include/linux/rbtree_augmented.h
-*/
-
-#ifndef _LINUX_RBTREE_AUGMENTED_H
-#define _LINUX_RBTREE_AUGMENTED_H
-
-/*
- * Please note - only struct rb_augment_callbacks and the prototypes for
- * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
- * The rest are implementation details you are not expected to depend on.
- *
- * See Documentation/core-api/rbtree.rst for documentation and samples.
- */
-
-struct rb_augment_callbacks {
-	void (*propagate)(struct rb_node *node, struct rb_node *stop);
-	void (*copy)(struct rb_node *old, struct rb_node *new);
-	void (*rotate)(struct rb_node *old, struct rb_node *new);
-};
-
-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
-
-/*
- * Fixup the rbtree and update the augmented information when rebalancing.
- *
- * On insertion, the user must update the augmented information on the path
- * leading to the inserted node, then call rb_link_node() as usual and
- * rb_insert_augmented() instead of the usual rb_insert_color() call.
- * If rb_insert_augmented() rebalances the rbtree, it will callback into
- * a user provided function to update the augmented information on the
- * affected subtrees.
- */
-static inline void
-rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-		    const struct rb_augment_callbacks *augment)
-{
-	__rb_insert_augmented(node, root, augment->rotate);
-}
-
-static inline void
-rb_insert_augmented_cached(struct rb_node *node,
-			   struct rb_root_cached *root, bool newleft,
-			   const struct rb_augment_callbacks *augment)
-{
-	if (newleft)
-		root->rb_leftmost = node;
-	rb_insert_augmented(node, &root->rb_root, augment);
-}
-
-/*
- * Template for declaring augmented rbtree callbacks (generic case)
- *
- * RBSTATIC:    'static' or empty
- * RBNAME:      name of the rb_augment_callbacks structure
- * RBSTRUCT:    struct type of the tree nodes
- * RBFIELD:     name of struct rb_node field within RBSTRUCT
- * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
- * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
- */
-
-#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,				\
-			     RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)	\
-static inline void							\
-RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
-{									\
-	while (rb != stop) {						\
-		RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);	\
-		if (RBCOMPUTE(node, true))				\
-			break;						\
-		rb = rb_parent(&node->RBFIELD);				\
-	}								\
-}									\
-static inline void							\
-RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
-{									\
-	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
-	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
-	new->RBAUGMENTED = old->RBAUGMENTED;				\
-}									\
-static void								\
-RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
-{									\
-	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
-	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
-	new->RBAUGMENTED = old->RBAUGMENTED;				\
-	RBCOMPUTE(old, false);						\
-}									\
-RBSTATIC const struct rb_augment_callbacks RBNAME = {			\
-	.propagate = RBNAME ## _propagate,				\
-	.copy = RBNAME ## _copy,					\
-	.rotate = RBNAME ## _rotate					\
-};
-
-/*
- * Template for declaring augmented rbtree callbacks,
- * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
- *
- * RBSTATIC:    'static' or empty
- * RBNAME:      name of the rb_augment_callbacks structure
- * RBSTRUCT:    struct type of the tree nodes
- * RBFIELD:     name of struct rb_node field within RBSTRUCT
- * RBTYPE:      type of the RBAUGMENTED field
- * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
- * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
- */
-
-#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,	      \
-				 RBTYPE, RBAUGMENTED, RBCOMPUTE)	      \
-static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)	      \
-{									      \
-	RBSTRUCT *child;						      \
-	RBTYPE max = RBCOMPUTE(node);					      \
-	if (node->RBFIELD.rb_left) {					      \
-		child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
-		if (child->RBAUGMENTED > max)				      \
-			max = child->RBAUGMENTED;			      \
-	}								      \
-	if (node->RBFIELD.rb_right) {					      \
-		child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
-		if (child->RBAUGMENTED > max)				      \
-			max = child->RBAUGMENTED;			      \
-	}								      \
-	if (exit && node->RBAUGMENTED == max)				      \
-		return true;						      \
-	node->RBAUGMENTED = max;					      \
-	return false;							      \
-}									      \
-RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,					      \
-		     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
-
-
-#define	RB_RED		0
-#define	RB_BLACK	1
-
-#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
-
-#define __rb_color(pc)     ((pc) & 1)
-#define __rb_is_black(pc)  __rb_color(pc)
-#define __rb_is_red(pc)    (!__rb_color(pc))
-#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
-#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
-#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
-
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
-	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
-}
-
-static inline void rb_set_parent_color(struct rb_node *rb,
-				       struct rb_node *p, int color)
-{
-	rb->__rb_parent_color = (unsigned long)p | color;
-}
-
-static inline void
-__rb_change_child(struct rb_node *old, struct rb_node *new,
-		  struct rb_node *parent, struct rb_root *root)
-{
-	if (parent) {
-		if (parent->rb_left == old)
-			WRITE_ONCE(parent->rb_left, new);
-		else
-			WRITE_ONCE(parent->rb_right, new);
-	} else
-		WRITE_ONCE(root->rb_node, new);
-}
-
-static inline void
-__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
-		      struct rb_node *parent, struct rb_root *root)
-{
-	if (parent) {
-		if (parent->rb_left == old)
-			rcu_assign_pointer(parent->rb_left, new);
-		else
-			rcu_assign_pointer(parent->rb_right, new);
-	} else
-		rcu_assign_pointer(root->rb_node, new);
-}
-
-extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
-	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
-
-static __always_inline struct rb_node *
-__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		     const struct rb_augment_callbacks *augment)
-{
-	struct rb_node *child = node->rb_right;
-	struct rb_node *tmp = node->rb_left;
-	struct rb_node *parent, *rebalance;
-	unsigned long pc;
-
-	if (!tmp) {
-		/*
-		 * Case 1: node to erase has no more than 1 child (easy!)
-		 *
-		 * Note that if there is one child it must be red due to 5)
-		 * and node must be black due to 4). We adjust colors locally
-		 * so as to bypass __rb_erase_color() later on.
-		 */
-		pc = node->__rb_parent_color;
-		parent = __rb_parent(pc);
-		__rb_change_child(node, child, parent, root);
-		if (child) {
-			child->__rb_parent_color = pc;
-			rebalance = NULL;
-		} else
-			rebalance = __rb_is_black(pc) ? parent : NULL;
-		tmp = parent;
-	} else if (!child) {
-		/* Still case 1, but this time the child is node->rb_left */
-		tmp->__rb_parent_color = pc = node->__rb_parent_color;
-		parent = __rb_parent(pc);
-		__rb_change_child(node, tmp, parent, root);
-		rebalance = NULL;
-		tmp = parent;
-	} else {
-		struct rb_node *successor = child, *child2;
-
-		tmp = child->rb_left;
-		if (!tmp) {
-			/*
-			 * Case 2: node's successor is its right child
-			 *
-			 *    (n)          (s)
-			 *    / \          / \
-			 *  (x) (s)  ->  (x) (c)
-			 *        \
-			 *        (c)
-			 */
-			parent = successor;
-			child2 = successor->rb_right;
-
-			augment->copy(node, successor);
-		} else {
-			/*
-			 * Case 3: node's successor is leftmost under
-			 * node's right child subtree
-			 *
-			 *    (n)          (s)
-			 *    / \          / \
-			 *  (x) (y)  ->  (x) (y)
-			 *      /            /
-			 *    (p)          (p)
-			 *    /            /
-			 *  (s)          (c)
-			 *    \
-			 *    (c)
-			 */
-			do {
-				parent = successor;
-				successor = tmp;
-				tmp = tmp->rb_left;
-			} while (tmp);
-			child2 = successor->rb_right;
-			WRITE_ONCE(parent->rb_left, child2);
-			WRITE_ONCE(successor->rb_right, child);
-			rb_set_parent(child, successor);
-
-			augment->copy(node, successor);
-			augment->propagate(parent, successor);
-		}
-
-		tmp = node->rb_left;
-		WRITE_ONCE(successor->rb_left, tmp);
-		rb_set_parent(tmp, successor);
-
-		pc = node->__rb_parent_color;
-		tmp = __rb_parent(pc);
-		__rb_change_child(node, successor, tmp, root);
-
-		if (child2) {
-			rb_set_parent_color(child2, parent, RB_BLACK);
-			rebalance = NULL;
-		} else {
-			rebalance = rb_is_black(successor) ? parent : NULL;
-		}
-		successor->__rb_parent_color = pc;
-		tmp = successor;
-	}
-
-	augment->propagate(tmp, NULL);
-	return rebalance;
-}
-
-static __always_inline void
-rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		   const struct rb_augment_callbacks *augment)
-{
-	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
-	if (rebalance)
-		__rb_erase_color(rebalance, root, augment->rotate);
-}
-
-static __always_inline void
-rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
-			  const struct rb_augment_callbacks *augment)
-{
-	if (root->rb_leftmost == node)
-		root->rb_leftmost = rb_next(node);
-	rb_erase_augmented(node, &root->rb_root, augment);
-}
-
-#endif	/* _LINUX_RBTREE_AUGMENTED_H */
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _LINUX_RBTREE_TYPES_H
-#define _LINUX_RBTREE_TYPES_H
-
-struct rb_node {
-	unsigned long  __rb_parent_color;
-	struct rb_node *rb_right;
-	struct rb_node *rb_left;
-} __attribute__((aligned(sizeof(long))));
-/* The alignment might seem pointless, but allegedly CRIS needs it */
-
-struct rb_root {
-	struct rb_node *rb_node;
-};
-
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
-	struct rb_root rb_root;
-	struct rb_node *rb_leftmost;
-};
-
-#define RB_ROOT (struct rb_root) { NULL, }
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
-
-#endif
Author	SHA1	Message	Date
Zach Brown	55f0a0ded4	Add nr_log_trees debugfs counter Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 12:48:27 -08:00
Zach Brown	dcfd22e4b1	Force merge creation timeout Signed-off-by: Zach Brown <zab@versity.com>	2024-01-25 12:46:46 -08:00