No i_mutex in aio_read for data_wait_check

As a regular file reader first acquires a cluster lock it checks the file's extents to see if the region it is reading contains offline extents. When this was written the extent operations didn't have their own internal locking. It was up to callers to use vfs mechanisms to serialize readers and writers. The aio_read data_wait_check extent caller tried to use dio_count and an i_mutex acquisition to ensure that our ioctls wouldn't modify extents. This creates a bad inversion between the vfs i_mutex and our cluster inode lock. There are lots of fs methods which are called by the vfs with i_mutex held which acquire a lock. This read case was holding a cluster lock and then acquiring i_mutex. Since the data waiting was written the file data extent operations have added their own extent_sem to protect the extent items from concurrent callers. We can rely on that internal locking and drop the bad i_mutex use which causes the inversion. Not surprisingly, this was trivial to deadlock by racing simple utilities like cat and touch. Signed-off-by: Zach Brown <zab@versity.com>
Add export-lookup-evict-race test
2026-05-01 10:25:43 +00:00 · 2021-04-27 11:20:40 -07:00 · 2021-04-27 11:20:40 -07:00
77 changed files with 965 additions and 6759 deletions
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -18,7 +18,6 @@ scoutfs-y +=			\
 	dir.o			\
 	export.o		\
 	ext.o			\
-	fence.o			\
 	file.o			\
 	forest.o		\
 	inode.o			\
@@ -43,7 +42,6 @@ scoutfs-y +=			\
 	trans.o			\
 	triggers.o		\
 	tseq.o			\
-	volopt.o		\
 	xattr.o

 #
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -29,8 +29,8 @@
 * The core allocator uses extent items in btrees rooted in the super.
 * Each free extent is stored in two items.  The first item is indexed
 * by block location and is used to merge adjacent extents when freeing.
- * The second item is indexed by the order of the length and is used to
- * find large extents to allocate from.
+ * The second item is indexed by length and is used to find large
+ * extents to allocate from.
 *
 * Free extent always consumes the front of the largest extent.  This
 * attempts to discourage fragmentation by given smaller freed extents
@@ -66,53 +66,26 @@
 * blocks to modify the next blocks, and swaps them at each transaction.
 */

-/*
- * Return the order of the length of a free extent, which we define as
- * floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
- */
-static u64 free_extent_order(u64 len)
-{
-	return (fls64(len | 1) - 1) / 3;
-}
-
-/*
- * The smallest (non-zero) length that will be mapped to the same order
- * as the given length.
- */
-static u64 smallest_order_length(u64 len)
-{
-	return 1ULL << (free_extent_order(len) * 3);
-}
-
 /*
 * Free extents don't have flags and are stored in two indexes sorted by
- * block location and by length order, largest first.  The location key
- * field is set to the final block in the extent so that we can find
- * intersections by calling _next() with the start of the range we're
- * searching for.
- *
- * We never store 0 length extents but we do build keys for searching
- * the order index from 0,0 without having to map it to a real extent.
+ * block location and by length, largest first.  The block location key
+ * is set to the final block in the extent so that we can find
+ * intersections by calling _next() iterators starting with the block
+ * we're searching for.
 */
-static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len)
+static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
 {
 	*key = (struct scoutfs_key) {
-		.sk_zone = zone,
+		.sk_zone = SCOUTFS_FREE_EXTENT_ZONE,
+		.sk_type = type,
 	};

-	if (len == 0) {
-		/* we only use 0 len extents for magic 0,0 order lookups */
-		WARN_ON_ONCE(zone != SCOUTFS_FREE_EXTENT_ORDER_ZONE || start != 0);
-		return;
-	}
-
-	if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
+	if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
 		key->skfb_end = cpu_to_le64(start + len - 1);
 		key->skfb_len = cpu_to_le64(len);
-	} else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) {
-		key->skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
-		key->skfo_end = cpu_to_le64(start + len - 1);
-		key->skfo_len = cpu_to_le64(len);
+	} else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE) {
+		key->skfl_neglen = cpu_to_le64(-len);
+		key->skfl_blkno = cpu_to_le64(start);
 	} else {
 		BUG();
 	}
@@ -120,27 +93,23 @@ static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len)

 static void ext_from_key(struct scoutfs_extent *ext, struct scoutfs_key *key)
 {
-	if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
+	if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
 		ext->start = le64_to_cpu(key->skfb_end) -
 			     le64_to_cpu(key->skfb_len) + 1;
 		ext->len = le64_to_cpu(key->skfb_len);
 	} else {
-		ext->start = le64_to_cpu(key->skfo_end) -
-			     le64_to_cpu(key->skfo_len) + 1;
-		ext->len = le64_to_cpu(key->skfo_len);
+		ext->start = le64_to_cpu(key->skfl_blkno);
+		ext->len = -le64_to_cpu(key->skfl_neglen);
 	}
 	ext->map = 0;
 	ext->flags = 0;
-
-	/* we never store 0 length extents */
-	WARN_ON_ONCE(ext->len == 0);
 }

 struct alloc_ext_args {
 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
 	struct scoutfs_alloc_root *root;
-	int zone;
+	int type;
 };

 static int alloc_ext_next(struct super_block *sb, void *arg,
@@ -151,13 +120,13 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
 	struct scoutfs_key key;
 	int ret;

-	init_ext_key(&key, args->zone, start, len);
+	init_ext_key(&key, args->type, start, len);

 	ret = scoutfs_btree_next(sb, &args->root->root, &key, &iref);
 	if (ret == 0) {
 		if (iref.val_len != 0)
 			ret = -EIO;
-		else if (iref.key->sk_zone != args->zone)
+		else if (iref.key->sk_type != args->type)
 			ret = -ENOENT;
 		else
 			ext_from_key(ext, iref.key);
@@ -170,19 +139,19 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
 	return ret;
 }

-static int other_zone(int zone)
+static int other_type(int type)
 {
-	if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
-		return SCOUTFS_FREE_EXTENT_ORDER_ZONE;
-	else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
-		return SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
+	if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
+		return SCOUTFS_FREE_EXTENT_LEN_TYPE;
+	else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE)
+		return SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
 	else
 		BUG();
 }

 /*
 * Insert an extent along with its matching item which is indexed by
- * opposite of its order or blkno.  If we succeed we update the root's
+ * opposite of its len or blkno.  If we succeed we update the root's
 * record of the total length of all the stored extents.
 */
 static int alloc_ext_insert(struct super_block *sb, void *arg,
@@ -198,8 +167,8 @@ static int alloc_ext_insert(struct super_block *sb, void *arg,
 	if (WARN_ON_ONCE(map || flags))
 		return -EINVAL;

-	init_ext_key(&key, args->zone, start, len);
-	init_ext_key(&other, other_zone(args->zone), start, len);
+	init_ext_key(&key, args->type, start, len);
+	init_ext_key(&other, other_type(args->type), start, len);

 	ret = scoutfs_btree_insert(sb, args->alloc, args->wri,
 				   &args->root->root, &key, NULL, 0);
@@ -227,8 +196,8 @@ static int alloc_ext_remove(struct super_block *sb, void *arg,
 	int ret;
 	int err;

-	init_ext_key(&key, args->zone, start, len);
-	init_ext_key(&other, other_zone(args->zone), start, len);
+	init_ext_key(&key, args->type, start, len);
+	init_ext_key(&other, other_type(args->type), start, len);

 	ret = scoutfs_btree_delete(sb, args->alloc, args->wri,
 				   &args->root->root, &key);
@@ -650,7 +619,7 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
 		.alloc = alloc,
 		.wri = wri,
 		.root = &dalloc->root,
-		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
+		.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
 	};
 	int ret = 0;

@@ -676,14 +645,6 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
 *
 * Unlike meta allocations, the caller is expected to serialize
 * allocations from the root.
- *
- * ENOBUFS is returned if the data allocator ran out of space and we can
- * probably refill it from the server.  The caller is expected to back
- * out, commit the transaction, and try again.
- *
- * ENOSPC is returned if the data allocator ran out of space but we have
- * a flag from the server telling us that there's no more space
- * available.  This is a hard error and should be returned.
 */
 int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
@@ -694,7 +655,7 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		.alloc = alloc,
 		.wri = wri,
 		.root = &dalloc->root,
-		.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
+		.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
 	};
 	struct scoutfs_extent ext;
 	u64 len;
@@ -732,13 +693,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 	ret = 0;
 out:
 	if (ret < 0) {
-		if (ret == -ENOENT) {
-			if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW)
-				ret = -ENOSPC;
-			else
-				ret = -ENOBUFS;
-		}
-
+		/*
+		 * Special retval meaning there wasn't space to alloc from
+		 * this txn. Doesn't mean filesystem is completely full.
+		 * Maybe upper layers want to try again.
+		 */
+		if (ret == -ENOENT)
+			ret = -ENOBUFS;
 		*blkno_ret = 0;
 		*count_ret = 0;
 	} else {
@@ -767,7 +728,7 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		.alloc = alloc,
 		.wri = wri,
 		.root = root,
-		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
+		.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
 	};
 	int ret;

@@ -780,95 +741,6 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

-/*
- * Return the first zone bit that the extent intersects with.
- */
-static int first_extent_zone(struct scoutfs_extent *ext,  __le64 *zones, u64 zone_blocks)
-{
-	int first;
-	int last;
-	int nr;
-
-	first = div64_u64(ext->start, zone_blocks);
-	last = div64_u64(ext->start + ext->len - 1, zone_blocks);
-
-	nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, first);
-	if (nr <= last)
-		return nr;
-
-	return SCOUTFS_DATA_ALLOC_MAX_ZONES;
-}
-
-/*
- * Find an extent in specific zones to satisfy an allocation.  We use
- * the order items to search for the largest extent that intersects with
- * the zones whose bits are set in the caller's bitmap.
- */
-static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *root,
-			    __le64 *zones, u64 zone_blocks,
-			    struct scoutfs_extent *found_ret, u64 count,
-			    struct scoutfs_extent *ext_ret)
-{
-	struct alloc_ext_args args = {
-		.root = root,
-		.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
-	};
-	struct scoutfs_extent found;
-	struct scoutfs_extent ext;
-	u64 start;
-	u64 len;
-	int nr;
-	int ret;
-
-	/* don't bother when there are no bits set */
-	if (find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0) ==
-	    SCOUTFS_DATA_ALLOC_MAX_ZONES)
-		return -ENOENT;
-
-	/* start searching for largest extent from the first zone */
-	len = smallest_order_length(SCOUTFS_BLOCK_SM_MAX);
-	nr = 0;
-
-	for (;;) {
-		/* search for extents in the next zone at our order */
-		nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, nr);
-		if (nr >= SCOUTFS_DATA_ALLOC_MAX_ZONES) {
-			/* wrap down to next smaller order if we run out of bits */
-			len >>= 3;
-			if (len == 0) {
-				ret = -ENOENT;
-				break;
-			}
-			nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0);
-		}
-
-		start = (u64)nr * zone_blocks;
-
-		ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, start, len, &found);
-		if (ret < 0)
-			break;
-
-		/* see if the next extent intersects any zones */
-		nr = first_extent_zone(&found, zones, zone_blocks);
-		if (nr < SCOUTFS_DATA_ALLOC_MAX_ZONES) {
-			start = (u64)nr * zone_blocks;
-
-			ext.start = max(start, found.start);
-			ext.len = min(count, found.start + found.len - ext.start);
-
-			*found_ret = found;
-			*ext_ret = ext;
-			ret = 0;
-			break;
-		}
-
-		/* continue searching past extent */
-		nr = div64_u64(found.start + found.len - 1, zone_blocks) + 1;
-		len = smallest_order_length(found.len);
-	}
-
-	return ret;
-}

 /*
 * Move extent items adding up to the requested total length from the
@@ -879,11 +751,6 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
 * -ENOENT is returned if we run out of extents in the source tree
 * before moving the total.
 *
- * The caller can specify that extents in the source tree should first
- * be found based on their zone bitmaps.  We'll first try to find
- * extents in the exclusive zones, then vacant zones, and then we'll
- * fall back to normal allocation that ignores zones.
- *
 * This first pass is not optimal because it performs full btree walks
 * per extent.  We could optimize this with more clever btree item
 * manipulation functions which can iterate through src and dst blocks
@@ -892,77 +759,32 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
 int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
 		       struct scoutfs_alloc_root *dst,
-		       struct scoutfs_alloc_root *src, u64 total,
-		       __le64 *exclusive, __le64 *vacant, u64 zone_blocks)
+		       struct scoutfs_alloc_root *src, u64 total)
 {
 	struct alloc_ext_args args = {
 		.alloc = alloc,
 		.wri = wri,
 	};
-	struct scoutfs_extent found;
 	struct scoutfs_extent ext;
 	u64 moved = 0;
-	u64 count;
 	int ret = 0;
 	int err;

-	if (zone_blocks == 0) {
-		exclusive = NULL;
-		vacant = NULL;
-	}
-
 	while (moved < total) {
-		count = total - moved;
-
-		if (exclusive) {
-			/* first try to find extents in our exclusive zones */
-			ret = find_zone_extent(sb, src, exclusive, zone_blocks,
-					       &found, count, &ext);
-			if (ret == -ENOENT) {
-				exclusive = NULL;
-				continue;
-			}
-		} else if (vacant) {
-			/* then try to find extents in vacant zones */
-			ret = find_zone_extent(sb, src, vacant, zone_blocks,
-					       &found, count, &ext);
-			if (ret == -ENOENT) {
-				vacant = NULL;
-				continue;
-			}
-		} else {
-			/* otherwise fall back to finding extents anywhere */
-			args.root = src;
-			args.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
-			ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, 0, 0, &found);
-			if (ret == 0) {
-				ext.start = found.start;
-				ext.len = min(count, found.len);
-			}
-		}
-		if (ret < 0)
-			break;
-
-		/* searching set start/len, finish initializing alloced extent */
-		ext.map = found.map ? ext.start - found.start + found.map : 0;
-		ext.flags = found.flags;
-
-		/* remove the allocation from the found extent */
 		args.root = src;
-		args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
-		ret = scoutfs_ext_remove(sb, &alloc_ext_ops, &args, ext.start, ext.len);
+		args.type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
+		ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args,
+					0, 0, total - moved, &ext);
 		if (ret < 0)
 			break;

-		/* insert the allocated extent into the dest */
 		args.root = dst;
-		args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
+		args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
 		ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start,
 					 ext.len, ext.map, ext.flags);
 		if (ret < 0) {
-			/* and put it back in src if insertion failed */
 			args.root = src;
-			args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
+			args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
 			err = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
 						 ext.start, ext.len, ext.map,
 						 ext.flags);
@@ -1030,7 +852,7 @@ out:
 * a list block and all the btree blocks that store extent items.
 *
 * At most, an extent operation can dirty down three paths of the tree
- * to modify a blkno item and two distant order items.  We can grow and
+ * to modify a blkno item and two distant len items.  We can grow and
 * split the root, and then those three paths could share blocks but each
 * modify two leaf blocks.
 */
@@ -1079,7 +901,7 @@ int scoutfs_alloc_fill_list(struct super_block *sb,
 		.alloc = alloc,
 		.wri = wri,
 		.root = root,
-		.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
+		.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
 	};
 	struct scoutfs_alloc_list_block *lblk;
 	struct scoutfs_block *bl = NULL;
@@ -1136,7 +958,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb,
 		.alloc = alloc,
 		.wri = wri,
 		.root = root,
-		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
+		.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
 	};
 	struct scoutfs_alloc_list_block *lblk = NULL;
 	struct scoutfs_block *bl = NULL;
@@ -1269,20 +1091,6 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
 	return lo;
 }

-bool scoutfs_alloc_test_flag(struct super_block *sb,
-			    struct scoutfs_alloc *alloc, u32 flag)
-{
-	unsigned int seq;
-	bool set;
-
-	do {
-		seq = read_seqbegin(&alloc->seqlock);
-		set = !!(le32_to_cpu(alloc->avail.flags) & flag);
-	} while (read_seqretry(&alloc->seqlock, seq));
-
-	return set;
-}
-
 /*
 * Call the callers callback for every persistent allocator structure
 * we can find.
@@ -1294,15 +1102,9 @@ int scoutfs_alloc_foreach(struct super_block *sb,
 	struct scoutfs_block_ref refs[2] = {{0,}};
 	struct scoutfs_super_block *super = NULL;
 	struct scoutfs_srch_compact *sc;
-	struct scoutfs_log_merge_request *lmreq;
-	struct scoutfs_log_merge_complete *lmcomp;
 	struct scoutfs_log_trees lt;
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
-	int expected;
-	u64 avail_tot;
-	u64 freed_tot;
-	u64 id;
 	int ret;

 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
@@ -1409,57 +1211,6 @@ retry:
 		scoutfs_key_inc(&key);
 	}

-	/* log merge allocators */
-	memset(&key, 0, sizeof(key));
-	key.sk_zone = SCOUTFS_LOG_MERGE_REQUEST_ZONE;
-	expected = sizeof(*lmreq);
-	id = 0;
-	avail_tot = 0;
-	freed_tot = 0;
-
-	for (;;) {
-		ret = scoutfs_btree_next(sb, &super->log_merge, &key, &iref);
-		if (ret == 0) {
-			if (iref.key->sk_zone != key.sk_zone) {
-				ret = -ENOENT;
-			} else if (iref.val_len == expected) {
-				key = *iref.key;
-				if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
-					lmreq = iref.val;
-					id = le64_to_cpu(lmreq->rid);
-					avail_tot = le64_to_cpu(lmreq->meta_avail.total_nr);
-					freed_tot = le64_to_cpu(lmreq->meta_freed.total_nr);
-				} else {
-					lmcomp = iref.val;
-					id = le64_to_cpu(lmcomp->rid);
-					avail_tot = le64_to_cpu(lmcomp->meta_avail.total_nr);
-					freed_tot = le64_to_cpu(lmcomp->meta_freed.total_nr);
-				}
-			} else {
-				ret = -EIO;
-			}
-			scoutfs_btree_put_iref(&iref);
-		}
-		if (ret == -ENOENT) {
-			if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
-				memset(&key, 0, sizeof(key));
-				key.sk_zone = SCOUTFS_LOG_MERGE_COMPLETE_ZONE;
-				expected = sizeof(*lmcomp);
-				continue;
-			}
-			break;
-		}
-		if (ret < 0)
-			goto out;
-
-		ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, true, avail_tot) ?:
-		      cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, false, freed_tot);
-		if (ret < 0)
-			goto out;
-
-		scoutfs_key_inc(&key);
-	}
-
 	ret = 0;
 out:
 	if (ret == -ESTALE) {
@@ -1476,63 +1227,3 @@ out:
 	kfree(sc);
 	return ret;
 }
-
-
-struct foreach_cb_args {
-	scoutfs_alloc_extent_cb_t cb;
-	void *cb_arg;
-};
-
-static int alloc_btree_extent_item_cb(struct super_block *sb, struct scoutfs_key *key,
-				      void *val, int val_len, void *arg)
-{
-	struct foreach_cb_args *cba = arg;
-	struct scoutfs_extent ext;
-
-	if (key->sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
-		return -ENOENT;
-
-	ext_from_key(&ext, key);
-	cba->cb(sb, cba->cb_arg, &ext);
-
-	return 0;
-}
-
-/*
- * Call the caller's callback on each extent stored in the allocator's
- * btree.  The callback sees extents called in order by starting blkno.
- */
-int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
-			     scoutfs_alloc_extent_cb_t cb, void *cb_arg)
-{
-	struct foreach_cb_args cba = {
-		.cb = cb,
-		.cb_arg = cb_arg,
-	};
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	struct scoutfs_key key;
-	int ret;
-
-	init_ext_key(&key, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
-
-	for (;;) {
-		/* will stop at order items before getting stuck in final block */
-		BUILD_BUG_ON(SCOUTFS_FREE_EXTENT_BLKNO_ZONE > SCOUTFS_FREE_EXTENT_ORDER_ZONE);
-		init_ext_key(&start, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
-		init_ext_key(&end, SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, 1);
-
-		ret = scoutfs_btree_read_items(sb, &root->root, &key, &start, &end,
-					       alloc_btree_extent_item_cb, &cba);
-		if (ret < 0 || end.sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
-			if (ret == -ENOENT)
-				ret = 0;
-			break;
-		}
-
-		key = end;
-		scoutfs_key_inc(&key);
-	}
-
-	return ret;
-}
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -38,10 +38,6 @@
 #define SCOUTFS_ALLOC_DATA_LG_THRESH \
 	(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)

-/* the client will force commits if data allocators get too low */
-#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \
-	((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
-
 /*
 * Fill client alloc roots to the target when they fall below the lo
 * threshold.
@@ -59,16 +55,15 @@
 #define SCOUTFS_SERVER_DATA_FILL_LO \
 	(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)

-
 /*
- * Log merge meta allocations are only used for one request and will
- * never use more than the dirty limit.
+ * Each of the server meta_alloc roots will try to keep a minimum amount
+ * of free blocks.  The server will swap roots when its current avail
+ * falls below the threshold while the freed root is still above it.  It
+ * must have room for all the largest allocation attempted in a
+ * transaction on the server.
 */
-#define SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT	(64ULL * 1024 * 1024)
-/* a few extra blocks for alloc blocks */
-#define SCOUTFS_SERVER_MERGE_FILL_TARGET	\
-	((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4)
-#define SCOUTFS_SERVER_MERGE_FILL_LO		SCOUTFS_SERVER_MERGE_FILL_TARGET
+#define SCOUTFS_SERVER_META_ALLOC_MIN \
+	(SCOUTFS_SERVER_META_FILL_TARGET * 2)

 /*
 * A run-time use of a pair of persistent avail/freed roots as a
@@ -130,8 +125,7 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
 		       struct scoutfs_alloc_root *dst,
-		       struct scoutfs_alloc_root *src, u64 total,
-		       __le64 *exclusive, __le64 *vacant, u64 zone_blocks);
+		       struct scoutfs_alloc_root *src, u64 total);

 int scoutfs_alloc_fill_list(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
@@ -152,8 +146,6 @@ int scoutfs_alloc_splice_list(struct super_block *sb,

 bool scoutfs_alloc_meta_low(struct super_block *sb,
 			    struct scoutfs_alloc *alloc, u32 nr);
-bool scoutfs_alloc_test_flag(struct super_block *sb,
-			    struct scoutfs_alloc *alloc, u32 flag);

 typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
 					  int owner, u64 id,
@@ -161,9 +153,4 @@ typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
 int scoutfs_alloc_foreach(struct super_block *sb,
 			  scoutfs_alloc_foreach_cb_t cb, void *arg);

-typedef void (*scoutfs_alloc_extent_cb_t)(struct super_block *sb, void *cb_arg,
-					  struct scoutfs_extent *ext);
-int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
-			     scoutfs_alloc_extent_cb_t cb, void *cb_arg);
-
 #endif
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -200,9 +200,7 @@ static void block_free(struct super_block *sb, struct block_private *bp)
 	else
 		BUG();

-	/* ok to tear down dirty blocks when forcing unmount */
-	WARN_ON_ONCE(!scoutfs_forcing_unmount(sb) && !list_empty(&bp->dirty_entry));
-
+	WARN_ON_ONCE(!list_empty(&bp->dirty_entry));
 	WARN_ON_ONCE(atomic_read(&bp->refcount));
 	WARN_ON_ONCE(atomic_read(&bp->io_count));
 	kfree(bp);
@@ -487,9 +485,6 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	sector_t sector;
 	int ret = 0;

-	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
-
 	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

 	WARN_ON_ONCE(bp->bl.blkno == U64_MAX);
@@ -1153,7 +1148,7 @@ static void sm_block_bio_end_io(struct bio *bio, int err)
 * only layer that sees the full block buffer so we pass the calculated
 * crc to the caller for them to check in their context.
 */
-static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw, u64 blkno,
+static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,
 		       struct scoutfs_block_header *hdr, size_t len,
 		       __le32 *blk_crc)
 {
@@ -1165,9 +1160,6 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw

 	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

-	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
-
 	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
 		return -EINVAL;
@@ -1220,14 +1212,14 @@ int scoutfs_block_read_sm(struct super_block *sb,
 			  struct scoutfs_block_header *hdr, size_t len,
 			  __le32 *blk_crc)
 {
-	return sm_block_io(sb, bdev, READ, blkno, hdr, len, blk_crc);
+	return sm_block_io(bdev, READ, blkno, hdr, len, blk_crc);
 }

 int scoutfs_block_write_sm(struct super_block *sb,
 			   struct block_device *bdev, u64 blkno,
 			   struct scoutfs_block_header *hdr, size_t len)
 {
-	return sm_block_io(sb, bdev, WRITE, blkno, hdr, len, NULL);
+	return sm_block_io(bdev, WRITE, blkno, hdr, len, NULL);
 }

 int scoutfs_block_setup(struct super_block *sb)
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -83,10 +83,6 @@ enum btree_walk_flags {
 	 BTW_ALLOC	= (1 <<  3), /* allocate a new block for 0 ref, requires dirty */
 	 BTW_INSERT	= (1 <<  4), /* walking to insert, try splitting */
 	 BTW_DELETE	= (1 <<  5), /* walking to delete, try joining */
-	 BTW_PAR_RNG	= (1 <<  6), /* return range through final parent */
-	 BTW_GET_PAR	= (1 <<  7), /* get reference to final parent */
-	 BTW_SET_PAR	= (1 <<  8), /* override reference to final parent */
-	 BTW_SUBTREE	= (1 <<  9), /* root is parent subtree, return -ERANGE if split/join */
 };

 /* total length of the value payload */
@@ -108,22 +104,16 @@ static inline unsigned int item_bytes(struct scoutfs_btree_item *item)
 }

 /*
- * Refill blocks from their siblings when they're under 1/4 full.  This
- * puts some distance between the join threshold and the full threshold
- * for splitting.  Blocks that just split or joined need to undergo a
- * reasonable amount of item modification before they'll split or join
- * again.
+ * Join blocks when they both are 1/4 full.  This puts some distance
+ * between the join threshold and the full threshold for splitting.
+ * Blocks that just split or joined need to undergo a reasonable amount
+ * of item modification before they'll split or join again.
 */
 static unsigned int join_low_watermark(void)
 {
 	return (SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / 4;
 }

-static bool total_above_join_low_water(struct scoutfs_btree_block *bt)
-{
-	return le16_to_cpu(bt->total_item_bytes) >= join_low_watermark();
-}
-
 /*
 * return the integer percentages of total space the block could have
 * consumed by items that is currently consumed.
@@ -522,7 +512,6 @@ static void create_item(struct scoutfs_btree_block *bt,

 	item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len);
 	item->val_len = cpu_to_le16(val_len);
-	memset(item->__pad, 0, sizeof(item->__pad));

 	le16_add_cpu(&bt->total_item_bytes, item_bytes(item));
 }
@@ -816,13 +805,12 @@ static int try_join(struct super_block *sb,
 	struct scoutfs_btree_block *sib;
 	struct scoutfs_block *sib_bl;
 	struct scoutfs_block_ref *ref;
-	const unsigned int lwm = join_low_watermark();
 	unsigned int sib_tot;
 	bool move_right;
 	int to_move;
 	int ret;

-	if (total_above_join_low_water(bt))
+	if (le16_to_cpu(bt->total_item_bytes) >= join_low_watermark())
 		return 0;

 	scoutfs_inc_counter(sb, btree_join);
@@ -842,23 +830,18 @@ static int try_join(struct super_block *sb,
 		return ret;
 	sib = sib_bl->data;

-	/* combine if resulting block would be up to 75% full, move big chunk otherwise */
-	sib_tot = le16_to_cpu(sib->total_item_bytes);
-	if (sib_tot <= lwm * 2)
+	sib_tot = le16_to_cpu(bt->total_item_bytes);
+	if (sib_tot < join_low_watermark())
 		to_move = sib_tot;
 	else
-		to_move = lwm;
+		to_move = sib_tot - join_low_watermark();

-	/* compact to make room for over-estimate of worst case move overrun */
-	if (le16_to_cpu(bt->mid_free_len) <
-	    (to_move + item_len_bytes(SCOUTFS_BTREE_MAX_VAL_LEN))) {
+	if (le16_to_cpu(bt->mid_free_len) < to_move) {
 		ret = compact_values(sb, bt);
-		if (ret < 0) {
+		if (ret < 0)
 			scoutfs_block_put(sb, sib_bl);
-			return ret;
-		}
+		return ret;
 	}
-
 	move_items(bt, sib, move_right, to_move);

 	/* update our parent's item */
@@ -921,21 +904,20 @@ static bool bad_avl_node_off(__le16 node_off, int nr)
 *  - call after leaf modification
 *  - padding is zero
 */
-__attribute__((unused))
-static void verify_btree_block(struct super_block *sb, char *str,
+static void verify_btree_block(struct super_block *sb,
 			       struct scoutfs_btree_block *bt, int level,
-			       bool last_ref, struct scoutfs_key *start,
+			       struct scoutfs_key *start,
 			       struct scoutfs_key *end)
 {
 	__le16 *buckets = leaf_item_hash_buckets(bt);
 	struct scoutfs_btree_item *item;
-	struct scoutfs_avl_node *node;
 	char *reason = NULL;
 	int first_val = 0;
 	int hashed = 0;
 	int end_off;
 	int tot = 0;
 	int i = 0;
+	int j = 0;
 	int nr;

 	if (bt->level != level) {
@@ -974,9 +956,8 @@ static void verify_btree_block(struct super_block *sb, char *str,
 			goto out;
 		}

-		if (memchr_inv(item->__pad, '\0', sizeof(item->__pad))) {
-			reason = "item struct __pad isn't zero";
-			goto out;
+		for (j = 0; j < sizeof(item->__pad); j++) {
+			WARN_ON_ONCE(item->__pad[j] != 0);
 		}

 		if (scoutfs_key_compare(&item->key, start) < 0 ||
@@ -991,29 +972,19 @@ static void verify_btree_block(struct super_block *sb, char *str,
 			goto out;
 		}

-		if (level > 0 && le16_to_cpu(item->val_len) !=
-				 sizeof(struct scoutfs_block_ref)) {
-			reason = "parent item val not sizeof ref";
-			goto out;
-		}
-
 		if (le16_to_cpu(item->val_len) > SCOUTFS_BTREE_MAX_VAL_LEN) {
 			reason = "bad item val len";
 			goto out;
 		}

-		if (le16_to_cpu(item->val_off) % SCOUTFS_BTREE_VALUE_ALIGN) {
-			reason = "item value not aligned";
-			goto out;
-		}
-
 		if (((int)le16_to_cpu(item->val_off) +
 		     le16_to_cpu(item->val_len)) > end_off) {
 			reason = "item value outside valid";
 			goto out;
 		}

-		tot += item_len_bytes(le16_to_cpu(item->val_len));
+		tot += sizeof(struct scoutfs_btree_item) +
+		       le16_to_cpu(item->val_len);

 		if (item->val_len != 0) {
 			first_val = min_t(int, first_val,
@@ -1021,15 +992,6 @@ static void verify_btree_block(struct super_block *sb, char *str,
 		}
 	}

-	if (last_ref && level > 0 &&
-	    (node = scoutfs_avl_last(&bt->item_root)) != NULL) {
-		item = node_item(node);
-		if (scoutfs_key_compare(&item->key, end) != 0) {
-			reason = "final ref item key not range end";
-			goto out;
-		}
-	}
-
 	for (i = 0; level == 0 && i < SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; i++) {
 		if (buckets[i] == 0)
 			continue;
@@ -1062,18 +1024,17 @@ out:
 	if (!reason)
 		return;

-	printk("verifying btree %s: %s\n", str, reason);
-	printk("args: level %u last_ref %u start "SK_FMT" end "SK_FMT"\n",
-		level, last_ref, SK_ARG(start), SK_ARG(end));
+	printk("found btree block inconsistency: %s\n", reason);
+	printk("start "SK_FMT" end "SK_FMT"\n", SK_ARG(start), SK_ARG(end));
 	printk("calced: i %u tot %u hashed %u fv %u\n",
 	       i, tot, hashed, first_val);

-	printk("bt hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", 
+	printk("hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", 
 		le32_to_cpu(bt->hdr.crc), le32_to_cpu(bt->hdr.magic),
 		le64_to_cpu(bt->hdr.fsid), le64_to_cpu(bt->hdr.seq),
 		le64_to_cpu(bt->hdr.blkno));
 	printk("item_root: node %u\n", le16_to_cpu(bt->item_root.node));
-	printk("bt: nr %u tib %u mfl %u lvl %u\n",
+	printk("nr %u tib %u mfl %u lvl %u\n",
 		le16_to_cpu(bt->nr_items), le16_to_cpu(bt->total_item_bytes),
 		le16_to_cpu(bt->mid_free_len), bt->level);

@@ -1090,92 +1051,6 @@ out:
 	BUG();
 }

-/*
- * Walk from the root to the leaf, verifying the blocks traversed.
- */
-__attribute__((unused))
-static void verify_btree_walk(struct super_block *sb, char *str,
-			      struct scoutfs_btree_root *root,
-			      struct scoutfs_key *key)
-{
-	struct scoutfs_avl_node *next_node;
-	struct scoutfs_avl_node *node;
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_item *prev;
-	struct scoutfs_block *bl = NULL;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block_ref ref;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	bool last_ref;
-	int level;
-	int ret;
-
-	if (root->height == 0 && root->ref.blkno != 0) {
-		WARN_ONCE(1, "invalid btree root height %u blkno %llu seq %016llx\n",
-			root->height, le64_to_cpu(root->ref.blkno),
-			le64_to_cpu(root->ref.seq));
-		return;
-	}
-
-	if (root->height == 0)
-		return;
-
-	scoutfs_key_set_zeros(&start);
-	scoutfs_key_set_ones(&end);
-	level = root->height;
-	ref = root->ref;
-	/* first parent last ref isn't all ones in subtrees */
-	last_ref = false;
-
-	while(level-- > 0) {
-		scoutfs_block_put(sb, bl);
-		bl = NULL;
-		ret = get_ref_block(sb, NULL, NULL, 0, &ref, &bl);
-		if (ret) {
-			printk("verifying  btree %s: read error %d\n",
-			       str, ret);
-			break;
-		}
-		bt = bl->data;
-
-		verify_btree_block(sb, str, bt, level, last_ref, &start, &end);
-
-		if (level == 0)
-			break;
-
-		node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key,
-					  NULL, NULL, &next_node, NULL);
-		item = node_item(node ?: next_node);
-
-		if (item == NULL) {
-			printk("verifying btree %s: no ref item\n", str);
-			printk("root: height %u blkno %llu seq %016llx\n",
-			       root->height, le64_to_cpu(root->ref.blkno),
-			       le64_to_cpu(root->ref.seq));
-			printk("walk level %u start "SK_FMT" end "SK_FMT"\n",
-				level, SK_ARG(&start), SK_ARG(&end));
-
-			printk("block: level %u blkno %llu seq %016llx\n",
-			       bt->level, le64_to_cpu(bt->hdr.blkno),
-			       le64_to_cpu(bt->hdr.seq));
-			printk("key: "SK_FMT"\n", SK_ARG(key));
-			BUG();
-		}
-
-		if ((prev = prev_item(bt, item))) {
-			start = *item_key(prev);
-			scoutfs_key_inc(&start);
-		}
-		end = *item_key(item);
-
-		memcpy(&ref, item_val(bt, item), sizeof(ref));
-		last_ref = !next_item(bt, item);
-	}
-
-	scoutfs_block_put(sb, bl);
-}
-
 struct btree_walk_key_range {
 	struct scoutfs_key start;
 	struct scoutfs_key end;
@@ -1207,8 +1082,7 @@ static int btree_walk(struct super_block *sb,
 		      int flags, struct scoutfs_key *key,
 		      unsigned int val_len,
 		      struct scoutfs_block **bl_ret,
-		      struct btree_walk_key_range *kr,
-		      struct scoutfs_btree_root *par_root)
+		      struct btree_walk_key_range *kr)
 {
 	struct scoutfs_block *par_bl = NULL;
 	struct scoutfs_block *bl = NULL;
@@ -1224,15 +1098,9 @@ static int btree_walk(struct super_block *sb,
 	unsigned int nr;
 	int ret;

-	if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)) ||
-	    WARN_ON_ONCE((flags & BTW_PAR_RNG) && !kr) ||
-	    WARN_ON_ONCE((flags & (BTW_GET_PAR|BTW_SET_PAR)) && !par_root))
+	if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)))
 		return -EINVAL;

-	/* all ops come through walk and walk calls all reads */
-	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
-
 	scoutfs_inc_counter(sb, btree_walk);

 restart:
@@ -1253,14 +1121,7 @@ restart:
 	ret = 0;

 	if (!root->height) {
-		if (flags & BTW_GET_PAR) {
-			memset(par_root, 0, sizeof(*par_root));
-			*root = *par_root;
-			ret = 0;
-		} else if (flags & BTW_SET_PAR) {
-			*root = *par_root;
-			ret = 0;
-		} else if (!(flags & BTW_INSERT)) {
+		if (!(flags & BTW_INSERT)) {
 			ret = -ENOENT;
 		} else {
 			ret = get_ref_block(sb, alloc, wri, BTW_ALLOC | BTW_DIRTY, &root->ref, &bl);
@@ -1279,40 +1140,14 @@ restart:

 		trace_scoutfs_btree_walk(sb, root, key, flags, level, ref);

-		/* par range set by ref to last parent block */
-		if (level < 2 && (flags & BTW_PAR_RNG)) {
-			ret = 0;
-			break;
-		}
-
-		if (level < 2 && (flags & BTW_GET_PAR)) {
-			par_root->ref = *ref;
-			par_root->height = level + 1;
-			ret = 0;
-			break;
-		}
-
-		if (level < 2 && (flags & BTW_SET_PAR)) {
-			if (ref == &root->ref) {
-				/* single parent block is replaced, can shrink/grow */
-				*root = *par_root;
-			} else {
-				/* subtree replacing one of parents must match height */
-				if (par_root->height != level + 1) {
-					ret = -EINVAL;
-					break;
-				}
-				*ref = par_root->ref;
-			}
-			ret = 0;
-			break;
-		}
-
 		ret = get_ref_block(sb, alloc, wri, flags, ref, &bl);
 		if (ret)
 			break;
 		bt = bl->data;

+		if (0 && kr)
+			verify_btree_block(sb, bt, level, &kr->start, &kr->end);
+
 		/* XXX more aggressive block verification, before ref updates? */
 		if (bt->level != level) {
 			scoutfs_corruption(sb, SC_BTREE_BLOCK_LEVEL,
@@ -1328,17 +1163,6 @@ restart:
 			break;
 		}

-		/*
-		 * join/split won't check subtree parent root, let
-		 * caller know when it needs to be split/join.
-		 */
-		if ((flags & BTW_SUBTREE) && level == 1 &&
-		    (!total_above_join_low_water(bt) ||
-		     !mid_free_item_room(bt, sizeof(struct scoutfs_block_ref)))) {
-			ret = -ERANGE;
-			break;
-		}
-
 		/*
 		 * Splitting and joining can add or remove parents or
 		 * change the parent item we use to reach the child
@@ -1464,7 +1288,7 @@ int scoutfs_btree_lookup(struct super_block *sb,
 	if (WARN_ON_ONCE(iref->key))
 		return -EINVAL;

-	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL, NULL);
+	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1516,7 +1340,7 @@ int scoutfs_btree_insert(struct super_block *sb,
 		return -EINVAL;

 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL, NULL);
+			 val_len, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1578,7 +1402,7 @@ int scoutfs_btree_update(struct super_block *sb,
 		return -EINVAL;

 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL, NULL);
+			 val_len, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1620,7 +1444,7 @@ int scoutfs_btree_force(struct super_block *sb,
 		return -EINVAL;

 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL, NULL);
+			 val_len, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1658,7 +1482,7 @@ int scoutfs_btree_delete(struct super_block *sb,
 	scoutfs_inc_counter(sb, btree_delete);

 	ret = btree_walk(sb, alloc, wri, root, BTW_DELETE | BTW_DIRTY, key,
-			 0, &bl, NULL, NULL);
+			 0, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1722,7 +1546,7 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root,

 	for (;;) {
 		ret = btree_walk(sb, NULL, NULL, root, flags, &walk_key,
-				 0, &bl, &kr, NULL);
+				 0, &bl, &kr);
 		if (ret < 0)
 			break;
 		bt = bl->data;
@@ -1795,8 +1619,7 @@ int scoutfs_btree_dirty(struct super_block *sb,

 	scoutfs_inc_counter(sb, btree_dirty);

-	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl,
-			 NULL, NULL);
+	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1832,7 +1655,7 @@ int scoutfs_btree_read_items(struct super_block *sb,
 	struct scoutfs_block *bl;
 	int ret;

-	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr, NULL);
+	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr);
 	if (ret < 0)
 		goto out;
 	bt = bl->data;
@@ -1887,7 +1710,7 @@ int scoutfs_btree_insert_list(struct super_block *sb,

 	while (lst) {
 		ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
-				 &lst->key, lst->val_len, &bl, &kr, NULL);
+				 &lst->key, lst->val_len, &bl, &kr);
 		if (ret < 0)
 			goto out;
 		bt = bl->data;
@@ -1915,542 +1738,3 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 out:
 	return ret;
 }
-
-/*
- * Descend towards the leaf that would contain the key.  As we arrive at
- * the last parent block, set start and end to the range of keys that
- * could be found through traversal of that last parent.
- *
- * If the tree is too short for parent blocks then the max key range
- * is returned.
- */
-int scoutfs_btree_parent_range(struct super_block *sb,
-			       struct scoutfs_btree_root *root,
-			       struct scoutfs_key *key,
-			       struct scoutfs_key *start,
-			       struct scoutfs_key *end)
-{
-	struct btree_walk_key_range kr;
-	int ret;
-
-	ret = btree_walk(sb, NULL, NULL, root, BTW_PAR_RNG, key, 0, NULL,
-			 &kr, NULL);
-	if (ret == -ENOENT)
-		ret = 0;
-
-	*start = kr.start;
-	*end = kr.end;
-	return ret;
-}
-
-/*
- * Initialize the caller's root as a subtree whose ref points to the
- * last parent found as we traverse towards the leaf containing the key.
- * If the tree is too small to have multiple blocks at the final parent
- * level then the caller's root will be initialized to equal full input
- * root.  If the tree is empty then the par root will also be empty.
- */
-int scoutfs_btree_get_parent(struct super_block *sb,
-			     struct scoutfs_btree_root *root,
-			     struct scoutfs_key *key,
-			     struct scoutfs_btree_root *par_root)
-{
-	return btree_walk(sb, NULL, NULL, root, BTW_GET_PAR, key, 0, NULL,
-			  NULL, par_root);
-}
-
-/*
- * Dirty a path towards the leaf block containing the key.  As we reach
- * the reference to the final parent block override it with the ref in
- * the caller's block.  If the tree only has a single block at the final
- * parent level, or a single leaf block, then the entire tree is
- * replaced with the caller's root.
- *
- * This manages allocs and frees while dirtying blocks in the path to
- * the ref, but it doesn't account for allocating the blocks that are
- * referenced by the ref nor freeing blocks referenced by the old ref
- * that's overwritten.  Keeping allocators in sync with the result of
- * the ref override is the responsibility of the caller.
- */
-int scoutfs_btree_set_parent(struct super_block *sb,
-			     struct scoutfs_alloc *alloc,
-			     struct scoutfs_block_writer *wri,
-			     struct scoutfs_btree_root *root,
-			     struct scoutfs_key *key,
-			     struct scoutfs_btree_root *par_root)
-{
-
-	trace_scoutfs_btree_set_parent(sb, root, key, par_root);
-
-	return btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_SET_PAR,
-			  key, 0, NULL, NULL, par_root);
-}
-
-/*
- * Descend to the leaf, making sure that all the blocks conform to the
- * balance constraints.  Blocks below the low threshold will be joined.
- * This is called to split blocks that were too large for insertions,
- * but those insertions were in a distant context and we don't bother
- * communicating the val_len back here.  We just try to insert a max
- * value.
- *
- * This always dirties all the way to the leaf.  It could be made more
- * efficient with more btree walk flags to walk and check for blocks
- * that need balancing, and then walks that don't dirty unless they need
- * to join/split.
- */
-int scoutfs_btree_rebalance(struct super_block *sb,
-			    struct scoutfs_alloc *alloc,
-			    struct scoutfs_block_writer *wri,
-			    struct scoutfs_btree_root *root,
-			    struct scoutfs_key *key)
-{
-	return btree_walk(sb, alloc, wri, root,
-			  BTW_DIRTY | BTW_INSERT | BTW_DELETE,
-			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
-}
-
-struct merge_pos {
-	struct rb_node node;
-	struct scoutfs_btree_root *root;
-	struct scoutfs_key key;
-	unsigned int val_len;
-	u8 val[SCOUTFS_BTREE_MAX_VAL_LEN];
-};
-
-/*
- * Find the next item in the mpos's root after its key and make sure
- * that it's in its sorted position in the rbtree.  We're responsible
- * for freeing the mpos if we don't put it back in the pos_root.  This
- * happens naturally naturally when its item_root has no more items to
- * merge.
- */
-static int reset_mpos(struct super_block *sb, struct rb_root *pos_root,
-		      struct merge_pos *mpos, struct scoutfs_key *end,
-		      scoutfs_btree_merge_cmp_t merge_cmp)
-{
-	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct merge_pos *walk;
-	struct rb_node *parent;
-	struct rb_node **node;
-	int key_cmp;
-	int val_cmp;
-	int ret;
-
-restart:
-	if (!RB_EMPTY_NODE(&mpos->node)) {
-		rb_erase(&mpos->node, pos_root);
-		RB_CLEAR_NODE(&mpos->node);
-	}
-
-	/* find the next item in the root within end */
-	ret = scoutfs_btree_next(sb, mpos->root, &mpos->key, &iref);
-	if (ret == 0) {
-		if (scoutfs_key_compare(iref.key, end) > 0) {
-			ret = -ENOENT;
-		} else {
-			mpos->key = *iref.key;
-			mpos->val_len = iref.val_len;
-			memcpy(mpos->val, iref.val, iref.val_len);
-		}
-		scoutfs_btree_put_iref(&iref);
-	}
-	if (ret < 0) {
-		kfree(mpos);
-		if (ret == -ENOENT)
-			ret = 0;
-		goto out;
-	}
-
-rewalk:
-	/* sort merge items by key then oldest to newest */
-	node = &pos_root->rb_node;
-	parent = NULL;
-	while (*node) {
-		parent = *node;
-		walk = container_of(*node, struct merge_pos, node);
-
-		key_cmp = scoutfs_key_compare(&mpos->key, &walk->key);
-		val_cmp = merge_cmp(mpos->val, mpos->val_len,
-				    walk->val, walk->val_len);
-
-		/* drop old versions of logged keys as we discover them */
-		if (key_cmp == 0) {
-			scoutfs_inc_counter(sb, btree_merge_drop_old);
-			if (val_cmp < 0)  {
-				scoutfs_key_inc(&mpos->key);
-				goto restart;
-			} else {
-				BUG_ON(val_cmp == 0);
-				rb_erase(&walk->node, pos_root);
-				kfree(walk);
-				goto rewalk;
-			}
-		}
-
-		if ((key_cmp ?: val_cmp) < 0)
-			node = &(*node)->rb_left;
-		else
-			node = &(*node)->rb_right;
-	}
-
-	rb_link_node(&mpos->node, parent, node);
-	rb_insert_color(&mpos->node, pos_root);
-	ret = 0;
-out:
-	return ret;
-}
-
-static struct merge_pos *first_mpos(struct rb_root *root)
-{
-	struct rb_node *node = rb_first(root);
-	if (node)
-		 return container_of(node, struct merge_pos, node);
-	return NULL;
-}
-
-/*
- * Merge items from a number of read-only input roots into a writable
- * destination root.  The order of the input roots doesn't matter, the
- * items are merged in sorted key order.
- *
- * The merge_cmp callback determines the order that the input items are
- * merged in.  The is_del callback determines if a merging item should
- * be removed from the destination.
- *
- * subtree indicates that the destination root is in fact one of many
- * parent blocks and shouldn't be split or allowed to fall below the
- * join low water mark.
- *
- * drop_val indicates the initial length of the value that should be
- * dropped when merging items into destination items.
- *
- * -ERANGE is returned if the merge doesn't fully exhaust the range, due
- * to allocators running low or needing to join/split the parent.
- * *next_ret is set to the next key which hasn't been merged so that the
- * caller can retry with a new allocator and subtree.
- */
-int scoutfs_btree_merge(struct super_block *sb,
-			struct scoutfs_alloc *alloc,
-			struct scoutfs_block_writer *wri,
-			struct scoutfs_key *start,
-			struct scoutfs_key *end,
-			struct scoutfs_key *next_ret,
-			struct scoutfs_btree_root *root,
-			struct list_head *inputs,
-			scoutfs_btree_merge_cmp_t merge_cmp,
-			scoutfs_btree_merge_is_del_t merge_is_del, bool subtree,
-			int drop_val, int dirty_limit, int alloc_low)
-{
-	struct scoutfs_btree_root_head *rhead;
-	struct rb_root pos_root = RB_ROOT;
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl = NULL;
-	struct btree_walk_key_range kr;
-	struct scoutfs_avl_node *par;
-	struct merge_pos *mpos;
-	struct merge_pos *tmp;
-	int walk_val_len;
-	int walk_flags;
-	bool is_del;
-	int cmp;
-	int ret;
-
-	trace_scoutfs_btree_merge(sb, root, start, end);
-	scoutfs_inc_counter(sb, btree_merge);
-
-	list_for_each_entry(rhead, inputs, head) {
-		mpos = kmalloc(sizeof(*mpos), GFP_NOFS);
-		if (!mpos) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		RB_CLEAR_NODE(&mpos->node);
-		mpos->key = *start;
-		mpos->root = &rhead->root;
-
-		ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp);
-		if (ret < 0)
-			goto out;
-	}
-
-	walk_flags = BTW_DIRTY;
-	if (subtree)
-		walk_flags |= BTW_SUBTREE;
-	walk_val_len = 0;
-
-	while ((mpos = first_mpos(&pos_root))) {
-
-		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
-			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
-			ret = -ERANGE;
-			*next_ret = mpos->key;
-			goto out;
-		}
-
-		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
-			scoutfs_inc_counter(sb, btree_merge_alloc_low);
-			ret = -ERANGE;
-			*next_ret = mpos->key;
-			goto out;
-		}
-
-		scoutfs_block_put(sb, bl);
-		bl = NULL;
-		ret = btree_walk(sb, alloc, wri, root, walk_flags,
-				 &mpos->key, walk_val_len, &bl, &kr, NULL);
-		if (ret < 0) {
-			if (ret == -ERANGE)
-				*next_ret = mpos->key;
-			goto out;
-		}
-		bt = bl->data;
-		scoutfs_inc_counter(sb, btree_merge_walk);
-
-		for (; mpos; mpos = first_mpos(&pos_root)) {
-
-			/* val must have at least what we need to drop */
-			if (mpos->val_len < drop_val) {
-				ret = -EIO;
-				goto out;
-			}
-
-			/* walk to new leaf if we exceed parent ref key */
-			if (scoutfs_key_compare(&mpos->key, &kr.end) > 0)
-				break;
-
-			/* see if there's an existing item */
-			item = leaf_item_hash_search(sb, bt, &mpos->key);
-			is_del = merge_is_del(mpos->val, mpos->val_len);
-
-			trace_scoutfs_btree_merge_items(sb, mpos->root,
-					&mpos->key, mpos->val_len,
-					item ? root : NULL,
-					item ? item_key(item) : NULL,
-					item ? item_val_len(item) : 0, is_del);
-
-			/* rewalk and split if ins/update needs room */
-			if (!is_del && !mid_free_item_room(bt, mpos->val_len)) {
-				walk_flags |= BTW_INSERT;
-				walk_val_len = mpos->val_len;
-				break;
-			}
-
-			/* insert missing non-deletion merge items */
-			if (!item && !is_del) {
-				scoutfs_avl_search(&bt->item_root,
-						   cmp_key_item, &mpos->key,
-						   &cmp, &par, NULL, NULL);
-				create_item(bt, &mpos->key,
-					    mpos->val + drop_val,
-					    mpos->val_len - drop_val, par, cmp);
-				scoutfs_inc_counter(sb, btree_merge_insert);
-			}
-
-			/* update existing items */
-			if (item && !is_del) {
-				update_item_value(bt, item,
-						  mpos->val + drop_val,
-						  mpos->val_len - drop_val);
-				scoutfs_inc_counter(sb, btree_merge_update);
-			}
-
-			/* delete if merge item was deletion */
-			if (item && is_del) {
-				/* rewalk and join if non-root falls under low water mark */
-				if (root->ref.blkno != bt->hdr.blkno &&
-				    !total_above_join_low_water(bt)) {
-					walk_flags |= BTW_DELETE;
-					break;
-				}
-				delete_item(bt, item, NULL);
-				scoutfs_inc_counter(sb, btree_merge_delete);
-			}
-
-			/* reset walk args now that we're not split/join */
-			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
-			walk_val_len = 0;
-
-			/* finished with this merge item */
-			scoutfs_key_inc(&mpos->key);
-			ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp);
-			if (ret < 0)
-				goto out;
-			mpos = NULL;
-		}
-	}
-
-	ret = 0;
-out:
-	scoutfs_block_put(sb, bl);
-	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
-		kfree(mpos);
-	}
-
-	return ret;
-}
-
-/*
- * Free all the blocks referenced by a btree.  The btree is only read,
- * this does not update the blocks as it frees.  The caller ensures that
- * these btrees aren't been modified.
- *
- * The caller's key tracks which blocks have been freed.  It must be
- * initialized to zeros before the first call to start freeing blocks.
- * Once a block is freed the key is updated such that the freed block
- * will not be read again.
- *
- * Returns 0 when progress has been made successfully, which includes
- * partial progress.  The key is set to all ones once we've freed all
- * the blocks.
- *
- * This works by descending to the last parent block and freeing all its
- * leaf blocks without reading them.  As it descends it remembers the
- * number of parent blocks which were traversed through their final
- * child ref.  If we free all the leaf blocks then all these parent
- * blocks are no longer needed and can be freed.  The caller's key is
- * updated to past the subtree that we just freed and we retry the
- * descent from the root through the next set of parents to the next set
- * of leaf blocks to free.
- */
-int scoutfs_btree_free_blocks(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_key *key,
-			      struct scoutfs_btree_root *root, int alloc_low)
-{
-	u64 blknos[SCOUTFS_BTREE_MAX_HEIGHT];
-	struct scoutfs_block *bl = NULL;
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block_ref ref;
-	struct scoutfs_avl_node *node;
-	struct scoutfs_avl_node *next;
-	struct scoutfs_key par_next;
-	int nr_par;
-	int level;
-	int ret;
-	int i;
-
-	if (WARN_ON_ONCE(root->height > ARRAY_SIZE(blknos)))
-		return -EIO; /* XXX corruption */
-
-	if (root->height == 0) {
-		scoutfs_key_set_ones(key);
-		return 0;
-	}
-
-	if (scoutfs_key_is_ones(key))
-		return 0;
-
-	/* just free a single leaf block */
-	if (root->height == 1) {
-		ret = scoutfs_free_meta(sb, alloc, wri,
-					le64_to_cpu(root->ref.blkno));
-		if (ret == 0) {
-			trace_scoutfs_btree_free_blocks_single(sb, root,
-						le64_to_cpu(root->ref.blkno));
-			scoutfs_key_set_ones(key);
-		}
-		goto out;
-	}
-
-	for (;;) {
-		/* start the walk at the root block */
-		level = root->height - 1;
-		ref = root->ref;
-		scoutfs_key_set_ones(&par_next);
-		nr_par = 0;
-
-		/* read blocks until we read the last parent */
-		for (;;) {
-			scoutfs_block_put(sb, bl);
-			bl = NULL;
-			ret = get_ref_block(sb, alloc, wri, 0, &ref, &bl);
-			if (ret < 0)
-				goto out;
-			bt = bl->data;
-
-			node = scoutfs_avl_search(&bt->item_root, cmp_key_item,
-						  key, NULL, NULL, &next, NULL);
-			if (node == NULL)
-				node = next;
-
-			/* should never descend into parent with no more refs */
-			if (WARN_ON_ONCE(node == NULL)) {
-				ret = -EIO;
-				goto out;
-			}
-
-			/* we'll free refs in the last parent */
-			if (level == 1)
-				break;
-
-			item = node_item(node);
-			next = scoutfs_avl_next(&bt->item_root, node);
-			if (next) {
-				/* didn't take last ref, still need parents */
-				nr_par = 0;
-				par_next = *item_key(item);
-				scoutfs_key_inc(&par_next);
-			} else {
-				/* final ref, could free after all leaves */
-				blknos[nr_par++] = le64_to_cpu(bt->hdr.blkno);
-			}
-
-			memcpy(&ref, item_val(bt, item), sizeof(ref));
-			level--;
-		}
-
-		/* free all leaf block refs in last parent */
-		while (node) {
-
-			/* make sure we can always free parents after leaves */
-			if (scoutfs_alloc_meta_low(sb, alloc,
-						   alloc_low + nr_par + 1)) {
-				ret = 0;
-				goto out;
-			}
-
-			item = node_item(node);
-			memcpy(&ref, item_val(bt, item), sizeof(ref));
-
-			trace_scoutfs_btree_free_blocks_leaf(sb, root,
-							le64_to_cpu(ref.blkno));
-			ret = scoutfs_free_meta(sb, alloc, wri,
-						le64_to_cpu(ref.blkno));
-			if (ret < 0)
-				goto out;
-
-			node = scoutfs_avl_next(&bt->item_root, node);
-			if (node) {
-				/* done with keys in child we just freed */
-				*key = *item_key(item);
-				scoutfs_key_inc(key);
-			}
-		}
-
-		/* now that leaves are freed, free any empty parents */
-		for (i = 0; i < nr_par; i++) {
-			trace_scoutfs_btree_free_blocks_parent(sb, root,
-							       blknos[i]);
-			ret = scoutfs_free_meta(sb, alloc, wri, blknos[i]);
-			BUG_ON(ret); /* checked meta low, freed should fit */
-		}
-
-		/* restart walk past the subtree we just freed */
-		*key = par_next;
-
-		/* but done if we just freed all parents down right spine */
-		if (scoutfs_key_is_ones(&par_next)) {
-			ret = 0;
-			goto out;
-		}
-	}
-
-out:
-	scoutfs_block_put(sb, bl);
-	return ret;
-}
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -82,58 +82,6 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 			      struct scoutfs_btree_root *root,
 			      struct scoutfs_btree_item_list *lst);

-int scoutfs_btree_parent_range(struct super_block *sb,
-			       struct scoutfs_btree_root *root,
-			       struct scoutfs_key *key,
-			       struct scoutfs_key *start,
-			       struct scoutfs_key *end);
-int scoutfs_btree_get_parent(struct super_block *sb,
-			     struct scoutfs_btree_root *root,
-			     struct scoutfs_key *key,
-			     struct scoutfs_btree_root *par_root);
-int scoutfs_btree_set_parent(struct super_block *sb,
-			     struct scoutfs_alloc *alloc,
-			     struct scoutfs_block_writer *wri,
-			     struct scoutfs_btree_root *root,
-			     struct scoutfs_key *key,
-			     struct scoutfs_btree_root *par_root);
-int scoutfs_btree_rebalance(struct super_block *sb,
-			    struct scoutfs_alloc *alloc,
-			    struct scoutfs_block_writer *wri,
-			    struct scoutfs_btree_root *root,
-			    struct scoutfs_key *key);
-
-/* merge input is a list of roots */
-struct scoutfs_btree_root_head {
-	struct list_head head;
-	struct scoutfs_btree_root root;
-};
-/*
- * Compare the values of merge input items whose keys are equal to
- * determine their merge order.
- */
-typedef int (*scoutfs_btree_merge_cmp_t)(void *a_val, int a_val_len,
-					 void *b_val, int b_val_len);
-/* whether merging item should be removed from destination */
-typedef bool (*scoutfs_btree_merge_is_del_t)(void *val, int val_len);
-int scoutfs_btree_merge(struct super_block *sb,
-			struct scoutfs_alloc *alloc,
-			struct scoutfs_block_writer *wri,
-			struct scoutfs_key *start,
-			struct scoutfs_key *end,
-			struct scoutfs_key *next_ret,
-			struct scoutfs_btree_root *root,
-			struct list_head *input_list,
-			scoutfs_btree_merge_cmp_t merge_cmp,
-			scoutfs_btree_merge_is_del_t merge_is_del, bool subtree,
-			int drop_val, int dirty_limit, int alloc_low);
-
-int scoutfs_btree_free_blocks(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_key *key,
-			      struct scoutfs_btree_root *root, int alloc_low);
-
 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);

 #endif
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -48,7 +48,6 @@ struct client_info {

 	struct workqueue_struct *workq;
 	struct delayed_work connect_dwork;
-	unsigned long connect_delay_jiffies;

 	u64 server_term;

@@ -217,26 +216,6 @@ int scoutfs_client_srch_commit_compact(struct super_block *sb,
 					res, sizeof(*res), NULL, 0);
 }

-int scoutfs_client_get_log_merge(struct super_block *sb,
-				 struct scoutfs_log_merge_request *req)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn,
-					SCOUTFS_NET_CMD_GET_LOG_MERGE,
-					NULL, 0, req, sizeof(*req));
-}
-
-int scoutfs_client_commit_log_merge(struct super_block *sb,
-				    struct scoutfs_log_merge_complete *comp)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn,
-					SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
-					comp, sizeof(*comp), NULL, 0);
-}
-
 int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
 				      struct scoutfs_open_ino_map *map)
 {
@@ -270,33 +249,6 @@ int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
 					&args, sizeof(args), map, sizeof(*map));
 }

-/* The client is asking the server for the current volume options */
-int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_VOLOPT,
-					NULL, 0, volopt, sizeof(*volopt));
-}
-
-/* The client is asking the server to update volume options */
-int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_SET_VOLOPT,
-					volopt, sizeof(*volopt), NULL, 0);
-}
-
-/* The client is asking the server to clear volume options */
-int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_CLEAR_VOLOPT,
-					volopt, sizeof(*volopt), NULL, 0);
-}
-
 /* The client is receiving a invalidation request from the server */
 static int client_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
@@ -370,7 +322,6 @@ static int client_greeting(struct super_block *sb,
 	scoutfs_net_client_greeting(sb, conn, new_server);

 	client->server_term = le64_to_cpu(gr->server_term);
-	client->connect_delay_jiffies = 0;
 	ret = 0;
 out:
 	return ret;
@@ -420,20 +371,6 @@ out:
 	return ret;
 }

-/*
- * If we're not seeing successful connections we want to back off.  Each
- * connection attempt starts by setting a long connection work delay.
- * We only set a shorter delay if we see a greeting response from the
- * server.  At that point we'll try to immediately reconnect if the
- * connection is broken.
- */
-static void queue_connect_dwork(struct super_block *sb, struct client_info *client)
-{
-	if (!atomic_read(&client->shutting_down) && !scoutfs_forcing_unmount(sb))
-		queue_delayed_work(client->workq, &client->connect_dwork,
-				   client->connect_delay_jiffies);
-}
-
 /*
 * This work is responsible for maintaining a connection from the client
 * to the server.  It's queued on mount and disconnect and we requeue
@@ -473,9 +410,6 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 		goto out;
 	}

-	/* always wait a bit until a greeting response sets a lower delay */
-	client->connect_delay_jiffies = msecs_to_jiffies(CLIENT_CONNECT_DELAY_MS);
-
 	ret = scoutfs_quorum_server_sin(sb, &sin);
 	if (ret < 0)
 		goto out;
@@ -503,8 +437,11 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 	if (ret)
 		scoutfs_net_shutdown(sb, client->conn);
 out:
-	if (ret)
-		queue_connect_dwork(sb, client);
+
+	/* always have a small delay before retrying to avoid storms */
+	if (ret && !atomic_read(&client->shutting_down))
+		queue_delayed_work(client->workq, &client->connect_dwork,
+				   msecs_to_jiffies(CLIENT_CONNECT_DELAY_MS));
 }

 static scoutfs_net_request_t client_req_funcs[] = {
@@ -523,7 +460,8 @@ static void client_notify_down(struct super_block *sb,
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;

-	queue_connect_dwork(sb, client);
+	if (!atomic_read(&client->shutting_down))
+		queue_delayed_work(client->workq, &client->connect_dwork, 0);
 }

 int scoutfs_client_setup(struct super_block *sb)
@@ -558,7 +496,7 @@ int scoutfs_client_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_connect_dwork(sb, client);
+	queue_delayed_work(client->workq, &client->connect_dwork, 0);
 	ret = 0;

 out:
@@ -615,7 +553,7 @@ void scoutfs_client_destroy(struct super_block *sb)
 	if (client == NULL)
 		return;

-	if (client->server_term != 0 && !scoutfs_forcing_unmount(sb)) {
+	if (client->server_term != 0) {
 		client->sending_farewell = true;
 		ret = scoutfs_net_submit_request(sb, client->conn,
 						 SCOUTFS_NET_CMD_FAREWELL,
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -22,17 +22,10 @@ int scoutfs_client_srch_get_compact(struct super_block *sb,
 				    struct scoutfs_srch_compact *sc);
 int scoutfs_client_srch_commit_compact(struct super_block *sb,
 				       struct scoutfs_srch_compact *res);
-int scoutfs_client_get_log_merge(struct super_block *sb,
-				 struct scoutfs_log_merge_request *req);
-int scoutfs_client_commit_log_merge(struct super_block *sb,
-				    struct scoutfs_log_merge_complete *comp);
 int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
 				      struct scoutfs_open_ino_map *map);
 int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
 				struct scoutfs_open_ino_map *map);
-int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
-int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
-int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);

 int scoutfs_client_setup(struct super_block *sb);
 void scoutfs_client_destroy(struct super_block *sb);
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -44,14 +44,6 @@
 	EXPAND_COUNTER(btree_insert)				\
 	EXPAND_COUNTER(btree_leaf_item_hash_search)		\
 	EXPAND_COUNTER(btree_lookup)				\
-	EXPAND_COUNTER(btree_merge)				\
-	EXPAND_COUNTER(btree_merge_alloc_low)			\
-	EXPAND_COUNTER(btree_merge_delete)			\
-	EXPAND_COUNTER(btree_merge_dirty_limit)			\
-	EXPAND_COUNTER(btree_merge_drop_old)			\
-	EXPAND_COUNTER(btree_merge_insert)			\
-	EXPAND_COUNTER(btree_merge_update)			\
-	EXPAND_COUNTER(btree_merge_walk)			\
 	EXPAND_COUNTER(btree_next)				\
 	EXPAND_COUNTER(btree_prev)				\
 	EXPAND_COUNTER(btree_split)				\
@@ -88,7 +80,6 @@
 	EXPAND_COUNTER(forest_read_items)			\
 	EXPAND_COUNTER(forest_roots_next_hint)			\
 	EXPAND_COUNTER(forest_set_bloom_bits)			\
-	EXPAND_COUNTER(inode_evict_intr)			\
 	EXPAND_COUNTER(item_clear_dirty)			\
 	EXPAND_COUNTER(item_create)				\
 	EXPAND_COUNTER(item_delete)				\
@@ -152,12 +143,6 @@
 	EXPAND_COUNTER(net_recv_invalid_message)		\
 	EXPAND_COUNTER(net_recv_messages)			\
 	EXPAND_COUNTER(net_unknown_request)			\
-	EXPAND_COUNTER(orphan_scan)				\
-	EXPAND_COUNTER(orphan_scan_cached)			\
-	EXPAND_COUNTER(orphan_scan_error)			\
-	EXPAND_COUNTER(orphan_scan_item)			\
-	EXPAND_COUNTER(orphan_scan_omap_set)			\
-	EXPAND_COUNTER(orphan_scan_read)			\
 	EXPAND_COUNTER(quorum_elected)				\
 	EXPAND_COUNTER(quorum_fence_error)			\
 	EXPAND_COUNTER(quorum_fence_leader)			\
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -312,9 +312,10 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,

 	while (iblock <= last) {
 		if (inode)
-			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
+			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
+							    true);
 		else
-			ret = scoutfs_hold_trans(sb, false);
+			ret = scoutfs_hold_trans(sb);
 		if (ret)
 			break;

@@ -755,7 +756,8 @@ retry:
 		ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 		      scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
 						  true) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true);
+		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
+							ind_seq);
 	} while (ret > 0);
 	if (ret < 0)
 		goto out;
@@ -1008,7 +1010,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)

 	while(iblock <= last) {

-		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
+		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 		if (ret)
 			goto out;

@@ -1084,7 +1086,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 	}

 	/* we're updating meta_seq with offline block count */
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret < 0)
 		goto out;

@@ -1236,7 +1238,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false);
+		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
 		if (ret > 0)
 			continue;
 		if (ret < 0)
@@ -1842,17 +1844,13 @@ int scoutfs_data_prepare_commit(struct super_block *sb)
 	return ret;
 }

-/*
- * Return true if the data allocator is lower than the caller's
- * requirement and we haven't been told by the server that we're out of
- * free extents.
- */
-bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks)
+u64 scoutfs_data_alloc_free_bytes(struct super_block *sb)
 {
 	DECLARE_DATA_INFO(sb, datinf);

-	return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) &&
-	       !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW);
+	return scoutfs_dalloc_total_len(&datinf->dalloc) <<
+		SCOUTFS_BLOCK_SM_SHIFT;
+
 }

 int scoutfs_data_setup(struct super_block *sb)
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb,
 void scoutfs_data_get_btrees(struct super_block *sb,
 			     struct scoutfs_log_trees *lt);
 int scoutfs_data_prepare_commit(struct super_block *sb);
-bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks);
+u64 scoutfs_data_alloc_free_bytes(struct super_block *sb);

 int scoutfs_data_setup(struct super_block *sb);
 void scoutfs_data_destroy(struct super_block *sb);
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -669,7 +669,6 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      umode_t mode, dev_t rdev,
 				      struct scoutfs_lock **dir_lock,
 				      struct scoutfs_lock **inode_lock,
-				      struct scoutfs_lock **orph_lock,
 				      struct list_head *ind_locks)
 {
 	struct super_block *sb = dir->i_sb;
@@ -702,17 +701,11 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		goto out_unlock;

-	if (orph_lock) {
-		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
-		if (ret < 0)
-			goto out_unlock;
-	}
-
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
 	      scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true);
+	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -732,13 +725,9 @@ out_unlock:
 	if (ret) {
 		scoutfs_inode_index_unlock(sb, ind_locks);
 		scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE);
-		*dir_lock = NULL;
 		scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
+		*dir_lock = NULL;
 		*inode_lock = NULL;
-		if (orph_lock) {
-			scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
-			*orph_lock = NULL;
-		}

 		inode = ERR_PTR(ret);
 	}
@@ -753,7 +742,6 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_inode_info *si;
 	LIST_HEAD(ind_locks);
 	u64 hash;
 	u64 pos;
@@ -764,10 +752,9 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
 	inode = lock_hold_create(dir, dentry, mode, rdev,
-				 &dir_lock, &inode_lock, NULL, &ind_locks);
+				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
-	si = SCOUTFS_I(inode);

 	pos = SCOUTFS_I(dir)->next_readdir_pos++;

@@ -783,7 +770,6 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	i_size_write(dir, i_size_read(dir) + dentry->d_name.len);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = dir->i_mtime;
-	si->crtime = inode->i_mtime;

 	if (S_ISDIR(mode)) {
 		inc_nlink(inode);
@@ -827,15 +813,13 @@ static int scoutfs_link(struct dentry *old_dentry,
 	struct super_block *sb = dir->i_sb;
 	struct scoutfs_lock *dir_lock;
 	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_lock *orph_lock = NULL;
 	LIST_HEAD(ind_locks);
-	bool del_orphan = false;
+	bool del_orphan;
 	u64 dir_size;
 	u64 ind_seq;
 	u64 hash;
 	u64 pos;
 	int ret;
-	int err;

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);

@@ -859,20 +843,13 @@ static int scoutfs_link(struct dentry *old_dentry,
 		goto out_unlock;

 	dir_size = i_size_read(dir) + dentry->d_name.len;
-
-	if (inode->i_nlink == 0) {
-		del_orphan = true;
-		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
-					  &orph_lock);
-		if (ret < 0)
-			goto out_unlock;
-	}
+	del_orphan = (inode->i_nlink == 0);

 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -883,7 +860,7 @@ retry:
 		goto out;

 	if (del_orphan) {
-		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
+		ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
 		if (ret)
 			goto out;
 	}
@@ -894,11 +871,8 @@ retry:
 			      dentry->d_name.name, dentry->d_name.len,
 			      scoutfs_ino(inode), inode->i_mode, dir_lock,
 			      inode_lock);
-	if (ret) {
-		err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
-		WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
+	if (ret)
 		goto out;
-	}
 	update_dentry_info(sb, dentry, hash, pos, dir_lock);

 	i_size_write(dir, dir_size);
@@ -906,6 +880,11 @@ retry:
 	inode->i_ctime = dir->i_mtime;
 	inc_nlink(inode);

+	if (del_orphan) {
+		ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
+		WARN_ON_ONCE(ret);
+	}
+
 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);

@@ -917,8 +896,6 @@ out_unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
-	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
-
 	return ret;
 }

@@ -943,7 +920,6 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec ts = current_kernel_time();
 	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	LIST_HEAD(ind_locks);
 	u64 ind_seq;
@@ -961,36 +937,32 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 		goto unlock;
 	}

-	if (should_orphan(inode)) {
-		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
-					  &orph_lock);
-		if (ret < 0)
-			goto unlock;
-	}
-
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
 		goto unlock;

-	if (should_orphan(inode)) {
-		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
-		if (ret < 0)
-			goto out;
-	}
-
 	ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry),
 			      dentry_info_pos(dentry), scoutfs_ino(inode),
 			      dir_lock, inode_lock);
-	if (ret) {
-		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
-		WARN_ON_ONCE(ret); /* should have been dirty */
+	if (ret)
 		goto out;
+
+	if (should_orphan(inode)) {
+		/*
+		 * Insert the orphan item before we modify any inode
+		 * metadata so we can gracefully exit should it
+		 * fail.
+		 */
+		ret = scoutfs_orphan_inode(inode);
+		WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
+		if (ret)
+			goto out;
 	}

 	dir->i_ctime = ts;
@@ -1012,7 +984,6 @@ unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
-	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1188,7 +1159,6 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_inode_info *si;
 	LIST_HEAD(ind_locks);
 	u64 hash;
 	u64 pos;
@@ -1206,10 +1176,9 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 		return ret;

 	inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-				 &dir_lock, &inode_lock, NULL, &ind_locks);
+				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
-	si = SCOUTFS_I(inode);

 	ret = symlink_item_ops(sb, SYM_CREATE, scoutfs_ino(inode), inode_lock,
 			       symname, name_len);
@@ -1231,7 +1200,6 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;

 	inode->i_ctime = dir->i_mtime;
-	si->crtime = inode->i_ctime;
 	i_size_write(inode, name_len);

 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
@@ -1567,7 +1535,6 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct scoutfs_lock *new_dir_lock = NULL;
 	struct scoutfs_lock *old_inode_lock = NULL;
 	struct scoutfs_lock *new_inode_lock = NULL;
-	struct scoutfs_lock *orph_lock = NULL;
 	struct timespec now;
 	bool ins_new = false;
 	bool del_new = false;
@@ -1632,13 +1599,6 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_unlock;

-	if (should_orphan(new_inode)) {
-		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
-					  &orph_lock);
-		if (ret < 0)
-			goto out_unlock;
-	}
-
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?:
@@ -1647,7 +1607,7 @@ retry:
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
 	      (new_inode == NULL ? 0 :
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1698,7 +1658,7 @@ retry:
 	ins_old = true;

 	if (should_orphan(new_inode)) {
-		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock);
+		ret = scoutfs_orphan_inode(new_inode);
 		if (ret)
 			goto out;
 	}
@@ -1802,7 +1762,6 @@ out_unlock:
 	scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE);
-	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1822,8 +1781,6 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_lock *orph_lock = NULL;
-	struct scoutfs_inode_info *si;
 	LIST_HEAD(ind_locks);
 	int ret;

@@ -1831,34 +1788,25 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 		return -ENAMETOOLONG;

 	inode = lock_hold_create(dir, dentry, mode, 0,
-				 &dir_lock, &inode_lock, &orph_lock, &ind_locks);
+				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
-	si = SCOUTFS_I(inode);
-
-	ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
-	if (ret < 0) {
-		iput(inode);
-		goto out; /* XXX returning error but items created */
-	}

 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	si->crtime = inode->i_mtime;
 	insert_inode_hash(inode);
-	ihold(inode); /* need to update inode modifications in d_tmpfile */
 	d_tmpfile(dentry, inode);

 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
-	iput(inode);

-out:
+	ret = scoutfs_orphan_inode(inode);
+	WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
+
 	scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
-	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
--- a/kmod/src/fence.c
+++ b/kmod/src/fence.c
@@ -1,480 +0,0 @@
-/*
- * Copyright (C) 2019 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/device.h>
-#include <linux/timer.h>
-#include <asm/barrier.h>
-
-#include "super.h"
-#include "msg.h"
-#include "sysfs.h"
-#include "server.h"
-#include "fence.h"
-
-/*
- * Fencing ensures that a given mount can no longer write to the
- * metadata or data devices.  It's necessary to ensure that it's safe to
- * give another mount access to a resource that is currently owned by a
- * mount that has stopped responding.
- *
- * Fencing is performed in collaboration between the currently elected
- * quorum leader mount and userspace running on its host.  The kernel
- * creates fencing requests as it notices that mounts have stopped
- * participating.  The fence requests are published as directories in
- * sysfs.  Userspace agents watch for directories, take action, and
- * write to files in the directory to indicate that the mount has been
- * fenced.  Once the mount is fenced the server can reclaim the
- * resources previously held by the fenced mount.
- *
- * The fence requests contain metadata identifying the specific instance
- * of the mount that needs to be fenced.  This lets a fencing agent
- * ensure that a specific mount has been fenced without necessarily
- * destroying the node that was hosting it.  Maybe the node had rebooted
- * and the mount is no longer there, maybe the mount can be force
- * unmounted, maybe the node can be configured to isolate the mount from
- * the devices.
- *
- * The fencing mechanism is asynchronous and can fail but the server
- * cannot make progress until it completes.  If a fence request times
- * out the server shuts down in the hope that another instance of a
- * server might have more luck fencing a non-responsive mount.
- *
- * Sources of fencing are fundamentally anchored in shared persistent
- * state.  It is possible, though unlikely, that servers can fence a
- * node and then themselves fail, leaving the next server to try and
- * fence the mount again.
- */
-
-struct fence_info {
-	struct kset *kset;
-	struct kobject fence_dir_kobj;
-	struct workqueue_struct *wq;
-	wait_queue_head_t waitq;
-	spinlock_t lock;
-	struct list_head list;
-};
-
-#define DECLARE_FENCE_INFO(sb, name) \
-	struct fence_info *name = SCOUTFS_SB(sb)->fence_info
-
-struct pending_fence {
-	struct super_block *sb;
-	struct scoutfs_sysfs_attrs ssa;
-	struct list_head entry;
-	struct timer_list timer;
-
-	ktime_t start_kt;
-	__be32 ipv4_addr;
-	bool fenced;
-	bool error;
-	int reason;
-	u64 rid;
-};
-
-#define FENCE_FROM_KOBJ(kobj)					\
-	container_of(SCOUTFS_SYSFS_ATTRS(kobj), struct pending_fence, ssa)
-#define DECLARE_FENCE_FROM_KOBJ(name, kobj)				\
-	struct pending_fence *name = FENCE_FROM_KOBJ(kobj)
-
-static void destroy_fence(struct pending_fence *fence)
-{
-	struct super_block *sb = fence->sb;
-
-	scoutfs_sysfs_destroy_attrs(sb, &fence->ssa);
-	del_timer_sync(&fence->timer);
-	kfree(fence);
-}
-
-static ssize_t elapsed_secs_show(struct kobject *kobj,
-				 struct kobj_attribute *attr, char *buf)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-	ktime_t now = ktime_get();
-	struct timeval tv = { 0, };
-
-	if (ktime_after(now, fence->start_kt))
-		tv = ktime_to_timeval(ktime_sub(now, fence->start_kt));
-
-	return snprintf(buf, PAGE_SIZE, "%llu", (long long)tv.tv_sec);
-}
-SCOUTFS_ATTR_RO(elapsed_secs);
-
-static ssize_t fenced_show(struct kobject *kobj, struct kobj_attribute *attr,
-			   char *buf)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-
-	return snprintf(buf, PAGE_SIZE, "%u", !!fence->fenced);
-}
-
-/*
- * any write to the fenced file from userspace indicates that the mount
- * has been safely fenced and can no longer write to the shared device.
- */
-static ssize_t fenced_store(struct kobject *kobj, struct kobj_attribute *attr,
-			    const char *buf, size_t count)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-	DECLARE_FENCE_INFO(fence->sb, fi);
-
-	if (!fence->fenced) {
-		del_timer_sync(&fence->timer);
-		fence->fenced = true;
-		wake_up(&fi->waitq);
-	}
-
-	return count;
-}
-SCOUTFS_ATTR_RW(fenced);
-
-static ssize_t error_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-
-	return snprintf(buf, PAGE_SIZE, "%u", !!fence->error);
-}
-
-/*
- * Fencing can tell us that they were unable to fence the given mount.
- * We can't continue if the mount can't be isolated so we shut down the
- * server.
- */
-static ssize_t error_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf,
-			   size_t count)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-	struct super_block *sb = fence->sb;
-	DECLARE_FENCE_INFO(fence->sb, fi);
-
-	if (!fence->error) {
-		fence->error = true;
-		scoutfs_err(sb, "error indicated by fence action for rid %016llx", fence->rid);
-		wake_up(&fi->waitq);
-	}
-
-	return count;
-}
-SCOUTFS_ATTR_RW(error);
-
-static ssize_t ipv4_addr_show(struct kobject *kobj,
-			      struct kobj_attribute *attr, char *buf)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-
-	return snprintf(buf, PAGE_SIZE, "%pI4", &fence->ipv4_addr);
-}
-SCOUTFS_ATTR_RO(ipv4_addr);
-
-static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr,
-			   char *buf)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-	unsigned r = fence->reason;
-	char *str = "unknown";
-	static char *reasons[] = {
-		[SCOUTFS_FENCE_CLIENT_RECOVERY] = "client_recovery",
-		[SCOUTFS_FENCE_CLIENT_RECONNECT] = "client_reconnect",
-		[SCOUTFS_FENCE_QUORUM_BLOCK_LEADER] = "quorum_block_leader",
-	};
-
-	if (r < ARRAY_SIZE(reasons) && reasons[r])
-		str = reasons[r];
-
-	return snprintf(buf, PAGE_SIZE, "%s", str);
-}
-SCOUTFS_ATTR_RO(reason);
-
-static ssize_t rid_show(struct kobject *kobj, struct kobj_attribute *attr,
-			char *buf)
-{
-	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
-
-	return snprintf(buf, PAGE_SIZE, "%016llx", fence->rid);
-}
-SCOUTFS_ATTR_RO(rid);
-
-static struct attribute *fence_attrs[] = {
-	SCOUTFS_ATTR_PTR(elapsed_secs),
-	SCOUTFS_ATTR_PTR(fenced),
-	SCOUTFS_ATTR_PTR(error),
-	SCOUTFS_ATTR_PTR(ipv4_addr),
-	SCOUTFS_ATTR_PTR(reason),
-	SCOUTFS_ATTR_PTR(rid),
-	NULL,
-};
-
-#define FENCE_TIMEOUT_MS (MSEC_PER_SEC * 30)
-
-static void fence_timeout(struct timer_list *timer)
-{
-	struct pending_fence *fence = from_timer(fence, timer, timer);
-	struct super_block *sb = fence->sb;
-	DECLARE_FENCE_INFO(sb, fi);
-
-	fence->error = true;
-	scoutfs_err(sb, "fence request for rid %016llx was not serviced in %lums, raising error",
-		    fence->rid, FENCE_TIMEOUT_MS);
-	wake_up(&fi->waitq);
-}
-
-int scoutfs_fence_start(struct super_block *sb, u64 rid, __be32 ipv4_addr, int reason)
-{
-	DECLARE_FENCE_INFO(sb, fi);
-	struct pending_fence *fence;
-	int ret;
-
-	fence = kzalloc(sizeof(struct pending_fence), GFP_NOFS);
-	if (!fence) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	fence->sb = sb;
-	scoutfs_sysfs_init_attrs(sb, &fence->ssa);
-
-	fence->start_kt = ktime_get();
-	fence->ipv4_addr = ipv4_addr;
-	fence->fenced = false;
-	fence->error = false;
-	fence->reason = reason;
-	fence->rid = rid;
-
-	ret = scoutfs_sysfs_create_attrs_parent(sb, &fi->kset->kobj,
-						&fence->ssa, fence_attrs,
-						"%016llx", rid);
-	if (ret < 0) {
-		kfree(fence);
-		goto out;
-	}
-
-	timer_setup(&fence->timer, fence_timeout, 0);
-	fence->timer.expires = jiffies + msecs_to_jiffies(FENCE_TIMEOUT_MS);
-	add_timer(&fence->timer);
-
-	spin_lock(&fi->lock);
-	list_add_tail(&fence->entry, &fi->list);
-	spin_unlock(&fi->lock);
-out:
-	return ret;
-}
-
-/*
- * Give the caller the rid of the next fence request which has been
- * fenced.  This doesn't have a position from which to return the next
- * because the caller either frees the fence request it's given or shuts
- * down.
- */
-int scoutfs_fence_next(struct super_block *sb, u64 *rid, int *reason, bool *error)
-{
-	DECLARE_FENCE_INFO(sb, fi);
-	struct pending_fence *fence;
-	int ret = -ENOENT;
-
-	spin_lock(&fi->lock);
-	list_for_each_entry(fence, &fi->list, entry) {
-		if (fence->fenced || fence->error) {
-			*rid = fence->rid;
-			*reason = fence->reason;
-			*error = fence->error;
-			ret = 0;
-			break;
-		}
-	}
-	spin_unlock(&fi->lock);
-
-	return ret;
-}
-
-int scoutfs_fence_reason_pending(struct super_block *sb, int reason)
-{
-	DECLARE_FENCE_INFO(sb, fi);
-	struct pending_fence *fence;
-	bool pending = false;
-
-	spin_lock(&fi->lock);
-	list_for_each_entry(fence, &fi->list, entry) {
-		if (fence->reason == reason) {
-			pending = true;
-			break;
-		}
-	}
-	spin_unlock(&fi->lock);
-
-	return pending;
-}
-
-int scoutfs_fence_free(struct super_block *sb, u64 rid)
-{
-	DECLARE_FENCE_INFO(sb, fi);
-	struct pending_fence *fence;
-	int ret = -ENOENT;
-
-	spin_lock(&fi->lock);
-	list_for_each_entry(fence, &fi->list, entry) {
-		if (fence->rid == rid) {
-			list_del_init(&fence->entry);
-			ret = 0;
-			break;
-		}
-	}
-	spin_unlock(&fi->lock);
-
-	if (ret == 0) {
-		destroy_fence(fence);
-		wake_up(&fi->waitq);
-	}
-
-	return ret;
-}
-
-static bool all_fenced(struct fence_info *fi, bool *error)
-{
-	struct pending_fence *fence;
-	bool all = true;
-
-	*error = false;
-
-	spin_lock(&fi->lock);
-	list_for_each_entry(fence, &fi->list, entry) {
-		if (fence->error) {
-			*error = true;
-			all = true;
-			break;
-		}
-		if (!fence->fenced) {
-			all = false;
-			break;
-		}
-	}
-	spin_unlock(&fi->lock);
-
-	return all;
-}
-
-/*
- * The caller waits for all the current requests to be fenced, but not
- * necessarily reclaimed.
- */
-int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies)
-{
-	DECLARE_FENCE_INFO(sb, fi);
-	bool error;
-	long ret;
-
-	ret = wait_event_interruptible_timeout(fi->waitq, all_fenced(fi, &error), timeout_jiffies);
-	if (ret == 0)
-		ret = -ETIMEDOUT;
-	else if (ret > 0)
-		ret = 0;
-	else if (error)
-		ret = -EIO;
-
-	return ret;
-}
-
-/*
- * This must be called early during startup so that it is guaranteed that
- * no other subsystems will try and call fence_start while we're waiting
- * for testing fence requests to complete.
- */
-int scoutfs_fence_setup(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct mount_options *opts = &sbi->opts;
-	struct fence_info *fi;
-	int ret;
-
-	/* can only fence if we can be elected by quorum */
-	if (opts->quorum_slot_nr == -1) {
-		ret = 0;
-		goto out;
-	}
-
-	fi = kzalloc(sizeof(struct fence_info), GFP_KERNEL);
-	if (!fi) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	init_waitqueue_head(&fi->waitq);
-	spin_lock_init(&fi->lock);
-	INIT_LIST_HEAD(&fi->list);
-
-	sbi->fence_info = fi;
-
-	fi->kset = kset_create_and_add("fence", NULL, scoutfs_sysfs_sb_dir(sb));
-	if (!fi->kset) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	fi->wq = alloc_workqueue("scoutfs_fence",
-				 WQ_UNBOUND | WQ_NON_REENTRANT, 0);
-	if (!fi->wq) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = 0;
-out:
-	if (ret)
-		scoutfs_fence_destroy(sb);
-
-	return ret;
-}
-
-/*
- * Tear down all pending fence requests because the server is shutting down.
- */
-void scoutfs_fence_stop(struct super_block *sb)
-{
-	DECLARE_FENCE_INFO(sb, fi);
-	struct pending_fence *fence;
-
-	do {
-		spin_lock(&fi->lock);
-		fence = list_first_entry_or_null(&fi->list, struct pending_fence, entry);
-		if (fence)
-			list_del_init(&fence->entry);
-		spin_unlock(&fi->lock);
-
-		if (fence) {
-			destroy_fence(fence);
-			wake_up(&fi->waitq);
-		}
-	} while (fence);
-}
-
-void scoutfs_fence_destroy(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct fence_info *fi = SCOUTFS_SB(sb)->fence_info;
-	struct pending_fence *fence;
-	struct pending_fence *tmp;
-
-	if (fi) {
-		if (fi->wq)
-			destroy_workqueue(fi->wq);
-		list_for_each_entry_safe(fence, tmp, &fi->list, entry)
-			destroy_fence(fence);
-		if (fi->kset)
-			kset_unregister(fi->kset);
-		kfree(fi);
-		sbi->fence_info = NULL;
-	}
-}
--- a/kmod/src/fence.h
+++ b/kmod/src/fence.h
@@ -1,20 +0,0 @@
-#ifndef _SCOUTFS_FENCE_H_
-#define _SCOUTFS_FENCE_H_
-
-enum {
-	SCOUTFS_FENCE_CLIENT_RECOVERY,
-	SCOUTFS_FENCE_CLIENT_RECONNECT,
-	SCOUTFS_FENCE_QUORUM_BLOCK_LEADER,
-};
-
-int scoutfs_fence_start(struct super_block *sb, u64 rid, __be32 ipv4_addr, int reason);
-int scoutfs_fence_next(struct super_block *sb, u64 *rid, int *reason, bool *error);
-int scoutfs_fence_reason_pending(struct super_block *sb, int reason);
-int scoutfs_fence_free(struct super_block *sb, u64 rid);
-int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies);
-
-int scoutfs_fence_setup(struct super_block *sb);
-void scoutfs_fence_stop(struct super_block *sb);
-void scoutfs_fence_destroy(struct super_block *sb);
-
-#endif
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -37,9 +37,9 @@
 *
 * The log btrees are modified by multiple transactions over time so
 * there is no consistent ordering relationship between the items in
- * different btrees.  Each item in a log btree stores a seq for the
- * item.  Readers check log btrees for the most recent seq that it
- * should use.
+ * different btrees.  Each item in a log btree stores a version number
+ * for the item.  Readers check log btrees for the most recent version
+ * that it should use.
 *
 * The item cache reads items in bulk from stable btrees, and writes a
 * transaction's worth of dirty items into the item log btree.
@@ -52,8 +52,6 @@
 */

 struct forest_info {
-	struct super_block *sb;
-
 	struct mutex mutex;
 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
@@ -62,9 +60,6 @@ struct forest_info {
 	struct mutex srch_mutex;
 	struct scoutfs_srch_file srch_file;
 	struct scoutfs_block *srch_bl;
-
-	struct workqueue_struct *workq;
-	struct delayed_work log_merge_dwork;
 };

 #define DECLARE_FOREST_INFO(sb, name) \
@@ -254,7 +249,7 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key,
 * If we hit stale blocks and retry we can call the callback for
 * duplicate items.  This is harmless because the items are stable while
 * the caller holds their cluster lock and the caller has to filter out
- * item seqs anyway.
+ * item versions anyway.
 */
 int scoutfs_forest_read_items(struct super_block *sb,
 			      struct scoutfs_lock *lock,
@@ -431,29 +426,29 @@ out:

 /*
 * The caller is commiting items in the transaction and has found the
- * greatest item seq amongst them.  We store it in the log_trees root
+ * greatest item version amongst them.  We store it in the log_trees root
 * to send to the server.
 */
-void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq)
+void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers)
 {
 	DECLARE_FOREST_INFO(sb, finf);

-	finf->our_log.max_item_seq = cpu_to_le64(max_seq);
+	finf->our_log.max_item_vers = cpu_to_le64(max_vers);
 }

 /*
- * The server is calling during setup to find the greatest item seq
+ * The server is calling during setup to find the greatest item version
 * amongst all the log tree roots.  They have the authoritative current
 * super.
 *
- * Item seqs are only used to compare items in log trees, not in the
- * main fs tree.  All we have to do is find the greatest seq amongst the
- * log_trees so that the core seq will have a greater seq than all the
- * items in the log_trees.
+ * Item versions are only used to compare items in log trees, not in the
+ * main fs tree.  All we have to do is find the greatest version amongst
+ * the log_trees so that new locks will have a write_version greater
+ * than all the items in the log_trees.
 */
-int scoutfs_forest_get_max_seq(struct super_block *sb,
-			       struct scoutfs_super_block *super,
-			       u64 *seq)
+int scoutfs_forest_get_max_vers(struct super_block *sb,
+				struct scoutfs_super_block *super,
+				u64 *vers)
 {
 	struct scoutfs_log_trees *lt;
 	SCOUTFS_BTREE_ITEM_REF(iref);
@@ -461,7 +456,7 @@ int scoutfs_forest_get_max_seq(struct super_block *sb,
 	int ret;

 	scoutfs_key_init_log_trees(&ltk, 0, 0);
-	*seq = 0;
+	*vers = 0;

 	for (;; scoutfs_key_inc(&ltk)) {
 		ret = scoutfs_btree_next(sb, &super->logs_root, &ltk, &iref);
@@ -469,7 +464,8 @@ int scoutfs_forest_get_max_seq(struct super_block *sb,
 			if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 				ltk = *iref.key;
 				lt = iref.val;
-				*seq = max(*seq, le64_to_cpu(lt->max_item_seq));
+				*vers = max(*vers,
+					    le64_to_cpu(lt->max_item_vers));
 			} else {
 				ret = -EIO;
 			}
@@ -538,7 +534,7 @@ void scoutfs_forest_init_btrees(struct super_block *sb,
 	memset(&finf->our_log, 0, sizeof(finf->our_log));
 	finf->our_log.item_root = lt->item_root;
 	finf->our_log.bloom_ref = lt->bloom_ref;
-	finf->our_log.max_item_seq = lt->max_item_seq;
+	finf->our_log.max_item_vers = lt->max_item_vers;
 	finf->our_log.rid = lt->rid;
 	finf->our_log.nr = lt->nr;
 	finf->srch_file = lt->srch_file;
@@ -568,7 +564,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 	lt->item_root = finf->our_log.item_root;
 	lt->bloom_ref = finf->our_log.bloom_ref;
 	lt->srch_file = finf->srch_file;
-	lt->max_item_seq = finf->our_log.max_item_seq;
+	lt->max_item_vers = finf->our_log.max_item_vers;

 	scoutfs_block_put(sb, finf->srch_bl);
 	finf->srch_bl = NULL;
@@ -577,149 +573,6 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 					    &lt->bloom_ref);
 }

-/*
- * Compare input items to merge by their log item value seq when their
- * keys match.
- */
-static int merge_cmp(void *a_val, int a_val_len, void *b_val, int b_val_len)
-{
-	struct scoutfs_log_item_value *a = a_val;
-	struct scoutfs_log_item_value *b = b_val;
-
-	/* sort merge item by seq */
-	return scoutfs_cmp(le64_to_cpu(a->seq), le64_to_cpu(b->seq));
-}
-
-static bool merge_is_del(void *val, int val_len)
-{
-	struct scoutfs_log_item_value *liv = val;
-
-	return !!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION);
-}
-
-#define LOG_MERGE_DELAY_MS (5 * MSEC_PER_SEC)
-
-/*
- * Regularly try to get a log merge request from the server.  If we get
- * a request we walk the log_trees items to find input trees and pass
- * them to btree_merge.  All of our work is done in dirty blocks
- * allocated from available free blocks that the server gave us.  If we
- * hit an error then we drop our dirty blocks without writing them and
- * send an error flag to the server so they can reclaim our allocators
- * and ignore the rest of our work.
- */
-static void scoutfs_forest_log_merge_worker(struct work_struct *work)
-{
-	struct forest_info *finf = container_of(work, struct forest_info,
-						log_merge_dwork.work);
-	struct super_block *sb = finf->sb;
-	struct scoutfs_btree_root_head *rhead = NULL;
-	struct scoutfs_btree_root_head *tmp;
-	struct scoutfs_log_merge_complete comp;
-	struct scoutfs_log_merge_request req;
-	struct scoutfs_log_trees *lt;
-	struct scoutfs_block_writer wri;
-	struct scoutfs_alloc alloc;
-	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_key next;
-	struct scoutfs_key key;
-	unsigned long delay;
-	LIST_HEAD(inputs);
-	int ret;
-
-	ret = scoutfs_client_get_log_merge(sb, &req);
-	if (ret < 0)
-		goto resched;
-
-	comp.root = req.root;
-	comp.start = req.start;
-	comp.end = req.end;
-	comp.remain = req.end;
-	comp.rid = req.rid;
-	comp.seq = req.seq;
-	comp.flags = 0;
-
-	scoutfs_alloc_init(&alloc, &req.meta_avail, &req.meta_freed);
-	scoutfs_block_writer_init(sb, &wri);
-
-	/* find finalized input log trees up to last_seq */
-	for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) {
-
-		if (!rhead) {
-			rhead = kmalloc(sizeof(*rhead), GFP_NOFS);
-			if (!rhead) {
-				ret = -ENOMEM;
-				goto out;
-			}
-		}
-
-		ret = scoutfs_btree_next(sb, &req.logs_root, &key, &iref);
-		if (ret == 0) {
-			if (iref.val_len == sizeof(*lt)) {
-				key = *iref.key;
-				lt = iref.val;
-				if ((le64_to_cpu(lt->flags) &
-				     SCOUTFS_LOG_TREES_FINALIZED) &&
-				    (le64_to_cpu(lt->max_item_seq) <=
-				     le64_to_cpu(req.last_seq))) {
-					rhead->root = lt->item_root;
-					list_add_tail(&rhead->head, &inputs);
-					rhead = NULL;
-				}
-			} else {
-				ret = -EIO;
-			}
-			scoutfs_btree_put_iref(&iref);
-		}
-		if (ret < 0) {
-			if (ret == -ENOENT) {
-				ret = 0;
-				break;
-			}
-			goto out;
-		}
-	}
-
-	/* shouldn't be possible, but it's harmless */
-	if (list_empty(&inputs)) {
-		ret = 0;
-		goto out;
-	}
-
-	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
-				  &next, &comp.root, &inputs, merge_cmp,
-				  merge_is_del,
-				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
-				  sizeof(struct scoutfs_log_item_value),
-				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
-	if (ret == -ERANGE) {
-		comp.remain = next;
-		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
-		ret = 0;
-	}
-
-out:
-	scoutfs_alloc_prepare_commit(sb, &alloc, &wri);
-	if (ret == 0)
-	      ret = scoutfs_block_writer_write(sb, &wri);
-	scoutfs_block_writer_forget_all(sb, &wri);
-
-	comp.meta_avail = alloc.avail;
-	comp.meta_freed = alloc.freed;
-	if (ret < 0)
-		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_ERROR);
-
-	ret = scoutfs_client_commit_log_merge(sb, &comp);
-
-	kfree(rhead);
-	list_for_each_entry_safe(rhead, tmp, &inputs, head)
-		kfree(rhead);
-
-resched:
-	delay = ret == 0 ? 0 : msecs_to_jiffies(LOG_MERGE_DELAY_MS);
-	queue_delayed_work(finf->workq, &finf->log_merge_dwork, delay);
-}
-
 int scoutfs_forest_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -733,23 +586,10 @@ int scoutfs_forest_setup(struct super_block *sb)
 	}

 	/* the finf fields will be setup as we open a transaction */
-	finf->sb = sb;
 	mutex_init(&finf->mutex);
 	mutex_init(&finf->srch_mutex);
-	INIT_DELAYED_WORK(&finf->log_merge_dwork,
-			  scoutfs_forest_log_merge_worker);
+
 	sbi->forest_info = finf;
-
-	finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT |
-				      WQ_UNBOUND | WQ_HIGHPRI, 0);
-	if (!finf->workq) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	queue_delayed_work(finf->workq, &finf->log_merge_dwork,
-			   msecs_to_jiffies(LOG_MERGE_DELAY_MS));
-
 	ret = 0;
 out:
 	if (ret)
@@ -758,16 +598,6 @@ out:
 	return 0;
 }

-void scoutfs_forest_stop(struct super_block *sb)
-{
-	DECLARE_FOREST_INFO(sb, finf);
-
-	if (finf && finf->workq) {
-		cancel_delayed_work_sync(&finf->log_merge_dwork);
-		destroy_workqueue(finf->workq);
-	}
-}
-
 void scoutfs_forest_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -775,7 +605,6 @@ void scoutfs_forest_destroy(struct super_block *sb)

 	if (finf) {
 		scoutfs_block_put(sb, finf->srch_bl);
-
 		kfree(finf);
 		sbi->forest_info = NULL;
 	}
--- a/kmod/src/forest.h
+++ b/kmod/src/forest.h
@@ -23,10 +23,10 @@ int scoutfs_forest_read_items(struct super_block *sb,
 			      scoutfs_forest_item_cb cb, void *arg);
 int scoutfs_forest_set_bloom_bits(struct super_block *sb,
 				  struct scoutfs_lock *lock);
-void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
-int scoutfs_forest_get_max_seq(struct super_block *sb,
-			       struct scoutfs_super_block *super,
-			       u64 *seq);
+void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers);
+int scoutfs_forest_get_max_vers(struct super_block *sb,
+				struct scoutfs_super_block *super,
+				u64 *vers);
 int scoutfs_forest_insert_list(struct super_block *sb,
 			       struct scoutfs_btree_item_list *lst);
 int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);
@@ -39,7 +39,6 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 			       struct scoutfs_log_trees *lt);

 int scoutfs_forest_setup(struct super_block *sb);
-void scoutfs_forest_stop(struct super_block *sb);
 void scoutfs_forest_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -203,12 +203,11 @@ struct scoutfs_key {
 #define skmc_rid	_sk_first

 /* free extents by blkno */
-#define skfb_end	_sk_first
-#define skfb_len	_sk_second
-/* free extents by order */
-#define skfo_revord	_sk_first
-#define skfo_end	_sk_second
-#define skfo_len	_sk_third
+#define skfb_end	_sk_second
+#define skfb_len	_sk_third
+/* free extents by len */
+#define skfl_neglen	_sk_second
+#define skfl_blkno	_sk_third

 struct scoutfs_avl_root {
 	__le16 node;
@@ -286,10 +285,9 @@ struct scoutfs_alloc_list_head {
 	struct scoutfs_block_ref ref;
 	__le64 total_nr;
 	__le32 first_nr;
-	__le32 flags;
+	__u8 __pad[4];
 };

-
 /*
 * While the main allocator uses extent items in btree blocks, metadata
 * allocations for a single transaction are recorded in arrays in
@@ -318,25 +316,17 @@ struct scoutfs_alloc_list_block {
 */
 struct scoutfs_alloc_root {
 	__le64 total_len;
-	__le32 flags;
-	__le32 _pad;
 	struct scoutfs_btree_root root;
 };

-/* Shared by _alloc_list_head and _alloc_root */
-#define SCOUTFS_ALLOC_FLAG_LOW	(1U << 0)
-
 /* types of allocators, exposed to alloc_detail ioctl */
 #define SCOUTFS_ALLOC_OWNER_NONE	0
 #define SCOUTFS_ALLOC_OWNER_SERVER	1
 #define SCOUTFS_ALLOC_OWNER_MOUNT	2
 #define SCOUTFS_ALLOC_OWNER_SRCH	3
-#define SCOUTFS_ALLOC_OWNER_LOG_MERGE	4

 struct scoutfs_mounted_client_btree_val {
-	union scoutfs_inet_addr addr;
 	__u8 flags;
-	__u8 __pad[7];
 };

 #define SCOUTFS_MOUNTED_CLIENT_QUORUM	(1 << 0)
@@ -437,10 +427,6 @@ struct scoutfs_srch_compact {
 /* client -> server: compaction failed */
 #define SCOUTFS_SRCH_COMPACT_FLAG_ERROR		(1 << 5)

-#define SCOUTFS_DATA_ALLOC_MAX_ZONES	1024
-#define SCOUTFS_DATA_ALLOC_ZONE_BYTES	DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 8)
-#define SCOUTFS_DATA_ALLOC_ZONE_LE64S	DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 64)
-
 /*
 * XXX I imagine we should rename these now that they've evolved to track
 * all the btrees that clients use during a transaction.  It's not just
@@ -454,18 +440,13 @@ struct scoutfs_log_trees {
 	struct scoutfs_alloc_root data_avail;
 	struct scoutfs_alloc_root data_freed;
 	struct scoutfs_srch_file srch_file;
-	__le64 data_alloc_zone_blocks;
-	__le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
-	__le64 max_item_seq;
+	__le64 max_item_vers;
 	__le64 rid;
 	__le64 nr;
-	__le64 flags;
 };

-#define SCOUTFS_LOG_TREES_FINALIZED	(1ULL << 0)
-
 struct scoutfs_log_item_value {
-	__le64 seq;
+	__le64 vers;
 	__u8 flags;
 	__u8 __pad[7];
 	__u8 data[];
@@ -500,83 +481,11 @@ struct scoutfs_bloom_block {
 	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)
 #define SCOUTFS_FOREST_BLOOM_FUNC_BITS		(SCOUTFS_BLOCK_LG_SHIFT + 3)

-/*
- * A private server btree item which records the status of a log merge
- * operation that is in progress.
- */
-struct scoutfs_log_merge_status {
-	struct scoutfs_key next_range_key;
-	__le64 nr_requests;
-	__le64 nr_complete;
-	__le64 last_seq;
-	__le64 seq;
-};
-
-/*
- * A request is sent to the client and stored in a server btree item to
- * record resources that would be reclaimed if the client failed.  It
- * has all the inputs needed for the client to perform its portion of a
- * merge.
- */
-struct scoutfs_log_merge_request {
-	struct scoutfs_alloc_list_head meta_avail;
-	struct scoutfs_alloc_list_head meta_freed;
-	struct scoutfs_btree_root logs_root;
-	struct scoutfs_btree_root root;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	__le64 last_seq;
-	__le64 rid;
-	__le64 seq;
-	__le64 flags;
-};
-
-/* request root is subtree of fs root at parent, restricted merging modifications */
-#define SCOUTFS_LOG_MERGE_REQUEST_SUBTREE	(1ULL << 0)
-
-/*
- * The output of a client's merge of log btree items into a subtree
- * rooted at a parent in the fs_root.  The client sends it to the
- * server, who stores it in a btree item for later splicing/rebalancing.
- */
-struct scoutfs_log_merge_complete {
-	struct scoutfs_alloc_list_head meta_avail;
-	struct scoutfs_alloc_list_head meta_freed;
-	struct scoutfs_btree_root root;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-	struct scoutfs_key remain;
-	__le64 rid;
-	__le64 seq;
-	__le64 flags;
-};
-
-/* merge failed, ignore completion and reclaim stored request */
-#define SCOUTFS_LOG_MERGE_COMP_ERROR	(1ULL << 0)
-/* merge didn't complete range, restart from remain */
-#define SCOUTFS_LOG_MERGE_COMP_REMAIN	(1ULL << 1)
-
-/*
- * Range items record the ranges of the fs keyspace that still need to
- * be merged.  They're added as a merge starts, removed as requests are
- * sent and added back if the request didn't consume its entire range.
- */
-struct scoutfs_log_merge_range {
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-};
-
-struct scoutfs_log_merge_freeing {
-	struct scoutfs_btree_root root;
-	struct scoutfs_key key;
-	__le64 seq;
-};
-
 /*
 * Keys are first sorted by major key zones.
 */
 #define SCOUTFS_INODE_INDEX_ZONE		1
-#define SCOUTFS_ORPHAN_ZONE			2
+#define SCOUTFS_RID_ZONE			2
 #define SCOUTFS_FS_ZONE				3
 #define SCOUTFS_LOCK_ZONE			4
 /* Items only stored in server btrees */
@@ -584,21 +493,14 @@ struct scoutfs_log_merge_freeing {
 #define SCOUTFS_TRANS_SEQ_ZONE			7
 #define SCOUTFS_MOUNTED_CLIENT_ZONE		8
 #define SCOUTFS_SRCH_ZONE			9
-#define SCOUTFS_FREE_EXTENT_BLKNO_ZONE		10
-#define SCOUTFS_FREE_EXTENT_ORDER_ZONE		11
-/* Items only stored in log merge server btrees */
-#define SCOUTFS_LOG_MERGE_STATUS_ZONE		12
-#define SCOUTFS_LOG_MERGE_RANGE_ZONE		13
-#define SCOUTFS_LOG_MERGE_REQUEST_ZONE		14
-#define SCOUTFS_LOG_MERGE_COMPLETE_ZONE		15
-#define SCOUTFS_LOG_MERGE_FREEING_ZONE		16
+#define SCOUTFS_FREE_EXTENT_ZONE		10

 /* inode index zone */
 #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE	1
 #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
 #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */

-/* orphan zone, redundant type used for clarity */
+/* rid zone (also used in server alloc btree) */
 #define SCOUTFS_ORPHAN_TYPE			1

 /* fs zone */
@@ -619,6 +521,10 @@ struct scoutfs_log_merge_freeing {
 #define SCOUTFS_SRCH_PENDING_TYPE	3
 #define SCOUTFS_SRCH_BUSY_TYPE		4

+/* free extents in allocator btrees in client and server, by blkno or len */
+#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE	1
+#define SCOUTFS_FREE_EXTENT_LEN_TYPE	2
+
 /* file data extents have start and len in key */
 struct scoutfs_data_extent_val {
 	__le64 blkno;
@@ -676,12 +582,6 @@ struct scoutfs_xattr {
 #define SCOUTFS_QUORUM_HB_IVAL_MS	100
 #define SCOUTFS_QUORUM_HB_TIMEO_MS	(5 * MSEC_PER_SEC)

-/*
- * A newly elected leader will give fencing some time before giving up and
- * shutting down.
- */
-#define SCOUTFS_QUORUM_FENCE_TO_MS	(15 * MSEC_PER_SEC)
-
 struct scoutfs_quorum_message {
 	__le64 fsid;
 	__le64 version;
@@ -713,60 +613,18 @@ struct scoutfs_quorum_config {
 	} slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };

-enum {
-	SCOUTFS_QUORUM_EVENT_BEGIN,		/* quorum service starting up */
-	SCOUTFS_QUORUM_EVENT_TERM,		/* updated persistent term */
-	SCOUTFS_QUORUM_EVENT_ELECT,		/* won election */
-	SCOUTFS_QUORUM_EVENT_FENCE,		/* server fenced others */
-	SCOUTFS_QUORUM_EVENT_STOP,		/* server stopped */
-	SCOUTFS_QUORUM_EVENT_END,		/* quorum service shutting down */
-	SCOUTFS_QUORUM_EVENT_NR,
-};
-
 struct scoutfs_quorum_block {
 	struct scoutfs_block_header hdr;
+	__le64 term;
+	__le64 random_write_mark;
+	__le64 flags;
 	struct scoutfs_quorum_block_event {
 		__le64 rid;
-		__le64 term;
 		struct scoutfs_timespec ts;
-	} events[SCOUTFS_QUORUM_EVENT_NR];
+	} write, update_term, set_leader, clear_leader, fenced;
 };

-/*
- * Tunable options that apply to the entire system.  They can be set in
- * mkfs or in sysfs files which send an rpc to the server to make the
- * change.  The super version defines the options that exist.
- *
- * @set_bits: bits for each 64bit starting offset after set_bits
- * indicate which logical option is set.
- *
- * @data_alloc_zone_blocks: if set, the data device is logically divided
- * into contiguous zones of this many blocks.  Data allocation will try
- * and isolate allocated extents for each mount to their own zone.  The
- * zone size must be larger than the data alloc high water mark and
- * large enough such that the number of zones is kept within its static
- * limit.
- */
-struct scoutfs_volume_options {
-	__le64 set_bits;
-	__le64 data_alloc_zone_blocks;
-	__le64 __future_expansion[63];
-};
-
-#define scoutfs_volopt_nr(field)							\
-	((offsetof(struct scoutfs_volume_options, field) -				\
-	  (offsetof(struct scoutfs_volume_options, set_bits) +				\
-	   member_sizeof(struct scoutfs_volume_options, set_bits))) / sizeof(__le64))
-#define scoutfs_volopt_bit(field)							\
-	(1ULL << scoutfs_volopt_nr(field))
-
-#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR \
-	scoutfs_volopt_nr(data_alloc_zone_blocks)
-#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT \
-	scoutfs_volopt_bit(data_alloc_zone_blocks)
-
-#define SCOUTFS_VOLOPT_EXPANSION_BITS \
-	(~(scoutfs_volopt_bit(__future_expansion) - 1))
+#define SCOUTFS_QUORUM_BLOCK_LEADER (1 << 0)

 #define SCOUTFS_FLAG_IS_META_BDEV 0x01

@@ -776,8 +634,8 @@ struct scoutfs_super_block {
 	__le64 version;
 	__le64 flags;
 	__u8 uuid[SCOUTFS_UUID_BYTES];
-	__le64 seq;
 	__le64 next_ino;
+	__le64 next_trans_seq;
 	__le64 total_meta_blocks;	/* both static and dynamic */
 	__le64 first_meta_blkno;	/* first dynamically allocated */
 	__le64 last_meta_blkno;
@@ -791,11 +649,9 @@ struct scoutfs_super_block {
 	struct scoutfs_alloc_list_head server_meta_freed[2];
 	struct scoutfs_btree_root fs_root;
 	struct scoutfs_btree_root logs_root;
-	struct scoutfs_btree_root log_merge;
 	struct scoutfs_btree_root trans_seqs;
 	struct scoutfs_btree_root mounted_clients;
 	struct scoutfs_btree_root srch_root;
-	struct scoutfs_volume_options volopt;
 };

 #define SCOUTFS_ROOT_INO 1
@@ -821,6 +677,7 @@ struct scoutfs_super_block {
 * online by staging.
 *
 * XXX
+ *	- otime?
 *	- compat flags?
 *	- version?
 *	- generation?
@@ -844,7 +701,6 @@ struct scoutfs_inode {
 	struct scoutfs_timespec atime;
 	struct scoutfs_timespec ctime;
 	struct scoutfs_timespec mtime;
-	struct scoutfs_timespec crtime;
 };

 #define SCOUTFS_INO_FLAG_TRUNCATE 0x1
@@ -984,12 +840,7 @@ enum scoutfs_net_cmd {
 	SCOUTFS_NET_CMD_LOCK_RECOVER,
 	SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
 	SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
-	SCOUTFS_NET_CMD_GET_LOG_MERGE,
-	SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
 	SCOUTFS_NET_CMD_OPEN_INO_MAP,
-	SCOUTFS_NET_CMD_GET_VOLOPT,
-	SCOUTFS_NET_CMD_SET_VOLOPT,
-	SCOUTFS_NET_CMD_CLEAR_VOLOPT,
 	SCOUTFS_NET_CMD_FAREWELL,
 	SCOUTFS_NET_CMD_UNKNOWN,
 };
@@ -1034,7 +885,7 @@ struct scoutfs_net_roots {

 struct scoutfs_net_lock {
 	struct scoutfs_key key;
-	__le64 write_seq;
+	__le64 write_version;
 	__u8 old_mode;
 	__u8 new_mode;
 	__u8 __pad[6];
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -34,7 +34,6 @@
 #include "client.h"
 #include "cmp.h"
 #include "omap.h"
-#include "forest.h"

 /*
 * XXX
@@ -55,19 +54,10 @@ struct inode_allocator {
 };

 struct inode_sb_info {
-	struct super_block *sb;
-	bool stopped;
-
 	spinlock_t writeback_lock;
 	struct rb_root writeback_inodes;
 	struct inode_allocator dir_ino_alloc;
 	struct inode_allocator ino_alloc;
-
-	struct delayed_work orphan_scan_dwork;
-
-	/* serialize multiple inode ->evict trying to delete same ino's items */
-	spinlock_t deleting_items_lock;
-	struct list_head deleting_items_list;
 };

 #define DECLARE_INODE_SB_INFO(sb, name) \
@@ -262,8 +252,6 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 	si->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
 	si->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
 	si->flags = le32_to_cpu(cinode->flags);
-	si->crtime.tv_sec = le64_to_cpu(cinode->crtime.sec);
-	si->crtime.tv_nsec = le32_to_cpu(cinode->crtime.nsec);

 	/*
 	 * i_blocks is initialized from online and offline and is then
@@ -364,7 +352,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	if (!S_ISREG(inode->i_mode))
 		return 0;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
 	if (ret)
 		return ret;

@@ -391,7 +379,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 	LIST_HEAD(ind_locks);
 	int ret;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		return ret;

@@ -506,7 +494,7 @@ retry:
 		}
 	}

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		goto out;

@@ -660,22 +648,12 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
 	} while (read_seqcount_retry(&si->seqcount, seq));
 }

-/*
- * We have inversions between getting cluster locks while performing
- * final deletion on a freeing inode and waiting on a freeing inode
- * while holding a cluster lock.
- *
- * We can avoid these deadlocks by hiding freeing inodes in our hash
- * lookup function.  We're fine with either returning null or populating
- * a new inode overlapping with eviction freeing a previous instance of
- * the inode.
- */
 static int scoutfs_iget_test(struct inode *inode, void *arg)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;

-	return (si->ino == *ino) && !(inode->i_state & I_FREEING);
+	return si->ino == *ino;
 }

 static int scoutfs_iget_set(struct inode *inode, void *arg)
@@ -694,6 +672,28 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
 	return ilookup5(sb, ino, scoutfs_iget_test, &ino);
 }

+static int iget_test_nofreeing(struct inode *inode, void *arg)
+{
+	return !(inode->i_state & I_FREEING) && scoutfs_iget_test(inode, arg);
+}
+
+/*
+ * There's a natural risk of a deadlock between lock invalidation and
+ * eviction.  Invalidation blocks locks while looking up inodes and
+ * invalidating local caches.  Inode eviction gets a lock to check final
+ * inode deletion while the inode is marked FREEING which blocks
+ * lookups.
+ *
+ * We have a lookup variant which doesn't return I_FREEING inodes
+ * instead of waiting on them.  If an inode has made it to I_FREEING
+ * then it doesn't have any local caches that are reachable and the lock
+ * invalidation promise is kept.
+ */
+struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino)
+{
+	return ilookup5(sb, ino, iget_test_nofreeing, &ino);
+}
+
 struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 {
 	struct scoutfs_lock *lock = NULL;
@@ -766,9 +766,6 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 	cinode->next_readdir_pos = cpu_to_le64(si->next_readdir_pos);
 	cinode->next_xattr_id = cpu_to_le64(si->next_xattr_id);
 	cinode->flags = cpu_to_le32(si->flags);
-	cinode->crtime.sec = cpu_to_le64(si->crtime.tv_sec);
-	cinode->crtime.nsec = cpu_to_le32(si->crtime.tv_nsec);
-	memset(cinode->crtime.__pad, 0, sizeof(cinode->crtime.__pad));
 }

 /*
@@ -1222,7 +1219,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
 * Returns > 0 if the seq changed and the locks should be retried.
 */
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq, bool allocing)
+				      struct list_head *list, u64 seq)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct index_lock *ind_lock;
@@ -1238,7 +1235,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
 			goto out;
 	}

-	ret = scoutfs_hold_trans(sb, allocing);
+	ret = scoutfs_hold_trans(sb);
 	if (ret == 0 && seq != sbi->trans_seq) {
 		scoutfs_release_trans(sb);
 		ret = 1;
@@ -1252,7 +1249,7 @@ out:
 }

 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq, bool allocing)
+				  bool set_data_seq)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
@@ -1262,7 +1259,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, list, inode,
 						  set_data_seq) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing);
+		      scoutfs_inode_index_try_lock_hold(sb, list, seq);
 	} while (ret > 0);

 	return ret;
@@ -1452,74 +1449,41 @@ out:
 	return inode;
 }

-static void init_orphan_key(struct scoutfs_key *key, u64 ino)
+static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
 {
 	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_ORPHAN_ZONE,
-		.sko_ino = cpu_to_le64(ino),
+		.sk_zone = SCOUTFS_RID_ZONE,
+		.sko_rid = cpu_to_le64(rid),
 		.sk_type = SCOUTFS_ORPHAN_TYPE,
+		.sko_ino = cpu_to_le64(ino),
 	};
 }

-/*
- * Create an orphan item.  The orphan items are maintained in their own
- * zone under a write only lock while the caller has the inode protected
- * by a write lock.
- */
-int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
+int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
 {
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;

-	init_orphan_key(&key, ino);
+	init_orphan_key(&key, sbi->rid, ino);

-	return scoutfs_item_create_force(sb, &key, NULL, 0, lock);
+	return scoutfs_item_dirty(sb, &key, lock);
 }

-int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
+int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
 {
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;
+	int ret;

-	init_orphan_key(&key, ino);
+	init_orphan_key(&key, sbi->rid, ino);

-	return scoutfs_item_delete_force(sb, &key, lock);
-}
+	ret = scoutfs_item_delete(sb, &key, lock);
+	if (ret == -ENOENT)
+		ret = 0;

-struct deleting_ino_entry {
-	struct list_head head;
-	u64 ino;
-};
-
-static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
-{
-	struct deleting_ino_entry *tmp;
-	bool added = true;
-
-	spin_lock(&inf->deleting_items_lock);
-
-	list_for_each_entry(tmp, &inf->deleting_items_list, head) {
-		if (tmp->ino == ino) {
-			added = false;
-			break;
-		}
-	}
-
-	if (added) {
-		del->ino = ino;
-		list_add_tail(&del->head, &inf->deleting_items_list);
-	}
-
-	spin_unlock(&inf->deleting_items_lock);
-
-	return added;
-}
-
-static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
-{
-	if (del->ino) {
-		spin_lock(&inf->deleting_items_lock);
-		list_del_init(&del->head);
-		spin_unlock(&inf->deleting_items_lock);
-	}
+	return ret;
 }

 /*
@@ -1530,21 +1494,9 @@ static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entr
 * orphan item will continue triggering attempts to finish previous
 * partial deletion until all deletion is complete and the orphan item
 * is removed.
- *
- * Currently this can be called multiple times for multiple cached
- * inodes for a given ino number (ilookup avoids freeing inodes to avoid
- * cluster lock<->inode flag waiting inversions).  Some items are not
- * safe to delete concurrently, for example concurrent data truncation
- * could free extents multiple times.  We use a very silly list of inos
- * being deleted.  Duplicates just return success.  If the first
- * deletion ends up failing orphan deletion will come back around later
- * and retry.
 */
-static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
-			      struct scoutfs_lock *orph_lock)
+static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	DECLARE_INODE_SB_INFO(sb, inf);
-	struct deleting_ino_entry del = {{NULL, }};
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
 	LIST_HEAD(ind_locks);
@@ -1554,11 +1506,6 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 	u64 size;
 	int ret;

-	if (!added_deleting_ino(inf, &del, ino)) {
-		ret = 0;
-		goto out;
-	}
-
 	init_inode_key(&key, ino);

 	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
@@ -1596,7 +1543,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1618,9 +1565,8 @@ retry:
 	if (ret)
 		goto out;

-	ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
+	ret = scoutfs_orphan_delete(sb, ino);
 out:
-	del_deleting_ino(inf, &del);
 	if (release)
 		scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
@@ -1634,17 +1580,11 @@ out:
 * tear down.  We use locking and open inode number bitmaps to decide if
 * we should finally destroy an inode that is no longer open nor
 * reachable through directory entries.
- *
- * Because lookup ignores freeing inodes we can get here from multiple
- * instances of an inode that is being deleted.  Orphan scanning in
- * particular can race with deletion.   delete_inode_items() resolves
- * concurrent attempts.
 */
 void scoutfs_evict_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
-	struct scoutfs_lock *orph_lock;
 	struct scoutfs_lock *lock;
 	int ret;

@@ -1656,21 +1596,14 @@ void scoutfs_evict_inode(struct inode *inode)

 	truncate_inode_pages_final(&inode->i_data);

-	ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
+	ret = scoutfs_omap_should_delete(sb, inode, &lock);
 	if (ret > 0) {
-		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
+		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock);
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
-		scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	}
-	if (ret == -ERESTARTSYS) {
-		/* can be in task with pending, could be found as orphan */
-		scoutfs_inc_counter(sb, inode_evict_intr);
-		ret = 0;
-	}
-	if (ret < 0) {
+	if (ret < 0)
 		scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
 			    ret, ino);
-	}

 	scoutfs_omap_dec(sb, ino);

@@ -1705,141 +1638,75 @@ int scoutfs_drop_inode(struct inode *inode)
 }

 /*
- * All mounts are performing this work concurrently.  We introduce
- * significant jitter between them to try and keep them from all
- * bunching up and working on the same inodes.
+ * Find orphan items and process each one.
+ *
+ * Runtime of this will be bounded by the number of orphans, which could
+ * theoretically be very large. If that becomes a problem we might want to push
+ * this work off to a thread.
+ *
+ * This only scans orphans for this node.  This will need to be covered by
+ * the rest of node zone cleanup.
 */
-static void schedule_orphan_dwork(struct inode_sb_info *inf)
+int scoutfs_scan_orphans(struct super_block *sb)
 {
-#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
-#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
-	unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
-					       prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
-	if (!inf->stopped) {
-		delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
-					 prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
-		schedule_delayed_work(&inf->orphan_scan_dwork, delay);
-	}
-}
-
-/*
- * Find and delete inodes whose only remaining reference is the
- * persistent orphan item that was created as they were unlinked.
- *
- * Orphan items are created as the final directory entry referring to an
- * inode is deleted.  They're deleted as the final cached inode is
- * evicted and the inode items are destroyed.  They can linger if all
- * the cached inodes pinning the inode fail to delete as they are
- * evicted from the cache -- either through crashing or errors.
- *
- * This work runs in all mounts in the background looking for orphaned
- * inodes that should be deleted.
- *
- * We use the forest hint call to read the persistent forest trees
- * looking for orphan items without creating lock contention.  Orphan
- * items exist for O_TMPFILE users and we don't want to force them to
- * commit by trying to acquire a conflicting read lock the orphan zone.
- * There's no rush to reclaim deleted items, eventually they will be
- * found in the persistent item btrees.
- *
- * Once we find candidate orphan items we can first check our local
- * inode cache for inodes that are already on their way to eviction and
- * can be skipped.  Then we ask the server for the open map containing
- * the inode.  Only if we don't have it cached, and no one else does, do
- * we try and read it into our cache and evict it to trigger the final
- * inode deletion process.
- *
- * Orphaned items that make it that far should be very rare.  They can
- * only exist if all the mounts that were using an inode after it had
- * been unlinked (or created with o_tmpfile) didn't unmount cleanly.
- */
-static void inode_orphan_scan_worker(struct work_struct *work)
-{
-	struct inode_sb_info *inf = container_of(work, struct inode_sb_info,
-						 orphan_scan_dwork.work);
-	struct super_block *sb = inf->sb;
-	struct scoutfs_open_ino_map omap;
-	struct scoutfs_key last;
-	struct scoutfs_key next;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_lock *lock = sbi->rid_lock;
+	struct scoutfs_lock *inode_lock = NULL;
 	struct scoutfs_key key;
-	struct inode *inode;
-	u64 group_nr;
-	int bit_nr;
+	struct scoutfs_key last;
 	u64 ino;
+	int err = 0;
 	int ret;

-	scoutfs_inc_counter(sb, orphan_scan);
+	trace_scoutfs_scan_orphans(sb);

-	init_orphan_key(&last, U64_MAX);
-	omap.args.group_nr = cpu_to_le64(U64_MAX);
+	init_orphan_key(&key, sbi->rid, 0);
+	init_orphan_key(&last, sbi->rid, ~0ULL);

-	for (ino = SCOUTFS_ROOT_INO + 1; ino != 0; ino++) {
-		if (inf->stopped) {
-			ret = 0;
-			goto out;
-		}
-
-		/* find the next orphan item */
-		init_orphan_key(&key, ino);
-		ret = scoutfs_forest_next_hint(sb, &key, &next);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				break;
-			goto out;
-		}
-
-		if (scoutfs_key_compare(&next, &last) > 0)
+	while (1) {
+		ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
+		if (ret == -ENOENT) /* No more orphan items */
 			break;
+		if (ret < 0)
+			goto out;

-		scoutfs_inc_counter(sb, orphan_scan_item);
-		ino = le64_to_cpu(next.sko_ino);
+		ino = le64_to_cpu(key.sko_ino);

-		/* locally cached inodes will already be deleted */
-		inode = scoutfs_ilookup(sb, ino);
-		if (inode) {
-			scoutfs_inc_counter(sb, orphan_scan_cached);
-			iput(inode);
-			continue;
+		ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock);
+		if (ret == 0) {
+			ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock);
+			scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
 		}
+		if (ret && ret != -ENOENT && !err)
+			err = ret;

-		/* get an omap that covers the orphaned ino */
-		group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
-		bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
-
-		if (le64_to_cpu(omap.args.group_nr) != group_nr) {
-			ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
-			if (ret < 0)
-				goto out;
+		if (le64_to_cpu(key.sko_ino) == U64_MAX) {
+			ret = -ENOENT;
+			break;
 		}
-
-		/* don't need to evict if someone else has it open (cached) */
-		if (test_bit_le(bit_nr, omap.bits)) {
-			scoutfs_inc_counter(sb, orphan_scan_omap_set);
-			continue;
-		}
-
-		/* try to cached and evict unused inode to delete, can be racing */
-		inode = scoutfs_iget(sb, ino);
-		if (IS_ERR(inode)) {
-			ret = PTR_ERR(inode);
-			if (ret == -ENOENT)
-				continue;
-			else
-				goto out;
-		}
-
-		scoutfs_inc_counter(sb, orphan_scan_read);
-		SCOUTFS_I(inode)->drop_invalidated = true;
-		iput(inode);
+		le64_add_cpu(&key.sko_ino, 1);
 	}

 	ret = 0;
-
 out:
-	if (ret < 0)
-		scoutfs_inc_counter(sb, orphan_scan_error);
+	return err ? err : ret;
+}

-	schedule_orphan_dwork(inf);
+int scoutfs_orphan_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_lock *lock = sbi->rid_lock;
+	struct scoutfs_key key;
+	int ret;
+
+	trace_scoutfs_orphan_inode(sb, inode);
+
+	init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));
+
+	ret = scoutfs_item_create(sb, &key, NULL, 0, lock);
+
+	return ret;
 }

 /*
@@ -1948,43 +1815,16 @@ int scoutfs_inode_setup(struct super_block *sb)
 	if (!inf)
 		return -ENOMEM;

-	inf->sb = sb;
 	spin_lock_init(&inf->writeback_lock);
 	inf->writeback_inodes = RB_ROOT;
 	spin_lock_init(&inf->dir_ino_alloc.lock);
 	spin_lock_init(&inf->ino_alloc.lock);
-	INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
-	spin_lock_init(&inf->deleting_items_lock);
-	INIT_LIST_HEAD(&inf->deleting_items_list);

 	sbi->inode_sb_info = inf;

 	return 0;
 }

-/*
- * Our inode subsystem is setup pretty early but orphan scanning uses
- * many other subsystems like networking and the server.  We only kick
- * it off once everything is ready.
- */
-int scoutfs_inode_start(struct super_block *sb)
-{
-	DECLARE_INODE_SB_INFO(sb, inf);
-
-	schedule_orphan_dwork(inf);
-	return 0;
-}
-
-void scoutfs_inode_stop(struct super_block *sb)
-{
-	DECLARE_INODE_SB_INFO(sb, inf);
-
-	if (inf) {
-		inf->stopped = true;
-		cancel_delayed_work_sync(&inf->orphan_scan_dwork);
-	}
-}
-
 void scoutfs_inode_destroy(struct super_block *sb)
 {
 	struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -20,7 +20,6 @@ struct scoutfs_inode_info {
 	u64 online_blocks;
 	u64 offline_blocks;
 	u32 flags;
-	struct timespec crtime;

 	/*
 	 * Protects per-inode extent items, most particularly readers
@@ -76,9 +75,11 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
 void scoutfs_destroy_inode(struct inode *inode);
 int scoutfs_drop_inode(struct inode *inode);
 void scoutfs_evict_inode(struct inode *inode);
+int scoutfs_orphan_inode(struct inode *inode);

 struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
+struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino);

 void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
 				  u32 minor, u64 ino);
@@ -89,9 +90,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
 				    struct list_head *list, u64 ino,
 				    umode_t mode);
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq, bool allocing);
+				      struct list_head *list, u64 seq);
 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq, bool allocing);
+				  bool set_data_seq);
 void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);

 int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
@@ -120,8 +121,9 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		    struct kstat *stat);
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);

-int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
-int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
+int scoutfs_scan_orphans(struct super_block *sb);
+int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
+int scoutfs_orphan_delete(struct super_block *sb, u64 ino);

 void scoutfs_inode_queue_writeback(struct inode *inode);
 int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
@@ -132,8 +134,6 @@ void scoutfs_inode_exit(void);
 int scoutfs_inode_init(void);

 int scoutfs_inode_setup(struct super_block *sb);
-int scoutfs_inode_start(struct super_block *sb);
-void scoutfs_inode_stop(struct super_block *sb);
 void scoutfs_inode_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -38,7 +38,6 @@
 #include "hash.h"
 #include "srch.h"
 #include "alloc.h"
-#include "server.h"
 #include "scoutfs_trace.h"

 /*
@@ -541,7 +540,6 @@ out:
 static long scoutfs_ioc_stat_more(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct scoutfs_ioctl_stat_more stm;

 	if (get_user(stm.valid_bytes, (__u64 __user *)arg))
@@ -553,8 +551,6 @@ static long scoutfs_ioc_stat_more(struct file *file, unsigned long arg)
 	stm.data_seq = scoutfs_inode_data_seq(inode);
 	stm.data_version = scoutfs_inode_data_version(inode);
 	scoutfs_inode_get_onoff(inode, &stm.online_blocks, &stm.offline_blocks);
-	stm.crtime_sec = si->crtime.tv_sec;
-	stm.crtime_nsec = si->crtime.tv_nsec;

 	if (copy_to_user((void __user *)arg, &stm, stm.valid_bytes))
 		return -EFAULT;
@@ -620,7 +616,6 @@ static long scoutfs_ioc_data_waiting(struct file *file, unsigned long arg)
 static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file->f_inode;
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_ioctl_setattr_more __user *usm = (void __user *)arg;
 	struct scoutfs_ioctl_setattr_more sm;
@@ -679,7 +674,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)

 	/* setting only so we don't see 0 data seq with nonzero data_version */
 	set_data_seq = sm.data_version != 0 ? true : false;
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
 	if (ret)
 		goto unlock;

@@ -689,8 +684,6 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 		i_size_write(inode, sm.i_size);
 	inode->i_ctime.tv_sec = sm.ctime_sec;
 	inode->i_ctime.tv_nsec = sm.ctime_nsec;
-	si->crtime.tv_sec = sm.crtime_sec;
-	si->crtime.tv_nsec = sm.crtime_nsec;

 	scoutfs_update_inode_item(inode, lock, &ind_locks);
 	ret = 0;
@@ -886,7 +879,6 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 	sfm.rid = sbi->rid;
 	sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks);
 	sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks);
-	sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb);

 	ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
 	if (ret)
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -232,9 +232,6 @@ struct scoutfs_ioctl_stat_more {
 	__u64 data_version;
 	__u64 online_blocks;
 	__u64 offline_blocks;
-	__u64 crtime_sec;
-	__u32 crtime_nsec;
-	__u8  _pad[4];
 };

 #define SCOUTFS_IOC_STAT_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 5, \
@@ -278,8 +275,7 @@ struct scoutfs_ioctl_setattr_more {
 	__u64 flags;
 	__u64 ctime_sec;
 	__u32 ctime_nsec;
-	__u32 crtime_nsec;
-	__u64 crtime_sec;
+	__u8 _pad[4];
 };

 #define SCOUTFS_IOC_SETATTR_MORE_OFFLINE		(1 << 0)
@@ -375,7 +371,6 @@ struct scoutfs_ioctl_statfs_more {
 	__u64 committed_seq;
 	__u64 total_meta_blocks;
 	__u64 total_data_blocks;
-	__u64 reserved_meta_blocks;
 };

 #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -95,7 +95,7 @@ struct item_cache_info {

 	/* written by page readers, read by shrink */
 	spinlock_t active_lock;
-	struct list_head active_list;
+	struct rb_root active_root;
 };

 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -127,7 +127,6 @@ struct cached_page {
 	unsigned long lru_time;
 	struct list_head dirty_list;
 	struct list_head dirty_head;
-	u64 max_liv_seq;
 	struct page *page;
 	unsigned int page_off;
 	unsigned int erased_bytes;
@@ -150,8 +149,7 @@ struct cached_item {

 static int item_val_bytes(int val_len)
 {
-	return round_up(offsetof(struct cached_item, val[val_len]),
-			CACHED_ITEM_ALIGN);
+	return round_up(offsetof(struct cached_item, val[val_len]), CACHED_ITEM_ALIGN);
 }

 /*
@@ -347,8 +345,7 @@ static struct cached_page *alloc_pg(struct super_block *sb, gfp_t gfp)
 	page = alloc_page(GFP_NOFS | gfp);
 	if (!page || !pg) {
 		kfree(pg);
-		if (page)
-			__free_page(page);
+		__free_page(page);
 		return NULL;
 	}

@@ -386,14 +383,6 @@ static void put_pg(struct super_block *sb, struct cached_page *pg)
 	}
 }

-static void update_pg_max_liv_seq(struct cached_page *pg, struct cached_item *item)
-{
-	u64 liv_seq = le64_to_cpu(item->liv.seq);
-
-	if (liv_seq > pg->max_liv_seq)
-		pg->max_liv_seq = liv_seq;
-}
-
 /*
 * Allocate space for a new item from the free offset at the end of a
 * cached page.  This isn't a blocking allocation, and it's likely that
@@ -425,15 +414,14 @@ static struct cached_item *alloc_item(struct cached_page *pg,
 	if (val_len)
 		memcpy(item->val, val, val_len);

-	update_pg_max_liv_seq(pg, item);
-
 	return item;
 }

 static void erase_item(struct cached_page *pg, struct cached_item *item)
 {
 	rbtree_erase(&item->node, &pg->item_root);
-	pg->erased_bytes += item_val_bytes(item->val_len);
+	pg->erased_bytes += round_up(item_val_bytes(item->val_len),
+				     CACHED_ITEM_ALIGN);
 }

 static void lru_add(struct super_block *sb, struct item_cache_info *cinf,
@@ -633,8 +621,6 @@ static void mark_item_dirty(struct super_block *sb,
 		list_add_tail(&item->dirty_head, &pg->dirty_list);
 		item->dirty = 1;
 	}
-
-	update_pg_max_liv_seq(pg, item);
 }

 static void clear_item_dirty(struct super_block *sb,
@@ -866,7 +852,8 @@ static void compact_page_items(struct super_block *sb,

 	for (from = first_item(&pg->item_root); from; from = next_item(from)) {
 		to = page_address(empty->page) + page_off;
-		page_off += item_val_bytes(from->val_len);
+		page_off += round_up(item_val_bytes(from->val_len),
+				     CACHED_ITEM_ALIGN);

 		/* copy the entire item, struct members and all */
 		memcpy(to, from, item_val_bytes(from->val_len));
@@ -1273,76 +1260,46 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }

-/*
- * Readers operate independently from dirty items and transactions.
- * They read a set of persistent items and insert them into the cache
- * when there aren't already pages whose key range contains the items.
- * This naturally prefers cached dirty items over stale read items.
- *
- * We have to deal with the case where dirty items are written and
- * invalidated while a read is in flight.   The reader won't have seen
- * the items that were dirty in their persistent roots as they started
- * reading.  By the time they insert their read pages the previously
- * dirty items have been reclaimed and are not in the cache.  The old
- * stale items will be inserted in their place, effectively corrupting
- * by having the dirty items disappear.
- *
- * We fix this by tracking the max seq of items in pages.  As readers
- * start they record the current transaction seq.  Invalidation skips
- * pages with a max seq greater than the first reader seq because the
- * items in the page have to stick around to prevent the readers stale
- * items from being inserted.
- *
- * This naturally only affects a small set of pages with items that were
- * written relatively recently.  If we're in memory pressure then we
- * probably have a lot of pages and they'll naturally have items that
- * were visible to any raders.  We don't bother with the complicated and
- * expensive further refinement of tracking the ranges that are being
- * read and comparing those with pages to invalidate.
- */
 struct active_reader {
-	struct list_head head;
-	u64 seq;
+	struct rb_node node;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
 };

-#define INIT_ACTIVE_READER(rdr) \
-	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
-
-static void add_active_reader(struct super_block *sb, struct active_reader *active)
-{
-	DECLARE_ITEM_CACHE_INFO(sb, cinf);
-
-	BUG_ON(!list_empty(&active->head));
-
-	active->seq = scoutfs_trans_sample_seq(sb);
-
-	spin_lock(&cinf->active_lock);
-	list_add_tail(&active->head, &cinf->active_list);
-	spin_unlock(&cinf->active_lock);
-}
-
-static u64 first_active_reader_seq(struct item_cache_info *cinf)
+static struct active_reader *active_rbtree_walk(struct rb_root *root,
+						struct scoutfs_key *start,
+						struct scoutfs_key *end,
+						struct rb_node **par,
+						struct rb_node ***pnode)
 {
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct active_reader *ret = NULL;
 	struct active_reader *active;
-	u64 first;
+	int cmp;

-	/* only the calling task adds or deletes this active */
-	spin_lock(&cinf->active_lock);
-	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
-	first = active ? active->seq : U64_MAX;
-	spin_unlock(&cinf->active_lock);
+	while (*node) {
+		parent = *node;
+		active = container_of(*node, struct active_reader, node);

-	return first;
-}
-
-static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
-{
-	/* only the calling task adds or deletes this active */
-	if (!list_empty(&active->head)) {
-		spin_lock(&cinf->active_lock);
-		list_del_init(&active->head);
-		spin_unlock(&cinf->active_lock);
+		cmp = scoutfs_key_compare_ranges(start, end, &active->start,
+						 &active->end);
+		if (cmp < 0) {
+			node = &(*node)->rb_left;
+		} else if (cmp > 0) {
+			node = &(*node)->rb_right;
+		} else {
+			ret = active;
+			node = &(*node)->rb_left;
+		}
 	}
+
+	if (par)
+		*par = parent;
+	if (pnode)
+		*pnode = node;
+
+	return ret;
 }

 /*
@@ -1351,10 +1308,10 @@ static void del_active_reader(struct item_cache_info *cinf, struct active_reader
 * on our root and aren't in dirty or lru lists.
 *
 * We need to store deletion items here as we read items from all the
- * btrees so that they can override older items.  The deletion items
- * will be deleted before we insert the pages into the cache.  We don't
- * insert old versions of items into the tree here so that the trees
- * don't have to compare seqs.
+ * btrees so that they can override older versions of the items.  The
+ * deletion items will be deleted before we insert the pages into the
+ * cache.  We don't insert old versions of items into the tree here so
+ * that the trees don't have to compare versions.
 */
 static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 			  struct scoutfs_log_item_value *liv, void *val,
@@ -1374,7 +1331,7 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,

 	pg = page_rbtree_walk(sb, root, key, key, NULL, NULL, &p_par, &p_pnode);
 	found = item_rbtree_walk(&pg->item_root, key, NULL, &par, &pnode);
-	if (found && (le64_to_cpu(found->liv.seq) >= le64_to_cpu(liv->seq)))
+	if (found && (le64_to_cpu(found->liv.vers) >= le64_to_cpu(liv->vers)))
 		return 0;

 	if (!page_has_room(pg, val_len)) {
@@ -1442,15 +1399,22 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 * locks held, but without locking the cache.  The regions we read can
 * be stale with respect to the current cache, which can be read and
 * dirtied by other cluster lock holders on our node, but the cluster
- * locks protect the stable items we read.  Invalidation is careful not
- * to drop pages that have items that we couldn't see because they were
- * dirty when we started reading.
+ * locks protect the stable items we read.
+ *
+ * There's also the exciting case where a reader can populate the cache
+ * with stale old persistent data which was read before another local
+ * cluster lock holder was able to read, dirty, write, and then shrink
+ * the cache.  In this case the cache couldn't be cleared by lock
+ * invalidation because the caller is actively holding the lock.  But
+ * shrinking could evict the cache within the held lock.  So we record
+ * that we're an active reader in the range covered by the lock and
+ * shrink will refuse to reclaim any pages that intersect with our read.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
-	INIT_ACTIVE_READER(active);
+	struct active_reader active;
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1466,6 +1430,15 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	int pgi;
 	int ret;

+	/* stop shrink from freeing new clean data, would let us cache stale */
+	active.start = lock->start;
+	active.end = lock->end;
+	spin_lock(&cinf->active_lock);
+	active_rbtree_walk(&cinf->active_root, &active.start, &active.end,
+		           &par, &pnode);
+	rbtree_insert(&active.node, par, pnode, &cinf->active_root);
+	spin_unlock(&cinf->active_lock);
+
 	/* start with an empty page that covers the whole lock */
 	pg = alloc_pg(sb, 0);
 	if (!pg) {
@@ -1476,9 +1449,6 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);

-	/* set active reader seq before reading persistent roots */
-	add_active_reader(sb, &active);
-
 	ret = scoutfs_forest_read_items(sb, lock, key, &start, &end,
 				       read_page_item, &root);
 	if (ret < 0)
@@ -1556,7 +1526,9 @@ retry:

 	ret = 0;
 out:
-	del_active_reader(cinf, &active);
+	spin_lock(&cinf->active_lock);
+	rbtree_erase(&active.node, &cinf->active_root);
+	spin_unlock(&cinf->active_lock);

 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
@@ -1811,21 +1783,6 @@ out:
 	return ret;
 }

-/*
- * An item's seq is greater of the client transaction's seq and the
- * lock's write_seq.  This ensures that multiple commits in one lock
- * grant will have increasing seqs, and new locks in open commits will
- * also increase the seqs.  It lets us limit the inputs of item merging
- * to the last stable seq and ensure that all the items in open
- * transactions and granted locks will have greater seqs.
- */
-static __le64 item_seq(struct super_block *sb, struct scoutfs_lock *lock)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	return cpu_to_le64(max(sbi->trans_seq, lock->write_seq));
-}
-
 /*
 * Mark the item dirty.  Dirtying while holding a transaction pins the
 * page holding the item and guarantees that the item can be deleted or
@@ -1858,8 +1815,8 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
 	if (!item || item->deletion) {
 		ret = -ENOENT;
 	} else {
-		item->liv.seq = item_seq(sb, lock);
 		mark_item_dirty(sb, cinf, pg, NULL, item);
+		item->liv.vers = cpu_to_le64(lock->write_version);
 		ret = 0;
 	}

@@ -1879,7 +1836,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.seq = item_seq(sb, lock),
+		.vers = cpu_to_le64(lock->write_version),
 	};
 	struct cached_item *found;
 	struct cached_item *item;
@@ -1954,7 +1911,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.seq = item_seq(sb, lock),
+		.vers = cpu_to_le64(lock->write_version),
 	};
 	struct cached_item *item;
 	struct cached_item *found;
@@ -1987,10 +1944,9 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 		if (val_len)
 			memcpy(found->val, val, val_len);
 		if (val_len < found->val_len)
-			pg->erased_bytes += item_val_bytes(found->val_len) -
-					    item_val_bytes(val_len);
+			pg->erased_bytes += found->val_len - val_len;
 		found->val_len = val_len;
-		found->liv.seq = liv.seq;
+		found->liv.vers = liv.vers;
 		mark_item_dirty(sb, cinf, pg, NULL, found);
 	} else {
 		item = alloc_item(pg, key, &liv, val, val_len);
@@ -2022,7 +1978,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.seq = item_seq(sb, lock),
+		.vers = cpu_to_le64(lock->write_version),
 	};
 	struct cached_item *item;
 	struct cached_page *pg;
@@ -2064,11 +2020,10 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 		erase_item(pg, item);
 	} else {
 		/* must emit deletion to clobber old persistent item */
-		item->liv.seq = liv.seq;
+		item->liv.vers = cpu_to_le64(lock->write_version);
 		item->liv.flags |= SCOUTFS_LOG_ITEM_FLAG_DELETION;
 		item->deletion = 1;
-		pg->erased_bytes += item_val_bytes(item->val_len) -
-				    item_val_bytes(0);
+		pg->erased_bytes += item->val_len;
 		item->val_len = 0;
 		mark_item_dirty(sb, cinf, pg, NULL, item);
 	}
@@ -2151,7 +2106,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 	struct page *page;
 	LIST_HEAD(pages);
 	LIST_HEAD(pos);
-	u64 max_seq = 0;
+	u64 max_vers = 0;
 	int val_len;
 	int bytes;
 	int off;
@@ -2216,7 +2171,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 			val_len = sizeof(item->liv) + item->val_len;
 			bytes = offsetof(struct scoutfs_btree_item_list,
 					 val[val_len]);
-			max_seq = max(max_seq, le64_to_cpu(item->liv.seq));
+			max_vers = max(max_vers, le64_to_cpu(item->liv.vers));

 			if (off + bytes > PAGE_SIZE) {
 				page = second;
@@ -2246,8 +2201,8 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 		read_unlock(&pg->rwlock);
 	}

-	/* store max item seq in forest's log_trees */
-	scoutfs_forest_set_max_seq(sb, max_seq);
+	/* store max item vers in forest's log_trees */
+	scoutfs_forest_set_max_vers(sb, max_vers);

 	/* write all the dirty items into log btree blocks */
 	ret = scoutfs_forest_insert_list(sb, first);
@@ -2434,9 +2389,9 @@ retry:

 /*
 * Shrink the size the item cache.  We're operating against the fast
- * path lock ordering and we skip pages if we can't acquire locks.  We
- * can run into dirty pages or pages with items that weren't visible to
- * the earliest active reader which must be skipped.
+ * path lock ordering and we skip pages if we can't acquire locks.
+ * Similarly, we can run into dirty pages or pages which intersect with
+ * active readers that we can't shrink and also choose to skip.
 */
 static int item_lru_shrink(struct shrinker *shrink,
 			   struct shrink_control *sc)
@@ -2445,24 +2400,26 @@ static int item_lru_shrink(struct shrinker *shrink,
 						    struct item_cache_info,
 						    shrinker);
 	struct super_block *sb = cinf->sb;
+	struct active_reader *active;
 	struct cached_page *tmp;
 	struct cached_page *pg;
-	u64 first_reader_seq;
 	int nr;

 	if (sc->nr_to_scan == 0)
 		goto out;
 	nr = sc->nr_to_scan;

-	/* can't invalidate pages with items that weren't visible to first reader */
-	first_reader_seq = first_active_reader_seq(cinf);
-
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);

 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {

-		if (first_reader_seq <= pg->max_liv_seq) {
+		/* can't invalidate ranges being read, reader might be stale */
+		spin_lock(&cinf->active_lock);
+		active = active_rbtree_walk(&cinf->active_root, &pg->start,
+					    &pg->end, NULL, NULL);
+		spin_unlock(&cinf->active_lock);
+		if (active) {
 			scoutfs_inc_counter(sb, item_shrink_page_reader);
 			continue;
 		}
@@ -2531,7 +2488,7 @@ int scoutfs_item_setup(struct super_block *sb)
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
 	spin_lock_init(&cinf->active_lock);
-	INIT_LIST_HEAD(&cinf->active_list);
+	cinf->active_root = RB_ROOT;

 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2562,7 +2519,7 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;

 	if (cinf) {
-		BUG_ON(!list_empty(&cinf->active_list));
+		BUG_ON(!RB_EMPTY_ROOT(&cinf->active_root));

 		unregister_hotcpu_notifier(&cinf->notifier);
 		unregister_shrinker(&cinf->shrinker);
--- a/kmod/src/key.h
+++ b/kmod/src/key.h
@@ -108,16 +108,6 @@ static inline void scoutfs_key_set_ones(struct scoutfs_key *key)
 	memset(key->__pad, 0, sizeof(key->__pad));
 }

-static inline bool scoutfs_key_is_ones(struct scoutfs_key *key)
-{
-	return key->sk_zone == U8_MAX &&
-	       key->_sk_first == cpu_to_le64(U64_MAX) &&
-	       key->sk_type == U8_MAX &&
-	       key->_sk_second == cpu_to_le64(U64_MAX) &&
-	       key->_sk_third == cpu_to_le64(U64_MAX) &&
-	       key->_sk_fourth == U8_MAX;
-}
-
 /*
 * Return a -1/0/1 comparison of keys.
 *
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -156,7 +156,9 @@ static void lock_inv_iput_worker(struct work_struct *work)

 /*
 * Invalidate cached data associated with an inode whose lock is going
- * away.
+ * away.  We ignore indoes with I_FREEING instead of waiting on them to
+ * avoid a deadlock, if they're freeing then they won't be visible to
+ * future lock users and we don't need to invalidate them.
 *
 * We try to drop cached dentries and inodes covered by the lock if they
 * aren't referenced.  This removes them from the mount's open map and
@@ -176,7 +178,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 	struct scoutfs_inode_info *si;
 	struct inode *inode;

-	inode = scoutfs_ilookup(sb, ino);
+	inode = scoutfs_ilookup_nofreeing(sb, ino);
 	if (inode) {
 		si = SCOUTFS_I(inode);

@@ -730,7 +732,7 @@ static void lock_grant_worker(struct work_struct *work)

 		lock->request_pending = 0;
 		lock->mode = nl->new_mode;
-		lock->write_seq = le64_to_cpu(nl->write_seq);
+		lock->write_version = le64_to_cpu(nl->write_version);

 		if (lock_count_match_exists(nl->new_mode, lock->waiters))
 			extend_grace(sb, lock);
@@ -894,9 +896,6 @@ static void lock_invalidate_worker(struct work_struct *work)
 			list_del_init(&lock->inv_head);
 			lock->invalidate_pending = 0;
 			wake_up(&lock->waitq);
-		} else {
-			/* another request filled nl/net_id, put it back on the list */
-			list_move_tail(&lock->inv_head, &linfo->inv_list);
 		}
 		put_lock(linfo, lock);
 	}
@@ -988,7 +987,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 	for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {

 		nlr->locks[i].key = lock->start;
-		nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq);
+		nlr->locks[i].write_version = cpu_to_le64(lock->write_version);
 		nlr->locks[i].old_mode = lock->mode;
 		nlr->locks[i].new_mode = lock->mode;

@@ -1347,28 +1346,29 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
 }

 /*
- * Orphan items are stored in their own zone which are modified with
- * shared write_only locks and are read inconsistently without locks by
- * background scanning work.
+ * The rid lock protects a mount's private persistent items in the rid
+ * zone.  It's held for the duration of the mount.  It lets the mount
+ * modify the rid items at will and signals to other mounts that we're
+ * still alive and our rid items shouldn't be reclaimed.
 *
- * Since we only use write_only locks we just lock the entire zone, but
- * the api provides the inode in case we ever change the locking scheme.
+ * Being held for the entire mount prevents other nodes from reclaiming
+ * our items, like free blocks, when it would make sense for them to be
+ * able to.  Maybe we have a bunch free and they're trying to allocate
+ * and are getting ENOSPC.
 */
-int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
-			struct scoutfs_lock **lock)
+int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+		     u64 rid, struct scoutfs_lock **lock)
 {
 	struct scoutfs_key start;
 	struct scoutfs_key end;

 	scoutfs_key_set_zeros(&start);
-	start.sk_zone = SCOUTFS_ORPHAN_ZONE;
-	start.sko_ino = 0;
-	start.sk_type = SCOUTFS_ORPHAN_TYPE;
+	start.sk_zone = SCOUTFS_RID_ZONE;
+	start.sko_rid = cpu_to_le64(rid);

-	scoutfs_key_set_zeros(&end);
-	end.sk_zone = SCOUTFS_ORPHAN_ZONE;
-	end.sko_ino = cpu_to_le64(U64_MAX);
-	end.sk_type = SCOUTFS_ORPHAN_TYPE;
+	scoutfs_key_set_ones(&end);
+	end.sk_zone = SCOUTFS_RID_ZONE;
+	end.sko_rid = cpu_to_le64(rid);

 	return lock_key_range(sb, mode, flags, &start, &end, lock);
 }
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -13,7 +13,7 @@
 struct scoutfs_omap_lock;

 /*
- * A few fields (start, end, refresh_gen, write_seq, granted_mode)
+ * A few fields (start, end, refresh_gen, write_version, granted_mode)
 * are referenced by code outside lock.c.
 */
 struct scoutfs_lock {
@@ -23,7 +23,7 @@ struct scoutfs_lock {
 	struct rb_node node;
 	struct rb_node range_node;
 	u64 refresh_gen;
-	u64 write_seq;
+	u64 write_version;
 	u64 dirty_trans_seq;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
@@ -85,8 +85,8 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int
 			struct inode *d, struct scoutfs_lock **D_lock);
 int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct scoutfs_lock **lock);
-int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
-		        u64 ino, struct scoutfs_lock **lock);
+int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+		     u64 rid, struct scoutfs_lock **lock);
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
 		    enum scoutfs_lock_mode mode);

--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -81,6 +81,8 @@ struct lock_server_info {

 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
+
+	atomic64_t write_version;
 };

 #define DECLARE_LOCK_SERVER_INFO(sb, name) \
@@ -477,14 +479,14 @@ static int process_waiting_requests(struct super_block *sb,
 	struct client_lock_entry *req_tmp;
 	struct client_lock_entry *gr;
 	struct client_lock_entry *gr_tmp;
-	u64 seq;
+	u64 wv;
 	int ret;

 	BUG_ON(!mutex_is_locked(&snode->mutex));

 	/* processing waits for all invalidation responses or recovery */
 	if (!list_empty(&snode->invalidated) ||
-	    scoutfs_recov_next_pending(sb, 0, SCOUTFS_RECOV_LOCKS) != 0) {
+	    scoutfs_recov_next_pending(sb, SCOUTFS_RECOV_LOCKS) != 0) {
 		ret = 0;
 		goto out;
 	}
@@ -518,7 +520,6 @@ static int process_waiting_requests(struct super_block *sb,

 		nl.key = snode->key;
 		nl.new_mode = req->mode;
-		nl.write_seq = 0;

 		/* see if there's an existing compatible grant to replace */
 		gr = find_entry(snode, &snode->granted, req->rid);
@@ -531,9 +532,8 @@ static int process_waiting_requests(struct super_block *sb,

 		if (nl.new_mode == SCOUTFS_LOCK_WRITE ||
 		    nl.new_mode == SCOUTFS_LOCK_WRITE_ONLY) {
-			/* doesn't commit seq update, recovered with locks */
-			seq = scoutfs_server_next_seq(sb);
-			nl.write_seq = cpu_to_le64(seq);
+			wv = atomic64_inc_return(&inf->write_version);
+			nl.write_version = cpu_to_le64(wv);
 		}

 		ret = scoutfs_server_lock_response(sb, req->rid,
@@ -609,6 +609,14 @@ int scoutfs_lock_server_finished_recovery(struct super_block *sb)
 	return ret;
 }

+static void set_max_write_version(struct lock_server_info *inf, u64 new)
+{
+	u64 old;
+
+	while (new > (old = atomic64_read(&inf->write_version)) &&
+	       (atomic64_cmpxchg(&inf->write_version, old, new) != old));
+}
+
 /*
 * We sent a lock recover request to the client when we received its
 * greeting while in recovery.  Here we instantiate all the locks it
@@ -672,9 +680,9 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,

 		put_server_lock(inf, snode);

-		/* make sure next core seq is greater than all lock write seq */
-		scoutfs_server_set_seq_if_greater(sb,
-				le64_to_cpu(nlr->locks[i].write_seq));
+		/* make sure next write lock is greater than all recovered */
+		set_max_write_version(inf,
+				le64_to_cpu(nlr->locks[i].write_version));
 	}

 	/* send request for next batch of keys */
@@ -792,7 +800,7 @@ static void lock_server_tseq_show(struct seq_file *m,
 */
 int scoutfs_lock_server_setup(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri)
+			      struct scoutfs_block_writer *wri, u64 max_vers)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct lock_server_info *inf;
@@ -807,6 +815,7 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);
 	inf->alloc = alloc;
 	inf->wri = wri;
+	atomic64_set(&inf->write_version, max_vers); /* inc_return gives +1 */

 	inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root,
 					       &inf->tseq_tree);
--- a/kmod/src/lock_server.h
+++ b/kmod/src/lock_server.h
@@ -13,7 +13,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid);

 int scoutfs_lock_server_setup(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri);
+			      struct scoutfs_block_writer *wri, u64 max_vers);
 void scoutfs_lock_server_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -30,7 +30,6 @@
 #include "net.h"
 #include "endian_swap.h"
 #include "tseq.h"
-#include "fence.h"

 /*
 * scoutfs networking delivers requests and responses between nodes.
@@ -331,9 +330,6 @@ static int submit_send(struct super_block *sb,
 	    WARN_ON_ONCE(id == 0 && (flags & SCOUTFS_NET_FLAG_RESPONSE)))
 		return -EINVAL;

-	if (scoutfs_forcing_unmount(sb))
-		return -EIO;
-
 	msend = kmalloc(offsetof(struct message_send,
 				 nh.data[data_len]), GFP_NOFS);
 	if (!msend)
@@ -424,16 +420,6 @@ static int process_request(struct scoutfs_net_connection *conn,
 			mrecv->nh.data, le16_to_cpu(mrecv->nh.data_len));
 }

-static int call_resp_func(struct super_block *sb, struct scoutfs_net_connection *conn,
-			  scoutfs_net_response_t resp_func, void *resp_data,
-			  void *resp, unsigned int resp_len, int error)
-{
-	if (resp_func)
-		return resp_func(sb, conn, resp, resp_len, error, resp_data);
-	else
-		return 0;
-}
-
 /*
 * An incoming response finds the queued request and calls its response
 * function.  The response function for a given request will only be
@@ -448,6 +434,7 @@ static int process_response(struct scoutfs_net_connection *conn,
 	struct message_send *msend;
 	scoutfs_net_response_t resp_func = NULL;
 	void *resp_data;
+	int ret = 0;

 	spin_lock(&conn->lock);

@@ -462,8 +449,11 @@ static int process_response(struct scoutfs_net_connection *conn,

 	spin_unlock(&conn->lock);

-	return call_resp_func(sb, conn, resp_func, resp_data, mrecv->nh.data,
-			      le16_to_cpu(mrecv->nh.data_len), net_err_to_host(mrecv->nh.error));
+	if (resp_func)
+		ret = resp_func(sb, conn, mrecv->nh.data,
+				le16_to_cpu(mrecv->nh.data_len),
+				net_err_to_host(mrecv->nh.error), resp_data);
+	return ret;
 }

 /*
@@ -833,15 +823,9 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
 	if (conn->listening_conn && conn->notify_down)
 		conn->notify_down(sb, conn, conn->info, conn->rid);

-	/*
-	 * Usually networking is idle and we destroy pending sends, but when forcing unmount
-	 * we can have to wake up waiters by failing pending sends.
-	 */
+	/* free all messages, refactor and complete for forced unmount? */
 	list_splice_init(&conn->resend_queue, &conn->send_queue);
 	list_for_each_entry_safe(msend, tmp, &conn->send_queue, head) {
-		if (scoutfs_forcing_unmount(sb))
-			call_resp_func(sb, conn, msend->resp_func, msend->resp_data,
-				       NULL, 0, -ECONNABORTED);
 		free_msend(ninf, msend);
 	}

@@ -941,8 +925,6 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 		ret = -EAFNOSUPPORT;
 	if (ret)
 		goto out;
-
-	conn->last_peername = conn->peername;
 out:
 	return ret;
 }
@@ -1223,7 +1205,6 @@ static void scoutfs_net_reconn_free_worker(struct work_struct *work)
 	unsigned long now = jiffies;
 	unsigned long deadline = 0;
 	bool requeue = false;
-	int ret;

 	trace_scoutfs_net_reconn_free_work_enter(sb, 0, 0);

@@ -1237,18 +1218,10 @@ restart:
 		     time_after_eq(now, acc->reconn_deadline))) {
 			set_conn_fl(acc, reconn_freeing);
 			spin_unlock(&conn->lock);
-			if (!test_conn_fl(conn, shutting_down)) {
-				scoutfs_info(sb, "client "SIN_FMT" reconnect timed out, fencing",
-					     SIN_ARG(&acc->last_peername));
-				ret = scoutfs_fence_start(sb, acc->rid,
-						acc->last_peername.sin_addr.s_addr,
-						SCOUTFS_FENCE_CLIENT_RECONNECT);
-				if (ret) {
-					scoutfs_err(sb, "client fence returned err %d, shutting down server",
-						    ret);
-					scoutfs_server_abort(sb);
-				}
-			}
+			if (!test_conn_fl(conn, shutting_down))
+				scoutfs_info(sb, "client timed out "SIN_FMT" -> "SIN_FMT", can not reconnect",
+					     SIN_ARG(&acc->sockname),
+					     SIN_ARG(&acc->peername));
 			destroy_conn(acc);
 			goto restart;
 		}
@@ -1319,7 +1292,6 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	init_waitqueue_head(&conn->waitq);
 	conn->sockname.sin_family = AF_INET;
 	conn->peername.sin_family = AF_INET;
-	conn->last_peername.sin_family = AF_INET;
 	INIT_LIST_HEAD(&conn->accepted_head);
 	INIT_LIST_HEAD(&conn->accepted_list);
 	conn->next_send_seq = 1;
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -49,7 +49,6 @@ struct scoutfs_net_connection {
 	u64 greeting_id;
 	struct sockaddr_in sockname;
 	struct sockaddr_in peername;
-	struct sockaddr_in last_peername;

 	struct list_head accepted_head;
 	struct scoutfs_net_connection *listening_conn;
@@ -100,16 +99,6 @@ static inline void scoutfs_addr_to_sin(struct sockaddr_in *sin,
 	sin->sin_port = cpu_to_be16(le16_to_cpu(addr->v4.port));
 }

-static inline void scoutfs_sin_to_addr(union scoutfs_inet_addr *addr, struct sockaddr_in *sin)
-{
-	BUG_ON(sin->sin_family != AF_INET);
-
-	memset(addr, 0, sizeof(union scoutfs_inet_addr));
-	addr->v4.family = cpu_to_le16(SCOUTFS_AF_IPV4);
-	addr->v4.addr = be32_to_le32(sin->sin_addr.s_addr);
-	addr->v4.port = be16_to_le16(sin->sin_port);
-}
-
 struct scoutfs_net_connection *
 scoutfs_net_alloc_conn(struct super_block *sb,
 		       scoutfs_net_notify_t notify_up,
--- a/kmod/src/omap.c
+++ b/kmod/src/omap.c
@@ -137,10 +137,11 @@ struct omap_request {
 /*
 * In each inode group cluster lock we store data to track the open ino
 * map which tracks all the inodes that the cluster lock covers.  When
- * the seq shows that the map is stale we send a request to update it.
+ * the version shows that the map is stale we send a request to update
+ * it.
 */
 struct scoutfs_omap_lock_data {
-	u64 seq;
+	u64 version;
 	bool req_in_flight;
 	wait_queue_head_t waitq;
 	struct scoutfs_open_ino_map map;
@@ -484,10 +485,6 @@ static int remove_rid_from_reqs(struct omap_info *ominf, u64 rid, u64 *resp_rid,
 * response if it was the last rid waiting for a response.
 *
 * If this returns an error then the server will shut down.
- *
- * This can be called multiple times by different servers if there are
- * errors reclaiming an evicted mount, so we allow asking to remove a
- * rid that hasn't been added.
 */
 int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid)
 {
@@ -498,20 +495,21 @@ int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid)
 	u64 resp_id = 0;
 	int ret;

+	map = kmalloc(sizeof(struct scoutfs_open_ino_map), GFP_NOFS);
+	if (!map) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	spin_lock(&ominf->lock);
 	entry = find_rid(&ominf->rids, rid);
 	if (entry)
 		free_rid(&ominf->rids, entry);
 	spin_unlock(&ominf->lock);

-	if (!entry) {
-		ret = 0;
-		goto out;
-	}
-
-	map = kmalloc(sizeof(struct scoutfs_open_ino_map), GFP_NOFS);
-	if (!map) {
-		ret = -ENOMEM;
+	/* the server really shouldn't be removing a rid it never added */
+	if (WARN_ON_ONCE(!entry)) {
+		ret = -ENOENT;
 		goto out;
 	}

@@ -595,6 +593,10 @@ out:
 		free_req(req);
 	}

+	/* it's fine if we couldn't send to a client that left */
+	if (ret == -ENOTCONN)
+		ret = 0;
+
 	return ret;
 }

@@ -614,7 +616,7 @@ static int handle_requests(struct super_block *sb)
 	int ret;
 	int err;

-	if (scoutfs_recov_next_pending(sb, 0, SCOUTFS_RECOV_GREETING))
+	if (scoutfs_recov_next_pending(sb, SCOUTFS_RECOV_GREETING))
 		return 0;

 	ret = 0;
@@ -828,7 +830,8 @@ static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lo
 /*
 * Make sure the map covered by the cluster lock is current.  The caller
 * holds the cluster lock so once we store lock_data on the cluster lock
- * it won't be freed and the write_seq in the cluster lock won't change.
+ * it won't be freed and the write_version in the cluster lock won't
+ * change.
 *
 * The omap_spinlock protects the omap_data in the cluster lock.  We
 * have to drop it if we have to block to allocate lock_data, send a
@@ -855,7 +858,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		}

 		if (lock->omap_data == NULL) {
-			ldata->seq = lock->write_seq - 1; /* ensure refresh */
+			ldata->version = lock->write_version - 1; /* ensure refresh */
 			init_waitqueue_head(&ldata->waitq);

 			lock->omap_data = ldata;
@@ -865,7 +868,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		}
 	}

-	while (ldata->seq != lock->write_seq) {
+	while (ldata->version != lock->write_version) {
 		/* only one waiter sends a request at a time */
 		if (!ldata->req_in_flight) {
 			ldata->req_in_flight = true;
@@ -885,7 +888,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		if (send_req) {
 			ldata->req_in_flight = false;
 			if (ret == 0)
-				ldata->seq = lock->write_seq;
+				ldata->version = lock->write_version;
 			wake_up(&ldata->waitq);
 			if (ret < 0)
 				goto out;
@@ -904,9 +907,9 @@ out:
 }

 /*
- * Return 1 and give the caller their locks when they should delete the
- * inode items.  It's safe to delete the inode items when it is no
- * longer reachable and nothing is referencing it.
+ * Return 1 and give the caller a write inode lock if it is safe to be
+ * deleted.  It's safe to be deleted when it is no longer reachable and
+ * nothing is referencing it.
 *
 * The inode is unreachable when nlink hits zero.  Cluster locks protect
 * modification and testing of nlink.  We use the ino_lock_cov covrage
@@ -921,17 +924,15 @@ out:
 * increase nlink from zero and let people get a reference to the inode.
 */
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
+			       struct scoutfs_lock **lock_ret)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
-	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *lock = NULL;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_omap_lock_data *ldata;
 	u64 group_nr;
 	int bit_nr;
 	int ret;
-	int err;

 	/* lock group and omap constants are defined independently */
 	BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
@@ -962,19 +963,12 @@ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
 out:
 	trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);

-	if (ret > 0) {
-		err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
-		if (err < 0)
-			ret = err;
-	}
-
 	if (ret <= 0) {
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
 		lock = NULL;
 	}

 	*lock_ret = lock;
-	*orph_lock_ret = orph_lock;
 	return ret;
 }

--- a/kmod/src/omap.h
+++ b/kmod/src/omap.h
@@ -4,7 +4,7 @@
 int scoutfs_omap_inc(struct super_block *sb, u64 ino);
 void scoutfs_omap_dec(struct super_block *sb, u64 ino);
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
+			       struct scoutfs_lock **lock_ret);
 void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
 int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
 				       struct scoutfs_open_ino_map_args *args);
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -32,7 +32,6 @@
 #include "block.h"
 #include "net.h"
 #include "sysfs.h"
-#include "fence.h"
 #include "scoutfs_trace.h"

 /*
@@ -61,9 +60,10 @@
 * running (maybe they've deadlocked, or lost network communications).
 * In addition to a configuration slot in the super block, each quorum
 * member also has a known block location that represents their slot.
- * The block contains an array of events which are updated during the life
- * time of the quorum agent.  The elected leader set its elected event
- * and can then start the server.
+ * They set a flag in their block indicating that they've been elected
+ * leader, then read slots for all the other blocks looking for
+ * previously active leaders to fence.  After that it can start the
+ * server.
 *
 * It's critical to raft elections that a participant's term not go
 * backwards in time so each mount also uses its quorum block to store
@@ -334,18 +334,17 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 }

 /*
- * Read and verify block fields before giving it to the caller.  We
- * should have exclusive write access to the block.  We know that
- * something has gone horribly wrong if we don't see our rid in the
- * begin event after we've written it as we started up.
+ * The caller can provide a mark that they're using to track their
+ * written blocks.  It's updated as they write the block and we can
+ * compare it with what we read to see if there have been unexpected
+ * intervening writes to the block -- the caller is supposed to have
+ * exclusive access to the block (or was fenced).
 */
-static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk,
-			     bool check_rid)
+static int read_quorum_block(struct super_block *sb, u64 blkno,
+			     struct scoutfs_quorum_block *blk, __le64 *mark)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
-	const u64 rid = sbi->rid;
-	char msg[150];
 	__le32 crc;
 	int ret;

@@ -356,208 +355,162 @@ static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_q

 	ret = scoutfs_block_read_sm(sb, sbi->meta_bdev, blkno,
 				     &blk->hdr, sizeof(*blk), &crc);
-	if (ret < 0) {
-		scoutfs_err(sb, "quorum block read error %d", ret);
-		goto out;
-	}

 	/* detect invalid blocks */
-	if (blk->hdr.crc != crc)
-		snprintf(msg, sizeof(msg), "blk crc %08x != %08x",
-			 le32_to_cpu(blk->hdr.crc), le32_to_cpu(crc));
-	else if (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM) 
-		snprintf(msg, sizeof(msg), "blk magic %08x != %08x",
-			 le32_to_cpu(blk->hdr.magic), SCOUTFS_BLOCK_MAGIC_QUORUM);
-	else if (blk->hdr.fsid != super->hdr.fsid)
-		snprintf(msg, sizeof(msg), "blk fsid %016llx != %016llx",
-			 le64_to_cpu(blk->hdr.fsid), le64_to_cpu(super->hdr.fsid));
-	else if (le64_to_cpu(blk->hdr.blkno) != blkno)
-		snprintf(msg, sizeof(msg), "blk blkno %llu != %llu",
-			 le64_to_cpu(blk->hdr.blkno), blkno);
-	else if (check_rid && le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid) != rid)
-		snprintf(msg, sizeof(msg), "quorum block begin rid %016llx != our rid %016llx, are multiple mounts configured with this slot?",
-		le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid), rid);
-	else
-		msg[0] = '\0';
-
-	if (msg[0] != '\0') {
-		scoutfs_err(sb, "read invalid quorum block, %s", msg);
+	if (ret == 0 &&
+	    ((blk->hdr.crc != crc) ||
+	     (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM) ||
+	     (blk->hdr.fsid != super->hdr.fsid) ||
+	     (le64_to_cpu(blk->hdr.blkno) != blkno))) {
+		scoutfs_inc_counter(sb, quorum_read_invalid_block);
 		ret = -EIO;
-		goto out;
 	}

-out:
+	if (mark && *mark != 0 && blk->random_write_mark != *mark) {
+		scoutfs_err(sb, "read unexpected quorum block write mark, are multiple mounts configured with the same slot?");
+		ret = -EIO;
+	}
+
+	if (ret < 0)
+		scoutfs_err(sb, "quorum block read error %d", ret);
+
 	return ret;
 }

-static void set_quorum_block_event(struct super_block *sb, struct scoutfs_quorum_block *blk,
-				   int event, u64 term)
+static void set_quorum_block_event(struct super_block *sb,
+				   struct scoutfs_quorum_block *blk,
+				   struct scoutfs_quorum_block_event *ev)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_quorum_block_event *ev;
 	struct timespec64 ts;

-	if (WARN_ON_ONCE(event < 0 || event >= SCOUTFS_QUORUM_EVENT_NR))
-		return;
-
 	getnstimeofday64(&ts);

-	ev = &blk->events[event];
 	ev->rid = cpu_to_le64(sbi->rid);
-	ev->term = cpu_to_le64(term);
 	ev->ts.sec = cpu_to_le64(ts.tv_sec);
 	ev->ts.nsec = cpu_to_le32(ts.tv_nsec);
 }

-static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk)
+/*
+ * Every time we write a block we update the write stamp and random
+ * write mark so readers can see our write.
+ */
+static int write_quorum_block(struct super_block *sb, u64 blkno,
+			      struct scoutfs_quorum_block *blk, __le64 *mark)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	int ret;

 	if (WARN_ON_ONCE(blkno < SCOUTFS_QUORUM_BLKNO) ||
 	    WARN_ON_ONCE(blkno >= (SCOUTFS_QUORUM_BLKNO +
 				   SCOUTFS_QUORUM_BLOCKS)))
 		return -EINVAL;

-	return scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno, &blk->hdr, sizeof(*blk));
-}
+	do {
+		get_random_bytes(&blk->random_write_mark,
+				 sizeof(blk->random_write_mark));
+	} while (blk->random_write_mark == 0);

-/*
- * Read the caller's slot's quorum block, make a change, and write it
- * back out.
- */
-static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
-{
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
-	u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
-	struct scoutfs_quorum_block blk;
-	int ret;
+	if (mark)
+		*mark = blk->random_write_mark;

-	ret = read_quorum_block(sb, blkno, &blk, check_rid);
-	if (ret == 0) {
-		set_quorum_block_event(sb, &blk, event, term);
-		ret = write_quorum_block(sb, blkno, &blk);
-		if (ret < 0)
-			scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
-				    ret, blkno, event, term);
-	} else {
-		scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
-			    ret, blkno, event, term);
-	}
+	set_quorum_block_event(sb, blk, &blk->write);
+
+	ret = scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno,
+				      &blk->hdr, sizeof(*blk));
+	if (ret < 0)
+		scoutfs_err(sb, "quorum block write error %d", ret);

 	return ret;
 }

 /*
- * The calling server has fenced previous leaders and reclaimed their
- * resources.  We can now update our fence event with a greater term to
- * stop future leaders from doing the same.
+ * Read the caller's slot's current quorum block, make a change, and
+ * write it back out.  If the caller provides a mark it can cause read
+ * errors if we read a mark that doesn't match the last mark that the
+ * caller wrote.
 */
-int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term)
+static int update_quorum_block(struct super_block *sb, u64 blkno,
+			       __le64 *mark, int role, u64 term)
 {
-	return update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, term, true);
+	struct scoutfs_quorum_block blk;
+	u64 flags;
+	u64 bits;
+	u64 set;
+	int ret;
+
+	ret = read_quorum_block(sb, blkno, &blk, mark);
+	if (ret == 0) {
+		if (blk.term != cpu_to_le64(term)) {
+			blk.term = cpu_to_le64(term);
+			set_quorum_block_event(sb, &blk, &blk.update_term);
+		}
+
+		flags = le64_to_cpu(blk.flags);
+		bits = SCOUTFS_QUORUM_BLOCK_LEADER;
+		set = role == LEADER ? SCOUTFS_QUORUM_BLOCK_LEADER : 0;
+		if ((flags & bits) != set)
+			set_quorum_block_event(sb, &blk,
+					       set ? &blk.set_leader :
+					             &blk.clear_leader);
+		blk.flags = cpu_to_le64((flags & ~bits) | set);
+
+		ret = write_quorum_block(sb, blkno, &blk, mark);
+	}
+
+	return ret;
 }

+
 /*
- * The calling server has been elected and has started running but can't
- * yet assume that it has exclusive access to the metadata device.  We
- * read all the quorum blocks looking for previously elected leaders to
- * fence so that we're the only leader running.
- *
- * We're relying on the invariant that there can't be two mounts running
- * with the same slot nr at the same time.  With this constraint there
- * can be at most two previous leaders per slot that need to be fenced:
- * a persistent record of an old mount on the slot, and an active mount.
- *
- * If we start fence requests then we only wait for them to complete
- * before returning.  The server will reclaim their resources once it is
- * up and running and will call us to update the fence event.  If we
- * don't start fence requests then we update the fence event
- * immediately, the server has nothing more to do.
- *
- * Quorum will be sending heartbeats while we wait for fencing.  That
- * keeps us from being fenced while we allow userspace fencing to take a
- * reasonably long time.  We still want to timeout eventually.
+ * The calling server has been elected and updated their block, but
+ * can't yet assume that it has exclusive access to the metadata device.
+ * We read all the quorum blocks looking for previously elected leaders
+ * to fence so that we're the only leader running.
 */
-int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
+static int fence_leader_blocks(struct super_block *sb)
 {
-#define NR_OLD 2
-	struct scoutfs_quorum_block_event old[SCOUTFS_QUORUM_MAX_SLOTS][NR_OLD] = {{{0,}}};
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
+	struct mount_options *opts = &sbi->opts;
 	struct scoutfs_quorum_block blk;
 	struct sockaddr_in sin;
-	const u64 rid = sbi->rid;
-	bool fence_started = false;
-	u64 fenced = 0;
-	__le64 fence_rid;
+	u64 blkno;
 	int ret = 0;
-	int err;
 	int i;
-	int j;

 	BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS);

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (!quorum_slot_present(super, i))
+		if (i == opts->quorum_slot_nr)
 			continue;

-		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
+		blkno = SCOUTFS_QUORUM_BLKNO + i;
+		ret = read_quorum_block(sb, blkno, &blk, NULL);
 		if (ret < 0)
 			goto out;

-		/* elected leader still running */
-		if (le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term) >
-		    le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term))
-			old[i][0] = blk.events[SCOUTFS_QUORUM_EVENT_ELECT];
+		if (!(le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER))
+			continue;

-		/* persistent record of previous server before elected */
-		if ((le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) >
-		     le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) &&
-		    (le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) <
-		     le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term)))
-			old[i][1] = blk.events[SCOUTFS_QUORUM_EVENT_FENCE];
+		scoutfs_inc_counter(sb, quorum_fence_leader);
+		scoutfs_quorum_slot_sin(super, i, &sin);

-		/* find greatest term that has fenced everything before it */
-		fenced = max(fenced, le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term));
-	}
+		scoutfs_err(sb, "fencing "SCSBF" at "SIN_FMT,
+			    SCSB_LEFR_ARGS(super->hdr.fsid, blk.set_leader.rid),
+			    SIN_ARG(&sin));

-	/* now actually fence any old leaders which haven't been fenced yet */
-	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		for (j = 0; j < NR_OLD; j++) {
-			if (le64_to_cpu(old[i][j].term) == 0 ||		/* uninitialized */
-			    le64_to_cpu(old[i][j].term) < fenced ||	/* already fenced */
-			    le64_to_cpu(old[i][j].term) > term ||	/* newer than us */
-			    le64_to_cpu(old[i][j].rid) == rid)		/* us */
-				continue;
+		blk.flags &= ~cpu_to_le64(SCOUTFS_QUORUM_BLOCK_LEADER);
+		set_quorum_block_event(sb, &blk, &blk.fenced);

-			scoutfs_inc_counter(sb, quorum_fence_leader);
-			scoutfs_quorum_slot_sin(super, i, &sin);
-			fence_rid = old[i][j].rid;
-
-			scoutfs_info(sb, "fencing previous leader "SCSBF" at term %llu in slot %u with address "SIN_FMT,
-				     SCSB_LEFR_ARGS(super->hdr.fsid, fence_rid),
-				     le64_to_cpu(old[i][j].term), i, SIN_ARG(&sin));
-			ret = scoutfs_fence_start(sb, le64_to_cpu(fence_rid), sin.sin_addr.s_addr,
-						  SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
-			if (ret < 0)
-				goto out;
-			fence_started = true;
-		}
+		ret = write_quorum_block(sb, blkno, &blk, NULL);
+		if (ret < 0)
+			goto out;
 	}

 out:
-	if (fence_started) {
-		err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
-		if (ret == 0)
-			ret = err;
-	} else {
-		err = scoutfs_quorum_fence_complete(sb, term);
-		if (ret == 0)
-			ret = err;
-	}
-
 	if (ret < 0) {
-		scoutfs_err(sb, "error %d attempting to find and fence previous leaders", ret);
+		scoutfs_err(sb, "error %d fencing active", ret);
 		scoutfs_inc_counter(sb, quorum_fence_error);
 	}

@@ -580,22 +533,23 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	struct sockaddr_in unused;
 	struct quorum_host_msg msg;
 	struct quorum_status qst;
+	__le64 mark;
 	u64 blkno;
 	int ret;
-	int err;

 	/* recording votes from slots as native single word bitmap */
 	BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);

 	/* get our starting term from our persistent block */
+	mark = 0;
 	blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
-	ret = read_quorum_block(sb, blkno, &blk, false);
+	ret = read_quorum_block(sb, blkno, &blk, &mark);
 	if (ret < 0)
 		goto out;

 	/* start out as a follower */
 	qst.role = FOLLOWER;
-	qst.term = le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_TERM].term);
+	qst.term = le64_to_cpu(blk.term);
 	qst.vote_for = -1;
 	qst.vote_bits = 0;

@@ -605,11 +559,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	else
 		qst.timeout = election_timeout();

-	/* record that we're up and running, readers check that it isn't updated */
-	ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_BEGIN, qst.term, false);
-	if (ret < 0)
-		goto out;
-
 	while (!qinf->shutdown) {

 		ret = recv_msg(sb, &msg, qst.timeout);
@@ -640,6 +589,11 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
 					qst.term);
 			scoutfs_inc_counter(sb, quorum_send_resignation);
+
+			ret = update_quorum_block(sb, blkno, &mark,
+						  qst.role, qst.term);
+			if (ret < 0)
+				goto out;
 		}

 		spin_lock(&qinf->show_lock);
@@ -670,7 +624,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				qst.timeout = election_timeout();

 			/* store our increased term */
-			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
+			ret = update_quorum_block(sb, blkno, &mark,
+						  qst.role, qst.term);
 			if (ret < 0)
 				goto out;
 		}
@@ -687,11 +642,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 					qst.term);
 			qst.timeout = election_timeout();
 			scoutfs_inc_counter(sb, quorum_send_request);
-
-			/* store our increased term */
-			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
-			if (ret < 0)
-				goto out;
 		}

 		/* candidates count votes in their term */
@@ -720,8 +670,10 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 					qst.term);
 			qst.timeout = heartbeat_interval();

-			/* record that we've been elected before starting up server */
-			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
+			/* set our leader flag and fence */
+			ret = update_quorum_block(sb, blkno, &mark,
+						  qst.role, qst.term) ?:
+			      fence_leader_blocks(sb);
 			if (ret < 0)
 				goto out;

@@ -732,13 +684,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 			ret = scoutfs_server_start(sb, qst.term);
 			if (ret < 0) {
-				clear_bit(QINF_FLAG_SERVER, &qinf->flags);
-				scoutfs_err(sb, "server startup failed with %d", ret);
-				/* store our increased term */
-				err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
-							  true);
-				if (err < 0 && ret == 0)
-					ret = err;
+				scoutfs_err(sb, "server startup failed with %d",
+					    ret);
 				goto out;
 			}
 		}
@@ -780,13 +727,17 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	/* always try to stop a running server as we stop */
 	if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) {
 		scoutfs_server_stop(sb);
-		scoutfs_fence_stop(sb);
 		send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
 				qst.term);
 	}

-	/* informational event that we're shutting down, nothing relies on it */
-	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
+	/* always try to clear leader block as we stop to avoid fencing */
+	if (qst.role == LEADER) {
+		ret = update_quorum_block(sb, blkno, &mark,
+					  FOLLOWER, qst.term);
+		if (ret < 0)
+			goto out;
+	}
 out:
 	if (ret < 0) {
 		scoutfs_err(sb, "quorum service saw error %d, shutting down.  Cluster will be degraded until this slot is remounted to restart the quorum service",
@@ -795,60 +746,58 @@ out:
 }

 /*
- * The calling server has shutdown and is no longer using shared
- * resources.  Clear the bit so that we stop sending heartbeats and
- * allow the next server to be elected.  Update the stop event so that
- * it won't be considered available by clients or fenced by the next
- * leader.
+ * Set a flag for the quorum work's next iteration to indicate that the
+ * server has shutdown and that it should step down as leader, update
+ * quorum blocks, and stop sending heartbeats.
 */
-void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term)
+void scoutfs_quorum_server_shutdown(struct super_block *sb)
 {
 	DECLARE_QUORUM_INFO(sb, qinf);

-	clear_bit(QINF_FLAG_SERVER, &qinf->flags);
-	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, term, true);
+	set_bit(QINF_FLAG_SERVER, &qinf->flags);
 }

 /*
 * Clients read quorum blocks looking for the leader with a server whose
 * address it can try and connect to.
 *
- * There can be records of multiple previous elected leaders if the
- * current server hasn't yet fenced any old servers.  We use the elected
- * leader with the greatest elected term.  If we get it wrong the
- * connection will timeout and the client will try again.
+ * There can be multiple running servers if a client checks before a
+ * server has had a chance to fence any old servers.  We try to use the
+ * block with the most recent timestamp.  If we get it wrong the
+ * connection will timeout and the client will try again, presumably
+ * finding a single server block.
 */
 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_quorum_block blk;
-	u64 elect_term;
-	u64 term = 0;
-	int ret = 0;
+	struct timespec64 recent = {0,};
+	struct timespec64 ts;
+	int ret;
 	int i;

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (!quorum_slot_present(super, i))
-			continue;
-
-		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
+		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk,
+					NULL);
 		if (ret < 0) {
 			scoutfs_err(sb, "error reading quorum block nr %u: %d",
 				    i, ret);
 			goto out;
 		}

-		elect_term = le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term);
-		if (elect_term > term &&
-		    elect_term > le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) {
-			term = elect_term;
+		ts.tv_sec = le64_to_cpu(blk.set_leader.ts.sec);
+		ts.tv_nsec = le32_to_cpu(blk.set_leader.ts.nsec);
+
+		if ((le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER) &&
+		    (timespec64_to_ns(&ts) > timespec64_to_ns(&recent))) {
+			recent = ts;
 			scoutfs_quorum_slot_sin(super, i, sin);
 			continue;
 		}
 	}

-	if (term == 0)
+	if (timespec64_to_ns(&recent) == 0)
 		ret = -ENOENT;

 out:
--- a/kmod/src/quorum.h
+++ b/kmod/src/quorum.h
@@ -2,15 +2,12 @@
 #define _SCOUTFS_QUORUM_H_

 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);
-void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term);
+void scoutfs_quorum_server_shutdown(struct super_block *sb);

 u8 scoutfs_quorum_votes_needed(struct super_block *sb);
 void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
 			     struct sockaddr_in *sin);

-int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
-int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term);
-
 int scoutfs_quorum_setup(struct super_block *sb);
 void scoutfs_quorum_shutdown(struct super_block *sb);
 void scoutfs_quorum_destroy(struct super_block *sb);
--- a/kmod/src/recov.c
+++ b/kmod/src/recov.c
@@ -16,11 +16,9 @@
 #include <linux/sched.h>
 #include <linux/rhashtable.h>
 #include <linux/rcupdate.h>
-#include <linux/list_sort.h>

 #include "super.h"
 #include "recov.h"
-#include "cmp.h"

 /*
 * There are a few server messages which can't be processed until they
@@ -49,41 +47,18 @@ struct recov_pending {
 	int which;
 };

-static struct recov_pending *next_pending(struct recov_info *recinf, u64 rid, int which)
+static struct recov_pending *find_pending(struct recov_info *recinf, u64 rid, int which)
 {
 	struct recov_pending *pend;

 	list_for_each_entry(pend, &recinf->pending, head) {
-		if (pend->rid > rid && pend->which & which)
+		if ((rid == 0 || pend->rid == rid) && (pend->which & which))
 			return pend;
 	}

 	return NULL;
 }

-static struct recov_pending *lookup_pending(struct recov_info *recinf, u64 rid, int which)
-{
-	struct recov_pending *pend;
-
-	pend = next_pending(recinf, rid - 1, which);
-	if (pend && pend->rid == rid)
-		return pend;
-
-	return NULL;
-}
-
-/*
- * We keep the pending list sorted by rid so that we can iterate over
- * them.  The list should be small and shouldn't be used often.
- */
-static int cmp_pending_rid(void *priv, struct list_head *A, struct list_head *B)
-{
-	struct recov_pending *a = list_entry(A, struct recov_pending, head);
-	struct recov_pending *b = list_entry(B, struct recov_pending, head);
-
-	return scoutfs_cmp_u64s(a->rid, b->rid);
-}
-
 /*
 * Record that we'll be waiting for a client to recover something.
 * _finished will eventually be called for every _prepare, either
@@ -105,15 +80,14 @@ int scoutfs_recov_prepare(struct super_block *sb, u64 rid, int which)

 	spin_lock(&recinf->lock);

-	pend = lookup_pending(recinf, rid, SCOUTFS_RECOV_ALL);
+	pend = find_pending(recinf, rid, SCOUTFS_RECOV_ALL);
 	if (pend) {
 		pend->which |= which;
 	} else {
 		swap(pend, alloc);
 		pend->rid = rid;
 		pend->which = which;
-		list_add_tail(&pend->head, &recinf->pending);
-		list_sort(NULL, &recinf->pending, cmp_pending_rid);
+		list_add(&pend->head, &recinf->pending);
 	}

 	spin_unlock(&recinf->lock);
@@ -185,7 +159,7 @@ int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which)

 	spin_lock(&recinf->lock);

-	pend = lookup_pending(recinf, rid, which);
+	pend = find_pending(recinf, rid, which);
 	if (pend) {
 		pend->which &= ~which;
 		if (pend->which) {
@@ -216,28 +190,29 @@ bool scoutfs_recov_is_pending(struct super_block *sb, u64 rid, int which)
 	bool is_pending;

 	spin_lock(&recinf->lock);
-	is_pending = lookup_pending(recinf, rid, which) != NULL;
+	is_pending = find_pending(recinf, rid, which) != NULL;
 	spin_unlock(&recinf->lock);

 	return is_pending;
 }

 /*
- * Return the next rid after the given rid of a client waiting for the
- * given state to be recovered.  Start with rid 0, returns 0 when there
- * are no more clients waiting for recovery.
+ * Returns 0 if there are no rids waiting for the given state to be
+ * recovered.  Returns the rid of a client still waiting if there are
+ * any, in no specified order.
 *
 * This is inherently racey.  Callers are responsible for resolving any
 * actions taken based on pending with the recovery finishing, perhaps
 * before we return.
 */
-u64 scoutfs_recov_next_pending(struct super_block *sb, u64 rid, int which)
+u64 scoutfs_recov_next_pending(struct super_block *sb, int which)
 {
 	DECLARE_RECOV_INFO(sb, recinf);
 	struct recov_pending *pend;
+	u64 rid;

 	spin_lock(&recinf->lock);
-	pend = next_pending(recinf, rid, which);
+	pend = find_pending(recinf, 0, which);
 	rid = pend ? pend->rid : 0;
 	spin_unlock(&recinf->lock);

--- a/kmod/src/recov.h
+++ b/kmod/src/recov.h
@@ -14,7 +14,7 @@ int scoutfs_recov_begin(struct super_block *sb, void (*timeout_fn)(struct super_
 			unsigned int timeout_ms);
 int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which);
 bool scoutfs_recov_is_pending(struct super_block *sb, u64 rid, int which);
-u64 scoutfs_recov_next_pending(struct super_block *sb, u64 rid, int which);
+u64 scoutfs_recov_next_pending(struct super_block *sb, int which);
 void scoutfs_recov_shutdown(struct super_block *sb);

 int scoutfs_recov_setup(struct super_block *sb);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -424,15 +424,14 @@ TRACE_EVENT(scoutfs_trans_write_func,
 );

 DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),

-	TP_ARGS(sb, journal_info, holders, ret),
+	TP_ARGS(sb, journal_info, holders),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(unsigned long, journal_info)
 		__field(int, holders)
-		__field(int, ret)
 	),

 	TP_fast_assign(
@@ -441,17 +440,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		__entry->holders = holders;
 	),

-	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
-		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret)
+	TP_printk(SCSBF" journal_info 0x%0lx holders %d",
+		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders)
 );

-DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
-	TP_ARGS(sb, journal_info, holders, ret)
+DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_ARGS(sb, journal_info, holders)
 );
 DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
-	TP_ARGS(sb, journal_info, holders, ret)
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_ARGS(sb, journal_info, holders)
 );

 TRACE_EVENT(scoutfs_ioc_release,
@@ -986,6 +985,22 @@ TRACE_EVENT(scoutfs_delete_inode,
 		  __entry->mode, __entry->size)
 );

+TRACE_EVENT(scoutfs_scan_orphans,
+	TP_PROTO(struct super_block *sb),
+
+	TP_ARGS(sb),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+	),
+
+	TP_fast_assign(
+		__entry->dev = sb->s_dev;
+	),
+
+	TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
+);
+
 DECLARE_EVENT_CLASS(scoutfs_key_class,
        TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
        TP_ARGS(sb, key),
@@ -1629,164 +1644,6 @@ TRACE_EVENT(scoutfs_btree_walk,
 		  __entry->level, __entry->ref_blkno, __entry->ref_seq)
 );

-TRACE_EVENT(scoutfs_btree_set_parent,
-	TP_PROTO(struct super_block *sb,
-		 struct scoutfs_btree_root *root, struct scoutfs_key *key,
-		 struct scoutfs_btree_root *par_root),
-
-	TP_ARGS(sb, root, key, par_root),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, root_blkno)
-		__field(__u64, root_seq)
-		__field(__u8, root_height)
-		sk_trace_define(key)
-		__field(__u64, par_root_blkno)
-		__field(__u64, par_root_seq)
-		__field(__u8, par_root_height)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
-		__entry->root_seq = le64_to_cpu(root->ref.seq);
-		__entry->root_height = root->height;
-		sk_trace_assign(key, key);
-		__entry->par_root_blkno = le64_to_cpu(par_root->ref.blkno);
-		__entry->par_root_seq = le64_to_cpu(par_root->ref.seq);
-		__entry->par_root_height = par_root->height;
-	),
-
-	TP_printk(SCSBF" root blkno %llu seq %llu height %u, key "SK_FMT", par_root blkno %llu seq %llu height %u",
-		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
-		  __entry->root_height, sk_trace_args(key),
-		  __entry->par_root_blkno, __entry->par_root_seq,
-		  __entry->par_root_height)
-);
-
-TRACE_EVENT(scoutfs_btree_merge,
-	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
-		 struct scoutfs_key *start, struct scoutfs_key *end),
-
-	TP_ARGS(sb, root, start, end),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, root_blkno)
-		__field(__u64, root_seq)
-		__field(__u8, root_height)
-		sk_trace_define(start)
-		sk_trace_define(end)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
-		__entry->root_seq = le64_to_cpu(root->ref.seq);
-		__entry->root_height = root->height;
-		sk_trace_assign(start, start);
-		sk_trace_assign(end, end);
-	),
-
-	TP_printk(SCSBF" root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT,
-		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
-		  __entry->root_height, sk_trace_args(start),
-		  sk_trace_args(end))
-);
-
-TRACE_EVENT(scoutfs_btree_merge_items,
-	TP_PROTO(struct super_block *sb,
-		 struct scoutfs_btree_root *m_root,
-		 struct scoutfs_key *m_key, int m_val_len,
-		 struct scoutfs_btree_root *f_root,
-		 struct scoutfs_key *f_key, int f_val_len,
-		 int is_del),
-
-	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, m_root_blkno)
-		__field(__u64, m_root_seq)
-		__field(__u8, m_root_height)
-		sk_trace_define(m_key)
-		__field(int, m_val_len)
-		__field(__u64, f_root_blkno)
-		__field(__u64, f_root_seq)
-		__field(__u8, f_root_height)
-		sk_trace_define(f_key)
-		__field(int, f_val_len)
-		__field(int, is_del)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->m_root_blkno = m_root ?
-					le64_to_cpu(m_root->ref.blkno) : 0;
-		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
-		__entry->m_root_height = m_root ? m_root->height : 0;
-		sk_trace_assign(m_key, m_key);
-		__entry->m_val_len = m_val_len;
-		__entry->f_root_blkno = f_root ?
-					le64_to_cpu(f_root->ref.blkno) : 0;
-		__entry->f_root_seq = f_root ? le64_to_cpu(f_root->ref.seq) : 0;
-		__entry->f_root_height = f_root ? f_root->height : 0;
-		sk_trace_assign(f_key, f_key);
-		__entry->f_val_len = f_val_len;
-		__entry->is_del = !!is_del;
-	),
-
-	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
-		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
-		  __entry->m_root_height, sk_trace_args(m_key),
-		  __entry->m_val_len, __entry->f_root_blkno,
-		  __entry->f_root_seq, __entry->f_root_height,
-		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
-);
-
-DECLARE_EVENT_CLASS(scoutfs_btree_free_blocks,
-	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
-		 u64 blkno),
-
-	TP_ARGS(sb, root, blkno),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, root_blkno)
-		__field(__u64, root_seq)
-		__field(__u8, root_height)
-		__field(__u64, blkno)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
-		__entry->root_seq = le64_to_cpu(root->ref.seq);
-		__entry->root_height = root->height;
-		__entry->blkno = blkno;
-	),
-
-	TP_printk(SCSBF" root blkno %llu seq %llu height %u, free blkno %llu",
-		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
-		  __entry->root_height, __entry->blkno)
-);
-DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_single,
-	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
-		 u64 blkno),
-	TP_ARGS(sb, root, blkno)
-);
-DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_leaf,
-	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
-		 u64 blkno),
-	TP_ARGS(sb, root, blkno)
-);
-DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_parent,
-	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
-		 u64 blkno),
-	TP_ARGS(sb, root, blkno)
-);
-
 TRACE_EVENT(scoutfs_online_offline_blocks,
 	TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta,
 		 u64 on_now, u64 off_now),
@@ -2043,116 +1900,6 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

-TRACE_EVENT(scoutfs_get_log_merge_status,
-	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
-		 u64 nr_requests, u64 nr_complete, u64 last_seq, u64 seq),
-
-	TP_ARGS(sb, rid, next_range_key, nr_requests, nr_complete, last_seq, seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, s_rid)
-		sk_trace_define(next_range_key)
-		__field(__u64, nr_requests)
-		__field(__u64, nr_complete)
-		__field(__u64, last_seq)
-		__field(__u64, seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->s_rid = rid;
-		sk_trace_assign(next_range_key, next_range_key);
-		__entry->nr_requests = nr_requests;
-		__entry->nr_complete = nr_complete;
-		__entry->last_seq = last_seq;
-		__entry->seq = seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx next_range_key "SK_FMT" nr_requests %llu nr_complete %llu last_seq %llu seq %llu",
-		  SCSB_TRACE_ARGS, __entry->s_rid, sk_trace_args(next_range_key),
-		  __entry->nr_requests, __entry->nr_complete, __entry->last_seq, __entry->seq)
-);
-
-TRACE_EVENT(scoutfs_get_log_merge_request,
-	TP_PROTO(struct super_block *sb, u64 rid,
-		 struct scoutfs_btree_root *root, struct scoutfs_key *start,
-		 struct scoutfs_key *end, u64 last_seq, u64 seq),
-
-	TP_ARGS(sb, rid, root, start, end, last_seq, seq),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, s_rid)
-		__field(__u64, root_blkno)
-		__field(__u64, root_seq)
-		__field(__u8, root_height)
-		sk_trace_define(start)
-		sk_trace_define(end)
-		__field(__u64, last_seq)
-		__field(__u64, seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->s_rid = rid;
-		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
-		__entry->root_seq = le64_to_cpu(root->ref.seq);
-		__entry->root_height = root->height;
-		sk_trace_assign(start, start);
-		sk_trace_assign(end, end);
-		__entry->last_seq = last_seq;
-		__entry->seq = seq;
-	),
-
-	TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" last_seq %llu seq %llu",
-		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno,
-		  __entry->root_seq, __entry->root_height,
-		  sk_trace_args(start), sk_trace_args(end), __entry->last_seq,
-		  __entry->seq)
-);
-
-TRACE_EVENT(scoutfs_get_log_merge_complete,
-	TP_PROTO(struct super_block *sb, u64 rid,
-		 struct scoutfs_btree_root *root, struct scoutfs_key *start,
-		 struct scoutfs_key *end, struct scoutfs_key *remain,
-		 u64 seq, u64 flags),
-
-	TP_ARGS(sb, rid, root, start, end, remain, seq, flags),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, s_rid)
-		__field(__u64, root_blkno)
-		__field(__u64, root_seq)
-		__field(__u8, root_height)
-		sk_trace_define(start)
-		sk_trace_define(end)
-		sk_trace_define(remain)
-		__field(__u64, seq)
-		__field(__u64, flags)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->s_rid = rid;
-		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
-		__entry->root_seq = le64_to_cpu(root->ref.seq);
-		__entry->root_height = root->height;
-		sk_trace_assign(start, start);
-		sk_trace_assign(end, end);
-		sk_trace_assign(remain, remain);
-		__entry->seq = seq;
-		__entry->flags = flags;
-	),
-
-	TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" remain "SK_FMT" seq %llu flags 0x%llx",
-		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno,
-		  __entry->root_seq, __entry->root_height,
-		  sk_trace_args(start), sk_trace_args(end),
-		  sk_trace_args(remain), __entry->seq, __entry->flags)
-);
-
 DECLARE_EVENT_CLASS(scoutfs_forest_bloom_class,
 	TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
 		 u64 rid, u64 nr, u64 blkno, u64 seq, unsigned int count),
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -56,15 +56,13 @@ do {								\
 	__entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \
 	__entry->name##_error

-u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb);
-
 int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 				struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
 				 struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
-void scoutfs_server_hold_commit(struct super_block *sb);
+int scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);
 void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which);

@@ -73,10 +71,8 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
 int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
 				      struct scoutfs_open_ino_map *map, int err);

-u64 scoutfs_server_seq(struct super_block *sb);
-u64 scoutfs_server_next_seq(struct super_block *sb);
-void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
-
+struct sockaddr_in;
+struct scoutfs_quorum_elected_info;
 int scoutfs_server_start(struct super_block *sb, u64 term);
 void scoutfs_server_abort(struct super_block *sb);
 void scoutfs_server_stop(struct super_block *sb);
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -989,13 +989,12 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
 			    struct scoutfs_block_writer *wri,
 			    struct scoutfs_btree_root *root,
-			    struct scoutfs_srch_file *sfl, bool force)
+			    struct scoutfs_srch_file *sfl)
 {
 	struct scoutfs_key key;
 	int ret;

-	if (sfl->ref.blkno == 0 ||
-	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
+	if (le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT)
 		return 0;

 	init_srch_key(&key, SCOUTFS_SRCH_LOG_TYPE,
--- a/kmod/src/srch.h
+++ b/kmod/src/srch.h
@@ -37,7 +37,7 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
 			    struct scoutfs_block_writer *wri,
 			    struct scoutfs_btree_root *root,
-			    struct scoutfs_srch_file *sfl, bool force);
+			    struct scoutfs_srch_file *sfl);
 int scoutfs_srch_get_compact(struct super_block *sb,
 			     struct scoutfs_alloc *alloc,
 			     struct scoutfs_block_writer *wri,
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -46,8 +46,6 @@
 #include "alloc.h"
 #include "recov.h"
 #include "omap.h"
-#include "volopt.h"
-#include "fence.h"
 #include "scoutfs_trace.h"

 static struct dentry *scoutfs_debugfs_root;
@@ -247,14 +245,14 @@ static void scoutfs_put_super(struct super_block *sb)

 	trace_scoutfs_put_super(sb);

-	scoutfs_inode_stop(sb);
-	scoutfs_forest_stop(sb);
 	scoutfs_srch_destroy(sb);

+	scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
+	sbi->rid_lock = NULL;
+
 	scoutfs_lock_shutdown(sb);

 	scoutfs_shutdown_trans(sb);
-	scoutfs_volopt_destroy(sb);
 	scoutfs_client_destroy(sb);
 	scoutfs_inode_destroy(sb);
 	scoutfs_item_destroy(sb);
@@ -270,7 +268,6 @@ static void scoutfs_put_super(struct super_block *sb)

 	scoutfs_block_destroy(sb);
 	scoutfs_destroy_triggers(sb);
-	scoutfs_fence_destroy(sb);
 	scoutfs_options_destroy(sb);
 	scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa);
 	debugfs_remove(sbi->debug_root);
@@ -284,21 +281,6 @@ static void scoutfs_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }

-/*
- * Record that we're performing a forced unmount.  As put_super drives
- * destruction of the filesystem we won't issue more network or storage
- * operations because we assume that they'll hang.  Pending operations
- * can return errors when it's possible to do so.  We may be racing with
- * pending operations which can't be canceled.
- */
-static void scoutfs_umount_begin(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	scoutfs_warn(sb, "forcing unmount, can return errors and lose unsynced data");
-	sbi->forced_unmount = true;
-}
-
 static const struct super_operations scoutfs_super_ops = {
 	.alloc_inode = scoutfs_alloc_inode,
 	.drop_inode = scoutfs_drop_inode,
@@ -308,7 +290,6 @@ static const struct super_operations scoutfs_super_ops = {
 	.statfs = scoutfs_statfs,
 	.show_options = scoutfs_show_options,
 	.put_super = scoutfs_put_super,
-	.umount_begin = scoutfs_umount_begin,
 };

 /*
@@ -607,7 +588,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa,
 				mount_options_attrs, "mount_options") ?:
 	      scoutfs_setup_triggers(sb) ?:
-	      scoutfs_fence_setup(sb) ?:
 	      scoutfs_block_setup(sb) ?:
 	      scoutfs_forest_setup(sb) ?:
 	      scoutfs_item_setup(sb) ?:
@@ -621,10 +601,10 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_server_setup(sb) ?:
 	      scoutfs_quorum_setup(sb) ?:
 	      scoutfs_client_setup(sb) ?:
-	      scoutfs_volopt_setup(sb) ?:
+	      scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
+				   &sbi->rid_lock) ?:
 	      scoutfs_trans_get_log_trees(sb) ?:
-	      scoutfs_srch_setup(sb) ?:
-	      scoutfs_inode_start(sb);
+	      scoutfs_srch_setup(sb);
 	if (ret)
 		goto out;

@@ -645,6 +625,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;

 	scoutfs_trans_restart_sync_deadline(sb);
+//	scoutfs_scan_orphans(sb);
 	ret = 0;
 out:
 	/* on error, generic_shutdown_super calls put_super if s_root */
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -28,14 +28,13 @@ struct forest_info;
 struct srch_info;
 struct recov_info;
 struct omap_info;
-struct volopt_info;
-struct fence_info;

 struct scoutfs_sb_info {
 	struct super_block *sb;

 	/* assigned once at the start of each mount, read-only */
 	u64 rid;
+	struct scoutfs_lock *rid_lock;

 	struct scoutfs_super_block super;

@@ -52,9 +51,7 @@ struct scoutfs_sb_info {
 	struct forest_info *forest_info;
 	struct srch_info *srch_info;
 	struct omap_info *omap_info;
-	struct volopt_info *volopt_info;
 	struct item_cache_info *item_cache_info;
-	struct fence_info *fence_info;

 	wait_queue_head_t trans_hold_wq;
 	struct task_struct *trans_task;
@@ -88,8 +85,6 @@ struct scoutfs_sb_info {

 	struct dentry *debug_root;

-	bool forced_unmount;
-
 	unsigned long corruption_messages_once[SC_NR_LONGS];
 };

@@ -110,13 +105,6 @@ static inline bool SCOUTFS_IS_META_BDEV(struct scoutfs_super_block *super_block)

 #define SCOUTFS_META_BDEV_MODE (FMODE_READ | FMODE_WRITE | FMODE_EXCL)

-static inline bool scoutfs_forcing_unmount(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	return sbi->forced_unmount;
-}
-
 /*
 * A small string embedded in messages that's used to identify a
 * specific mount.  It's the three most significant bytes of the fsid
--- a/kmod/src/sysfs.c
+++ b/kmod/src/sysfs.c
@@ -131,10 +131,9 @@ void scoutfs_sysfs_init_attrs(struct super_block *sb,
 *  If this returns success then the file will be visible and show can
 *  be called until unmount.
 */
-int scoutfs_sysfs_create_attrs_parent(struct super_block *sb,
-				      struct kobject *parent,
-				      struct scoutfs_sysfs_attrs *ssa,
-				      struct attribute **attrs, char *fmt, ...)
+int scoutfs_sysfs_create_attrs(struct super_block *sb,
+			       struct scoutfs_sysfs_attrs *ssa,
+			       struct attribute **attrs, char *fmt, ...)
 {
 	va_list args;
 	size_t name_len;
@@ -175,8 +174,8 @@ int scoutfs_sysfs_create_attrs_parent(struct super_block *sb,
 		goto out;
 	}

-	ret = kobject_init_and_add(&ssa->kobj, &ssa->ktype, parent,
-				   "%s", ssa->name);
+	ret = kobject_init_and_add(&ssa->kobj, &ssa->ktype,
+				   scoutfs_sysfs_sb_dir(sb), "%s", ssa->name);
 out:
 	if (ret) {
 		kfree(ssa->name);
--- a/kmod/src/sysfs.h
+++ b/kmod/src/sysfs.h
@@ -10,8 +10,6 @@

 #define SCOUTFS_ATTR_RO(_name)						\
        static struct kobj_attribute scoutfs_attr_##_name = __ATTR_RO(_name)
-#define SCOUTFS_ATTR_RW(_name)						\
-        static struct kobj_attribute scoutfs_attr_##_name = __ATTR_RW(_name)

 #define SCOUTFS_ATTR_PTR(_name)						\
        &scoutfs_attr_##_name.attr
@@ -36,14 +34,9 @@ struct scoutfs_sysfs_attrs {

 void scoutfs_sysfs_init_attrs(struct super_block *sb,
 			      struct scoutfs_sysfs_attrs *ssa);
-int scoutfs_sysfs_create_attrs_parent(struct super_block *sb,
-				      struct kobject *parent,
-				      struct scoutfs_sysfs_attrs *ssa,
-				      struct attribute **attrs, char *fmt, ...);
-#define scoutfs_sysfs_create_attrs(sb, ssa, attrs, fmt, args...)	\
-	scoutfs_sysfs_create_attrs_parent(sb, scoutfs_sysfs_sb_dir(sb),	\
-					  ssa, attrs, fmt, ##args)
-
+int scoutfs_sysfs_create_attrs(struct super_block *sb,
+			       struct scoutfs_sysfs_attrs *ssa,
+			       struct attribute **attrs, char *fmt, ...);
 void scoutfs_sysfs_destroy_attrs(struct super_block *sb,
 				 struct scoutfs_sysfs_attrs *ssa);

--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -185,11 +185,6 @@ void scoutfs_trans_write_func(struct work_struct *work)

 	wait_event(sbi->trans_hold_wq, drained_holders(tri));

-	if (scoutfs_forcing_unmount(sb)) {
-		ret = -EIO;
-		goto out;
-	}
-
 	trace_scoutfs_trans_write_func(sb,
 			scoutfs_block_writer_dirty_bytes(sb, &tri->wri));

@@ -207,7 +202,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 			if (ret < 0)
 			      s = "clean advance seq";
 		}
-		goto err;
+		goto out;
 	}

 	if (sbi->trans_deadline_expired)
@@ -227,12 +222,11 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	      scoutfs_item_write_done(sb) ?:
 	      (s = "advance seq", scoutfs_client_advance_seq(sb, &trans_seq)) ?:
 	      (s = "get log trees", scoutfs_trans_get_log_trees(sb));
-err:
+out:
 	if (ret < 0)
 		scoutfs_err(sb, "critical transaction commit failure: %s, %d",
 			    s, ret);

-out:
 	spin_lock(&sbi->trans_write_lock);
 	sbi->trans_write_count++;
 	sbi->trans_write_ret = ret;
@@ -436,8 +430,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 		return true;
 	}

-	/* if we're low and can't refill then alloc could empty and return enospc */
-	if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) {
+	/* Try to refill data allocator before premature enospc */
+	if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) {
 		scoutfs_inc_counter(sb, trans_commit_data_alloc_low);
 		return true;
 	}
@@ -445,15 +439,38 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 	return false;
 }

-/*
- * called as a wait_event condition, needs to be careful to not change
- * task state and is racing with waking paths that sub_return, test, and
- * wake.
- */
-static bool holders_no_writer(struct trans_info *tri)
+static bool acquired_hold(struct super_block *sb)
 {
-	smp_mb(); /* make sure task in wait_event queue before atomic read */
-	return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_TRANS_INFO(sb, tri);
+	bool acquired;
+
+	/* if a caller already has a hold we acquire unconditionally */
+	if (inc_journal_info_holders()) {
+		atomic_inc(&tri->holders);
+		acquired = true;
+		goto out;
+	}
+
+	/* wait if the writer is blocking holds */
+	if (!inc_holders_unless_writer(tri)) {
+		dec_journal_info_holders();
+		acquired = false;
+		goto out;
+	}
+
+	/* wait if we're triggering another commit */
+	if (commit_before_hold(sb, tri)) {
+		release_holders(sb);
+		queue_trans_work(sbi);
+		acquired = false;
+		goto out;
+	}
+
+	trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders));
+	acquired = true;
+out:
+	return acquired;
 }

 /*
@@ -469,64 +486,15 @@ static bool holders_no_writer(struct trans_info *tri)
 * The writing thread marks itself as a global trans_task which
 * short-circuits all the hold machinery so it can call code that would
 * otherwise try to hold transactions while it is writing.
- *
- * If the caller is adding metadata items that will eventually consume
- * free space -- not dirtying existing items or adding deletion items --
- * then we can return enospc if our metadata allocator indicates that
- * we're low on space.
 */
-int scoutfs_hold_trans(struct super_block *sb, bool allocing)
+int scoutfs_hold_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	DECLARE_TRANS_INFO(sb, tri);
-	u64 seq;
-	int ret;

 	if (current == sbi->trans_task)
 		return 0;

-	for (;;) {
-		/* if a caller already has a hold we acquire unconditionally */
-		if (inc_journal_info_holders()) {
-			atomic_inc(&tri->holders);
-			ret = 0;
-			break;
-		}
-
-		/* wait until the writer work is finished */
-		if (!inc_holders_unless_writer(tri)) {
-			dec_journal_info_holders();
-			ret = wait_event_interruptible(sbi->trans_hold_wq, holders_no_writer(tri));
-			if (ret < 0)
-				break;
-			continue;
-		}
-
-		/* return enospc if server is into reserved blocks and we're allocating */
-		if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) {
-			release_holders(sb);
-			ret = -ENOSPC;
-			break;
-		}
-
-		/* see if we need to trigger and wait for a commit before holding */
-		if (commit_before_hold(sb, tri)) {
-			seq = scoutfs_trans_sample_seq(sb);
-			release_holders(sb);
-			queue_trans_work(sbi);
-			ret = wait_event_interruptible(sbi->trans_hold_wq,
-						       scoutfs_trans_sample_seq(sb) != seq);
-			if (ret < 0)
-				break;
-			continue;
-		}
-
-		ret = 0;
-		break;
-	}
-
-	trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret);
-	return ret;
+	return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb));
 }

 /*
@@ -551,7 +519,7 @@ void scoutfs_release_trans(struct super_block *sb)

 	release_holders(sb);

-	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0);
+	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders));
 }

 /*
--- a/kmod/src/trans.h
+++ b/kmod/src/trans.h
@@ -1,13 +1,18 @@
 #ifndef _SCOUTFS_TRANS_H_
 #define _SCOUTFS_TRANS_H_

+/* the server will attempt to fill data allocs for each trans */
+#define SCOUTFS_TRANS_DATA_ALLOC_HWM	(2ULL * 1024 * 1024 * 1024)
+/* the client will force commits if data allocators get too low */
+#define SCOUTFS_TRANS_DATA_ALLOC_LWM	(256ULL * 1024 * 1024)
+
 void scoutfs_trans_write_func(struct work_struct *work);
 int scoutfs_trans_sync(struct super_block *sb, int wait);
 int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 void scoutfs_trans_restart_sync_deadline(struct super_block *sb);

-int scoutfs_hold_trans(struct super_block *sb, bool allocing);
+int scoutfs_hold_trans(struct super_block *sb);
 bool scoutfs_trans_held(void);
 void scoutfs_release_trans(struct super_block *sb);
 u64 scoutfs_trans_sample_seq(struct super_block *sb);
--- a/kmod/src/volopt.c
+++ b/kmod/src/volopt.c
@@ -1,188 +0,0 @@
-/*
- * Copyright (C) 2021 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-
-#include "super.h"
-#include "client.h"
-#include "volopt.h"
-
-/*
- * Volume options are exposed through a sysfs directory.  Getting and
- * setting the values sends rpcs to the server who owns the options in
- * the super block.
- */
-
-struct volopt_info {
-	struct super_block *sb;
-	struct scoutfs_sysfs_attrs ssa;
-};
-
-#define DECLARE_VOLOPT_INFO(sb, name) \
-	struct volopt_info *name = SCOUTFS_SB(sb)->volopt_info
-#define DECLARE_VOLOPT_INFO_KOBJ(kobj, name) \
-	DECLARE_VOLOPT_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
-
-/*
- * attribute arrays need to be dense but the options we export could
- * well become sparse over time.  .store and .load are generic and we
- * have a lookup table to map the attributes array indexes to the number
- * and name of the option.
- */
-static struct volopt_nr_name {
-	int nr;
-	char *name;
-} volopt_table[] = {
-	{ SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, "data_alloc_zone_blocks" },
-};
-
-/* initialized by setup, pointer array is null terminated */
-static struct kobj_attribute volopt_attrs[ARRAY_SIZE(volopt_table)];
-static struct attribute *volopt_attr_ptrs[ARRAY_SIZE(volopt_table) + 1];
-
-static void get_opt_data(struct kobj_attribute *attr, struct scoutfs_volume_options *volopt,
-			 u64 *bit, __le64 **opt)
-{
-	size_t index = attr - &volopt_attrs[0];
-	int nr = volopt_table[index].nr;
-
-	*bit = 1ULL << nr;
-	*opt = &volopt->set_bits + 1 + nr;
-}
-
-static ssize_t volopt_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
-{
-	DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
-	struct super_block *sb = vinf->sb;
-	struct scoutfs_volume_options volopt;
-	__le64 *opt;
-	u64 bit;
-	int ret;
-
-	ret = scoutfs_client_get_volopt(sb, &volopt);
-	if (ret < 0)
-		return ret;
-
-	get_opt_data(attr, &volopt, &bit, &opt);
-
-	if (le64_to_cpu(volopt.set_bits) & bit) {
-		return snprintf(buf, PAGE_SIZE, "%llu", le64_to_cpup(opt));
-	} else {
-		buf[0] = '\0';
-		return 0;
-	}
-}
-
-static ssize_t volopt_attr_store(struct kobject *kobj, struct kobj_attribute *attr,
-				 const char *buf, size_t count)
-{
-	DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
-	struct super_block *sb = vinf->sb;
-	struct scoutfs_volume_options volopt = {0,};
-	u8 chars[32];
-	__le64 *opt;
-	u64 bit;
-	u64 val;
-	int ret;
-
-	if (count == 0)
-		return 0;
-	if (count > sizeof(chars) - 1)
-		return -ERANGE;
-
-	get_opt_data(attr, &volopt, &bit, &opt);
-
-	if (buf[0] == '\n' || buf[0] == '\r') {
-		volopt.set_bits = cpu_to_le64(bit);
-
-		ret = scoutfs_client_clear_volopt(sb, &volopt);
-	} else {
-		memcpy(chars, buf, count);
-		chars[count] = '\0';
-		ret = kstrtoull(chars, 0, &val);
-		if (ret < 0)
-			return ret;
-
-		volopt.set_bits = cpu_to_le64(bit);
-		*opt = cpu_to_le64(val);
-
-		ret = scoutfs_client_set_volopt(sb, &volopt);
-	}
-
-	if (ret == 0)
-		ret = count;
-	return ret;
-}
-
-/*
- * The volume option sysfs files are slim shims around RPCs so this
- * should be called after the client is setup and before it is torn
- * down.
- */
-int scoutfs_volopt_setup(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct volopt_info *vinf;
-	int ret;
-	int i;
-
-	/* persistent volume options are always a bitmap u64 then the 64 options */
-	BUILD_BUG_ON(sizeof(struct scoutfs_volume_options) != (1 + 64) * 8);
-
-	vinf = kzalloc(sizeof(struct volopt_info), GFP_KERNEL);
-	if (!vinf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	scoutfs_sysfs_init_attrs(sb, &vinf->ssa);
-	vinf->sb = sb;
-	sbi->volopt_info = vinf;
-
-	for (i = 0; i < ARRAY_SIZE(volopt_table); i++) {
-		volopt_attrs[i] = (struct kobj_attribute) {
-			.attr = { .name = volopt_table[i].name, .mode = S_IWUSR | S_IRUGO },
-			.show = volopt_attr_show,
-			.store  = volopt_attr_store,
-		};
-		volopt_attr_ptrs[i] = &volopt_attrs[i].attr;
-	}
-
-	BUILD_BUG_ON(ARRAY_SIZE(volopt_table) != ARRAY_SIZE(volopt_attr_ptrs) - 1);
-	volopt_attr_ptrs[i] = NULL;
-
-	ret = scoutfs_sysfs_create_attrs(sb, &vinf->ssa, volopt_attr_ptrs, "volume_options");
-	if (ret < 0)
-		goto out;
-
-out:
-	if (ret)
-		scoutfs_volopt_destroy(sb);
-
-	return ret;
-}
-
-void scoutfs_volopt_destroy(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct volopt_info *vinf = SCOUTFS_SB(sb)->volopt_info;
-
-	if (vinf) {
-		scoutfs_sysfs_destroy_attrs(sb, &vinf->ssa);
-		kfree(vinf);
-		sbi->volopt_info = NULL;
-	}
-}
--- a/kmod/src/volopt.h
+++ b/kmod/src/volopt.h
@@ -1,7 +0,0 @@
-#ifndef _SCOUTFS_VOLOPT_H_
-#define _SCOUTFS_VOLOPT_H_
-
-int scoutfs_volopt_setup(struct super_block *sb);
-void scoutfs_volopt_destroy(struct super_block *sb);
-
-#endif
--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 					     &tgs) != 0)
 			memset(&tgs, 0, sizeof(tgs));

-		ret = scoutfs_hold_trans(sb, false);
+		ret = scoutfs_hold_trans(sb);
 		if (ret < 0)
 			break;
 		release = true;
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -4,5 +4,3 @@ src/dumb_setxattr
 src/handle_cat
 src/bulk_create_paths
 src/find_xattrs
-src/stage_tmpfile
-src/create_xattr_loop
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -7,8 +7,7 @@ BIN := src/createmany			\
 	src/handle_cat			\
 	src/bulk_create_paths		\
 	src/stage_tmpfile		\
-	src/find_xattrs			\
-	src/create_xattr_loop
+	src/find_xattrs

 DEPS := $(wildcard src/*.d)

--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -62,16 +62,5 @@ t_filter_dmesg()
 	# in debugging kernels we can slow things down a bit
 	re="$re|hrtimer: interrupt took .*"

-	# fencing tests force unmounts and trigger timeouts
-	re="$re|scoutfs .* forcing unmount"
-	re="$re|scoutfs .* reconnect timed out"
-	re="$re|scoutfs .* recovery timeout expired"
-	re="$re|scoutfs .* fencing previous leader"
-	re="$re|scoutfs .* reclaimed resources"
-	re="$re|scoutfs .* quorum .* error"
-	re="$re|scoutfs .* error reading quorum block"
-	re="$re|scoutfs .* error .* writing quorum block"
-	re="$re|scoutfs .* error .* while checking to delete inode"
-
 	egrep -v "($re)" 
 }
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -17,17 +17,6 @@ t_sync_seq_index()
 	t_quiet sync
 }

-t_mount_rid()
-{
-	local nr="${1:-0}"
-	local mnt="$(eval echo \$T_M$nr)"
-	local rid
-
-	rid=$(scoutfs statfs -s rid -p "$mnt")
-
-	echo "$rid"
-}
-
 #
 # Output the "f.$fsid.r.$rid" identifier string for the given mount
 # number, 0 is used by default if none is specified. 
@@ -143,16 +132,6 @@ t_umount()
 	eval t_quiet umount \$T_M$nr
 }

-t_force_umount()
-{
-	local nr="$1"
-
-	test "$nr" -lt "$T_NR_MOUNTS" || \
-		t_fail "fs nr $nr invalid"
-
-	eval t_quiet umount -f \$T_M$nr
-}
-
 #
 # Attempt to mount all the configured mounts, assuming that they're
 # not already mounted.
@@ -298,67 +277,3 @@ t_counter_diff_changed() {
 		echo "counter $which didn't change" ||
 		echo "counter $which changed"
 }
-
-#
-# See if we can find a local mount with the caller's rid.
-#
-t_rid_is_mounted() {
-	local rid="$1"
-	local fr="$1"
-
-	for fr in /sys/fs/scoutfs/*; do
-		if [ "$(cat $fr/rid)" == "$rid" ]; then
-			return 0
-		fi
-	done
-
-	return 1
-}
-
-#
-# A given mount is being fenced if any mount has a fence request pending
-# for it which hasn't finished and been removed.
-#
-t_rid_is_fencing() {
-	local rid="$1"
-	local fr
-
-	for fr in /sys/fs/scoutfs/*; do
-		if [ -d "$fr/fence/$rid" ]; then
-			return 0
-		fi
-	done
-
-	return 1
-}
-
-#
-# Wait until the mount identified by the first rid arg is not in any
-# states specified by the remaining state description word args.
-#
-t_wait_if_rid_is() {
-	local rid="$1"
-
-	while ( [[ $* =~ mounted ]] && t_rid_is_mounted $rid ) ||
-	      ( [[ $* =~ fencing ]] && t_rid_is_fencing $rid ) ; do
-		sleep .5
-	done
-}
-
-#
-# Wait until any mount identifies itself as the elected leader.  We can
-# be waiting while tests mount and unmount so mounts may not be mounted
-# at the test's expected mount points.
-#
-t_wait_for_leader() {
-	local i
-
-	while sleep .25; do
-		for i in $(t_fs_nrs); do
-			local ldr="$(t_sysfs_path $i 2>/dev/null)/quorum/is_leader"
-			if [ "$(cat $ldr 2>/dev/null)" == "1" ]; then
-				return
-			fi
-		done
-	done
-}
--- a/tests/golden/enospc
+++ b/tests/golden/enospc
@@ -1,8 +0,0 @@
-== prepare directories and files
-== fallocate until enospc
-== remove all the files and verify free data blocks
-== make small meta fs
-== create large xattrs until we fill up metadata
-== remove files with xattrs after enospc
-== make sure we can create again
-== cleanup small meta fs
--- a/tests/golden/fence-and-reclaim
+++ b/tests/golden/fence-and-reclaim
@@ -1,5 +0,0 @@
-== make sure all mounts can see each other
-== force unmount one client, connection timeout, fence nop, mount
-== force unmount all non-server, connection timeout, fence nop, mount
-== force unmount server, quorum elects new leader, fence nop, mount
-== force unmount everything, new server fences all previous
--- a/tests/golden/mkdir-rename-rmdir
+++ b/tests/golden/mkdir-rename-rmdir
--- a/tests/golden/orphan-inodes
+++ b/tests/golden/orphan-inodes
@@ -1,4 +0,0 @@
-== test our inode existance function
-== unlinked and opened inodes still exist
-== orphan from failed evict deletion is picked up
-== orphaned inos in all mounts all deleted
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -18,15 +18,10 @@ die() {
 	exit 1
 }

-timestamp()
-{
-	date '+%F %T.%N'
-}
-
 # output a message with a timestamp to the run.log
 log()
 {
-	echo "[$(timestamp)] $*" >> "$T_RESULTS/run.log"
+	echo "[$(date '+%F %T.%N')] $*" >> "$T_RESULTS/run.log"
 }

 # run a logged command, exiting if it fails
@@ -71,7 +66,6 @@ $(basename $0) options:
    -X        | xfstests git repo. Used by tests/xfstests.sh.
    -x        | xfstests git branch to checkout and track.
    -y        | xfstests ./check additional args
-    -z <nr>   | set data-alloc-zone-blocks in mkfs
 EOF
 }

@@ -175,11 +169,6 @@ while true; do
 		T_XFSTESTS_ARGS="$2"
 		shift
 		;;
-	-z)
-		test -n "$2" || die "-z must have nr mounts argument"
-		T_DATA_ALLOC_ZONE_BLOCKS="-z $2"
-		shift
-		;;
 	-h|-\?|--help)
 		show_help
 		exit 1
@@ -330,8 +319,7 @@ if [ -n "$T_MKFS" ]; then
 	done

 	msg "making new filesystem with $T_QUORUM quorum members"
-	cmd scoutfs mkfs -f $quo $T_DATA_ALLOC_ZONE_BLOCKS \
-		"$T_META_DEVICE" "$T_DATA_DEVICE"
+	cmd scoutfs mkfs -f $quo "$T_META_DEVICE" "$T_DATA_DEVICE"
 fi

 if [ -n "$T_INSMOD" ]; then
@@ -372,39 +360,6 @@ cmd cat /sys/kernel/debug/tracing/set_event
 cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
 	    /proc/sys/kernel/ftrace_dump_on_oops

-#
-# Build a fenced config that runs scripts out of the repository rather
-# than the default system directory
-#
-conf="$T_RESULTS/scoutfs-fencd.conf"
-cat > $conf << EOF
-SCOUTFS_FENCED_DELAY=1
-SCOUTFS_FENCED_RUN=$T_UTILS/fenced/local-force-unmount
-SCOUTFS_FENCED_RUN_ARGS=""
-EOF
-export SCOUTFS_FENCED_CONFIG_FILE="$conf"
-
-#
-# Run the agent in the background, log its output, an kill it if we
-# exit
-#
-fenced_log()
-{
-	echo "[$(timestamp)] $*" >> "$T_RESULTS/fenced.stdout.log"
-}
-fenced_pid=""
-kill_fenced()
-{
-	if test -n "$fenced_pid" -a -d "/proc/$fenced_pid" ; then
-		fenced_log "killing fenced pid $fenced_pid"
-		kill "$fenced_pid"
-	fi
-}
-trap kill_fenced EXIT
-$T_UTILS/fenced/scoutfs-fenced > "$T_RESULTS/fenced.stdout.log" 2> "$T_RESULTS/fenced.stderr.log" &
-fenced_pid=$!
-fenced_log "started fenced pid $fenced_pid in the background"
-
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
--- a/tests/sequence
+++ b/tests/sequence
@@ -7,7 +7,6 @@ simple-release-extents.sh
 setattr_more.sh
 offline-extent-waiting.sh
 move-blocks.sh
-enospc.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 lock-refleak.sh
@@ -23,14 +22,11 @@ stage-multi-part.sh
 stage-tmpfile.sh
 basic-posix-consistency.sh
 dirent-consistency.sh
-mkdir-rename-rmdir.sh
 lock-ex-race-processes.sh
 lock-conflicting-batch-commit.sh
 cross-mount-data-free.sh
 persistent-item-vers.sh
 setup-error-teardown.sh
-fence-and-reclaim.sh
-orphan-inodes.sh
 mount-unmount-race.sh
 createmany-parallel-mounts.sh
 archive-light-cycle.sh
--- a/tests/src/create_xattr_loop.c
+++ b/tests/src/create_xattr_loop.c
@@ -1,113 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/xattr.h>
-#include <ctype.h>
-#include <string.h>
-#include <errno.h>
-#include <limits.h>
-
-static void exit_usage(void)
-{
-	printf(" -h/-?         output this usage message and exit\n"
-	       " -c <count>    number of xattrs to create\n"
-	       " -n <string>   xattr name prefix, -NR is appended\n"
-	       " -p <path>     string with path to file with xattrs\n" 
-	       " -s <size>     xattr value size\n");
-	exit(1);
-}
-
-int main(int argc, char **argv)
-{
-	char *pref = NULL;
-	char *path = NULL;
-	char *val;
-	char *name;
-	unsigned long long count = 0;
-	unsigned long long size = 0;
-	unsigned long long i;
-	int ret;
-	int c;
-
-	while ((c = getopt(argc, argv, "+c:n:p:s:")) != -1) {
-
-		switch (c) {
-			case 'c':
-				count = strtoull(optarg, NULL, 0);
-				break;
-			case 'n':
-				pref = strdup(optarg);
-				break;
-			case 'p':
-				path = strdup(optarg);
-				break;
-			case 's':
-				size = strtoull(optarg, NULL, 0);
-				break;
-			case '?':
-				printf("unknown argument: %c\n", optind);
-			case 'h':
-				exit_usage();
-		}
-	}
-
-	if (count == 0) {
-		printf("specify count of xattrs to create with -c\n");
-		exit(1);
-	}
-
-	if (count == ULLONG_MAX) {
-		printf("invalid -c count\n");
-		exit(1);
-	}
-
-	if (size == 0) {
-		printf("specify xattrs value size with -s\n");
-		exit(1);
-	}
-
-	if (size == ULLONG_MAX || size < 2) {
-		printf("invalid -s size\n");
-		exit(1);
-	}
-
-	if (path == NULL) {
-		printf("specify path to file with -p\n");
-		exit(1);
-	}
-
-	if (pref == NULL) {
-		printf("specify xattr name prefix string with -n\n");
-		exit(1);
-	}
-
-	ret = snprintf(NULL, 0, "%s-%llu", pref, ULLONG_MAX) + 1;
-	name = malloc(ret);
-	if (!name) {
-		printf("couldn't allocate xattr name buffer\n");
-		exit(1);
-	}
-
-	val = malloc(size);
-	if (!val) {
-		printf("couldn't allocate xattr value buffer\n");
-		exit(1);
-	}
-
-	memset(val, 'a', size - 1);
-	val[size - 1] = '\0';
-
-	for (i = 0; i < count; i++) {
-		sprintf(name, "%s-%llu", pref, i);
-
-		ret = setxattr(path, name, val, size, 0);
-		if (ret) {
-			printf("returned %d errno %d (%s)\n",
-					ret, errno, strerror(errno));
-			return 1;
-		}
-	}
-
-	return 0;
-}
--- a/tests/tests/enospc.sh
+++ b/tests/tests/enospc.sh
@@ -1,100 +0,0 @@
-#
-# test hititng enospc by filling with data or metadata and
-# then recovering by removing what we filled.
-#
-
-#    Type  Size     Total   Used      Free  Use%  
-#MetaData  64KB   1048576  32782   1015794     3  
-#    Data   4KB  16777152      0  16777152     0  
-free_blocks() {
-	local md="$1"
-	local mnt="$2"
-	scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }'
-}
-
-t_require_commands scoutfs stat fallocate createmany
-
-echo "== prepare directories and files"
-for n in $(t_fs_nrs); do
-	eval path="\$T_D${n}/dir-$n/file-$n"
-	mkdir -p $(dirname $path)
-	touch $path
-done
-sync
-
-echo "== fallocate until enospc"
-before=$(free_blocks Data "$T_M0")
-finished=0
-while [ $finished != 1 ]; do
-	for n in $(t_fs_nrs); do
-		eval path="\$T_D${n}/dir-$n/file-$n"
-		off=$(stat -c "%s" "$path")
-
-		LC_ALL=C fallocate -o $off -l 128MiB  "$path" > $T_TMP.fallocate 2>&1
-		err="$?"
-
-		if grep -qi "no space" $T_TMP.fallocate; then
-			finished=1
-			break
-		fi
-		if [ "$err" != "0" ]; then
-			t_fail "fallocate failed with $err"
-		fi
-	done
-done
-
-echo "== remove all the files and verify free data blocks"
-for n in $(t_fs_nrs); do
-	eval dir="\$T_D${n}/dir-$n"
-	rm -rf "$dir"
-done
-sync
-after=$(free_blocks Data "$T_M0")
-# nothing else should be modifying data blocks
-test "$before" == "$after" || \
-	t_fail "$after free data blocks after rm, expected $before"
-
-# XXX this is all pretty manual, would be nice to have helpers
-echo "== make small meta fs"
-# meta device just big enough for reserves and the metadata we'll fill
-scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
-	t_fail "mkfs failed"
-SCR="/mnt/scoutfs.enospc"
-mkdir -p "$SCR"
-mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
-	"$T_EX_DATA_DEV" "$SCR"
-
-echo "== create large xattrs until we fill up metadata"
-mkdir -p "$SCR/xattrs"
-
-for f in $(seq 1 100000); do
-	file="$SCR/xattrs/file-$f"
-	touch "$file"
-
-	LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1
-	err="$?"
-
-	if grep -qi "no space" $T_TMP.cxl; then
-		echo "enospc at f $f" >> $T_TMP.cxl
-		break
-	fi
-	if [ "$err" != "0" ]; then
-		t_fail "create_xattr_loop failed with $err"
-	fi
-done
-
-echo "== remove files with xattrs after enospc"
-rm -rf "$SCR/xattrs"
-
-echo "== make sure we can create again"
-file="$SCR/file-after"
-touch $file
-setfattr -n user.scoutfs-enospc -v 1 "$file"
-sync
-rm -f "$file"
-
-echo "== cleanup small meta fs"
-umount "$SCR"
-rmdir "$SCR"
-
-t_pass
--- a/tests/tests/fence-and-reclaim.sh
+++ b/tests/tests/fence-and-reclaim.sh
@@ -1,127 +0,0 @@
-#
-# Fence nodes and reclaim their resources.
-#
-
-t_require_commands sleep touch grep sync scoutfs
-t_require_mounts 2
-
-#
-# Make sure that all mounts can read the results of a write from each
-# mount.  And make sure that the greatest of all the written seqs is
-# visible after the writes were commited by remote reads.
-#
-check_read_write()
-{
-	local expected
-	local greatest=0
-	local seq
-	local path
-	local saw
-	local w
-	local r
-
-	for w in $(t_fs_nrs); do
-		expected="$w wrote at $(date --rfc-3339=ns)"
-		eval path="\$T_D${w}/written"
-		echo "$expected" > "$path"
-
-		seq=$(scoutfs stat -s meta_seq $path)
-		if [ "$seq" -gt "$greatest" ]; then
-			greatest=$seq
-		fi
-
-		for r in $(t_fs_nrs); do
-			eval path="\$T_D${r}/written"
-			saw=$(cat "$path")
-			if [ "$saw" != "$expected" ]; then
-				echo "mount $r read '$saw' after mount $w wrote '$expected'"
-			fi
-		done
-	done
-
-	seq=$(scoutfs statfs -s committed_seq -p $T_D0)
-	if [ "$seq" -lt "$greatest" ]; then
-		echo "committed_seq $seq less than greatest $greatest"
-	fi
-}
-
-echo "== make sure all mounts can see each other"
-check_read_write
-
-echo "== force unmount one client, connection timeout, fence nop, mount"
-cl=$(t_first_client_nr)
-sv=$(t_server_nr)
-rid=$(t_mount_rid $cl)
-echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
-sync
-t_force_umount $cl
-# wait for client reconnection to timeout
-while grep -q $rid $(t_debugfs_path $sv)/connections; do
-	sleep .5
-done
-while t_rid_is_fencing $rid; do
-	sleep .5
-done
-t_mount $cl
-check_read_write
-
-echo "== force unmount all non-server, connection timeout, fence nop, mount"
-sv=$(t_server_nr)
-pattern="nonsense"
-sync
-for cl in $(t_fs_nrs); do
-	if [ $cl == $sv ]; then
-		continue;
-	fi
-
-	rid=$(t_mount_rid $cl)
-	pattern="$pattern|$rid"
-	echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
-
-	t_force_umount $cl
-done
-
-# wait for all client reconnections to timeout
-while egrep -q "($pattern)" $(t_debugfs_path $sv)/connections; do
-	sleep .5
-done
-# wait for all fence requests to complete
-while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
-	sleep .5
-done
-# remount all the clients
-for cl in $(t_fs_nrs); do
-	if [ $cl == $sv ]; then
-		continue;
-	fi
-	t_mount $cl
-done
-check_read_write
-
-echo "== force unmount server, quorum elects new leader, fence nop, mount"
-sv=$(t_server_nr)
-rid=$(t_mount_rid $sv)
-echo "sv $sv rid $rid" >> "$T_TMP.log"
-sync
-t_force_umount $sv
-t_wait_for_leader
-# wait until new server is done fencing unmounted leader rid
-while t_rid_is_fencing $rid; do
-	sleep .5
-done
-t_mount $sv
-check_read_write
-
-echo "== force unmount everything, new server fences all previous"
-sync
-for nr in $(t_fs_nrs); do
-	t_force_umount $nr
-done
-t_mount_all
-# wait for all fence requests to complete
-while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
-	sleep .5
-done
-check_read_write
-
-t_pass
--- a/tests/tests/mkdir-rename-rmdir.sh
+++ b/tests/tests/mkdir-rename-rmdir.sh
@@ -1,59 +0,0 @@
-#
-# Sequentially perform operations on a dir (mkdir; rename*2; rmdir) on
-# all possible combinations of different mounts that could perform the
-# operations.
-#
-# We're testing that the tracking of the entry key in our cached dirents
-# stays consitent with the persistent entry items as they're modified
-# around the cluster.
-#
-
-t_require_commands mkdir mv rmdir
-
-NR_OPS=4
-
-unset op_mnt
-for op in $(seq 0 $NR_OPS); do
-	op_mnt[$op]=0
-done
-
-if [ $T_NR_MOUNTS -gt $NR_OPS ]; then
-	NR_MNTS=$NR_OPS
-else
-	NR_MNTS=$T_NR_MOUNTS
-fi
-
-# test until final op mount dir wraps
-while [ ${op_mnt[$NR_OPS]} == 0 ]; do
-
-	# sequentially perform each op from its mount dir
-	for op in $(seq 0 $((NR_OPS - 1))); do
-		m=${op_mnt[$op]}
-		eval dir="\$T_D${m}/dir"
-
-		case "$op" in
-			0) mkdir "$dir" ;;
-			1) mv "$dir" "$dir-1" ;;
-			2) mv "$dir-1" "$dir-2" ;;
-			3) rmdir "$dir-2" ;;
-		esac
-
-		if [ $? != 0 ]; then
-			t_fail "${op_mnt[*]} failed at op $op"
-		fi
-	done
-
-	# advance through mnt nrs for each op
-	i=0
-	while [ ${op_mnt[$NR_OPS]} == 0 ]; do
-		((op_mnt[$i]++))
-		if [ ${op_mnt[$i]} -ge $NR_MNTS ]; then
-			op_mnt[$i]=0
-			((i++))
-		else
-			break
-		fi
-	done
-done
-
-t_pass
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -1,77 +0,0 @@
-#
-# make sure we clean up orphaned inodes
-#
-
-t_require_commands sleep touch sync stat handle_cat kill rm
-t_require_mounts 2
-
-#
-# usually bash prints an annoying output message when jobs
-# are killed.  We can avoid that by redirecting stderr for
-# the bash process when it reaps the jobs that are killed.
-#
-silent_kill() {
-	exec {ERR}>&2 2>/dev/null
-	kill "$@"
-	wait "$@"
-	exec 2>&$ERR {ERR}>&-
-}
-
-#
-# We don't have a great way to test that inode items still exist.   We
-# don't prevent opening handles with nlink 0 today, so we'll use that.
-# This would have to change to some other method.
-#
-inode_exists()
-{
-	local ino="$1"
-
-	handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
-}
-
-echo "== test our inode existance function"
-path="$T_D0/file"
-touch "$path"
-ino=$(stat -c "%i" "$path")
-inode_exists $ino || echo "$ino didn't exist"
-
-echo "== unlinked and opened inodes still exist"
-sleep 1000000 < "$path" &
-pid="$!"
-rm -f "$path"
-inode_exists $ino || echo "$ino didn't exist"
-
-echo "== orphan from failed evict deletion is picked up"
-# pending kill signal stops evict from getting locks and deleting
-silent_kill $pid
-sleep 55
-inode_exists $ino && echo "$ino still exists"
-
-echo "== orphaned inos in all mounts all deleted"
-pids=""
-inos=""
-for nr in $(t_fs_nrs); do
-	eval path="\$T_D${nr}/file-$nr"
-	touch "$path"
-	inos="$inos $(stat -c %i $path)"
-	sleep 1000000 < "$path" &
-	pids="$pids $!"
-	rm -f "$path"
-done
-sync
-silent_kill $pids
-for nr in $(t_fs_nrs); do
-	t_force_umount $nr
-done
-t_mount_all
-# wait for all fence requests to complete
-while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
-	sleep .5
-done
-# wait for orphan scans to run
-sleep 55
-for ino in $inos; do
-	inode_exists $ino && echo "$ino still exists"
-done
-
-t_pass
--- a/utils/fenced/local-force-unmount
+++ b/utils/fenced/local-force-unmount
@@ -1,35 +0,0 @@
-#!/usr/bin/bash
-
-echo_fail() {
-	echo "$@" > /dev/stderr
-	exit 1
-}
-
-rid="$SCOUTFS_FENCED_REQ_RID"
-
-#
-# Look for a local mount with the rid to fence.  Typically we'll at
-# least find the mount with the server that requested the fence that
-# we're processing.   But it's possible that mounts are unmounted
-# before, or while, we're running.
-#
-mnts=$(findmnt -l -n -t scoutfs -o TARGET) || \
-	echo_fail "findmnt -t scoutfs failed" > /dev/stderr
-
-for mnt in $mnts; do
-	mnt_rid=$(scoutfs statfs -p "$mnt" -s rid) || \
-		echo_fail "scoutfs statfs $mnt failed"
-
-	if [ "$mnt_rid" == "$rid" ]; then
-		umount -f "$mnt" || \
-			echo_fail "umout -f $mnt"
-
-		exit 0
-	fi
-done
-
-#
-# If the mount doesn't exist on this host then it can't access the
-# devices by definition and can be considered fenced.
-#
-exit 0
--- a/utils/fenced/scoutfs-fenced
+++ b/utils/fenced/scoutfs-fenced
@@ -1,94 +0,0 @@
-#!/usr/bin/bash
-
-message_output()
-{
-	printf "[%s] %s\n" "$(date '+%F %T.%N')" "$@"
-}
-
-error_message()
-{
-	message_output "$@" >> /dev/stderr
-}
-
-error_exit()
-{
-	error_message "$@, exiting"
-	exit 1
-}
-
-log_message()
-{
-	message_output "$@" >> /dev/stdout
-}
-
-# restart if we catch hup to re-read the config
-hup_restart()
-{
-	log_message "caught SIGHUP, restarting"
-	exec "$@"
-}
-trap hup_restart SIGHUP
-
-# defaults
-SCOUTFS_FENCED_CONFIG_FILE=${SCOUTFS_FENCED_CONFIG_FILE:-/etc/scoutfs/scoutfs-fenced.conf}
-SCOUTFS_FENCED_DELAY=2
-#SCOUTFS_FENCED_RUN
-#SCOUTFS_FENCED_RUN_ARGS
-
-test -n "$SCOUTFS_FENCED_CONFIG_FILE" || \
-	error_exit "SCOUTFS_FENCED_CONFIG_FILE isn't set"
-test -r "$SCOUTFS_FENCED_CONFIG_FILE" || \
-	error_exit "SCOUTFS_FENCED_CONFIG_FILE isn't readable file"
-
-log_message "reading config file $SCOUTFS_FENCED_CONFIG_FILE"
-
-. "$SCOUTFS_FENCED_CONFIG_FILE" || \
-	error_exit "error sourcing $SCOUTFS_FENCED_CONFIG_FILE as bash script"
-
-for conf in "${!SCOUTFS_FENCED_@}"; do
-	log_message "    config var $conf=${!conf}"
-done
-
-test -n "$SCOUTFS_FENCED_RUN" || \
-	error_exit "SCOUTFS_FENCED_RUN must be set"
-test -x "$SCOUTFS_FENCED_RUN" || \
-	error_exit "SCOUTFS_FENCED_RUN '$SCOUTFS_FENCED_RUN' isn't executable"
-
-#
-# main loop watching for fence request across all filesystems 
-#
-
-while sleep $SCOUTFS_FENCED_DELAY; do
-	for fence in /sys/fs/scoutfs/*/fence/*; do
-		# catches unmatched regex when no dirs
-		if [ ! -d "$fence" ]; then
-			continue
-		fi
-
-		# skip requests that have been handled
-		if [ $(cat "$fence/fenced") == 1 -o $(cat "$fence/error") == 1 ]; then
-			continue
-		fi
-
-		srv=$(basename $(dirname $(dirname $fence)))
-		rid="$(cat $fence/rid)"
-		ip="$(cat $fence/ipv4_addr)"
-		reason="$(cat $fence/reason)"
-
-		log_message "server $srv fencing rid $rid at IP $ip for $reason"
-
-		# export _REQ_ vars for run to use
-		export SCOUTFS_FENCED_REQ_RID="$rid"
-		export SCOUTFS_FENCED_REQ_IP="$ip"
-
-		$run $SCOUTFS_FENCED_RUN_ARGS
-		rc=$?
-		if [ "$rc" != 0 ]; then
-			log_message "server $srv fencing rid $rid saw error status $rc from $run"
-			echo 1 > "$fence/error"
-			continue
-		fi
-
-		echo 1 > "$fence/fenced"
-	done
-done
--- a/utils/man/scoutfs-fenced.8
+++ b/utils/man/scoutfs-fenced.8
@@ -1,66 +0,0 @@
-.TH scoutfs-fenced 8
-.SH NAME
-scoutfs-fenced \- scoutfs fence request monitoring and dispatch daemon
-.SH DESCRIPTION
-The
-.B scoutfs-fenced
-daemon runs on hosts with mounts that are configured as quorum members
-and could create fence requests.  It watches sysfs directories of
-mounted scoutfs volumes for the directories store requests
-to fence a mount.
-
-.SH ENVIRONMENT
-scoutfs-fenced reads the
-.I SCOUTFS_FENCED_CONFIG_FILE
-environment variable for the path to the config file that contains its
-configuration.  The file must be readable and is sourced as a bash
-script and is expected to set the following configuration variables.
-
-.SH CONFIGURATION
-
-.TP
-.B SCOUTFS_FENCED_DELAY
-The number of seconds to wait beteween checking for fence request
-directories in the sysfs directories of all mounts on the host.
-
-.TP
-.B SCOUTFS_FENCED_RUN
-The path to the command to execute for each fence request.  The file at
-the path must be executable.
-
-.TP
-.B SCOUTFS_FENCED_RUN_ARGS
-The arguments that are unconditionally passed through to the run
-command.
-
-.SH DAEMONIZING AND LOGGING
-
-scoutfs-fenced runs in the foreground and writes to stderr and stdout.
-Disconnecting it from parents and redirecting its output are the
-responsibility of the host environment.
-
-.SH RUN COMMAND INTERFACE
-
-scoutfs-fenced sets enviroment variables for the run command with
-information about the mount that must be fenced:
-
-.TP
-.B SCOUTFS_FENCED_REQ_RID
-The RID of the mount to be fenced.
-.TP
-.B SCOUTFS_FENCED_REQ_IP
-The dotted quad IPv4 address of the last connection from the mount.
-
-.RE
-The return status of the run command indicates if the mount was
-fenced, or not.  If the mount was successfully fenced then the command
-should return a 0 success status.  If the run command returns a non-zero
-failure status then the request will be set as errored and the server
-will shut down.  The next server that starts will create another fence
-request for the mount.
-
-.SH SEE ALSO
-.BR scoutfs (5),
-
-.SH AUTHORS
-Zach Brown <zab@versity.com>
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -1,6 +1,6 @@
 .TH scoutfs 5
 .SH NAME
-scoutfs \- high level overview of the scoutfs filesystem
+scoutfs \- overview and mount options for the scoutfs filesystem
 .SH DESCRIPTION
 A scoutfs filesystem is stored on two block devices.  Multiple mounts of
 the filesystem are supported between hosts that share access to the
@@ -34,116 +34,7 @@ the server for the filesystem if it is elected leader.
 The assigned number must match one of the slots defined with \-Q options
 when the filesystem was created with mkfs.  If the number assigned
 doesn't match a number created during mkfs then the mount will fail.
-.SH VOLUME OPTIONS
-Volume options are persistent options which are stored in the super
-block in the metadata device and which apply to all mounts of the volume.
-.sp
-Volume options may be initially specified as the volume is created
-as described in the mkfs command in
-.BR scoutfs (8).
-.sp
-Volume options may be changed at runtime by writing to files in sysfs
-while the volume is mounted.  Volume options are found in the
-volume_options/ directory with a file for each option.  Reading the
-file provides the current setting of the option and an empty string
-is returned if the option is not set.  To set the option, write
-the new value ofthe option to the file.  To clear the option, write
-a blank line with a newline to the file.  The write syscall will
-return an error if the set operation fails and a message will be written
-to the console.
-.sp
-The following volume options are supported:
-.TP
-.B data_alloc_zone_blocks=<zone size in 4KiB blocks>
-When the data_alloc_zone_blocks option is set the data device is
-logically divided into zones of equal length as specified by the value
-of the option.  The size of the zones must be greater than a minimum
-allocation pool size, large enough to result in no more than 1024 zones,
-and not more than the total number of blocks in the data device.
-.sp
-When set, the server will try to provide each mount with free data
-extents that don't share a zone with other mounts.  When a mount has free
-extents in a given zone the server will try and find more free extents
-in that zone.  When the mount is not in a zone, or its zone has no more
-free extents, the server will try and find free extents in a zone that
-no other mount currently occupies.  The result is to try and produce
-write streams where only one mount is writing into each zone.
-.SH FENCING
-.B scoutfs
-mounts coordinate exclusive access to shared resources through
-comminication with the mount that was elected leader.
-A mount can malfunction and stop participating at which point it needs
-to be safely isolated ("fenced off") from shared resources before other mounts can
-have their turn at exclusive access.
-.sp
-Only the elected leader can fence mounts.  As the leader decides that a
-mount must be fenced, typically by timeouts expiring without
-comminication from the mount, it creates a fence request.   Fence
-requests are visible as directories in the leader mount's sysfs
-directory.  The fence request directory is named for the RID of the
-mount being fenced.  The directory contains the following files:
-
-.RS
-.TP
-.B elapsec_secs
-Reading this file gives the number of seconds that have passed since
-this fence request was created.
-.TP
-.B error
-This file contains 0 when the fence request is created.  Userspace
-fencing agents write 1 into this file if they are unable to fence the
-mount.  The volume can not make progress until the mount is fenced so
-this will cause the server to stop and another mount will be elected
-leader.
-.TP
-.B fenced
-This file contains 0 when the fence request is created.  Userspace
-fencing agents write 1 into this file once the mount has been fenced.
-.TP
-.B ipv4_addr
-This file contains the dotted quad IPv4 peer address of the last
-connected socket from the mount.  Userspace fencing agents can use this
-to find the host that contains the mount.
-.TP
-.B reason
-This file contains a text string that indicates the reason that the
-mount is being fenced:
-
-.B client_recovery
- During startup the server found persistent items recording the presence
-of a mount that didn't reconnect to the server in time.
-.sp
-.B client_reconnect
- A mount disconnected from the server and didn't reconnect in time.
-.sp
-.B quorum_block_leader
- As a leader was elected it read persistent blocks that indicated that
-a previous leader had not shut down and cleared their quorum block.
-.TP
-.B rid
-This file contains the hex string of the RID of the mount to be fenced.
-.RE
-
-The request directories enable userspace processes to gather the
-information to find the host with the mount to fence, isolate the mount
-by whatever means are appropriate (f.e. cut off network and storage
-communication, force unmount the mount, isolate storage fabric ports,
-reboot the host) and write to the
-.I fenced
-file.
-.sp
-Once the 
-.I fenced
-file is written to the server reclaims the resources
-associated with the fenced mount and resumes normal operations.
-.sp
-If the 
-.I error
-file is written to then the server cannot make forward progress and
-shuts down.  The request can similarly enter an errored state if enough
-time passes before userspace completes the request.
- 
-.SH CORRUPTION DETECTION
+.SH FURTHER READING
 A
 .B scoutfs
 filesystem can detect corruption at runtime.  A catalog of kernel log
--- a/utils/man/scoutfs.8
+++ b/utils/man/scoutfs.8
@@ -32,18 +32,10 @@ A path within a ScoutFS filesystem.
 .PD

 .TP
-.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force] [-A|--allow-small-size]"
+.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-f|--force]"
 .sp
 Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
 separate block devices for its metadata and data storage, two are required.
-The internal structures and nature of metadata and data transactions
-lead to minimum viable device sizes.  
-.B mkfs
-will check both devices and fail with an error if either are under the
-minimum size.   If
-.B --allow-small-size
-is given then sizes under the minimum size will be
-allowed after printing an informational warning.
 .sp
 If
 .B --force
@@ -89,14 +81,6 @@ kibibytes, mebibytes, etc.
 .B "-d, --max-data-size SIZE"
 Same as previous, but for limiting the size of the data device.
 .TP
-.B "-A, --allow-small-size"
-Allows use of specified device sizes less than the minimum.  This can
-result in bad behaviour and is only intended for testing.
-.TP
-.B "-z, --data-alloc-zone-blocks BLOCKS"
-Set the data_alloc_zone_blocks volume option, as described in
-.BR scoutfs (5).
-.TP
 .B "-f, --force"
 Ignore presence of existing data on the data and metadata devices.
 .RE
--- a/utils/scoutfs-utils.spec.in
+++ b/utils/scoutfs-utils.spec.in
@@ -54,15 +54,12 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
-install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
-install -m 755 -D fenced/local-force-unmount $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/run/local-force-unmount

 %files
 %defattr(644,root,root,755)
 %{_mandir}/man*/scoutfs*.gz
 %defattr(755,root,root,755)
 %{_sbindir}/scoutfs
-%{_libexecdir}/scoutfs-fenced

 %files -n scoutfs-devel
 %defattr(644,root,root,755)
--- a/utils/src/btree.c
+++ b/utils/src/btree.c
@@ -40,7 +40,7 @@ static void *alloc_val(struct scoutfs_btree_block *bt, int len)
 {
 	le16_add_cpu(&bt->mid_free_len, -len);
 	le16_add_cpu(&bt->total_item_bytes, len);
-	return (void *)&bt->items[le16_to_cpu(bt->nr_items)] + le16_to_cpu(bt->mid_free_len);
+	return (void *)bt + le16_to_cpu(bt->mid_free_len);
 }

 /*
--- a/utils/src/dev.c
+++ b/utils/src/dev.c
@@ -6,13 +6,12 @@
 #include <sys/ioctl.h>
 #include <linux/fs.h>
 #include <errno.h>
-#include <stdbool.h>

 #include "sparse.h"
 #include "dev.h"

 int device_size(char *path, int fd,
-		u64 min_size, u64 max_size, bool allow_small_size,
+		u64 min_size, u64 max_size,
 		char *use_type, u64 *size_ret)
 {
 	struct stat st;
@@ -64,13 +63,10 @@ int device_size(char *path, int fd,
 	if (size < min_size) {
 		fprintf(stderr,
 			BASE_SIZE_FMT" %s too small for min "
-			BASE_SIZE_FMT" %s device%s\n",
+			BASE_SIZE_FMT" %s device\n",
 			BASE_SIZE_ARGS(size), target_type,
-			BASE_SIZE_ARGS(min_size), use_type,
-			allow_small_size ? ", allowing with -A" : "");
-
-		if (!allow_small_size)
-			return -EINVAL;
+			BASE_SIZE_ARGS(min_size), use_type);
+		return -EINVAL;
 	}

 	*size_ret = size;
--- a/utils/src/dev.h
+++ b/utils/src/dev.h
@@ -1,8 +1,6 @@
 #ifndef _DEV_H_
 #define _DEV_H_

-#include <stdbool.h>
-
 #define BASE_SIZE_FMT "%.2f%s"
 #define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1)

@@ -10,7 +8,7 @@
 #define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz)

 int device_size(char *path, int fd,
-		u64 min_size, u64 max_size, bool allow_small_size,
+		u64 min_size, u64 max_size,
 		char *use_type, u64 *size_ret);
 float size_flt(u64 nr, unsigned size);
 char *size_str(u64 nr, unsigned size);
--- a/utils/src/df.c
+++ b/utils/src/df.c
@@ -86,11 +86,6 @@ static int do_df(struct df_args *args)
 			data_free += ade[i].blocks;
 	}

-	if (meta_free >= sfm.reserved_meta_blocks)
-		meta_free -= sfm.reserved_meta_blocks;
-	else
-		meta_free = 0;
-
 	snprintf(cells[0][0], CHARS, "Type");
 	snprintf(cells[0][1], CHARS, "Size");
 	snprintf(cells[0][2], CHARS, "Total");
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -57,15 +57,6 @@ static int write_block(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
 	return 0;
 }

-/*
- * Return the order of the length of a free extent, which we define as
- * floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
- */
-static u64 free_extent_order(u64 len)
-{
-	return (flsll(len | 1) - 1) / 3;
-}
-
 /*
 * Write the single btree block that contains the blkno and len indexed
 * items to store the given extent, and update the root to point to it.
@@ -81,61 +72,31 @@ static int write_alloc_root(int fd, __le64 fsid,
 	root->total_len = cpu_to_le64(len);

 	memset(&key, 0, sizeof(key));
-	key.sk_zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
+	key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
+	key.sk_type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
+	key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
 	key.skfb_end = cpu_to_le64(start + len - 1);
 	key.skfb_len = cpu_to_le64(len);
 	btree_append_item(bt, &key, NULL, 0);

 	memset(&key, 0, sizeof(key));
-	key.sk_zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
-	key.skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
-	key.skfo_end = cpu_to_le64(start + len - 1);
-	key.skfo_len = cpu_to_le64(len);
+	key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
+	key.sk_type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
+	key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
+	key.skfl_neglen = cpu_to_le64(-len);
+	key.skfl_blkno = cpu_to_le64(start);
 	btree_append_item(bt, &key, NULL, 0);

 	return write_block(fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, seq, blkno,
 			   SCOUTFS_BLOCK_LG_SHIFT, &bt->hdr);
 }

-#define SCOUTFS_SERVER_DATA_FILL_TARGET \
-	((4ULL * 1024 * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
-static bool invalid_data_alloc_zone_blocks(u64 total_data_blocks, u64 zone_blocks)
-{
-	u64 nr;
-
-	if (zone_blocks == 0)
-		return false;
-
-	if (zone_blocks < SCOUTFS_SERVER_DATA_FILL_TARGET) {
-		fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at least %llu mount data allocation target blocks",
-		        zone_blocks, SCOUTFS_SERVER_DATA_FILL_TARGET);
-		return true;
-	}
-
-	nr = total_data_blocks / SCOUTFS_DATA_ALLOC_MAX_ZONES;
-	if (zone_blocks < nr) {
-		fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be greater than %llu blocks which results in max %u zones",
-			    zone_blocks, nr, SCOUTFS_DATA_ALLOC_MAX_ZONES);
-		return true;
-	}
-
-	if (zone_blocks > total_data_blocks) {
-		fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at most %llu total data device blocks",
-			    zone_blocks, total_data_blocks);
-		return true;
-	}
-
-	return false;
-}
-
 struct mkfs_args {
 	char *meta_device;
 	char *data_device;
 	unsigned long long max_meta_size;
 	unsigned long long max_data_size;
-	u64 data_alloc_zone_blocks;
 	bool force;
-	bool allow_small_size;
 	int nr_slots;
 	struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };
@@ -216,15 +177,13 @@ static int do_mkfs(struct mkfs_args *args)
 		goto out;
 	}

-	/* minumum meta device size to make reserved blocks reasonably large */
-	ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024),
-			  args->max_meta_size, args->allow_small_size, "meta", &meta_size);
+	ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024),
+			  args->max_meta_size, "meta", &meta_size);
 	if (ret)
 		goto out;

-	/* .. then arbitrarily the same minimum data device size */
-	ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024),
-			  args->max_data_size, args->allow_small_size, "data", &data_size);
+	ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024),
+			  args->max_data_size, "data", &data_size);
 	if (ret)
 		goto out;

@@ -239,7 +198,7 @@ static int do_mkfs(struct mkfs_args *args)
 	super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION);
 	uuid_generate(super->uuid);
 	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
-	super->seq = cpu_to_le64(1);
+	super->next_trans_seq = cpu_to_le64(1);
 	super->total_meta_blocks = cpu_to_le64(last_meta + 1);
 	super->first_meta_blkno = cpu_to_le64(next_meta);
 	super->last_meta_blkno = cpu_to_le64(last_meta);
@@ -251,17 +210,6 @@ static int do_mkfs(struct mkfs_args *args)
 		     member_sizeof(struct scoutfs_super_block, qconf.slots));
 	memcpy(super->qconf.slots, args->slots, sizeof(args->slots));

-	if (invalid_data_alloc_zone_blocks(le64_to_cpu(super->total_data_blocks),
-					   args->data_alloc_zone_blocks)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (args->data_alloc_zone_blocks) {
-		super->volopt.set_bits |= cpu_to_le64(SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT);
-		super->volopt.data_alloc_zone_blocks = cpu_to_le64(args->data_alloc_zone_blocks);
-	}
-
 	/* fs root starts with root inode and its index items */
 	blkno = next_meta++;
 	btree_init_root_single(&super->fs_root, bt, 1, blkno);
@@ -523,20 +471,6 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 				prev_val, args->max_data_size);
 		break;
 	}
-	case 'A':
-		args->allow_small_size = true;
-		break;
-	case 'z': /* data-alloc-zone-blocks */
-	{
-		ret = parse_u64(arg, &args->data_alloc_zone_blocks);
-		if (ret)
-			return ret;
-
-		if (args->data_alloc_zone_blocks == 0)
-			argp_error(state, "must provide non-zero data-alloc-zone-blocks");
-
-		break;
-	}
 	case ARGP_KEY_ARG:
 		if (!args->meta_device)
 			args->meta_device = strdup_or_error(state, arg);
@@ -565,10 +499,8 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 static struct argp_option options[] = {
 	{ "quorum-slot", 'Q', "NR,ADDR,PORT", 0, "Specify quorum slot addresses [Required]"},
 	{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
-	{ "allow-small-size", 'A', NULL, 0, "Allow specified meta/data devices less than minimum, still warns"},
 	{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
 	{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
-	{ "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"},
 	{ NULL }
 };

--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -1,4 +1,3 @@
-#define _GNU_SOURCE /* ffsll for glibc < 2.27 */
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -158,7 +157,7 @@ static print_func_t find_printer(u8 zone, u8 type)
 	    type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE)
 		return print_inode_index;

-	if (zone == SCOUTFS_ORPHAN_ZONE) {
+	if (zone == SCOUTFS_RID_ZONE) {
 		if (type == SCOUTFS_ORPHAN_TYPE)
 			return print_orphan;
 	}
@@ -210,8 +209,8 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
 	/* only items in leaf blocks have values */
 	if (val) {
 		liv = val;
-		printf("    log_item_value: seq %llu flags %x\n",
-		       le64_to_cpu(liv->seq), liv->flags);
+		printf("    log_item_value: vers %llu flags %x\n",
+		       le64_to_cpu(liv->vers), liv->flags);

 		/* deletion items don't have values */
 		if (!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION)) {
@@ -245,15 +244,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
 	le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq)

 #define AL_HEAD_F \
-	AL_REF_F" total_nr %llu first_nr %u flags 0x%x"
+	AL_REF_F" total_nr %llu first_nr %u"
 #define AL_HEAD_A(p)					\
 	AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\
-	le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags)
+	le32_to_cpu((p)->first_nr)

 #define ALCROOT_F \
-	BTROOT_F" total_len %llu flags 0x%x"
+	BTROOT_F" total_len %llu"
 #define ALCROOT_A(ar) \
-	BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags)
+	BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_A(sre)						\
@@ -273,9 +272,6 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 				unsigned val_len, void *arg)
 {
 	struct scoutfs_log_trees *lt = val;
-	u64 zones;
-	int bit;
-	int i;

 	printf("    rid %llu nr %llu\n",
 	       le64_to_cpu(key->sklt_rid), le64_to_cpu(key->sklt_nr));
@@ -289,12 +285,9 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       "      data_avail: "ALCROOT_F"\n"
 		       "      data_freed: "ALCROOT_F"\n"
 		       "      srch_file: "SRF_FMT"\n"
-		       "      max_item_seq: %llu\n"
+		       "      max_item_vers: %llu\n"
 		       "      rid: %016llx\n"
-		       "      nr: %llu\n"
-		       "      flags: %llx\n"
-		       "      data_alloc_zone_blocks: %llu\n"
-		       "      data_alloc_zones: ",
+		       "      nr: %llu\n",
 		       AL_HEAD_A(&lt->meta_avail),
 		       AL_HEAD_A(&lt->meta_freed),
 			lt->item_root.height,
@@ -305,24 +298,9 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       ALCROOT_A(&lt->data_avail),
 		       ALCROOT_A(&lt->data_freed),
 		       SRF_A(&lt->srch_file),
-		       le64_to_cpu(lt->max_item_seq),
+		       le64_to_cpu(lt->max_item_vers),
 		       le64_to_cpu(lt->rid),
-		       le64_to_cpu(lt->nr),
-		       le64_to_cpu(lt->flags),
-		       le64_to_cpu(lt->data_alloc_zone_blocks));
-
-		for (i = 0; i < SCOUTFS_DATA_ALLOC_ZONE_LE64S; i++) {
-			if (lt->data_alloc_zones[i] == 0)
-				continue;
-
-			zones = le64_to_cpu(lt->data_alloc_zones[i]);
-			while (zones) {
-				bit = ffsll(zones) - 1;
-				printf("%u ", (i * 64) + bit);
-				zones ^= (1ULL << bit);
-			}
-		}
-		printf("\n");
+		       le64_to_cpu(lt->nr));
 	}

 	return 0;
@@ -374,79 +352,9 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val,
 				      unsigned val_len, void *arg)
 {
 	struct scoutfs_mounted_client_btree_val *mcv = val;
-	struct in_addr in;

-	memset(&in, 0, sizeof(in));
-	in.s_addr = htonl(le32_to_cpu(mcv->addr.v4.addr));
-
-	printf("    rid %016llx ipv4_addr %s flags 0x%x\n",
-	       le64_to_cpu(key->skmc_rid), inet_ntoa(in), mcv->flags);
-
-	return 0;
-}
-
-static int print_log_merge_item(struct scoutfs_key *key, void *val,
-				      unsigned val_len, void *arg)
-{
-	struct scoutfs_log_merge_status *stat;
-	struct scoutfs_log_merge_range *rng;
-	struct scoutfs_log_merge_request *req;
-	struct scoutfs_log_merge_complete *comp;
-	struct scoutfs_log_merge_freeing *fr;
-
-	switch (key->sk_zone) {
-	case SCOUTFS_LOG_MERGE_STATUS_ZONE:
-		stat = val;
-		printf("    status: next_range_key "SK_FMT" nr_req %llu nr_comp %llu"
-		       " last_seq %llu seq %llu\n",
-		       SK_ARG(&stat->next_range_key),
-		       le64_to_cpu(stat->nr_requests),
-		       le64_to_cpu(stat->nr_complete),
-		       le64_to_cpu(stat->last_seq),
-		       le64_to_cpu(stat->seq));
-		break;
-	case SCOUTFS_LOG_MERGE_RANGE_ZONE:
-		rng = val;
-		printf("    range: start "SK_FMT" end "SK_FMT"\n",
-		       SK_ARG(&rng->start),
-		       SK_ARG(&rng->end));
-		break;
-	case SCOUTFS_LOG_MERGE_REQUEST_ZONE:
-		req = val;
-		printf("    request: logs_root "BTROOT_F" logs_root "BTROOT_F" start "SK_FMT
-		       " end "SK_FMT" last_seq %llu rid %016llx seq %llu flags 0x%llx\n",
-		       BTROOT_A(&req->logs_root),
-		       BTROOT_A(&req->root),
-		       SK_ARG(&req->start),
-		       SK_ARG(&req->end),
-		       le64_to_cpu(req->last_seq),
-		       le64_to_cpu(req->rid),
-		       le64_to_cpu(req->seq),
-		       le64_to_cpu(req->flags));
-		break;
-	case SCOUTFS_LOG_MERGE_COMPLETE_ZONE:
-		comp = val;
-		printf("    complete: root "BTROOT_F" start "SK_FMT" end "SK_FMT
-		       " remain "SK_FMT" rid %016llx seq %llu flags %llx\n",
-		       BTROOT_A(&comp->root),
-		       SK_ARG(&comp->start),
-		       SK_ARG(&comp->end),
-		       SK_ARG(&comp->remain),
-		       le64_to_cpu(comp->rid),
-		       le64_to_cpu(comp->seq),
-		       le64_to_cpu(comp->flags));
-		break;
-	case SCOUTFS_LOG_MERGE_FREEING_ZONE:
-		fr = val;
-		printf("    freeing: root "BTROOT_F" key "SK_FMT" seq %llu\n",
-		       BTROOT_A(&fr->root),
-		       SK_ARG(&fr->key),
-		       le64_to_cpu(fr->seq));
-		break;
-	default:
-		printf("    (unknown log merge key zone %u)\n", key->sk_zone);
-		break;
-	}
+	printf("    rid %016llx flags 0x%x\n",
+	       le64_to_cpu(key->skmc_rid), mcv->flags);

 	return 0;
 }
@@ -454,17 +362,17 @@ static int print_log_merge_item(struct scoutfs_key *key, void *val,
 static int print_alloc_item(struct scoutfs_key *key, void *val,
 			    unsigned val_len, void *arg)
 {
-	if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
+	if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
 		printf("    free extent: blkno %llu len %llu end %llu\n",
 		       le64_to_cpu(key->skfb_end) -
 		       le64_to_cpu(key->skfb_len) + 1,
 		       le64_to_cpu(key->skfb_len),
 		       le64_to_cpu(key->skfb_end));
 	else
-		printf("    free extent: blkno %llu len %llu order %llu\n",
-		       le64_to_cpu(key->skfo_end) - le64_to_cpu(key->skfo_len) + 1,
-		       le64_to_cpu(key->skfo_len),
-		       (long long)(U64_MAX - le64_to_cpu(key->skfo_revord)));
+		printf("    free extent: blkno %llu len %llu neglen %lld\n",
+		       le64_to_cpu(key->skfl_blkno),
+		       -le64_to_cpu(key->skfl_neglen),
+		       (long long)le64_to_cpu(key->skfl_neglen));

 	return 0;
 }
@@ -884,16 +792,16 @@ static char *alloc_addr_str(union scoutfs_inet_addr *ia)

 static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 {
-	const static char *event_names[] = {
-		[SCOUTFS_QUORUM_EVENT_BEGIN] = "begin",
-		[SCOUTFS_QUORUM_EVENT_TERM] = "term",
-		[SCOUTFS_QUORUM_EVENT_ELECT] = "elect",
-		[SCOUTFS_QUORUM_EVENT_FENCE] = "fence",
-		[SCOUTFS_QUORUM_EVENT_STOP] = "stop",
-		[SCOUTFS_QUORUM_EVENT_END] = "end",
+	struct print_events {
+		size_t offset;
+		char *name;
+	} events[] = {
+		OFF_NAME(write), OFF_NAME(update_term), OFF_NAME(set_leader),
+		OFF_NAME(clear_leader), OFF_NAME(fenced),
 	};
 	struct scoutfs_quorum_block *blk = NULL;
 	struct scoutfs_quorum_block_event *ev;
+	char *log_addr = NULL;
 	u64 blkno;
 	int ret;
 	int i;
@@ -902,7 +810,6 @@ static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
 		blkno = SCOUTFS_QUORUM_BLKNO + i;
 		free(blk);
-		blk = NULL;
 		ret = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT, (void **)&blk);
 		if (ret)
 			goto out;
@@ -910,27 +817,28 @@ static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 		printf("quorum blkno %llu (slot %llu)\n",
 		       blkno, blkno - SCOUTFS_QUORUM_BLKNO);
 		print_block_header(&blk->hdr, SCOUTFS_BLOCK_SM_SIZE);
+		printf("  term %llu random_write_mark 0x%llx flags 0x%llx\n",
+		       le64_to_cpu(blk->term),
+		       le64_to_cpu(blk->random_write_mark),
+		       le64_to_cpu(blk->flags));

-		for (e = 0; e < array_size(event_names); e++) {
-			ev = &blk->events[e];
+		for (e = 0; e < array_size(events); e++) {
+			ev = (void *)blk + events[e].offset;

-			printf("  %12s: rid %016llx term %llu ts %llu.%08u\n",
-			       event_names[e], le64_to_cpu(ev->rid), le64_to_cpu(ev->term),
-			       le64_to_cpu(ev->ts.sec), le32_to_cpu(ev->ts.nsec));
+			printf("  %12s: rid %016llx ts %llu.%08u\n",
+			       events[e].name, le64_to_cpu(ev->rid),
+			       le64_to_cpu(ev->ts.sec),
+			       le32_to_cpu(ev->ts.nsec));
 		}
 	}

 	ret = 0;
 out:
-	free(blk);
+	free(log_addr);

 	return ret;
 }

-#define BTR_FMT "blkno %llu seq %016llx height %u"
-#define BTR_ARG(rt) \
-	le64_to_cpu((rt)->ref.blkno), le64_to_cpu((rt)->ref.seq), (rt)->height
-
 static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 {
 	char uuid_str[37];
@@ -950,7 +858,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	printf("  flags: 0x%016llx\n", le64_to_cpu(super->flags));

 	/* XXX these are all in a crazy order */
-	printf("  next_ino %llu seq %llu\n"
+	printf("  next_ino %llu next_trans_seq %llu\n"
 	       "  total_meta_blocks %llu first_meta_blkno %llu last_meta_blkno %llu\n"
 	       "  total_data_blocks %llu first_data_blkno %llu last_data_blkno %llu\n"
 	       "  meta_alloc[0]: "ALCROOT_F"\n"
@@ -960,14 +868,12 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	       "  server_meta_avail[1]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[0]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[1]: "AL_HEAD_F"\n"
-	       "  fs_root: "BTR_FMT"\n"
-	       "  logs_root: "BTR_FMT"\n"
-	       "  log_merge: "BTR_FMT"\n"
-	       "  trans_seqs: "BTR_FMT"\n"
-	       "  mounted_clients: "BTR_FMT"\n"
-	       "  srch_root: "BTR_FMT"\n",
+	       "  mounted_clients root: height %u blkno %llu seq %llu\n"
+	       "  srch_root root: height %u blkno %llu seq %llu\n"
+	       "  trans_seqs root: height %u blkno %llu seq %llu\n"
+	       "  fs_root btree root: height %u blkno %llu seq %llu\n",
 		le64_to_cpu(super->next_ino),
-		le64_to_cpu(super->seq),
+		le64_to_cpu(super->next_trans_seq),
 		le64_to_cpu(super->total_meta_blocks),
 		le64_to_cpu(super->first_meta_blkno),
 		le64_to_cpu(super->last_meta_blkno),
@@ -981,20 +887,18 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 		AL_HEAD_A(&super->server_meta_avail[1]),
 		AL_HEAD_A(&super->server_meta_freed[0]),
 		AL_HEAD_A(&super->server_meta_freed[1]),
-		BTR_ARG(&super->fs_root),
-		BTR_ARG(&super->logs_root),
-		BTR_ARG(&super->log_merge),
-		BTR_ARG(&super->trans_seqs),
-		BTR_ARG(&super->mounted_clients),
-		BTR_ARG(&super->srch_root));
-
-	printf("  volume options:\n"
-	       "    set_bits: %016llx\n",
-		le64_to_cpu(super->volopt.set_bits));
-	if (le64_to_cpu(super->volopt.set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
-		printf("    data_alloc_zone_blocks: %llu\n",
-			le64_to_cpu(super->volopt.data_alloc_zone_blocks));
-	}
+		super->mounted_clients.height,
+		le64_to_cpu(super->mounted_clients.ref.blkno),
+		le64_to_cpu(super->mounted_clients.ref.seq),
+		super->srch_root.height,
+		le64_to_cpu(super->srch_root.ref.blkno),
+		le64_to_cpu(super->srch_root.ref.seq),
+		super->trans_seqs.height,
+		le64_to_cpu(super->trans_seqs.ref.blkno),
+		le64_to_cpu(super->trans_seqs.ref.seq),
+		super->fs_root.height,
+		le64_to_cpu(super->fs_root.ref.blkno),
+		le64_to_cpu(super->fs_root.ref.seq));

 	printf("  quorum config version %llu\n",
 		le64_to_cpu(super->qconf.version));
@@ -1041,11 +945,6 @@ static int print_volume(int fd)
 	if (err && !ret)
 		ret = err;

-	err = print_btree(fd, super, "log_merge", &super->log_merge,
-			  print_log_merge_item, NULL);
-	if (err && !ret)
-		ret = err;
-
 	for (i = 0; i < array_size(super->server_meta_avail); i++) {
 		snprintf(str, sizeof(str), "server_meta_avail[%u]", i);
 		err = print_alloc_list_block(fd, str,
--- a/utils/src/stat.c
+++ b/utils/src/stat.c
@@ -37,7 +37,6 @@ static struct stat_more_field inode_fields[] = {
 	INODE_FIELD(data_version),
 	INODE_FIELD(online_blocks),
 	INODE_FIELD(offline_blocks),
-	{ .name = "crtime", .offset = INODE_FIELD_OFF(crtime_sec) },
 	{ NULL, }
 };

@@ -61,9 +60,6 @@ static void print_inode_field(void *st, size_t off)
 		case INODE_FIELD_OFF(offline_blocks):
 			printf("%llu", stm->offline_blocks);
 			break;
-		case INODE_FIELD_OFF(crtime_sec):
-			printf("%llu.%09u", stm->crtime_sec, stm->crtime_nsec);
-			break;
 	};
 }