Fix net BUG_ON if reconnection farewell send races

When a client socket disconnects we save the connection state to re-use later if the client reconnects. A newly accepted connection finds the old connection associated with the reconnecting client and migrates state from the old idle connection to the newly accepted connection. While moving messages between the old and new send and resend queues the code had an aggressive BUG_ON that was asserting that the newly accepted connection couldn't have any messages in its resend queue. This BUG can be tripped due to the ordering of greeting processing and connection state migration. The server greeting processing path sends the farewell response to the client before it calls the net code to migrate connection state. When it "sends" the farewell response it puts the message on the send queue and kicks the send work. It's possible for the send work to execute and move the farewell response to the resend queue and trip the BUG_ON. This is harmless. The sent greeting response is going to end up on the resend queue either way, there's no reason for the reconnection migration to assert that it can't have happened yet. It is going to be dropped the moment we get a message from the client with a recv_seq that is necessarily past the greeting response which always gets a seq of 1 from the newly accepted connection. We remove the BUG_ON and try to splice the old resend queue after the possible response at the head of the resend_queue so that it is the first to be dropped. Signed-off-by: Zach Brown <zab@versity.com>
Fix alloc list exhaustion calculation
2026-05-03 19:35:43 +00:00 · 2021-08-02 11:15:57 -07:00 · 2021-08-01 14:31:57 -07:00 · 2021-07-30 13:26:32 -07:00 · 2021-07-30 13:26:32 -07:00 · 2021-07-30 13:26:32 -07:00
83 changed files with 7670 additions and 1248 deletions
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -18,6 +18,7 @@ scoutfs-y +=			\
 	dir.o			\
 	export.o		\
 	ext.o			\
+	fence.o			\
 	file.o			\
 	forest.o		\
 	inode.o			\
@@ -42,6 +43,7 @@ scoutfs-y +=			\
 	trans.o			\
 	triggers.o		\
 	tseq.o			\
+	volopt.o		\
 	xattr.o

 #
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
@@ -29,8 +29,8 @@
 * The core allocator uses extent items in btrees rooted in the super.
 * Each free extent is stored in two items.  The first item is indexed
 * by block location and is used to merge adjacent extents when freeing.
- * The second item is indexed by length and is used to find large
- * extents to allocate from.
+ * The second item is indexed by the order of the length and is used to
+ * find large extents to allocate from.
 *
 * Free extent always consumes the front of the largest extent.  This
 * attempts to discourage fragmentation by given smaller freed extents
@@ -67,25 +67,52 @@
 */

 /*
- * Free extents don't have flags and are stored in two indexes sorted by
- * block location and by length, largest first.  The block location key
- * is set to the final block in the extent so that we can find
- * intersections by calling _next() iterators starting with the block
- * we're searching for.
+ * Return the order of the length of a free extent, which we define as
+ * floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
 */
-static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)
+static u64 free_extent_order(u64 len)
+{
+	return (fls64(len | 1) - 1) / 3;
+}
+
+/*
+ * The smallest (non-zero) length that will be mapped to the same order
+ * as the given length.
+ */
+static u64 smallest_order_length(u64 len)
+{
+	return 1ULL << (free_extent_order(len) * 3);
+}
+
+/*
+ * Free extents don't have flags and are stored in two indexes sorted by
+ * block location and by length order, largest first.  The location key
+ * field is set to the final block in the extent so that we can find
+ * intersections by calling _next() with the start of the range we're
+ * searching for.
+ *
+ * We never store 0 length extents but we do build keys for searching
+ * the order index from 0,0 without having to map it to a real extent.
+ */
+static void init_ext_key(struct scoutfs_key *key, int zone, u64 start, u64 len)
 {
 	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_FREE_EXTENT_ZONE,
-		.sk_type = type,
+		.sk_zone = zone,
 	};

-	if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
+	if (len == 0) {
+		/* we only use 0 len extents for magic 0,0 order lookups */
+		WARN_ON_ONCE(zone != SCOUTFS_FREE_EXTENT_ORDER_ZONE || start != 0);
+		return;
+	}
+
+	if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
 		key->skfb_end = cpu_to_le64(start + len - 1);
 		key->skfb_len = cpu_to_le64(len);
-	} else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE) {
-		key->skfl_neglen = cpu_to_le64(-len);
-		key->skfl_blkno = cpu_to_le64(start);
+	} else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) {
+		key->skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
+		key->skfo_end = cpu_to_le64(start + len - 1);
+		key->skfo_len = cpu_to_le64(len);
 	} else {
 		BUG();
 	}
@@ -93,23 +120,27 @@ static void init_ext_key(struct scoutfs_key *key, int type, u64 start, u64 len)

 static void ext_from_key(struct scoutfs_extent *ext, struct scoutfs_key *key)
 {
-	if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
+	if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
 		ext->start = le64_to_cpu(key->skfb_end) -
 			     le64_to_cpu(key->skfb_len) + 1;
 		ext->len = le64_to_cpu(key->skfb_len);
 	} else {
-		ext->start = le64_to_cpu(key->skfl_blkno);
-		ext->len = -le64_to_cpu(key->skfl_neglen);
+		ext->start = le64_to_cpu(key->skfo_end) -
+			     le64_to_cpu(key->skfo_len) + 1;
+		ext->len = le64_to_cpu(key->skfo_len);
 	}
 	ext->map = 0;
 	ext->flags = 0;
+
+	/* we never store 0 length extents */
+	WARN_ON_ONCE(ext->len == 0);
 }

 struct alloc_ext_args {
 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
 	struct scoutfs_alloc_root *root;
-	int type;
+	int zone;
 };

 static int alloc_ext_next(struct super_block *sb, void *arg,
@@ -120,13 +151,13 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
 	struct scoutfs_key key;
 	int ret;

-	init_ext_key(&key, args->type, start, len);
+	init_ext_key(&key, args->zone, start, len);

 	ret = scoutfs_btree_next(sb, &args->root->root, &key, &iref);
 	if (ret == 0) {
 		if (iref.val_len != 0)
 			ret = -EIO;
-		else if (iref.key->sk_type != args->type)
+		else if (iref.key->sk_zone != args->zone)
 			ret = -ENOENT;
 		else
 			ext_from_key(ext, iref.key);
@@ -139,19 +170,19 @@ static int alloc_ext_next(struct super_block *sb, void *arg,
 	return ret;
 }

-static int other_type(int type)
+static int other_zone(int zone)
 {
-	if (type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
-		return SCOUTFS_FREE_EXTENT_LEN_TYPE;
-	else if (type == SCOUTFS_FREE_EXTENT_LEN_TYPE)
-		return SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
+	if (zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
+		return SCOUTFS_FREE_EXTENT_ORDER_ZONE;
+	else if (zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
+		return SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
 	else
 		BUG();
 }

 /*
 * Insert an extent along with its matching item which is indexed by
- * opposite of its len or blkno.  If we succeed we update the root's
+ * opposite of its order or blkno.  If we succeed we update the root's
 * record of the total length of all the stored extents.
 */
 static int alloc_ext_insert(struct super_block *sb, void *arg,
@@ -167,8 +198,8 @@ static int alloc_ext_insert(struct super_block *sb, void *arg,
 	if (WARN_ON_ONCE(map || flags))
 		return -EINVAL;

-	init_ext_key(&key, args->type, start, len);
-	init_ext_key(&other, other_type(args->type), start, len);
+	init_ext_key(&key, args->zone, start, len);
+	init_ext_key(&other, other_zone(args->zone), start, len);

 	ret = scoutfs_btree_insert(sb, args->alloc, args->wri,
 				   &args->root->root, &key, NULL, 0);
@@ -196,8 +227,8 @@ static int alloc_ext_remove(struct super_block *sb, void *arg,
 	int ret;
 	int err;

-	init_ext_key(&key, args->type, start, len);
-	init_ext_key(&other, other_type(args->type), start, len);
+	init_ext_key(&key, args->zone, start, len);
+	init_ext_key(&other, other_zone(args->zone), start, len);

 	ret = scoutfs_btree_delete(sb, args->alloc, args->wri,
 				   &args->root->root, &key);
@@ -230,20 +261,17 @@ static bool invalid_extent(u64 start, u64 end, u64 first, u64 last)

 static bool invalid_meta_blkno(struct super_block *sb, u64 blkno)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	u64 last_meta = (i_size_read(sbi->meta_bdev->bd_inode) >> SCOUTFS_BLOCK_LG_SHIFT) - 1;

-	return invalid_extent(blkno, blkno,
-			      le64_to_cpu(super->first_meta_blkno),
-			      le64_to_cpu(super->last_meta_blkno));
+	return invalid_extent(blkno, blkno, SCOUTFS_META_DEV_START_BLKNO, last_meta);
 }

 static bool invalid_data_extent(struct super_block *sb, u64 start, u64 len)
 {
-	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	u64 last_data = (i_size_read(sb->s_bdev->bd_inode) >> SCOUTFS_BLOCK_SM_SHIFT) - 1;

-	return invalid_extent(start, start + len - 1,
-			      le64_to_cpu(super->first_data_blkno),
-			      le64_to_cpu(super->last_data_blkno));
+	return invalid_extent(start, start + len - 1, SCOUTFS_DATA_DEV_START_BLKNO, last_data);
 }

 void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
@@ -619,7 +647,7 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
 		.alloc = alloc,
 		.wri = wri,
 		.root = &dalloc->root,
-		.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
+		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
 	};
 	int ret = 0;

@@ -645,6 +673,14 @@ int scoutfs_dalloc_return_cached(struct super_block *sb,
 *
 * Unlike meta allocations, the caller is expected to serialize
 * allocations from the root.
+ *
+ * ENOBUFS is returned if the data allocator ran out of space and we can
+ * probably refill it from the server.  The caller is expected to back
+ * out, commit the transaction, and try again.
+ *
+ * ENOSPC is returned if the data allocator ran out of space but we have
+ * a flag from the server telling us that there's no more space
+ * available.  This is a hard error and should be returned.
 */
 int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
@@ -655,7 +691,7 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		.alloc = alloc,
 		.wri = wri,
 		.root = &dalloc->root,
-		.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
+		.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
 	};
 	struct scoutfs_extent ext;
 	u64 len;
@@ -693,13 +729,13 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 	ret = 0;
 out:
 	if (ret < 0) {
-		/*
-		 * Special retval meaning there wasn't space to alloc from
-		 * this txn. Doesn't mean filesystem is completely full.
-		 * Maybe upper layers want to try again.
-		 */
-		if (ret == -ENOENT)
-			ret = -ENOBUFS;
+		if (ret == -ENOENT) {
+			if (le32_to_cpu(dalloc->root.flags) & SCOUTFS_ALLOC_FLAG_LOW)
+				ret = -ENOSPC;
+			else
+				ret = -ENOBUFS;
+		}
+
 		*blkno_ret = 0;
 		*count_ret = 0;
 	} else {
@@ -728,7 +764,7 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 		.alloc = alloc,
 		.wri = wri,
 		.root = root,
-		.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
+		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
 	};
 	int ret;

@@ -741,6 +777,95 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

+/*
+ * Return the first zone bit that the extent intersects with.
+ */
+static int first_extent_zone(struct scoutfs_extent *ext,  __le64 *zones, u64 zone_blocks)
+{
+	int first;
+	int last;
+	int nr;
+
+	first = div64_u64(ext->start, zone_blocks);
+	last = div64_u64(ext->start + ext->len - 1, zone_blocks);
+
+	nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, first);
+	if (nr <= last)
+		return nr;
+
+	return SCOUTFS_DATA_ALLOC_MAX_ZONES;
+}
+
+/*
+ * Find an extent in specific zones to satisfy an allocation.  We use
+ * the order items to search for the largest extent that intersects with
+ * the zones whose bits are set in the caller's bitmap.
+ */
+static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *root,
+			    __le64 *zones, u64 zone_blocks,
+			    struct scoutfs_extent *found_ret, u64 count,
+			    struct scoutfs_extent *ext_ret)
+{
+	struct alloc_ext_args args = {
+		.root = root,
+		.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
+	};
+	struct scoutfs_extent found;
+	struct scoutfs_extent ext;
+	u64 start;
+	u64 len;
+	int nr;
+	int ret;
+
+	/* don't bother when there are no bits set */
+	if (find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0) ==
+	    SCOUTFS_DATA_ALLOC_MAX_ZONES)
+		return -ENOENT;
+
+	/* start searching for largest extent from the first zone */
+	len = smallest_order_length(SCOUTFS_BLOCK_SM_MAX);
+	nr = 0;
+
+	for (;;) {
+		/* search for extents in the next zone at our order */
+		nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, nr);
+		if (nr >= SCOUTFS_DATA_ALLOC_MAX_ZONES) {
+			/* wrap down to next smaller order if we run out of bits */
+			len >>= 3;
+			if (len == 0) {
+				ret = -ENOENT;
+				break;
+			}
+			nr = find_next_bit_le(zones, SCOUTFS_DATA_ALLOC_MAX_ZONES, 0);
+		}
+
+		start = (u64)nr * zone_blocks;
+
+		ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, start, len, &found);
+		if (ret < 0)
+			break;
+
+		/* see if the next extent intersects any zones */
+		nr = first_extent_zone(&found, zones, zone_blocks);
+		if (nr < SCOUTFS_DATA_ALLOC_MAX_ZONES) {
+			start = (u64)nr * zone_blocks;
+
+			ext.start = max(start, found.start);
+			ext.len = min(count, found.start + found.len - ext.start);
+
+			*found_ret = found;
+			*ext_ret = ext;
+			ret = 0;
+			break;
+		}
+
+		/* continue searching past extent */
+		nr = div64_u64(found.start + found.len - 1, zone_blocks) + 1;
+		len = smallest_order_length(found.len);
+	}
+
+	return ret;
+}

 /*
 * Move extent items adding up to the requested total length from the
@@ -751,6 +876,11 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 * -ENOENT is returned if we run out of extents in the source tree
 * before moving the total.
 *
+ * The caller can specify that extents in the source tree should first
+ * be found based on their zone bitmaps.  We'll first try to find
+ * extents in the exclusive zones, then vacant zones, and then we'll
+ * fall back to normal allocation that ignores zones.
+ *
 * This first pass is not optimal because it performs full btree walks
 * per extent.  We could optimize this with more clever btree item
 * manipulation functions which can iterate through src and dst blocks
@@ -759,32 +889,77 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
 		       struct scoutfs_alloc_root *dst,
-		       struct scoutfs_alloc_root *src, u64 total)
+		       struct scoutfs_alloc_root *src, u64 total,
+		       __le64 *exclusive, __le64 *vacant, u64 zone_blocks)
 {
 	struct alloc_ext_args args = {
 		.alloc = alloc,
 		.wri = wri,
 	};
+	struct scoutfs_extent found;
 	struct scoutfs_extent ext;
 	u64 moved = 0;
+	u64 count;
 	int ret = 0;
 	int err;

+	if (zone_blocks == 0) {
+		exclusive = NULL;
+		vacant = NULL;
+	}
+
 	while (moved < total) {
-		args.root = src;
-		args.type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
-		ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args,
-					0, 0, total - moved, &ext);
+		count = total - moved;
+
+		if (exclusive) {
+			/* first try to find extents in our exclusive zones */
+			ret = find_zone_extent(sb, src, exclusive, zone_blocks,
+					       &found, count, &ext);
+			if (ret == -ENOENT) {
+				exclusive = NULL;
+				continue;
+			}
+		} else if (vacant) {
+			/* then try to find extents in vacant zones */
+			ret = find_zone_extent(sb, src, vacant, zone_blocks,
+					       &found, count, &ext);
+			if (ret == -ENOENT) {
+				vacant = NULL;
+				continue;
+			}
+		} else {
+			/* otherwise fall back to finding extents anywhere */
+			args.root = src;
+			args.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
+			ret = scoutfs_ext_next(sb, &alloc_ext_ops, &args, 0, 0, &found);
+			if (ret == 0) {
+				ext.start = found.start;
+				ext.len = min(count, found.len);
+			}
+		}
 		if (ret < 0)
 			break;

+		/* searching set start/len, finish initializing alloced extent */
+		ext.map = found.map ? ext.start - found.start + found.map : 0;
+		ext.flags = found.flags;
+
+		/* remove the allocation from the found extent */
+		args.root = src;
+		args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
+		ret = scoutfs_ext_remove(sb, &alloc_ext_ops, &args, ext.start, ext.len);
+		if (ret < 0)
+			break;
+
+		/* insert the allocated extent into the dest */
 		args.root = dst;
-		args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
+		args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
 		ret = scoutfs_ext_insert(sb, &alloc_ext_ops, &args, ext.start,
 					 ext.len, ext.map, ext.flags);
 		if (ret < 0) {
+			/* and put it back in src if insertion failed */
 			args.root = src;
-			args.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
+			args.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
 			err = scoutfs_ext_insert(sb, &alloc_ext_ops, &args,
 						 ext.start, ext.len, ext.map,
 						 ext.flags);
@@ -802,6 +977,39 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 	return ret;
 }

+/*
+ * Add new free space to an allocator.  _ext_insert will make sure that it doesn't
+ * overlap with any existing extents.  This is done by the server in a transaction that
+ * also updates total_*_blocks in the super so we don't verify.
+ */
+int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
+			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
+			 u64 start, u64 len)
+{
+	struct alloc_ext_args args = {
+		.alloc = alloc,
+		.wri = wri,
+		.root = root,
+		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
+	};
+
+	return scoutfs_ext_insert(sb, &alloc_ext_ops, &args, start, len, 0, 0);
+}
+
+int scoutfs_alloc_remove(struct super_block *sb, struct scoutfs_alloc *alloc,
+			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
+			 u64 start, u64 len)
+{
+	struct alloc_ext_args args = {
+		.alloc = alloc,
+		.wri = wri,
+		.root = root,
+		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
+	};
+
+	return scoutfs_ext_remove(sb, &alloc_ext_ops, &args, start, len);
+}
+
 /*
 * We only trim one block, instead of looping trimming all, because the
 * caller is assuming that we do a fixed amount of work when they check
@@ -848,18 +1056,31 @@ out:
 }

 /*
- * True if the allocator has enough free blocks to cow (alloc and free)
- * a list block and all the btree blocks that store extent items.
+ * True if the allocator has enough blocks in the avail list and space
+ * in the freed list to be able to perform the callers operations.  If
+ * false the caller should back off and return partial progress rather
+ * than completely exhausting the avail list or overflowing the freed
+ * list.
 *
- * At most, an extent operation can dirty down three paths of the tree
- * to modify a blkno item and two distant len items.  We can grow and
- * split the root, and then those three paths could share blocks but each
- * modify two leaf blocks.
+ * An extent modification dirties three distinct leaves of an allocator
+ * btree as it adds and removes the blkno and size sorted items for the
+ * old and new lengths of the extent.  Dirtying the paths to these
+ * leaves can grow the tree and grow/shrink neighbours at each level.
+ * We over-estimate the number of blocks allocated and freed (the paths
+ * share a root, growth doesn't free) to err on the simpler and safer
+ * side.  The overhead is minimal given the relatively large list blocks
+ * and relatively short allocator trees.
+ *
+ * The caller tells us how many extents they're about to modify and how
+ * many other additional blocks they may cow manually.  And finally, the
+ * caller could be the first to dirty the avail and freed blocks in the
+ * allocator,
 */
-static bool list_can_cow(struct super_block *sb, struct scoutfs_alloc *alloc,
-			 struct scoutfs_alloc_root *root)
+static bool list_has_blocks(struct super_block *sb, struct scoutfs_alloc *alloc,
+			    struct scoutfs_alloc_root *root, u32 extents, u32 addl_blocks)
 {
-	u32 most = 1 + (1 + 1 + (3 * (1 - root->root.height + 1)));
+	u32 tree_blocks = (((1 + root->root.height) * 2) * 3) * extents;
+	u32 most = 1 + tree_blocks + addl_blocks;

 	if (le32_to_cpu(alloc->avail.first_nr) < most) {
 		scoutfs_inc_counter(sb, alloc_list_avail_lo);
@@ -901,7 +1122,7 @@ int scoutfs_alloc_fill_list(struct super_block *sb,
 		.alloc = alloc,
 		.wri = wri,
 		.root = root,
-		.type = SCOUTFS_FREE_EXTENT_LEN_TYPE,
+		.zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE,
 	};
 	struct scoutfs_alloc_list_block *lblk;
 	struct scoutfs_block *bl = NULL;
@@ -923,8 +1144,7 @@ int scoutfs_alloc_fill_list(struct super_block *sb,
 		goto out;
 	lblk = bl->data;

-	while (le32_to_cpu(lblk->nr) < target &&
-	       list_can_cow(sb, alloc, root)) {
+	while (le32_to_cpu(lblk->nr) < target && list_has_blocks(sb, alloc, root, 1, 0)) {

 		ret = scoutfs_ext_alloc(sb, &alloc_ext_ops, &args, 0, 0,
 					target - le32_to_cpu(lblk->nr), &ext);
@@ -958,7 +1178,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb,
 		.alloc = alloc,
 		.wri = wri,
 		.root = root,
-		.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
+		.zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE,
 	};
 	struct scoutfs_alloc_list_block *lblk = NULL;
 	struct scoutfs_block *bl = NULL;
@@ -968,7 +1188,7 @@ int scoutfs_alloc_empty_list(struct super_block *sb,
 	if (WARN_ON_ONCE(lhead_in_alloc(alloc, lhead)))
 		return -EINVAL;

-	while (lhead->ref.blkno && list_can_cow(sb, alloc, args.root)) {
+	while (lhead->ref.blkno && list_has_blocks(sb, alloc, args.root, 1, 1)) {

 		if (lhead->first_nr == 0) {
 			ret = trim_empty_first_block(sb, alloc, wri, lhead);
@@ -1091,6 +1311,20 @@ bool scoutfs_alloc_meta_low(struct super_block *sb,
 	return lo;
 }

+bool scoutfs_alloc_test_flag(struct super_block *sb,
+			    struct scoutfs_alloc *alloc, u32 flag)
+{
+	unsigned int seq;
+	bool set;
+
+	do {
+		seq = read_seqbegin(&alloc->seqlock);
+		set = !!(le32_to_cpu(alloc->avail.flags) & flag);
+	} while (read_seqretry(&alloc->seqlock, seq));
+
+	return set;
+}
+
 /*
 * Call the callers callback for every persistent allocator structure
 * we can find.
@@ -1102,9 +1336,15 @@ int scoutfs_alloc_foreach(struct super_block *sb,
 	struct scoutfs_block_ref refs[2] = {{0,}};
 	struct scoutfs_super_block *super = NULL;
 	struct scoutfs_srch_compact *sc;
+	struct scoutfs_log_merge_request *lmreq;
+	struct scoutfs_log_merge_complete *lmcomp;
 	struct scoutfs_log_trees lt;
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
+	int expected;
+	u64 avail_tot;
+	u64 freed_tot;
+	u64 id;
 	int ret;

 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
@@ -1211,6 +1451,57 @@ retry:
 		scoutfs_key_inc(&key);
 	}

+	/* log merge allocators */
+	memset(&key, 0, sizeof(key));
+	key.sk_zone = SCOUTFS_LOG_MERGE_REQUEST_ZONE;
+	expected = sizeof(*lmreq);
+	id = 0;
+	avail_tot = 0;
+	freed_tot = 0;
+
+	for (;;) {
+		ret = scoutfs_btree_next(sb, &super->log_merge, &key, &iref);
+		if (ret == 0) {
+			if (iref.key->sk_zone != key.sk_zone) {
+				ret = -ENOENT;
+			} else if (iref.val_len == expected) {
+				key = *iref.key;
+				if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
+					lmreq = iref.val;
+					id = le64_to_cpu(lmreq->rid);
+					avail_tot = le64_to_cpu(lmreq->meta_avail.total_nr);
+					freed_tot = le64_to_cpu(lmreq->meta_freed.total_nr);
+				} else {
+					lmcomp = iref.val;
+					id = le64_to_cpu(lmcomp->rid);
+					avail_tot = le64_to_cpu(lmcomp->meta_avail.total_nr);
+					freed_tot = le64_to_cpu(lmcomp->meta_freed.total_nr);
+				}
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret == -ENOENT) {
+			if (key.sk_zone == SCOUTFS_LOG_MERGE_REQUEST_ZONE) {
+				memset(&key, 0, sizeof(key));
+				key.sk_zone = SCOUTFS_LOG_MERGE_COMPLETE_ZONE;
+				expected = sizeof(*lmcomp);
+				continue;
+			}
+			break;
+		}
+		if (ret < 0)
+			goto out;
+
+		ret = cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, true, avail_tot) ?:
+		      cb(sb, arg, SCOUTFS_ALLOC_OWNER_LOG_MERGE, id, true, false, freed_tot);
+		if (ret < 0)
+			goto out;
+
+		scoutfs_key_inc(&key);
+	}
+
 	ret = 0;
 out:
 	if (ret == -ESTALE) {
@@ -1227,3 +1518,63 @@ out:
 	kfree(sc);
 	return ret;
 }
+
+
+struct foreach_cb_args {
+	scoutfs_alloc_extent_cb_t cb;
+	void *cb_arg;
+};
+
+static int alloc_btree_extent_item_cb(struct super_block *sb, struct scoutfs_key *key,
+				      void *val, int val_len, void *arg)
+{
+	struct foreach_cb_args *cba = arg;
+	struct scoutfs_extent ext;
+
+	if (key->sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
+		return -ENOENT;
+
+	ext_from_key(&ext, key);
+	cba->cb(sb, cba->cb_arg, &ext);
+
+	return 0;
+}
+
+/*
+ * Call the caller's callback on each extent stored in the allocator's
+ * btree.  The callback sees extents called in order by starting blkno.
+ */
+int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
+			     scoutfs_alloc_extent_cb_t cb, void *cb_arg)
+{
+	struct foreach_cb_args cba = {
+		.cb = cb,
+		.cb_arg = cb_arg,
+	};
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct scoutfs_key key;
+	int ret;
+
+	init_ext_key(&key, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
+
+	for (;;) {
+		/* will stop at order items before getting stuck in final block */
+		BUILD_BUG_ON(SCOUTFS_FREE_EXTENT_BLKNO_ZONE > SCOUTFS_FREE_EXTENT_ORDER_ZONE);
+		init_ext_key(&start, SCOUTFS_FREE_EXTENT_BLKNO_ZONE, 0, 1);
+		init_ext_key(&end, SCOUTFS_FREE_EXTENT_ORDER_ZONE, 0, 1);
+
+		ret = scoutfs_btree_read_items(sb, &root->root, &key, &start, &end,
+					       alloc_btree_extent_item_cb, &cba);
+		if (ret < 0 || end.sk_zone != SCOUTFS_FREE_EXTENT_BLKNO_ZONE) {
+			if (ret == -ENOENT)
+				ret = 0;
+			break;
+		}
+
+		key = end;
+		scoutfs_key_inc(&key);
+	}
+
+	return ret;
+}
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -38,6 +38,10 @@
 #define SCOUTFS_ALLOC_DATA_LG_THRESH \
 	(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)

+/* the client will force commits if data allocators get too low */
+#define SCOUTFS_ALLOC_DATA_REFILL_THRESH \
+	((256ULL * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
+
 /*
 * Fill client alloc roots to the target when they fall below the lo
 * threshold.
@@ -55,15 +59,16 @@
 #define SCOUTFS_SERVER_DATA_FILL_LO \
 	(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)

+
 /*
- * Each of the server meta_alloc roots will try to keep a minimum amount
- * of free blocks.  The server will swap roots when its current avail
- * falls below the threshold while the freed root is still above it.  It
- * must have room for all the largest allocation attempted in a
- * transaction on the server.
+ * Log merge meta allocations are only used for one request and will
+ * never use more than the dirty limit.
 */
-#define SCOUTFS_SERVER_META_ALLOC_MIN \
-	(SCOUTFS_SERVER_META_FILL_TARGET * 2)
+#define SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT	(64ULL * 1024 * 1024)
+/* a few extra blocks for alloc blocks */
+#define SCOUTFS_SERVER_MERGE_FILL_TARGET	\
+	((SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT >> SCOUTFS_BLOCK_LG_SHIFT) + 4)
+#define SCOUTFS_SERVER_MERGE_FILL_LO		SCOUTFS_SERVER_MERGE_FILL_TARGET

 /*
 * A run-time use of a pair of persistent avail/freed roots as a
@@ -125,7 +130,14 @@ int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
 int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
 		       struct scoutfs_block_writer *wri,
 		       struct scoutfs_alloc_root *dst,
-		       struct scoutfs_alloc_root *src, u64 total);
+		       struct scoutfs_alloc_root *src, u64 total,
+		       __le64 *exclusive, __le64 *vacant, u64 zone_blocks);
+int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
+			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
+			 u64 start, u64 len);
+int scoutfs_alloc_remove(struct super_block *sb, struct scoutfs_alloc *alloc,
+			 struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
+			 u64 start, u64 len);

 int scoutfs_alloc_fill_list(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
@@ -146,6 +158,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb,

 bool scoutfs_alloc_meta_low(struct super_block *sb,
 			    struct scoutfs_alloc *alloc, u32 nr);
+bool scoutfs_alloc_test_flag(struct super_block *sb,
+			    struct scoutfs_alloc *alloc, u32 flag);

 typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
 					  int owner, u64 id,
@@ -153,4 +167,9 @@ typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
 int scoutfs_alloc_foreach(struct super_block *sb,
 			  scoutfs_alloc_foreach_cb_t cb, void *arg);

+typedef void (*scoutfs_alloc_extent_cb_t)(struct super_block *sb, void *cb_arg,
+					  struct scoutfs_extent *ext);
+int scoutfs_alloc_extents_cb(struct super_block *sb, struct scoutfs_alloc_root *root,
+			     scoutfs_alloc_extent_cb_t cb, void *cb_arg);
+
 #endif
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -200,7 +200,9 @@ static void block_free(struct super_block *sb, struct block_private *bp)
 	else
 		BUG();

-	WARN_ON_ONCE(!list_empty(&bp->dirty_entry));
+	/* ok to tear down dirty blocks when forcing unmount */
+	WARN_ON_ONCE(!scoutfs_forcing_unmount(sb) && !list_empty(&bp->dirty_entry));
+
 	WARN_ON_ONCE(atomic_read(&bp->refcount));
 	WARN_ON_ONCE(atomic_read(&bp->io_count));
 	kfree(bp);
@@ -485,6 +487,9 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	sector_t sector;
 	int ret = 0;

+	if (scoutfs_forcing_unmount(sb))
+		return -EIO;
+
 	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

 	WARN_ON_ONCE(bp->bl.blkno == U64_MAX);
@@ -640,9 +645,11 @@ static struct block_private *block_read(struct super_block *sb, u64 blkno)
 			goto out;
 	}

-	ret = wait_event_interruptible(binf->waitq, uptodate_or_error(bp));
-	if (ret == 0 && test_bit(BLOCK_BIT_ERROR, &bp->bits))
+	wait_event(binf->waitq, uptodate_or_error(bp));
+	if (test_bit(BLOCK_BIT_ERROR, &bp->bits))
 		ret = -EIO;
+	else
+		ret = 0;

 out:
 	if (ret < 0) {
@@ -1148,7 +1155,7 @@ static void sm_block_bio_end_io(struct bio *bio, int err)
 * only layer that sees the full block buffer so we pass the calculated
 * crc to the caller for them to check in their context.
 */
-static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,
+static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw, u64 blkno,
 		       struct scoutfs_block_header *hdr, size_t len,
 		       __le32 *blk_crc)
 {
@@ -1160,6 +1167,9 @@ static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,

 	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

+	if (scoutfs_forcing_unmount(sb))
+		return -EIO;
+
 	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
 		return -EINVAL;
@@ -1212,14 +1222,14 @@ int scoutfs_block_read_sm(struct super_block *sb,
 			  struct scoutfs_block_header *hdr, size_t len,
 			  __le32 *blk_crc)
 {
-	return sm_block_io(bdev, READ, blkno, hdr, len, blk_crc);
+	return sm_block_io(sb, bdev, READ, blkno, hdr, len, blk_crc);
 }

 int scoutfs_block_write_sm(struct super_block *sb,
 			   struct block_device *bdev, u64 blkno,
 			   struct scoutfs_block_header *hdr, size_t len)
 {
-	return sm_block_io(bdev, WRITE, blkno, hdr, len, NULL);
+	return sm_block_io(sb, bdev, WRITE, blkno, hdr, len, NULL);
 }

 int scoutfs_block_setup(struct super_block *sb)
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -83,6 +83,10 @@ enum btree_walk_flags {
 	 BTW_ALLOC	= (1 <<  3), /* allocate a new block for 0 ref, requires dirty */
 	 BTW_INSERT	= (1 <<  4), /* walking to insert, try splitting */
 	 BTW_DELETE	= (1 <<  5), /* walking to delete, try joining */
+	 BTW_PAR_RNG	= (1 <<  6), /* return range through final parent */
+	 BTW_GET_PAR	= (1 <<  7), /* get reference to final parent */
+	 BTW_SET_PAR	= (1 <<  8), /* override reference to final parent */
+	 BTW_SUBTREE	= (1 <<  9), /* root is parent subtree, return -ERANGE if split/join */
 };

 /* total length of the value payload */
@@ -104,16 +108,22 @@ static inline unsigned int item_bytes(struct scoutfs_btree_item *item)
 }

 /*
- * Join blocks when they both are 1/4 full.  This puts some distance
- * between the join threshold and the full threshold for splitting.
- * Blocks that just split or joined need to undergo a reasonable amount
- * of item modification before they'll split or join again.
+ * Refill blocks from their siblings when they're under 1/4 full.  This
+ * puts some distance between the join threshold and the full threshold
+ * for splitting.  Blocks that just split or joined need to undergo a
+ * reasonable amount of item modification before they'll split or join
+ * again.
 */
 static unsigned int join_low_watermark(void)
 {
 	return (SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) / 4;
 }

+static bool total_above_join_low_water(struct scoutfs_btree_block *bt)
+{
+	return le16_to_cpu(bt->total_item_bytes) >= join_low_watermark();
+}
+
 /*
 * return the integer percentages of total space the block could have
 * consumed by items that is currently consumed.
@@ -512,6 +522,7 @@ static void create_item(struct scoutfs_btree_block *bt,

 	item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len);
 	item->val_len = cpu_to_le16(val_len);
+	memset(item->__pad, 0, sizeof(item->__pad));

 	le16_add_cpu(&bt->total_item_bytes, item_bytes(item));
 }
@@ -805,12 +816,13 @@ static int try_join(struct super_block *sb,
 	struct scoutfs_btree_block *sib;
 	struct scoutfs_block *sib_bl;
 	struct scoutfs_block_ref *ref;
+	const unsigned int lwm = join_low_watermark();
 	unsigned int sib_tot;
 	bool move_right;
 	int to_move;
 	int ret;

-	if (le16_to_cpu(bt->total_item_bytes) >= join_low_watermark())
+	if (total_above_join_low_water(bt))
 		return 0;

 	scoutfs_inc_counter(sb, btree_join);
@@ -830,18 +842,23 @@ static int try_join(struct super_block *sb,
 		return ret;
 	sib = sib_bl->data;

-	sib_tot = le16_to_cpu(bt->total_item_bytes);
-	if (sib_tot < join_low_watermark())
+	/* combine if resulting block would be up to 75% full, move big chunk otherwise */
+	sib_tot = le16_to_cpu(sib->total_item_bytes);
+	if (sib_tot <= lwm * 2)
 		to_move = sib_tot;
 	else
-		to_move = sib_tot - join_low_watermark();
+		to_move = lwm;

-	if (le16_to_cpu(bt->mid_free_len) < to_move) {
+	/* compact to make room for over-estimate of worst case move overrun */
+	if (le16_to_cpu(bt->mid_free_len) <
+	    (to_move + item_len_bytes(SCOUTFS_BTREE_MAX_VAL_LEN))) {
 		ret = compact_values(sb, bt);
-		if (ret < 0)
+		if (ret < 0) {
 			scoutfs_block_put(sb, sib_bl);
-		return ret;
+			return ret;
+		}
 	}
+
 	move_items(bt, sib, move_right, to_move);

 	/* update our parent's item */
@@ -904,20 +921,21 @@ static bool bad_avl_node_off(__le16 node_off, int nr)
 *  - call after leaf modification
 *  - padding is zero
 */
-static void verify_btree_block(struct super_block *sb,
+__attribute__((unused))
+static void verify_btree_block(struct super_block *sb, char *str,
 			       struct scoutfs_btree_block *bt, int level,
-			       struct scoutfs_key *start,
+			       bool last_ref, struct scoutfs_key *start,
 			       struct scoutfs_key *end)
 {
 	__le16 *buckets = leaf_item_hash_buckets(bt);
 	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *node;
 	char *reason = NULL;
 	int first_val = 0;
 	int hashed = 0;
 	int end_off;
 	int tot = 0;
 	int i = 0;
-	int j = 0;
 	int nr;

 	if (bt->level != level) {
@@ -956,8 +974,9 @@ static void verify_btree_block(struct super_block *sb,
 			goto out;
 		}

-		for (j = 0; j < sizeof(item->__pad); j++) {
-			WARN_ON_ONCE(item->__pad[j] != 0);
+		if (memchr_inv(item->__pad, '\0', sizeof(item->__pad))) {
+			reason = "item struct __pad isn't zero";
+			goto out;
 		}

 		if (scoutfs_key_compare(&item->key, start) < 0 ||
@@ -972,19 +991,29 @@ static void verify_btree_block(struct super_block *sb,
 			goto out;
 		}

+		if (level > 0 && le16_to_cpu(item->val_len) !=
+				 sizeof(struct scoutfs_block_ref)) {
+			reason = "parent item val not sizeof ref";
+			goto out;
+		}
+
 		if (le16_to_cpu(item->val_len) > SCOUTFS_BTREE_MAX_VAL_LEN) {
 			reason = "bad item val len";
 			goto out;
 		}

+		if (le16_to_cpu(item->val_off) % SCOUTFS_BTREE_VALUE_ALIGN) {
+			reason = "item value not aligned";
+			goto out;
+		}
+
 		if (((int)le16_to_cpu(item->val_off) +
 		     le16_to_cpu(item->val_len)) > end_off) {
 			reason = "item value outside valid";
 			goto out;
 		}

-		tot += sizeof(struct scoutfs_btree_item) +
-		       le16_to_cpu(item->val_len);
+		tot += item_len_bytes(le16_to_cpu(item->val_len));

 		if (item->val_len != 0) {
 			first_val = min_t(int, first_val,
@@ -992,6 +1021,15 @@ static void verify_btree_block(struct super_block *sb,
 		}
 	}

+	if (last_ref && level > 0 &&
+	    (node = scoutfs_avl_last(&bt->item_root)) != NULL) {
+		item = node_item(node);
+		if (scoutfs_key_compare(&item->key, end) != 0) {
+			reason = "final ref item key not range end";
+			goto out;
+		}
+	}
+
 	for (i = 0; level == 0 && i < SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; i++) {
 		if (buckets[i] == 0)
 			continue;
@@ -1024,17 +1062,18 @@ out:
 	if (!reason)
 		return;

-	printk("found btree block inconsistency: %s\n", reason);
-	printk("start "SK_FMT" end "SK_FMT"\n", SK_ARG(start), SK_ARG(end));
+	printk("verifying btree %s: %s\n", str, reason);
+	printk("args: level %u last_ref %u start "SK_FMT" end "SK_FMT"\n",
+		level, last_ref, SK_ARG(start), SK_ARG(end));
 	printk("calced: i %u tot %u hashed %u fv %u\n",
 	       i, tot, hashed, first_val);

-	printk("hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", 
+	printk("bt hdr: crc %x magic %x fsid %llx seq %llx blkno %llu\n", 
 		le32_to_cpu(bt->hdr.crc), le32_to_cpu(bt->hdr.magic),
 		le64_to_cpu(bt->hdr.fsid), le64_to_cpu(bt->hdr.seq),
 		le64_to_cpu(bt->hdr.blkno));
 	printk("item_root: node %u\n", le16_to_cpu(bt->item_root.node));
-	printk("nr %u tib %u mfl %u lvl %u\n",
+	printk("bt: nr %u tib %u mfl %u lvl %u\n",
 		le16_to_cpu(bt->nr_items), le16_to_cpu(bt->total_item_bytes),
 		le16_to_cpu(bt->mid_free_len), bt->level);

@@ -1051,6 +1090,92 @@ out:
 	BUG();
 }

+/*
+ * Walk from the root to the leaf, verifying the blocks traversed.
+ */
+__attribute__((unused))
+static void verify_btree_walk(struct super_block *sb, char *str,
+			      struct scoutfs_btree_root *root,
+			      struct scoutfs_key *key)
+{
+	struct scoutfs_avl_node *next_node;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_item *prev;
+	struct scoutfs_block *bl = NULL;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block_ref ref;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	bool last_ref;
+	int level;
+	int ret;
+
+	if (root->height == 0 && root->ref.blkno != 0) {
+		WARN_ONCE(1, "invalid btree root height %u blkno %llu seq %016llx\n",
+			root->height, le64_to_cpu(root->ref.blkno),
+			le64_to_cpu(root->ref.seq));
+		return;
+	}
+
+	if (root->height == 0)
+		return;
+
+	scoutfs_key_set_zeros(&start);
+	scoutfs_key_set_ones(&end);
+	level = root->height;
+	ref = root->ref;
+	/* first parent last ref isn't all ones in subtrees */
+	last_ref = false;
+
+	while(level-- > 0) {
+		scoutfs_block_put(sb, bl);
+		bl = NULL;
+		ret = get_ref_block(sb, NULL, NULL, 0, &ref, &bl);
+		if (ret) {
+			printk("verifying  btree %s: read error %d\n",
+			       str, ret);
+			break;
+		}
+		bt = bl->data;
+
+		verify_btree_block(sb, str, bt, level, last_ref, &start, &end);
+
+		if (level == 0)
+			break;
+
+		node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key,
+					  NULL, NULL, &next_node, NULL);
+		item = node_item(node ?: next_node);
+
+		if (item == NULL) {
+			printk("verifying btree %s: no ref item\n", str);
+			printk("root: height %u blkno %llu seq %016llx\n",
+			       root->height, le64_to_cpu(root->ref.blkno),
+			       le64_to_cpu(root->ref.seq));
+			printk("walk level %u start "SK_FMT" end "SK_FMT"\n",
+				level, SK_ARG(&start), SK_ARG(&end));
+
+			printk("block: level %u blkno %llu seq %016llx\n",
+			       bt->level, le64_to_cpu(bt->hdr.blkno),
+			       le64_to_cpu(bt->hdr.seq));
+			printk("key: "SK_FMT"\n", SK_ARG(key));
+			BUG();
+		}
+
+		if ((prev = prev_item(bt, item))) {
+			start = *item_key(prev);
+			scoutfs_key_inc(&start);
+		}
+		end = *item_key(item);
+
+		memcpy(&ref, item_val(bt, item), sizeof(ref));
+		last_ref = !next_item(bt, item);
+	}
+
+	scoutfs_block_put(sb, bl);
+}
+
 struct btree_walk_key_range {
 	struct scoutfs_key start;
 	struct scoutfs_key end;
@@ -1082,7 +1207,8 @@ static int btree_walk(struct super_block *sb,
 		      int flags, struct scoutfs_key *key,
 		      unsigned int val_len,
 		      struct scoutfs_block **bl_ret,
-		      struct btree_walk_key_range *kr)
+		      struct btree_walk_key_range *kr,
+		      struct scoutfs_btree_root *par_root)
 {
 	struct scoutfs_block *par_bl = NULL;
 	struct scoutfs_block *bl = NULL;
@@ -1098,9 +1224,15 @@ static int btree_walk(struct super_block *sb,
 	unsigned int nr;
 	int ret;

-	if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)))
+	if (WARN_ON_ONCE((flags & BTW_DIRTY) && (!alloc || !wri)) ||
+	    WARN_ON_ONCE((flags & BTW_PAR_RNG) && !kr) ||
+	    WARN_ON_ONCE((flags & (BTW_GET_PAR|BTW_SET_PAR)) && !par_root))
 		return -EINVAL;

+	/* all ops come through walk and walk calls all reads */
+	if (scoutfs_forcing_unmount(sb))
+		return -EIO;
+
 	scoutfs_inc_counter(sb, btree_walk);

 restart:
@@ -1121,7 +1253,14 @@ restart:
 	ret = 0;

 	if (!root->height) {
-		if (!(flags & BTW_INSERT)) {
+		if (flags & BTW_GET_PAR) {
+			memset(par_root, 0, sizeof(*par_root));
+			*root = *par_root;
+			ret = 0;
+		} else if (flags & BTW_SET_PAR) {
+			*root = *par_root;
+			ret = 0;
+		} else if (!(flags & BTW_INSERT)) {
 			ret = -ENOENT;
 		} else {
 			ret = get_ref_block(sb, alloc, wri, BTW_ALLOC | BTW_DIRTY, &root->ref, &bl);
@@ -1140,14 +1279,40 @@ restart:

 		trace_scoutfs_btree_walk(sb, root, key, flags, level, ref);

+		/* par range set by ref to last parent block */
+		if (level < 2 && (flags & BTW_PAR_RNG)) {
+			ret = 0;
+			break;
+		}
+
+		if (level < 2 && (flags & BTW_GET_PAR)) {
+			par_root->ref = *ref;
+			par_root->height = level + 1;
+			ret = 0;
+			break;
+		}
+
+		if (level < 2 && (flags & BTW_SET_PAR)) {
+			if (ref == &root->ref) {
+				/* single parent block is replaced, can shrink/grow */
+				*root = *par_root;
+			} else {
+				/* subtree replacing one of parents must match height */
+				if (par_root->height != level + 1) {
+					ret = -EINVAL;
+					break;
+				}
+				*ref = par_root->ref;
+			}
+			ret = 0;
+			break;
+		}
+
 		ret = get_ref_block(sb, alloc, wri, flags, ref, &bl);
 		if (ret)
 			break;
 		bt = bl->data;

-		if (0 && kr)
-			verify_btree_block(sb, bt, level, &kr->start, &kr->end);
-
 		/* XXX more aggressive block verification, before ref updates? */
 		if (bt->level != level) {
 			scoutfs_corruption(sb, SC_BTREE_BLOCK_LEVEL,
@@ -1163,6 +1328,17 @@ restart:
 			break;
 		}

+		/*
+		 * join/split won't check subtree parent root, let
+		 * caller know when it needs to be split/join.
+		 */
+		if ((flags & BTW_SUBTREE) && level == 1 &&
+		    (!total_above_join_low_water(bt) ||
+		     !mid_free_item_room(bt, sizeof(struct scoutfs_block_ref)))) {
+			ret = -ERANGE;
+			break;
+		}
+
 		/*
 		 * Splitting and joining can add or remove parents or
 		 * change the parent item we use to reach the child
@@ -1288,7 +1464,7 @@ int scoutfs_btree_lookup(struct super_block *sb,
 	if (WARN_ON_ONCE(iref->key))
 		return -EINVAL;

-	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL);
+	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1340,7 +1516,7 @@ int scoutfs_btree_insert(struct super_block *sb,
 		return -EINVAL;

 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL);
+			 val_len, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1402,7 +1578,7 @@ int scoutfs_btree_update(struct super_block *sb,
 		return -EINVAL;

 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL);
+			 val_len, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1444,7 +1620,7 @@ int scoutfs_btree_force(struct super_block *sb,
 		return -EINVAL;

 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT, key,
-			 val_len, &bl, NULL);
+			 val_len, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1482,7 +1658,7 @@ int scoutfs_btree_delete(struct super_block *sb,
 	scoutfs_inc_counter(sb, btree_delete);

 	ret = btree_walk(sb, alloc, wri, root, BTW_DELETE | BTW_DIRTY, key,
-			 0, &bl, NULL);
+			 0, &bl, NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1546,7 +1722,7 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root,

 	for (;;) {
 		ret = btree_walk(sb, NULL, NULL, root, flags, &walk_key,
-				 0, &bl, &kr);
+				 0, &bl, &kr, NULL);
 		if (ret < 0)
 			break;
 		bt = bl->data;
@@ -1619,7 +1795,8 @@ int scoutfs_btree_dirty(struct super_block *sb,

 	scoutfs_inc_counter(sb, btree_dirty);

-	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl, NULL);
+	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl,
+			 NULL, NULL);
 	if (ret == 0) {
 		bt = bl->data;

@@ -1655,7 +1832,7 @@ int scoutfs_btree_read_items(struct super_block *sb,
 	struct scoutfs_block *bl;
 	int ret;

-	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr);
+	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, &kr, NULL);
 	if (ret < 0)
 		goto out;
 	bt = bl->data;
@@ -1710,7 +1887,7 @@ int scoutfs_btree_insert_list(struct super_block *sb,

 	while (lst) {
 		ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_INSERT,
-				 &lst->key, lst->val_len, &bl, &kr);
+				 &lst->key, lst->val_len, &bl, &kr, NULL);
 		if (ret < 0)
 			goto out;
 		bt = bl->data;
@@ -1738,3 +1915,542 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 out:
 	return ret;
 }
+
+/*
+ * Descend towards the leaf that would contain the key.  As we arrive at
+ * the last parent block, set start and end to the range of keys that
+ * could be found through traversal of that last parent.
+ *
+ * If the tree is too short for parent blocks then the max key range
+ * is returned.
+ */
+int scoutfs_btree_parent_range(struct super_block *sb,
+			       struct scoutfs_btree_root *root,
+			       struct scoutfs_key *key,
+			       struct scoutfs_key *start,
+			       struct scoutfs_key *end)
+{
+	struct btree_walk_key_range kr;
+	int ret;
+
+	ret = btree_walk(sb, NULL, NULL, root, BTW_PAR_RNG, key, 0, NULL,
+			 &kr, NULL);
+	if (ret == -ENOENT)
+		ret = 0;
+
+	*start = kr.start;
+	*end = kr.end;
+	return ret;
+}
+
+/*
+ * Initialize the caller's root as a subtree whose ref points to the
+ * last parent found as we traverse towards the leaf containing the key.
+ * If the tree is too small to have multiple blocks at the final parent
+ * level then the caller's root will be initialized to equal full input
+ * root.  If the tree is empty then the par root will also be empty.
+ */
+int scoutfs_btree_get_parent(struct super_block *sb,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root)
+{
+	return btree_walk(sb, NULL, NULL, root, BTW_GET_PAR, key, 0, NULL,
+			  NULL, par_root);
+}
+
+/*
+ * Dirty a path towards the leaf block containing the key.  As we reach
+ * the reference to the final parent block override it with the ref in
+ * the caller's block.  If the tree only has a single block at the final
+ * parent level, or a single leaf block, then the entire tree is
+ * replaced with the caller's root.
+ *
+ * This manages allocs and frees while dirtying blocks in the path to
+ * the ref, but it doesn't account for allocating the blocks that are
+ * referenced by the ref nor freeing blocks referenced by the old ref
+ * that's overwritten.  Keeping allocators in sync with the result of
+ * the ref override is the responsibility of the caller.
+ */
+int scoutfs_btree_set_parent(struct super_block *sb,
+			     struct scoutfs_alloc *alloc,
+			     struct scoutfs_block_writer *wri,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root)
+{
+
+	trace_scoutfs_btree_set_parent(sb, root, key, par_root);
+
+	return btree_walk(sb, alloc, wri, root, BTW_DIRTY | BTW_SET_PAR,
+			  key, 0, NULL, NULL, par_root);
+}
+
+/*
+ * Descend to the leaf, making sure that all the blocks conform to the
+ * balance constraints.  Blocks below the low threshold will be joined.
+ * This is called to split blocks that were too large for insertions,
+ * but those insertions were in a distant context and we don't bother
+ * communicating the val_len back here.  We just try to insert a max
+ * value.
+ *
+ * This always dirties all the way to the leaf.  It could be made more
+ * efficient with more btree walk flags to walk and check for blocks
+ * that need balancing, and then walks that don't dirty unless they need
+ * to join/split.
+ */
+int scoutfs_btree_rebalance(struct super_block *sb,
+			    struct scoutfs_alloc *alloc,
+			    struct scoutfs_block_writer *wri,
+			    struct scoutfs_btree_root *root,
+			    struct scoutfs_key *key)
+{
+	return btree_walk(sb, alloc, wri, root,
+			  BTW_DIRTY | BTW_INSERT | BTW_DELETE,
+			  key, SCOUTFS_BTREE_MAX_VAL_LEN, NULL, NULL, NULL);
+}
+
+struct merge_pos {
+	struct rb_node node;
+	struct scoutfs_btree_root *root;
+	struct scoutfs_key key;
+	unsigned int val_len;
+	u8 val[SCOUTFS_BTREE_MAX_VAL_LEN];
+};
+
+/*
+ * Find the next item in the mpos's root after its key and make sure
+ * that it's in its sorted position in the rbtree.  We're responsible
+ * for freeing the mpos if we don't put it back in the pos_root.  This
+ * happens naturally naturally when its item_root has no more items to
+ * merge.
+ */
+static int reset_mpos(struct super_block *sb, struct rb_root *pos_root,
+		      struct merge_pos *mpos, struct scoutfs_key *end,
+		      scoutfs_btree_merge_cmp_t merge_cmp)
+{
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct merge_pos *walk;
+	struct rb_node *parent;
+	struct rb_node **node;
+	int key_cmp;
+	int val_cmp;
+	int ret;
+
+restart:
+	if (!RB_EMPTY_NODE(&mpos->node)) {
+		rb_erase(&mpos->node, pos_root);
+		RB_CLEAR_NODE(&mpos->node);
+	}
+
+	/* find the next item in the root within end */
+	ret = scoutfs_btree_next(sb, mpos->root, &mpos->key, &iref);
+	if (ret == 0) {
+		if (scoutfs_key_compare(iref.key, end) > 0) {
+			ret = -ENOENT;
+		} else {
+			mpos->key = *iref.key;
+			mpos->val_len = iref.val_len;
+			memcpy(mpos->val, iref.val, iref.val_len);
+		}
+		scoutfs_btree_put_iref(&iref);
+	}
+	if (ret < 0) {
+		kfree(mpos);
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+rewalk:
+	/* sort merge items by key then oldest to newest */
+	node = &pos_root->rb_node;
+	parent = NULL;
+	while (*node) {
+		parent = *node;
+		walk = container_of(*node, struct merge_pos, node);
+
+		key_cmp = scoutfs_key_compare(&mpos->key, &walk->key);
+		val_cmp = merge_cmp(mpos->val, mpos->val_len,
+				    walk->val, walk->val_len);
+
+		/* drop old versions of logged keys as we discover them */
+		if (key_cmp == 0) {
+			scoutfs_inc_counter(sb, btree_merge_drop_old);
+			if (val_cmp < 0)  {
+				scoutfs_key_inc(&mpos->key);
+				goto restart;
+			} else {
+				BUG_ON(val_cmp == 0);
+				rb_erase(&walk->node, pos_root);
+				kfree(walk);
+				goto rewalk;
+			}
+		}
+
+		if ((key_cmp ?: val_cmp) < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
+	}
+
+	rb_link_node(&mpos->node, parent, node);
+	rb_insert_color(&mpos->node, pos_root);
+	ret = 0;
+out:
+	return ret;
+}
+
+static struct merge_pos *first_mpos(struct rb_root *root)
+{
+	struct rb_node *node = rb_first(root);
+	if (node)
+		 return container_of(node, struct merge_pos, node);
+	return NULL;
+}
+
+/*
+ * Merge items from a number of read-only input roots into a writable
+ * destination root.  The order of the input roots doesn't matter, the
+ * items are merged in sorted key order.
+ *
+ * The merge_cmp callback determines the order that the input items are
+ * merged in.  The is_del callback determines if a merging item should
+ * be removed from the destination.
+ *
+ * subtree indicates that the destination root is in fact one of many
+ * parent blocks and shouldn't be split or allowed to fall below the
+ * join low water mark.
+ *
+ * drop_val indicates the initial length of the value that should be
+ * dropped when merging items into destination items.
+ *
+ * -ERANGE is returned if the merge doesn't fully exhaust the range, due
+ * to allocators running low or needing to join/split the parent.
+ * *next_ret is set to the next key which hasn't been merged so that the
+ * caller can retry with a new allocator and subtree.
+ */
+int scoutfs_btree_merge(struct super_block *sb,
+			struct scoutfs_alloc *alloc,
+			struct scoutfs_block_writer *wri,
+			struct scoutfs_key *start,
+			struct scoutfs_key *end,
+			struct scoutfs_key *next_ret,
+			struct scoutfs_btree_root *root,
+			struct list_head *inputs,
+			scoutfs_btree_merge_cmp_t merge_cmp,
+			scoutfs_btree_merge_is_del_t merge_is_del, bool subtree,
+			int drop_val, int dirty_limit, int alloc_low)
+{
+	struct scoutfs_btree_root_head *rhead;
+	struct rb_root pos_root = RB_ROOT;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block *bl = NULL;
+	struct btree_walk_key_range kr;
+	struct scoutfs_avl_node *par;
+	struct merge_pos *mpos;
+	struct merge_pos *tmp;
+	int walk_val_len;
+	int walk_flags;
+	bool is_del;
+	int cmp;
+	int ret;
+
+	trace_scoutfs_btree_merge(sb, root, start, end);
+	scoutfs_inc_counter(sb, btree_merge);
+
+	list_for_each_entry(rhead, inputs, head) {
+		mpos = kmalloc(sizeof(*mpos), GFP_NOFS);
+		if (!mpos) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		RB_CLEAR_NODE(&mpos->node);
+		mpos->key = *start;
+		mpos->root = &rhead->root;
+
+		ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp);
+		if (ret < 0)
+			goto out;
+	}
+
+	walk_flags = BTW_DIRTY;
+	if (subtree)
+		walk_flags |= BTW_SUBTREE;
+	walk_val_len = 0;
+
+	while ((mpos = first_mpos(&pos_root))) {
+
+		if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) {
+			scoutfs_inc_counter(sb, btree_merge_dirty_limit);
+			ret = -ERANGE;
+			*next_ret = mpos->key;
+			goto out;
+		}
+
+		if (scoutfs_alloc_meta_low(sb, alloc, alloc_low)) {
+			scoutfs_inc_counter(sb, btree_merge_alloc_low);
+			ret = -ERANGE;
+			*next_ret = mpos->key;
+			goto out;
+		}
+
+		scoutfs_block_put(sb, bl);
+		bl = NULL;
+		ret = btree_walk(sb, alloc, wri, root, walk_flags,
+				 &mpos->key, walk_val_len, &bl, &kr, NULL);
+		if (ret < 0) {
+			if (ret == -ERANGE)
+				*next_ret = mpos->key;
+			goto out;
+		}
+		bt = bl->data;
+		scoutfs_inc_counter(sb, btree_merge_walk);
+
+		for (; mpos; mpos = first_mpos(&pos_root)) {
+
+			/* val must have at least what we need to drop */
+			if (mpos->val_len < drop_val) {
+				ret = -EIO;
+				goto out;
+			}
+
+			/* walk to new leaf if we exceed parent ref key */
+			if (scoutfs_key_compare(&mpos->key, &kr.end) > 0)
+				break;
+
+			/* see if there's an existing item */
+			item = leaf_item_hash_search(sb, bt, &mpos->key);
+			is_del = merge_is_del(mpos->val, mpos->val_len);
+
+			trace_scoutfs_btree_merge_items(sb, mpos->root,
+					&mpos->key, mpos->val_len,
+					item ? root : NULL,
+					item ? item_key(item) : NULL,
+					item ? item_val_len(item) : 0, is_del);
+
+			/* rewalk and split if ins/update needs room */
+			if (!is_del && !mid_free_item_room(bt, mpos->val_len)) {
+				walk_flags |= BTW_INSERT;
+				walk_val_len = mpos->val_len;
+				break;
+			}
+
+			/* insert missing non-deletion merge items */
+			if (!item && !is_del) {
+				scoutfs_avl_search(&bt->item_root,
+						   cmp_key_item, &mpos->key,
+						   &cmp, &par, NULL, NULL);
+				create_item(bt, &mpos->key,
+					    mpos->val + drop_val,
+					    mpos->val_len - drop_val, par, cmp);
+				scoutfs_inc_counter(sb, btree_merge_insert);
+			}
+
+			/* update existing items */
+			if (item && !is_del) {
+				update_item_value(bt, item,
+						  mpos->val + drop_val,
+						  mpos->val_len - drop_val);
+				scoutfs_inc_counter(sb, btree_merge_update);
+			}
+
+			/* delete if merge item was deletion */
+			if (item && is_del) {
+				/* rewalk and join if non-root falls under low water mark */
+				if (root->ref.blkno != bt->hdr.blkno &&
+				    !total_above_join_low_water(bt)) {
+					walk_flags |= BTW_DELETE;
+					break;
+				}
+				delete_item(bt, item, NULL);
+				scoutfs_inc_counter(sb, btree_merge_delete);
+			}
+
+			/* reset walk args now that we're not split/join */
+			walk_flags &= ~(BTW_INSERT | BTW_DELETE);
+			walk_val_len = 0;
+
+			/* finished with this merge item */
+			scoutfs_key_inc(&mpos->key);
+			ret = reset_mpos(sb, &pos_root, mpos, end, merge_cmp);
+			if (ret < 0)
+				goto out;
+			mpos = NULL;
+		}
+	}
+
+	ret = 0;
+out:
+	scoutfs_block_put(sb, bl);
+	rbtree_postorder_for_each_entry_safe(mpos, tmp, &pos_root, node) {
+		kfree(mpos);
+	}
+
+	return ret;
+}
+
+/*
+ * Free all the blocks referenced by a btree.  The btree is only read,
+ * this does not update the blocks as it frees.  The caller ensures that
+ * these btrees aren't been modified.
+ *
+ * The caller's key tracks which blocks have been freed.  It must be
+ * initialized to zeros before the first call to start freeing blocks.
+ * Once a block is freed the key is updated such that the freed block
+ * will not be read again.
+ *
+ * Returns 0 when progress has been made successfully, which includes
+ * partial progress.  The key is set to all ones once we've freed all
+ * the blocks.
+ *
+ * This works by descending to the last parent block and freeing all its
+ * leaf blocks without reading them.  As it descends it remembers the
+ * number of parent blocks which were traversed through their final
+ * child ref.  If we free all the leaf blocks then all these parent
+ * blocks are no longer needed and can be freed.  The caller's key is
+ * updated to past the subtree that we just freed and we retry the
+ * descent from the root through the next set of parents to the next set
+ * of leaf blocks to free.
+ */
+int scoutfs_btree_free_blocks(struct super_block *sb,
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri,
+			      struct scoutfs_key *key,
+			      struct scoutfs_btree_root *root, int alloc_low)
+{
+	u64 blknos[SCOUTFS_BTREE_MAX_HEIGHT];
+	struct scoutfs_block *bl = NULL;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	struct scoutfs_block_ref ref;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_avl_node *next;
+	struct scoutfs_key par_next;
+	int nr_par;
+	int level;
+	int ret;
+	int i;
+
+	if (WARN_ON_ONCE(root->height > ARRAY_SIZE(blknos)))
+		return -EIO; /* XXX corruption */
+
+	if (root->height == 0) {
+		scoutfs_key_set_ones(key);
+		return 0;
+	}
+
+	if (scoutfs_key_is_ones(key))
+		return 0;
+
+	/* just free a single leaf block */
+	if (root->height == 1) {
+		ret = scoutfs_free_meta(sb, alloc, wri,
+					le64_to_cpu(root->ref.blkno));
+		if (ret == 0) {
+			trace_scoutfs_btree_free_blocks_single(sb, root,
+						le64_to_cpu(root->ref.blkno));
+			scoutfs_key_set_ones(key);
+		}
+		goto out;
+	}
+
+	for (;;) {
+		/* start the walk at the root block */
+		level = root->height - 1;
+		ref = root->ref;
+		scoutfs_key_set_ones(&par_next);
+		nr_par = 0;
+
+		/* read blocks until we read the last parent */
+		for (;;) {
+			scoutfs_block_put(sb, bl);
+			bl = NULL;
+			ret = get_ref_block(sb, alloc, wri, 0, &ref, &bl);
+			if (ret < 0)
+				goto out;
+			bt = bl->data;
+
+			node = scoutfs_avl_search(&bt->item_root, cmp_key_item,
+						  key, NULL, NULL, &next, NULL);
+			if (node == NULL)
+				node = next;
+
+			/* should never descend into parent with no more refs */
+			if (WARN_ON_ONCE(node == NULL)) {
+				ret = -EIO;
+				goto out;
+			}
+
+			/* we'll free refs in the last parent */
+			if (level == 1)
+				break;
+
+			item = node_item(node);
+			next = scoutfs_avl_next(&bt->item_root, node);
+			if (next) {
+				/* didn't take last ref, still need parents */
+				nr_par = 0;
+				par_next = *item_key(item);
+				scoutfs_key_inc(&par_next);
+			} else {
+				/* final ref, could free after all leaves */
+				blknos[nr_par++] = le64_to_cpu(bt->hdr.blkno);
+			}
+
+			memcpy(&ref, item_val(bt, item), sizeof(ref));
+			level--;
+		}
+
+		/* free all leaf block refs in last parent */
+		while (node) {
+
+			/* make sure we can always free parents after leaves */
+			if (scoutfs_alloc_meta_low(sb, alloc,
+						   alloc_low + nr_par + 1)) {
+				ret = 0;
+				goto out;
+			}
+
+			item = node_item(node);
+			memcpy(&ref, item_val(bt, item), sizeof(ref));
+
+			trace_scoutfs_btree_free_blocks_leaf(sb, root,
+							le64_to_cpu(ref.blkno));
+			ret = scoutfs_free_meta(sb, alloc, wri,
+						le64_to_cpu(ref.blkno));
+			if (ret < 0)
+				goto out;
+
+			node = scoutfs_avl_next(&bt->item_root, node);
+			if (node) {
+				/* done with keys in child we just freed */
+				*key = *item_key(item);
+				scoutfs_key_inc(key);
+			}
+		}
+
+		/* now that leaves are freed, free any empty parents */
+		for (i = 0; i < nr_par; i++) {
+			trace_scoutfs_btree_free_blocks_parent(sb, root,
+							       blknos[i]);
+			ret = scoutfs_free_meta(sb, alloc, wri, blknos[i]);
+			BUG_ON(ret); /* checked meta low, freed should fit */
+		}
+
+		/* restart walk past the subtree we just freed */
+		*key = par_next;
+
+		/* but done if we just freed all parents down right spine */
+		if (scoutfs_key_is_ones(&par_next)) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+out:
+	scoutfs_block_put(sb, bl);
+	return ret;
+}
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -82,6 +82,58 @@ int scoutfs_btree_insert_list(struct super_block *sb,
 			      struct scoutfs_btree_root *root,
 			      struct scoutfs_btree_item_list *lst);

+int scoutfs_btree_parent_range(struct super_block *sb,
+			       struct scoutfs_btree_root *root,
+			       struct scoutfs_key *key,
+			       struct scoutfs_key *start,
+			       struct scoutfs_key *end);
+int scoutfs_btree_get_parent(struct super_block *sb,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root);
+int scoutfs_btree_set_parent(struct super_block *sb,
+			     struct scoutfs_alloc *alloc,
+			     struct scoutfs_block_writer *wri,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_btree_root *par_root);
+int scoutfs_btree_rebalance(struct super_block *sb,
+			    struct scoutfs_alloc *alloc,
+			    struct scoutfs_block_writer *wri,
+			    struct scoutfs_btree_root *root,
+			    struct scoutfs_key *key);
+
+/* merge input is a list of roots */
+struct scoutfs_btree_root_head {
+	struct list_head head;
+	struct scoutfs_btree_root root;
+};
+/*
+ * Compare the values of merge input items whose keys are equal to
+ * determine their merge order.
+ */
+typedef int (*scoutfs_btree_merge_cmp_t)(void *a_val, int a_val_len,
+					 void *b_val, int b_val_len);
+/* whether merging item should be removed from destination */
+typedef bool (*scoutfs_btree_merge_is_del_t)(void *val, int val_len);
+int scoutfs_btree_merge(struct super_block *sb,
+			struct scoutfs_alloc *alloc,
+			struct scoutfs_block_writer *wri,
+			struct scoutfs_key *start,
+			struct scoutfs_key *end,
+			struct scoutfs_key *next_ret,
+			struct scoutfs_btree_root *root,
+			struct list_head *input_list,
+			scoutfs_btree_merge_cmp_t merge_cmp,
+			scoutfs_btree_merge_is_del_t merge_is_del, bool subtree,
+			int drop_val, int dirty_limit, int alloc_low);
+
+int scoutfs_btree_free_blocks(struct super_block *sb,
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri,
+			      struct scoutfs_key *key,
+			      struct scoutfs_btree_root *root, int alloc_low);
+
 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);

 #endif
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -48,6 +48,7 @@ struct client_info {

 	struct workqueue_struct *workq;
 	struct delayed_work connect_dwork;
+	unsigned long connect_delay_jiffies;

 	u64 server_term;

@@ -216,6 +217,26 @@ int scoutfs_client_srch_commit_compact(struct super_block *sb,
 					res, sizeof(*res), NULL, 0);
 }

+int scoutfs_client_get_log_merge(struct super_block *sb,
+				 struct scoutfs_log_merge_request *req)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_GET_LOG_MERGE,
+					NULL, 0, req, sizeof(*req));
+}
+
+int scoutfs_client_commit_log_merge(struct super_block *sb,
+				    struct scoutfs_log_merge_complete *comp)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
+					comp, sizeof(*comp), NULL, 0);
+}
+
 int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
 				      struct scoutfs_open_ino_map *map)
 {
@@ -249,6 +270,41 @@ int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
 					&args, sizeof(args), map, sizeof(*map));
 }

+/* The client is asking the server for the current volume options */
+int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_VOLOPT,
+					NULL, 0, volopt, sizeof(*volopt));
+}
+
+/* The client is asking the server to update volume options */
+int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_SET_VOLOPT,
+					volopt, sizeof(*volopt), NULL, 0);
+}
+
+/* The client is asking the server to clear volume options */
+int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_CLEAR_VOLOPT,
+					volopt, sizeof(*volopt), NULL, 0);
+}
+
+int scoutfs_client_resize_devices(struct super_block *sb, struct scoutfs_net_resize_devices *nrd)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_RESIZE_DEVICES,
+					nrd, sizeof(*nrd), NULL, 0);
+}
+
 /* The client is receiving a invalidation request from the server */
 static int client_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
@@ -322,6 +378,7 @@ static int client_greeting(struct super_block *sb,
 	scoutfs_net_client_greeting(sb, conn, new_server);

 	client->server_term = le64_to_cpu(gr->server_term);
+	client->connect_delay_jiffies = 0;
 	ret = 0;
 out:
 	return ret;
@@ -371,6 +428,20 @@ out:
 	return ret;
 }

+/*
+ * If we're not seeing successful connections we want to back off.  Each
+ * connection attempt starts by setting a long connection work delay.
+ * We only set a shorter delay if we see a greeting response from the
+ * server.  At that point we'll try to immediately reconnect if the
+ * connection is broken.
+ */
+static void queue_connect_dwork(struct super_block *sb, struct client_info *client)
+{
+	if (!atomic_read(&client->shutting_down) && !scoutfs_forcing_unmount(sb))
+		queue_delayed_work(client->workq, &client->connect_dwork,
+				   client->connect_delay_jiffies);
+}
+
 /*
 * This work is responsible for maintaining a connection from the client
 * to the server.  It's queued on mount and disconnect and we requeue
@@ -410,6 +481,9 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 		goto out;
 	}

+	/* always wait a bit until a greeting response sets a lower delay */
+	client->connect_delay_jiffies = msecs_to_jiffies(CLIENT_CONNECT_DELAY_MS);
+
 	ret = scoutfs_quorum_server_sin(sb, &sin);
 	if (ret < 0)
 		goto out;
@@ -437,11 +511,8 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
 	if (ret)
 		scoutfs_net_shutdown(sb, client->conn);
 out:
-
-	/* always have a small delay before retrying to avoid storms */
-	if (ret && !atomic_read(&client->shutting_down))
-		queue_delayed_work(client->workq, &client->connect_dwork,
-				   msecs_to_jiffies(CLIENT_CONNECT_DELAY_MS));
+	if (ret)
+		queue_connect_dwork(sb, client);
 }

 static scoutfs_net_request_t client_req_funcs[] = {
@@ -460,8 +531,7 @@ static void client_notify_down(struct super_block *sb,
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;

-	if (!atomic_read(&client->shutting_down))
-		queue_delayed_work(client->workq, &client->connect_dwork, 0);
+	queue_connect_dwork(sb, client);
 }

 int scoutfs_client_setup(struct super_block *sb)
@@ -496,7 +566,7 @@ int scoutfs_client_setup(struct super_block *sb)
 		goto out;
 	}

-	queue_delayed_work(client->workq, &client->connect_dwork, 0);
+	queue_connect_dwork(sb, client);
 	ret = 0;

 out:
@@ -553,7 +623,7 @@ void scoutfs_client_destroy(struct super_block *sb)
 	if (client == NULL)
 		return;

-	if (client->server_term != 0) {
+	if (client->server_term != 0 && !scoutfs_forcing_unmount(sb)) {
 		client->sending_farewell = true;
 		ret = scoutfs_net_submit_request(sb, client->conn,
 						 SCOUTFS_NET_CMD_FAREWELL,
@@ -561,10 +631,8 @@ void scoutfs_client_destroy(struct super_block *sb)
 						 client_farewell_response,
 						 NULL, NULL);
 		if (ret == 0) {
-			ret = wait_for_completion_interruptible(
-							&client->farewell_comp);
-			if (ret == 0)
-				ret = client->farewell_error;
+			wait_for_completion(&client->farewell_comp);
+			ret = client->farewell_error;
 		}
 		if (ret) {
 			scoutfs_inc_counter(sb, client_farewell_error);
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -22,10 +22,18 @@ int scoutfs_client_srch_get_compact(struct super_block *sb,
 				    struct scoutfs_srch_compact *sc);
 int scoutfs_client_srch_commit_compact(struct super_block *sb,
 				       struct scoutfs_srch_compact *res);
+int scoutfs_client_get_log_merge(struct super_block *sb,
+				 struct scoutfs_log_merge_request *req);
+int scoutfs_client_commit_log_merge(struct super_block *sb,
+				    struct scoutfs_log_merge_complete *comp);
 int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
 				      struct scoutfs_open_ino_map *map);
 int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
 				struct scoutfs_open_ino_map *map);
+int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
+int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
+int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
+int scoutfs_client_resize_devices(struct super_block *sb, struct scoutfs_net_resize_devices *nrd);

 int scoutfs_client_setup(struct super_block *sb);
 void scoutfs_client_destroy(struct super_block *sb);
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -44,6 +44,14 @@
 	EXPAND_COUNTER(btree_insert)				\
 	EXPAND_COUNTER(btree_leaf_item_hash_search)		\
 	EXPAND_COUNTER(btree_lookup)				\
+	EXPAND_COUNTER(btree_merge)				\
+	EXPAND_COUNTER(btree_merge_alloc_low)			\
+	EXPAND_COUNTER(btree_merge_delete)			\
+	EXPAND_COUNTER(btree_merge_dirty_limit)			\
+	EXPAND_COUNTER(btree_merge_drop_old)			\
+	EXPAND_COUNTER(btree_merge_insert)			\
+	EXPAND_COUNTER(btree_merge_update)			\
+	EXPAND_COUNTER(btree_merge_walk)			\
 	EXPAND_COUNTER(btree_next)				\
 	EXPAND_COUNTER(btree_prev)				\
 	EXPAND_COUNTER(btree_split)				\
@@ -112,6 +120,9 @@
 	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
 	EXPAND_COUNTER(lock_free)				\
+	EXPAND_COUNTER(lock_grace_extended)			\
+	EXPAND_COUNTER(lock_grace_set)				\
+	EXPAND_COUNTER(lock_grace_wait)				\
 	EXPAND_COUNTER(lock_grant_request)			\
 	EXPAND_COUNTER(lock_grant_response)			\
 	EXPAND_COUNTER(lock_grant_work)				\
@@ -140,6 +151,12 @@
 	EXPAND_COUNTER(net_recv_invalid_message)		\
 	EXPAND_COUNTER(net_recv_messages)			\
 	EXPAND_COUNTER(net_unknown_request)			\
+	EXPAND_COUNTER(orphan_scan)				\
+	EXPAND_COUNTER(orphan_scan_cached)			\
+	EXPAND_COUNTER(orphan_scan_error)			\
+	EXPAND_COUNTER(orphan_scan_item)			\
+	EXPAND_COUNTER(orphan_scan_omap_set)			\
+	EXPAND_COUNTER(orphan_scan_read)			\
 	EXPAND_COUNTER(quorum_elected)				\
 	EXPAND_COUNTER(quorum_fence_error)			\
 	EXPAND_COUNTER(quorum_fence_leader)			\
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -207,6 +207,7 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
 	u64 offset;
 	s64 ret;
 	u8 flags;
+	int err;
 	int i;

 	flags = offline ? SEF_OFFLINE : 0;
@@ -246,6 +247,18 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
 		tr.len = min(ext.len - offset, last - iblock + 1);
 		tr.flags = ext.flags;

+		trace_scoutfs_data_extent_truncated(sb, ino, &tr);
+
+		ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
+				      tr.start, tr.len, 0, flags);
+		if (ret < 0) {
+			if (WARN_ON_ONCE(ret == -EINVAL)) {
+				scoutfs_err(sb, "unexpected truncate inconsistency: ino %llu iblock %llu last %llu, start %llu len %llu",
+					    ino, iblock, last, tr.start, tr.len);
+			}
+			break;
+		}
+
 		if (tr.map) {
 			mutex_lock(&datinf->mutex);
 			ret = scoutfs_free_data(sb, datinf->alloc,
@@ -253,16 +266,16 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
 						&datinf->data_freed,
 						tr.map, tr.len);
 			mutex_unlock(&datinf->mutex);
-			if (ret < 0)
+			if (ret < 0) {
+				err = scoutfs_ext_set(sb, &data_ext_ops, &args,
+						      tr.start, tr.len, tr.map, tr.flags);
+				if (err < 0)
+					scoutfs_err(sb, "truncate err %d restoring extent after error %lld: ino %llu start %llu len %llu",
+						    err, ret, ino, tr.start, tr.len);
 				break;
+			}
 		}

-		trace_scoutfs_data_extent_truncated(sb, ino, &tr);
-
-		ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
-				      tr.start, tr.len, 0, flags);
-		BUG_ON(ret);  /* inconsistent, could prealloc items */
-
 		iblock += tr.len;
 	}

@@ -312,10 +325,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,

 	while (iblock <= last) {
 		if (inode)
-			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
-							    true);
+			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
 		else
-			ret = scoutfs_hold_trans(sb);
+			ret = scoutfs_hold_trans(sb, false);
 		if (ret)
 			break;

@@ -756,8 +768,7 @@ retry:
 		ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 		      scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
 						  true) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
-							ind_seq);
+		      scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true);
 	} while (ret > 0);
 	if (ret < 0)
 		goto out;
@@ -1010,7 +1021,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)

 	while(iblock <= last) {

-		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+		ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
 		if (ret)
 			goto out;

@@ -1020,8 +1031,10 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 			end = (iblock + ret) << SCOUTFS_BLOCK_SM_SHIFT;
 			if (end > offset + len)
 				end = offset + len;
-			if (end > i_size_read(inode))
+			if (end > i_size_read(inode)) {
 				i_size_write(inode, end);
+				scoutfs_inode_inc_data_version(inode);
+			}
 		}
 		if (ret >= 0)
 			scoutfs_update_inode_item(inode, lock, &ind_locks);
@@ -1086,7 +1099,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 	}

 	/* we're updating meta_seq with offline block count */
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true);
 	if (ret < 0)
 		goto out;

@@ -1238,7 +1251,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
 		      scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
+		      scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false);
 		if (ret > 0)
 			continue;
 		if (ret < 0)
@@ -1844,13 +1857,17 @@ int scoutfs_data_prepare_commit(struct super_block *sb)
 	return ret;
 }

-u64 scoutfs_data_alloc_free_bytes(struct super_block *sb)
+/*
+ * Return true if the data allocator is lower than the caller's
+ * requirement and we haven't been told by the server that we're out of
+ * free extents.
+ */
+bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks)
 {
 	DECLARE_DATA_INFO(sb, datinf);

-	return scoutfs_dalloc_total_len(&datinf->dalloc) <<
-		SCOUTFS_BLOCK_SM_SHIFT;
-
+	return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) &&
+	       !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW);
 }

 int scoutfs_data_setup(struct super_block *sb)
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -86,7 +86,7 @@ void scoutfs_data_init_btrees(struct super_block *sb,
 void scoutfs_data_get_btrees(struct super_block *sb,
 			     struct scoutfs_log_trees *lt);
 int scoutfs_data_prepare_commit(struct super_block *sb);
-u64 scoutfs_data_alloc_free_bytes(struct super_block *sb);
+bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks);

 int scoutfs_data_setup(struct super_block *sb);
 void scoutfs_data_destroy(struct super_block *sb);
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -253,7 +253,7 @@ static u64 dirent_name_hash(const char *name, unsigned int name_len)
              ((u64)dirent_name_fingerprint(name, name_len) << 32);
 }

-static u64 dirent_names_equal(const char *a_name, unsigned int a_len,
+static bool dirent_names_equal(const char *a_name, unsigned int a_len,
 			      const char *b_name, unsigned int b_len)
 {
 	return a_len == b_len && memcmp(a_name, b_name, a_len) == 0;
@@ -462,7 +462,7 @@ out:
 	else if (ino == 0)
 		inode = NULL;
 	else
-		inode = scoutfs_iget(sb, ino);
+		inode = scoutfs_iget(sb, ino, 0);

 	/*
 	 * We can't splice dir aliases into the dcache.  dir entries
@@ -669,6 +669,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      umode_t mode, dev_t rdev,
 				      struct scoutfs_lock **dir_lock,
 				      struct scoutfs_lock **inode_lock,
+				      struct scoutfs_lock **orph_lock,
 				      struct list_head *ind_locks)
 {
 	struct super_block *sb = dir->i_sb;
@@ -701,11 +702,17 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		goto out_unlock;

+	if (orph_lock) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
 	      scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -725,9 +732,13 @@ out_unlock:
 	if (ret) {
 		scoutfs_inode_index_unlock(sb, ind_locks);
 		scoutfs_unlock(sb, *dir_lock, SCOUTFS_LOCK_WRITE);
-		scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
 		*dir_lock = NULL;
+		scoutfs_unlock(sb, *inode_lock, SCOUTFS_LOCK_WRITE);
 		*inode_lock = NULL;
+		if (orph_lock) {
+			scoutfs_unlock(sb, *orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
+			*orph_lock = NULL;
+		}

 		inode = ERR_PTR(ret);
 	}
@@ -752,7 +763,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
 	inode = lock_hold_create(dir, dentry, mode, rdev,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

@@ -813,13 +824,15 @@ static int scoutfs_link(struct dentry *old_dentry,
 	struct super_block *sb = dir->i_sb;
 	struct scoutfs_lock *dir_lock;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	LIST_HEAD(ind_locks);
-	bool del_orphan;
+	bool del_orphan = false;
 	u64 dir_size;
 	u64 ind_seq;
 	u64 hash;
 	u64 pos;
 	int ret;
+	int err;

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);

@@ -843,13 +856,20 @@ static int scoutfs_link(struct dentry *old_dentry,
 		goto out_unlock;

 	dir_size = i_size_read(dir) + dentry->d_name.len;
-	del_orphan = (inode->i_nlink == 0);
+
+	if (inode->i_nlink == 0) {
+		del_orphan = true;
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}

 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -860,7 +880,7 @@ retry:
 		goto out;

 	if (del_orphan) {
-		ret = scoutfs_orphan_dirty(sb, scoutfs_ino(inode));
+		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
 		if (ret)
 			goto out;
 	}
@@ -871,8 +891,11 @@ retry:
 			      dentry->d_name.name, dentry->d_name.len,
 			      scoutfs_ino(inode), inode->i_mode, dir_lock,
 			      inode_lock);
-	if (ret)
+	if (ret) {
+		err = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+		WARN_ON_ONCE(err); /* no orphan, might not scan and delete after crash */
 		goto out;
+	}
 	update_dentry_info(sb, dentry, hash, pos, dir_lock);

 	i_size_write(dir, dir_size);
@@ -880,11 +903,6 @@ retry:
 	inode->i_ctime = dir->i_mtime;
 	inc_nlink(inode);

-	if (del_orphan) {
-		ret = scoutfs_orphan_delete(sb, scoutfs_ino(inode));
-		WARN_ON_ONCE(ret);
-	}
-
 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);

@@ -896,6 +914,8 @@ out_unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
+
 	return ret;
 }

@@ -920,6 +940,7 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec ts = current_kernel_time();
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	LIST_HEAD(ind_locks);
 	u64 ind_seq;
@@ -937,32 +958,36 @@ static int scoutfs_unlink(struct inode *dir, struct dentry *dentry)
 		goto unlock;
 	}

+	if (should_orphan(inode)) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
 	if (ret > 0)
 		goto retry;
 	if (ret)
 		goto unlock;

+	if (should_orphan(inode)) {
+		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+		if (ret < 0)
+			goto out;
+	}
+
 	ret = del_entry_items(sb, scoutfs_ino(dir), dentry_info_hash(dentry),
 			      dentry_info_pos(dentry), scoutfs_ino(inode),
 			      dir_lock, inode_lock);
-	if (ret)
+	if (ret) {
+		ret = scoutfs_inode_orphan_delete(sb, scoutfs_ino(inode), orph_lock);
+		WARN_ON_ONCE(ret); /* should have been dirty */
 		goto out;
-
-	if (should_orphan(inode)) {
-		/*
-		 * Insert the orphan item before we modify any inode
-		 * metadata so we can gracefully exit should it
-		 * fail.
-		 */
-		ret = scoutfs_orphan_inode(inode);
-		WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
-		if (ret)
-			goto out;
 	}

 	dir->i_ctime = ts;
@@ -984,6 +1009,7 @@ unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1176,7 +1202,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 		return ret;

 	inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, NULL, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

@@ -1535,6 +1561,7 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct scoutfs_lock *new_dir_lock = NULL;
 	struct scoutfs_lock *old_inode_lock = NULL;
 	struct scoutfs_lock *new_inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	struct timespec now;
 	bool ins_new = false;
 	bool del_new = false;
@@ -1599,6 +1626,13 @@ static int scoutfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_unlock;

+	if (should_orphan(new_inode)) {
+		ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, scoutfs_ino(new_inode),
+					  &orph_lock);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, old_dir, false) ?:
@@ -1607,7 +1641,7 @@ retry:
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
 	      (new_inode == NULL ? 0 :
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1658,7 +1692,7 @@ retry:
 	ins_old = true;

 	if (should_orphan(new_inode)) {
-		ret = scoutfs_orphan_inode(new_inode);
+		ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(new_inode), orph_lock);
 		if (ret)
 			goto out;
 	}
@@ -1762,6 +1796,7 @@ out_unlock:
 	scoutfs_unlock(sb, old_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, new_dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, rename_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
@@ -1781,6 +1816,7 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 	struct inode *inode = NULL;
 	struct scoutfs_lock *dir_lock = NULL;
 	struct scoutfs_lock *inode_lock = NULL;
+	struct scoutfs_lock *orph_lock = NULL;
 	LIST_HEAD(ind_locks);
 	int ret;

@@ -1788,25 +1824,32 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod
 		return -ENAMETOOLONG;

 	inode = lock_hold_create(dir, dentry, mode, 0,
-				 &dir_lock, &inode_lock, &ind_locks);
+				 &dir_lock, &inode_lock, &orph_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);

+	ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock);
+	if (ret < 0) {
+		iput(inode);
+		goto out; /* XXX returning error but items created */
+	}
+
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	insert_inode_hash(inode);
+	ihold(inode); /* need to update inode modifications in d_tmpfile */
 	d_tmpfile(dentry, inode);

 	scoutfs_update_inode_item(inode, inode_lock, &ind_locks);
 	scoutfs_update_inode_item(dir, dir_lock, &ind_locks);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
+	iput(inode);

-	ret = scoutfs_orphan_inode(inode);
-	WARN_ON_ONCE(ret); /* XXX returning error but items deleted */
-
+out:
 	scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
+	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

 	return ret;
 }
--- a/kmod/src/export.c
+++ b/kmod/src/export.c
@@ -81,7 +81,7 @@ static struct dentry *scoutfs_fh_to_dentry(struct super_block *sb,
 	trace_scoutfs_fh_to_dentry(sb, fh_type, sfid);

 	if (scoutfs_valid_fileid(fh_type))
-		inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino));
+		inode = scoutfs_iget(sb, le64_to_cpu(sfid->ino), 0);

 	return d_obtain_alias(inode);
 }
@@ -100,7 +100,7 @@ static struct dentry *scoutfs_fh_to_parent(struct super_block *sb,

 	if (scoutfs_valid_fileid(fh_type) &&
 	    fh_type == FILEID_SCOUTFS_WITH_PARENT)
-		inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino));
+		inode = scoutfs_iget(sb, le64_to_cpu(sfid->parent_ino), 0);

 	return d_obtain_alias(inode);
 }
@@ -123,7 +123,7 @@ static struct dentry *scoutfs_get_parent(struct dentry *child)
 	scoutfs_dir_free_backref_path(sb, &list);
 	trace_scoutfs_get_parent(sb, inode, ino);

-	inode = scoutfs_iget(sb, ino);
+	inode = scoutfs_iget(sb, ino, 0);

 	return d_obtain_alias(inode);
 }
--- a/kmod/src/fence.c
+++ b/kmod/src/fence.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2019 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/device.h>
+#include <linux/timer.h>
+#include <asm/barrier.h>
+
+#include "super.h"
+#include "msg.h"
+#include "sysfs.h"
+#include "server.h"
+#include "fence.h"
+
+/*
+ * Fencing ensures that a given mount can no longer write to the
+ * metadata or data devices.  It's necessary to ensure that it's safe to
+ * give another mount access to a resource that is currently owned by a
+ * mount that has stopped responding.
+ *
+ * Fencing is performed in collaboration between the currently elected
+ * quorum leader mount and userspace running on its host.  The kernel
+ * creates fencing requests as it notices that mounts have stopped
+ * participating.  The fence requests are published as directories in
+ * sysfs.  Userspace agents watch for directories, take action, and
+ * write to files in the directory to indicate that the mount has been
+ * fenced.  Once the mount is fenced the server can reclaim the
+ * resources previously held by the fenced mount.
+ *
+ * The fence requests contain metadata identifying the specific instance
+ * of the mount that needs to be fenced.  This lets a fencing agent
+ * ensure that a specific mount has been fenced without necessarily
+ * destroying the node that was hosting it.  Maybe the node had rebooted
+ * and the mount is no longer there, maybe the mount can be force
+ * unmounted, maybe the node can be configured to isolate the mount from
+ * the devices.
+ *
+ * The fencing mechanism is asynchronous and can fail but the server
+ * cannot make progress until it completes.  If a fence request times
+ * out the server shuts down in the hope that another instance of a
+ * server might have more luck fencing a non-responsive mount.
+ *
+ * Sources of fencing are fundamentally anchored in shared persistent
+ * state.  It is possible, though unlikely, that servers can fence a
+ * node and then themselves fail, leaving the next server to try and
+ * fence the mount again.
+ */
+
+struct fence_info {
+	struct kset *kset;
+	struct kobject fence_dir_kobj;
+	struct workqueue_struct *wq;
+	wait_queue_head_t waitq;
+	spinlock_t lock;
+	struct list_head list;
+};
+
+#define DECLARE_FENCE_INFO(sb, name) \
+	struct fence_info *name = SCOUTFS_SB(sb)->fence_info
+
+struct pending_fence {
+	struct super_block *sb;
+	struct scoutfs_sysfs_attrs ssa;
+	struct list_head entry;
+	struct timer_list timer;
+
+	ktime_t start_kt;
+	__be32 ipv4_addr;
+	bool fenced;
+	bool error;
+	int reason;
+	u64 rid;
+};
+
+#define FENCE_FROM_KOBJ(kobj)					\
+	container_of(SCOUTFS_SYSFS_ATTRS(kobj), struct pending_fence, ssa)
+#define DECLARE_FENCE_FROM_KOBJ(name, kobj)				\
+	struct pending_fence *name = FENCE_FROM_KOBJ(kobj)
+
+static void destroy_fence(struct pending_fence *fence)
+{
+	struct super_block *sb = fence->sb;
+
+	scoutfs_sysfs_destroy_attrs(sb, &fence->ssa);
+	del_timer_sync(&fence->timer);
+	kfree(fence);
+}
+
+static ssize_t elapsed_secs_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+	ktime_t now = ktime_get();
+	struct timeval tv = { 0, };
+
+	if (ktime_after(now, fence->start_kt))
+		tv = ktime_to_timeval(ktime_sub(now, fence->start_kt));
+
+	return snprintf(buf, PAGE_SIZE, "%llu", (long long)tv.tv_sec);
+}
+SCOUTFS_ATTR_RO(elapsed_secs);
+
+static ssize_t fenced_show(struct kobject *kobj, struct kobj_attribute *attr,
+			   char *buf)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u", !!fence->fenced);
+}
+
+/*
+ * any write to the fenced file from userspace indicates that the mount
+ * has been safely fenced and can no longer write to the shared device.
+ */
+static ssize_t fenced_store(struct kobject *kobj, struct kobj_attribute *attr,
+			    const char *buf, size_t count)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+	DECLARE_FENCE_INFO(fence->sb, fi);
+
+	if (!fence->fenced) {
+		del_timer_sync(&fence->timer);
+		fence->fenced = true;
+		wake_up(&fi->waitq);
+	}
+
+	return count;
+}
+SCOUTFS_ATTR_RW(fenced);
+
+static ssize_t error_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u", !!fence->error);
+}
+
+/*
+ * Fencing can tell us that they were unable to fence the given mount.
+ * We can't continue if the mount can't be isolated so we shut down the
+ * server.
+ */
+static ssize_t error_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf,
+			   size_t count)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+	struct super_block *sb = fence->sb;
+	DECLARE_FENCE_INFO(fence->sb, fi);
+
+	if (!fence->error) {
+		fence->error = true;
+		scoutfs_err(sb, "error indicated by fence action for rid %016llx", fence->rid);
+		wake_up(&fi->waitq);
+	}
+
+	return count;
+}
+SCOUTFS_ATTR_RW(error);
+
+static ssize_t ipv4_addr_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%pI4", &fence->ipv4_addr);
+}
+SCOUTFS_ATTR_RO(ipv4_addr);
+
+static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+			   char *buf)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+	unsigned r = fence->reason;
+	char *str = "unknown";
+	static char *reasons[] = {
+		[SCOUTFS_FENCE_CLIENT_RECOVERY] = "client_recovery",
+		[SCOUTFS_FENCE_CLIENT_RECONNECT] = "client_reconnect",
+		[SCOUTFS_FENCE_QUORUM_BLOCK_LEADER] = "quorum_block_leader",
+	};
+
+	if (r < ARRAY_SIZE(reasons) && reasons[r])
+		str = reasons[r];
+
+	return snprintf(buf, PAGE_SIZE, "%s", str);
+}
+SCOUTFS_ATTR_RO(reason);
+
+static ssize_t rid_show(struct kobject *kobj, struct kobj_attribute *attr,
+			char *buf)
+{
+	DECLARE_FENCE_FROM_KOBJ(fence, kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%016llx", fence->rid);
+}
+SCOUTFS_ATTR_RO(rid);
+
+static struct attribute *fence_attrs[] = {
+	SCOUTFS_ATTR_PTR(elapsed_secs),
+	SCOUTFS_ATTR_PTR(fenced),
+	SCOUTFS_ATTR_PTR(error),
+	SCOUTFS_ATTR_PTR(ipv4_addr),
+	SCOUTFS_ATTR_PTR(reason),
+	SCOUTFS_ATTR_PTR(rid),
+	NULL,
+};
+
+#define FENCE_TIMEOUT_MS (MSEC_PER_SEC * 30)
+
+static void fence_timeout(struct timer_list *timer)
+{
+	struct pending_fence *fence = from_timer(fence, timer, timer);
+	struct super_block *sb = fence->sb;
+	DECLARE_FENCE_INFO(sb, fi);
+
+	fence->error = true;
+	scoutfs_err(sb, "fence request for rid %016llx was not serviced in %lums, raising error",
+		    fence->rid, FENCE_TIMEOUT_MS);
+	wake_up(&fi->waitq);
+}
+
+int scoutfs_fence_start(struct super_block *sb, u64 rid, __be32 ipv4_addr, int reason)
+{
+	DECLARE_FENCE_INFO(sb, fi);
+	struct pending_fence *fence;
+	int ret;
+
+	fence = kzalloc(sizeof(struct pending_fence), GFP_NOFS);
+	if (!fence) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	fence->sb = sb;
+	scoutfs_sysfs_init_attrs(sb, &fence->ssa);
+
+	fence->start_kt = ktime_get();
+	fence->ipv4_addr = ipv4_addr;
+	fence->fenced = false;
+	fence->error = false;
+	fence->reason = reason;
+	fence->rid = rid;
+
+	ret = scoutfs_sysfs_create_attrs_parent(sb, &fi->kset->kobj,
+						&fence->ssa, fence_attrs,
+						"%016llx", rid);
+	if (ret < 0) {
+		kfree(fence);
+		goto out;
+	}
+
+	timer_setup(&fence->timer, fence_timeout, 0);
+	fence->timer.expires = jiffies + msecs_to_jiffies(FENCE_TIMEOUT_MS);
+	add_timer(&fence->timer);
+
+	spin_lock(&fi->lock);
+	list_add_tail(&fence->entry, &fi->list);
+	spin_unlock(&fi->lock);
+out:
+	return ret;
+}
+
+/*
+ * Give the caller the rid of the next fence request which has been
+ * fenced.  This doesn't have a position from which to return the next
+ * because the caller either frees the fence request it's given or shuts
+ * down.
+ */
+int scoutfs_fence_next(struct super_block *sb, u64 *rid, int *reason, bool *error)
+{
+	DECLARE_FENCE_INFO(sb, fi);
+	struct pending_fence *fence;
+	int ret = -ENOENT;
+
+	spin_lock(&fi->lock);
+	list_for_each_entry(fence, &fi->list, entry) {
+		if (fence->fenced || fence->error) {
+			*rid = fence->rid;
+			*reason = fence->reason;
+			*error = fence->error;
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock(&fi->lock);
+
+	return ret;
+}
+
+int scoutfs_fence_reason_pending(struct super_block *sb, int reason)
+{
+	DECLARE_FENCE_INFO(sb, fi);
+	struct pending_fence *fence;
+	bool pending = false;
+
+	spin_lock(&fi->lock);
+	list_for_each_entry(fence, &fi->list, entry) {
+		if (fence->reason == reason) {
+			pending = true;
+			break;
+		}
+	}
+	spin_unlock(&fi->lock);
+
+	return pending;
+}
+
+int scoutfs_fence_free(struct super_block *sb, u64 rid)
+{
+	DECLARE_FENCE_INFO(sb, fi);
+	struct pending_fence *fence;
+	int ret = -ENOENT;
+
+	spin_lock(&fi->lock);
+	list_for_each_entry(fence, &fi->list, entry) {
+		if (fence->rid == rid) {
+			list_del_init(&fence->entry);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock(&fi->lock);
+
+	if (ret == 0) {
+		destroy_fence(fence);
+		wake_up(&fi->waitq);
+	}
+
+	return ret;
+}
+
+static bool all_fenced(struct fence_info *fi, bool *error)
+{
+	struct pending_fence *fence;
+	bool all = true;
+
+	*error = false;
+
+	spin_lock(&fi->lock);
+	list_for_each_entry(fence, &fi->list, entry) {
+		if (fence->error) {
+			*error = true;
+			all = true;
+			break;
+		}
+		if (!fence->fenced) {
+			all = false;
+			break;
+		}
+	}
+	spin_unlock(&fi->lock);
+
+	return all;
+}
+
+/*
+ * The caller waits for all the current requests to be fenced, but not
+ * necessarily reclaimed.
+ */
+int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies)
+{
+	DECLARE_FENCE_INFO(sb, fi);
+	bool error;
+	long ret;
+
+	ret = wait_event_timeout(fi->waitq, all_fenced(fi, &error), timeout_jiffies);
+	if (ret == 0)
+		ret = -ETIMEDOUT;
+	else if (ret > 0)
+		ret = 0;
+	else if (error)
+		ret = -EIO;
+
+	return ret;
+}
+
+/*
+ * This must be called early during startup so that it is guaranteed that
+ * no other subsystems will try and call fence_start while we're waiting
+ * for testing fence requests to complete.
+ */
+int scoutfs_fence_setup(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct mount_options *opts = &sbi->opts;
+	struct fence_info *fi;
+	int ret;
+
+	/* can only fence if we can be elected by quorum */
+	if (opts->quorum_slot_nr == -1) {
+		ret = 0;
+		goto out;
+	}
+
+	fi = kzalloc(sizeof(struct fence_info), GFP_KERNEL);
+	if (!fi) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	init_waitqueue_head(&fi->waitq);
+	spin_lock_init(&fi->lock);
+	INIT_LIST_HEAD(&fi->list);
+
+	sbi->fence_info = fi;
+
+	fi->kset = kset_create_and_add("fence", NULL, scoutfs_sysfs_sb_dir(sb));
+	if (!fi->kset) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	fi->wq = alloc_workqueue("scoutfs_fence",
+				 WQ_UNBOUND | WQ_NON_REENTRANT, 0);
+	if (!fi->wq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = 0;
+out:
+	if (ret)
+		scoutfs_fence_destroy(sb);
+
+	return ret;
+}
+
+/*
+ * Tear down all pending fence requests because the server is shutting down.
+ */
+void scoutfs_fence_stop(struct super_block *sb)
+{
+	DECLARE_FENCE_INFO(sb, fi);
+	struct pending_fence *fence;
+
+	do {
+		spin_lock(&fi->lock);
+		fence = list_first_entry_or_null(&fi->list, struct pending_fence, entry);
+		if (fence)
+			list_del_init(&fence->entry);
+		spin_unlock(&fi->lock);
+
+		if (fence) {
+			destroy_fence(fence);
+			wake_up(&fi->waitq);
+		}
+	} while (fence);
+}
+
+void scoutfs_fence_destroy(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct fence_info *fi = SCOUTFS_SB(sb)->fence_info;
+	struct pending_fence *fence;
+	struct pending_fence *tmp;
+
+	if (fi) {
+		if (fi->wq)
+			destroy_workqueue(fi->wq);
+		list_for_each_entry_safe(fence, tmp, &fi->list, entry)
+			destroy_fence(fence);
+		if (fi->kset)
+			kset_unregister(fi->kset);
+		kfree(fi);
+		sbi->fence_info = NULL;
+	}
+}
--- a/kmod/src/fence.h
+++ b/kmod/src/fence.h
@@ -0,0 +1,20 @@
+#ifndef _SCOUTFS_FENCE_H_
+#define _SCOUTFS_FENCE_H_
+
+enum {
+	SCOUTFS_FENCE_CLIENT_RECOVERY,
+	SCOUTFS_FENCE_CLIENT_RECONNECT,
+	SCOUTFS_FENCE_QUORUM_BLOCK_LEADER,
+};
+
+int scoutfs_fence_start(struct super_block *sb, u64 rid, __be32 ipv4_addr, int reason);
+int scoutfs_fence_next(struct super_block *sb, u64 *rid, int *reason, bool *error);
+int scoutfs_fence_reason_pending(struct super_block *sb, int reason);
+int scoutfs_fence_free(struct super_block *sb, u64 rid);
+int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies);
+
+int scoutfs_fence_setup(struct super_block *sb);
+void scoutfs_fence_stop(struct super_block *sb);
+void scoutfs_fence_destroy(struct super_block *sb);
+
+#endif
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
@@ -37,9 +37,9 @@
 *
 * The log btrees are modified by multiple transactions over time so
 * there is no consistent ordering relationship between the items in
- * different btrees.  Each item in a log btree stores a version number
- * for the item.  Readers check log btrees for the most recent version
- * that it should use.
+ * different btrees.  Each item in a log btree stores a seq for the
+ * item.  Readers check log btrees for the most recent seq that it
+ * should use.
 *
 * The item cache reads items in bulk from stable btrees, and writes a
 * transaction's worth of dirty items into the item log btree.
@@ -52,6 +52,8 @@
 */

 struct forest_info {
+	struct super_block *sb;
+
 	struct mutex mutex;
 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
@@ -60,6 +62,9 @@ struct forest_info {
 	struct mutex srch_mutex;
 	struct scoutfs_srch_file srch_file;
 	struct scoutfs_block *srch_bl;
+
+	struct workqueue_struct *workq;
+	struct delayed_work log_merge_dwork;
 };

 #define DECLARE_FOREST_INFO(sb, name) \
@@ -249,7 +254,7 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key,
 * If we hit stale blocks and retry we can call the callback for
 * duplicate items.  This is harmless because the items are stable while
 * the caller holds their cluster lock and the caller has to filter out
- * item versions anyway.
+ * item seqs anyway.
 */
 int scoutfs_forest_read_items(struct super_block *sb,
 			      struct scoutfs_lock *lock,
@@ -426,29 +431,29 @@ out:

 /*
 * The caller is commiting items in the transaction and has found the
- * greatest item version amongst them.  We store it in the log_trees root
+ * greatest item seq amongst them.  We store it in the log_trees root
 * to send to the server.
 */
-void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers)
+void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq)
 {
 	DECLARE_FOREST_INFO(sb, finf);

-	finf->our_log.max_item_vers = cpu_to_le64(max_vers);
+	finf->our_log.max_item_seq = cpu_to_le64(max_seq);
 }

 /*
- * The server is calling during setup to find the greatest item version
+ * The server is calling during setup to find the greatest item seq
 * amongst all the log tree roots.  They have the authoritative current
 * super.
 *
- * Item versions are only used to compare items in log trees, not in the
- * main fs tree.  All we have to do is find the greatest version amongst
- * the log_trees so that new locks will have a write_version greater
- * than all the items in the log_trees.
+ * Item seqs are only used to compare items in log trees, not in the
+ * main fs tree.  All we have to do is find the greatest seq amongst the
+ * log_trees so that the core seq will have a greater seq than all the
+ * items in the log_trees.
 */
-int scoutfs_forest_get_max_vers(struct super_block *sb,
-				struct scoutfs_super_block *super,
-				u64 *vers)
+int scoutfs_forest_get_max_seq(struct super_block *sb,
+			       struct scoutfs_super_block *super,
+			       u64 *seq)
 {
 	struct scoutfs_log_trees *lt;
 	SCOUTFS_BTREE_ITEM_REF(iref);
@@ -456,7 +461,7 @@ int scoutfs_forest_get_max_vers(struct super_block *sb,
 	int ret;

 	scoutfs_key_init_log_trees(&ltk, 0, 0);
-	*vers = 0;
+	*seq = 0;

 	for (;; scoutfs_key_inc(&ltk)) {
 		ret = scoutfs_btree_next(sb, &super->logs_root, &ltk, &iref);
@@ -464,8 +469,7 @@ int scoutfs_forest_get_max_vers(struct super_block *sb,
 			if (iref.val_len == sizeof(struct scoutfs_log_trees)) {
 				ltk = *iref.key;
 				lt = iref.val;
-				*vers = max(*vers,
-					    le64_to_cpu(lt->max_item_vers));
+				*seq = max(*seq, le64_to_cpu(lt->max_item_seq));
 			} else {
 				ret = -EIO;
 			}
@@ -534,7 +538,7 @@ void scoutfs_forest_init_btrees(struct super_block *sb,
 	memset(&finf->our_log, 0, sizeof(finf->our_log));
 	finf->our_log.item_root = lt->item_root;
 	finf->our_log.bloom_ref = lt->bloom_ref;
-	finf->our_log.max_item_vers = lt->max_item_vers;
+	finf->our_log.max_item_seq = lt->max_item_seq;
 	finf->our_log.rid = lt->rid;
 	finf->our_log.nr = lt->nr;
 	finf->srch_file = lt->srch_file;
@@ -564,7 +568,7 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 	lt->item_root = finf->our_log.item_root;
 	lt->bloom_ref = finf->our_log.bloom_ref;
 	lt->srch_file = finf->srch_file;
-	lt->max_item_vers = finf->our_log.max_item_vers;
+	lt->max_item_seq = finf->our_log.max_item_seq;

 	scoutfs_block_put(sb, finf->srch_bl);
 	finf->srch_bl = NULL;
@@ -573,6 +577,149 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 					    &lt->bloom_ref);
 }

+/*
+ * Compare input items to merge by their log item value seq when their
+ * keys match.
+ */
+static int merge_cmp(void *a_val, int a_val_len, void *b_val, int b_val_len)
+{
+	struct scoutfs_log_item_value *a = a_val;
+	struct scoutfs_log_item_value *b = b_val;
+
+	/* sort merge item by seq */
+	return scoutfs_cmp(le64_to_cpu(a->seq), le64_to_cpu(b->seq));
+}
+
+static bool merge_is_del(void *val, int val_len)
+{
+	struct scoutfs_log_item_value *liv = val;
+
+	return !!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION);
+}
+
+#define LOG_MERGE_DELAY_MS (5 * MSEC_PER_SEC)
+
+/*
+ * Regularly try to get a log merge request from the server.  If we get
+ * a request we walk the log_trees items to find input trees and pass
+ * them to btree_merge.  All of our work is done in dirty blocks
+ * allocated from available free blocks that the server gave us.  If we
+ * hit an error then we drop our dirty blocks without writing them and
+ * send an error flag to the server so they can reclaim our allocators
+ * and ignore the rest of our work.
+ */
+static void scoutfs_forest_log_merge_worker(struct work_struct *work)
+{
+	struct forest_info *finf = container_of(work, struct forest_info,
+						log_merge_dwork.work);
+	struct super_block *sb = finf->sb;
+	struct scoutfs_btree_root_head *rhead = NULL;
+	struct scoutfs_btree_root_head *tmp;
+	struct scoutfs_log_merge_complete comp;
+	struct scoutfs_log_merge_request req;
+	struct scoutfs_log_trees *lt;
+	struct scoutfs_block_writer wri;
+	struct scoutfs_alloc alloc;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key next;
+	struct scoutfs_key key;
+	unsigned long delay;
+	LIST_HEAD(inputs);
+	int ret;
+
+	ret = scoutfs_client_get_log_merge(sb, &req);
+	if (ret < 0)
+		goto resched;
+
+	comp.root = req.root;
+	comp.start = req.start;
+	comp.end = req.end;
+	comp.remain = req.end;
+	comp.rid = req.rid;
+	comp.seq = req.seq;
+	comp.flags = 0;
+
+	scoutfs_alloc_init(&alloc, &req.meta_avail, &req.meta_freed);
+	scoutfs_block_writer_init(sb, &wri);
+
+	/* find finalized input log trees up to last_seq */
+	for (scoutfs_key_init_log_trees(&key, 0, 0); ; scoutfs_key_inc(&key)) {
+
+		if (!rhead) {
+			rhead = kmalloc(sizeof(*rhead), GFP_NOFS);
+			if (!rhead) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		ret = scoutfs_btree_next(sb, &req.logs_root, &key, &iref);
+		if (ret == 0) {
+			if (iref.val_len == sizeof(*lt)) {
+				key = *iref.key;
+				lt = iref.val;
+				if ((le64_to_cpu(lt->flags) &
+				     SCOUTFS_LOG_TREES_FINALIZED) &&
+				    (le64_to_cpu(lt->max_item_seq) <=
+				     le64_to_cpu(req.last_seq))) {
+					rhead->root = lt->item_root;
+					list_add_tail(&rhead->head, &inputs);
+					rhead = NULL;
+				}
+			} else {
+				ret = -EIO;
+			}
+			scoutfs_btree_put_iref(&iref);
+		}
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				ret = 0;
+				break;
+			}
+			goto out;
+		}
+	}
+
+	/* shouldn't be possible, but it's harmless */
+	if (list_empty(&inputs)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = scoutfs_btree_merge(sb, &alloc, &wri, &req.start, &req.end,
+				  &next, &comp.root, &inputs, merge_cmp,
+				  merge_is_del,
+				  !!(req.flags & cpu_to_le64(SCOUTFS_LOG_MERGE_REQUEST_SUBTREE)),
+				  sizeof(struct scoutfs_log_item_value),
+				  SCOUTFS_LOG_MERGE_DIRTY_BYTE_LIMIT, 10);
+	if (ret == -ERANGE) {
+		comp.remain = next;
+		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_REMAIN);
+		ret = 0;
+	}
+
+out:
+	scoutfs_alloc_prepare_commit(sb, &alloc, &wri);
+	if (ret == 0)
+	      ret = scoutfs_block_writer_write(sb, &wri);
+	scoutfs_block_writer_forget_all(sb, &wri);
+
+	comp.meta_avail = alloc.avail;
+	comp.meta_freed = alloc.freed;
+	if (ret < 0)
+		le64_add_cpu(&comp.flags, SCOUTFS_LOG_MERGE_COMP_ERROR);
+
+	ret = scoutfs_client_commit_log_merge(sb, &comp);
+
+	kfree(rhead);
+	list_for_each_entry_safe(rhead, tmp, &inputs, head)
+		kfree(rhead);
+
+resched:
+	delay = ret == 0 ? 0 : msecs_to_jiffies(LOG_MERGE_DELAY_MS);
+	queue_delayed_work(finf->workq, &finf->log_merge_dwork, delay);
+}
+
 int scoutfs_forest_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -586,10 +733,20 @@ int scoutfs_forest_setup(struct super_block *sb)
 	}

 	/* the finf fields will be setup as we open a transaction */
+	finf->sb = sb;
 	mutex_init(&finf->mutex);
 	mutex_init(&finf->srch_mutex);
-
+	INIT_DELAYED_WORK(&finf->log_merge_dwork,
+			  scoutfs_forest_log_merge_worker);
 	sbi->forest_info = finf;
+
+	finf->workq = alloc_workqueue("scoutfs_log_merge", WQ_NON_REENTRANT |
+				      WQ_UNBOUND | WQ_HIGHPRI, 0);
+	if (!finf->workq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = 0;
 out:
 	if (ret)
@@ -598,6 +755,24 @@ out:
 	return 0;
 }

+void scoutfs_forest_start(struct super_block *sb)
+{
+	DECLARE_FOREST_INFO(sb, finf);
+
+	queue_delayed_work(finf->workq, &finf->log_merge_dwork,
+			   msecs_to_jiffies(LOG_MERGE_DELAY_MS));
+}
+
+void scoutfs_forest_stop(struct super_block *sb)
+{
+	DECLARE_FOREST_INFO(sb, finf);
+
+	if (finf && finf->workq) {
+		cancel_delayed_work_sync(&finf->log_merge_dwork);
+		destroy_workqueue(finf->workq);
+	}
+}
+
 void scoutfs_forest_destroy(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -605,6 +780,7 @@ void scoutfs_forest_destroy(struct super_block *sb)

 	if (finf) {
 		scoutfs_block_put(sb, finf->srch_bl);
+
 		kfree(finf);
 		sbi->forest_info = NULL;
 	}
--- a/kmod/src/forest.h
+++ b/kmod/src/forest.h
@@ -23,10 +23,10 @@ int scoutfs_forest_read_items(struct super_block *sb,
 			      scoutfs_forest_item_cb cb, void *arg);
 int scoutfs_forest_set_bloom_bits(struct super_block *sb,
 				  struct scoutfs_lock *lock);
-void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers);
-int scoutfs_forest_get_max_vers(struct super_block *sb,
-				struct scoutfs_super_block *super,
-				u64 *vers);
+void scoutfs_forest_set_max_seq(struct super_block *sb, u64 max_seq);
+int scoutfs_forest_get_max_seq(struct super_block *sb,
+			       struct scoutfs_super_block *super,
+			       u64 *seq);
 int scoutfs_forest_insert_list(struct super_block *sb,
 			       struct scoutfs_btree_item_list *lst);
 int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);
@@ -39,6 +39,8 @@ void scoutfs_forest_get_btrees(struct super_block *sb,
 			       struct scoutfs_log_trees *lt);

 int scoutfs_forest_setup(struct super_block *sb);
+void scoutfs_forest_start(struct super_block *sb);
+void scoutfs_forest_stop(struct super_block *sb);
 void scoutfs_forest_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -203,11 +203,12 @@ struct scoutfs_key {
 #define skmc_rid	_sk_first

 /* free extents by blkno */
-#define skfb_end	_sk_second
-#define skfb_len	_sk_third
-/* free extents by len */
-#define skfl_neglen	_sk_second
-#define skfl_blkno	_sk_third
+#define skfb_end	_sk_first
+#define skfb_len	_sk_second
+/* free extents by order */
+#define skfo_revord	_sk_first
+#define skfo_end	_sk_second
+#define skfo_len	_sk_third

 struct scoutfs_avl_root {
 	__le16 node;
@@ -285,9 +286,10 @@ struct scoutfs_alloc_list_head {
 	struct scoutfs_block_ref ref;
 	__le64 total_nr;
 	__le32 first_nr;
-	__u8 __pad[4];
+	__le32 flags;
 };

+
 /*
 * While the main allocator uses extent items in btree blocks, metadata
 * allocations for a single transaction are recorded in arrays in
@@ -316,17 +318,25 @@ struct scoutfs_alloc_list_block {
 */
 struct scoutfs_alloc_root {
 	__le64 total_len;
+	__le32 flags;
+	__le32 _pad;
 	struct scoutfs_btree_root root;
 };

+/* Shared by _alloc_list_head and _alloc_root */
+#define SCOUTFS_ALLOC_FLAG_LOW	(1U << 0)
+
 /* types of allocators, exposed to alloc_detail ioctl */
 #define SCOUTFS_ALLOC_OWNER_NONE	0
 #define SCOUTFS_ALLOC_OWNER_SERVER	1
 #define SCOUTFS_ALLOC_OWNER_MOUNT	2
 #define SCOUTFS_ALLOC_OWNER_SRCH	3
+#define SCOUTFS_ALLOC_OWNER_LOG_MERGE	4

 struct scoutfs_mounted_client_btree_val {
+	union scoutfs_inet_addr addr;
 	__u8 flags;
+	__u8 __pad[7];
 };

 #define SCOUTFS_MOUNTED_CLIENT_QUORUM	(1 << 0)
@@ -427,6 +437,10 @@ struct scoutfs_srch_compact {
 /* client -> server: compaction failed */
 #define SCOUTFS_SRCH_COMPACT_FLAG_ERROR		(1 << 5)

+#define SCOUTFS_DATA_ALLOC_MAX_ZONES	1024
+#define SCOUTFS_DATA_ALLOC_ZONE_BYTES	DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 8)
+#define SCOUTFS_DATA_ALLOC_ZONE_LE64S	DIV_ROUND_UP(SCOUTFS_DATA_ALLOC_MAX_ZONES, 64)
+
 /*
 * XXX I imagine we should rename these now that they've evolved to track
 * all the btrees that clients use during a transaction.  It's not just
@@ -440,13 +454,18 @@ struct scoutfs_log_trees {
 	struct scoutfs_alloc_root data_avail;
 	struct scoutfs_alloc_root data_freed;
 	struct scoutfs_srch_file srch_file;
-	__le64 max_item_vers;
+	__le64 data_alloc_zone_blocks;
+	__le64 data_alloc_zones[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
+	__le64 max_item_seq;
 	__le64 rid;
 	__le64 nr;
+	__le64 flags;
 };

+#define SCOUTFS_LOG_TREES_FINALIZED	(1ULL << 0)
+
 struct scoutfs_log_item_value {
-	__le64 vers;
+	__le64 seq;
 	__u8 flags;
 	__u8 __pad[7];
 	__u8 data[];
@@ -481,11 +500,83 @@ struct scoutfs_bloom_block {
 	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)
 #define SCOUTFS_FOREST_BLOOM_FUNC_BITS		(SCOUTFS_BLOCK_LG_SHIFT + 3)

+/*
+ * A private server btree item which records the status of a log merge
+ * operation that is in progress.
+ */
+struct scoutfs_log_merge_status {
+	struct scoutfs_key next_range_key;
+	__le64 nr_requests;
+	__le64 nr_complete;
+	__le64 last_seq;
+	__le64 seq;
+};
+
+/*
+ * A request is sent to the client and stored in a server btree item to
+ * record resources that would be reclaimed if the client failed.  It
+ * has all the inputs needed for the client to perform its portion of a
+ * merge.
+ */
+struct scoutfs_log_merge_request {
+	struct scoutfs_alloc_list_head meta_avail;
+	struct scoutfs_alloc_list_head meta_freed;
+	struct scoutfs_btree_root logs_root;
+	struct scoutfs_btree_root root;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	__le64 last_seq;
+	__le64 rid;
+	__le64 seq;
+	__le64 flags;
+};
+
+/* request root is subtree of fs root at parent, restricted merging modifications */
+#define SCOUTFS_LOG_MERGE_REQUEST_SUBTREE	(1ULL << 0)
+
+/*
+ * The output of a client's merge of log btree items into a subtree
+ * rooted at a parent in the fs_root.  The client sends it to the
+ * server, who stores it in a btree item for later splicing/rebalancing.
+ */
+struct scoutfs_log_merge_complete {
+	struct scoutfs_alloc_list_head meta_avail;
+	struct scoutfs_alloc_list_head meta_freed;
+	struct scoutfs_btree_root root;
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+	struct scoutfs_key remain;
+	__le64 rid;
+	__le64 seq;
+	__le64 flags;
+};
+
+/* merge failed, ignore completion and reclaim stored request */
+#define SCOUTFS_LOG_MERGE_COMP_ERROR	(1ULL << 0)
+/* merge didn't complete range, restart from remain */
+#define SCOUTFS_LOG_MERGE_COMP_REMAIN	(1ULL << 1)
+
+/*
+ * Range items record the ranges of the fs keyspace that still need to
+ * be merged.  They're added as a merge starts, removed as requests are
+ * sent and added back if the request didn't consume its entire range.
+ */
+struct scoutfs_log_merge_range {
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+};
+
+struct scoutfs_log_merge_freeing {
+	struct scoutfs_btree_root root;
+	struct scoutfs_key key;
+	__le64 seq;
+};
+
 /*
 * Keys are first sorted by major key zones.
 */
 #define SCOUTFS_INODE_INDEX_ZONE		1
-#define SCOUTFS_RID_ZONE			2
+#define SCOUTFS_ORPHAN_ZONE			2
 #define SCOUTFS_FS_ZONE				3
 #define SCOUTFS_LOCK_ZONE			4
 /* Items only stored in server btrees */
@@ -493,14 +584,21 @@ struct scoutfs_bloom_block {
 #define SCOUTFS_TRANS_SEQ_ZONE			7
 #define SCOUTFS_MOUNTED_CLIENT_ZONE		8
 #define SCOUTFS_SRCH_ZONE			9
-#define SCOUTFS_FREE_EXTENT_ZONE		10
+#define SCOUTFS_FREE_EXTENT_BLKNO_ZONE		10
+#define SCOUTFS_FREE_EXTENT_ORDER_ZONE		11
+/* Items only stored in log merge server btrees */
+#define SCOUTFS_LOG_MERGE_STATUS_ZONE		12
+#define SCOUTFS_LOG_MERGE_RANGE_ZONE		13
+#define SCOUTFS_LOG_MERGE_REQUEST_ZONE		14
+#define SCOUTFS_LOG_MERGE_COMPLETE_ZONE		15
+#define SCOUTFS_LOG_MERGE_FREEING_ZONE		16

 /* inode index zone */
 #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE	1
 #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
 #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */

-/* rid zone (also used in server alloc btree) */
+/* orphan zone, redundant type used for clarity */
 #define SCOUTFS_ORPHAN_TYPE			1

 /* fs zone */
@@ -521,10 +619,6 @@ struct scoutfs_bloom_block {
 #define SCOUTFS_SRCH_PENDING_TYPE	3
 #define SCOUTFS_SRCH_BUSY_TYPE		4

-/* free extents in allocator btrees in client and server, by blkno or len */
-#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE	1
-#define SCOUTFS_FREE_EXTENT_LEN_TYPE	2
-
 /* file data extents have start and len in key */
 struct scoutfs_data_extent_val {
 	__le64 blkno;
@@ -582,6 +676,12 @@ struct scoutfs_xattr {
 #define SCOUTFS_QUORUM_HB_IVAL_MS	100
 #define SCOUTFS_QUORUM_HB_TIMEO_MS	(5 * MSEC_PER_SEC)

+/*
+ * A newly elected leader will give fencing some time before giving up and
+ * shutting down.
+ */
+#define SCOUTFS_QUORUM_FENCE_TO_MS	(15 * MSEC_PER_SEC)
+
 struct scoutfs_quorum_message {
 	__le64 fsid;
 	__le64 version;
@@ -613,18 +713,60 @@ struct scoutfs_quorum_config {
 	} slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };

-struct scoutfs_quorum_block {
-	struct scoutfs_block_header hdr;
-	__le64 term;
-	__le64 random_write_mark;
-	__le64 flags;
-	struct scoutfs_quorum_block_event {
-		__le64 rid;
-		struct scoutfs_timespec ts;
-	} write, update_term, set_leader, clear_leader, fenced;
+enum {
+	SCOUTFS_QUORUM_EVENT_BEGIN,		/* quorum service starting up */
+	SCOUTFS_QUORUM_EVENT_TERM,		/* updated persistent term */
+	SCOUTFS_QUORUM_EVENT_ELECT,		/* won election */
+	SCOUTFS_QUORUM_EVENT_FENCE,		/* server fenced others */
+	SCOUTFS_QUORUM_EVENT_STOP,		/* server stopped */
+	SCOUTFS_QUORUM_EVENT_END,		/* quorum service shutting down */
+	SCOUTFS_QUORUM_EVENT_NR,
 };

-#define SCOUTFS_QUORUM_BLOCK_LEADER (1 << 0)
+struct scoutfs_quorum_block {
+	struct scoutfs_block_header hdr;
+	struct scoutfs_quorum_block_event {
+		__le64 rid;
+		__le64 term;
+		struct scoutfs_timespec ts;
+	} events[SCOUTFS_QUORUM_EVENT_NR];
+};
+
+/*
+ * Tunable options that apply to the entire system.  They can be set in
+ * mkfs or in sysfs files which send an rpc to the server to make the
+ * change.  The super version defines the options that exist.
+ *
+ * @set_bits: bits for each 64bit starting offset after set_bits
+ * indicate which logical option is set.
+ *
+ * @data_alloc_zone_blocks: if set, the data device is logically divided
+ * into contiguous zones of this many blocks.  Data allocation will try
+ * and isolate allocated extents for each mount to their own zone.  The
+ * zone size must be larger than the data alloc high water mark and
+ * large enough such that the number of zones is kept within its static
+ * limit.
+ */
+struct scoutfs_volume_options {
+	__le64 set_bits;
+	__le64 data_alloc_zone_blocks;
+	__le64 __future_expansion[63];
+};
+
+#define scoutfs_volopt_nr(field)							\
+	((offsetof(struct scoutfs_volume_options, field) -				\
+	  (offsetof(struct scoutfs_volume_options, set_bits) +				\
+	   member_sizeof(struct scoutfs_volume_options, set_bits))) / sizeof(__le64))
+#define scoutfs_volopt_bit(field)							\
+	(1ULL << scoutfs_volopt_nr(field))
+
+#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR \
+	scoutfs_volopt_nr(data_alloc_zone_blocks)
+#define SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT \
+	scoutfs_volopt_bit(data_alloc_zone_blocks)
+
+#define SCOUTFS_VOLOPT_EXPANSION_BITS \
+	(~(scoutfs_volopt_bit(__future_expansion) - 1))

 #define SCOUTFS_FLAG_IS_META_BDEV 0x01

@@ -634,14 +776,10 @@ struct scoutfs_super_block {
 	__le64 version;
 	__le64 flags;
 	__u8 uuid[SCOUTFS_UUID_BYTES];
+	__le64 seq;
 	__le64 next_ino;
-	__le64 next_trans_seq;
 	__le64 total_meta_blocks;	/* both static and dynamic */
-	__le64 first_meta_blkno;	/* first dynamically allocated */
-	__le64 last_meta_blkno;
 	__le64 total_data_blocks;
-	__le64 first_data_blkno;
-	__le64 last_data_blkno;
 	struct scoutfs_quorum_config qconf;
 	struct scoutfs_alloc_root meta_alloc[2];
 	struct scoutfs_alloc_root data_alloc;
@@ -649,9 +787,11 @@ struct scoutfs_super_block {
 	struct scoutfs_alloc_list_head server_meta_freed[2];
 	struct scoutfs_btree_root fs_root;
 	struct scoutfs_btree_root logs_root;
+	struct scoutfs_btree_root log_merge;
 	struct scoutfs_btree_root trans_seqs;
 	struct scoutfs_btree_root mounted_clients;
 	struct scoutfs_btree_root srch_root;
+	struct scoutfs_volume_options volopt;
 };

 #define SCOUTFS_ROOT_INO 1
@@ -757,9 +897,9 @@ enum scoutfs_dentry_type {
 	DIV_ROUND_UP(sizeof(struct scoutfs_xattr) + name_len + val_len, \
 		     (unsigned int)SCOUTFS_XATTR_MAX_PART_SIZE)

-#define SCOUTFS_LOCK_INODE_GROUP_NR	128
+#define SCOUTFS_LOCK_INODE_GROUP_NR	1024
 #define SCOUTFS_LOCK_INODE_GROUP_MASK	(SCOUTFS_LOCK_INODE_GROUP_NR - 1)
-#define SCOUTFS_LOCK_SEQ_GROUP_MASK	((1ULL << 7) - 1)
+#define SCOUTFS_LOCK_SEQ_GROUP_MASK	((1ULL << 10) - 1)

 /*
 * messages over the wire.
@@ -840,7 +980,13 @@ enum scoutfs_net_cmd {
 	SCOUTFS_NET_CMD_LOCK_RECOVER,
 	SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
 	SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
+	SCOUTFS_NET_CMD_GET_LOG_MERGE,
+	SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
 	SCOUTFS_NET_CMD_OPEN_INO_MAP,
+	SCOUTFS_NET_CMD_GET_VOLOPT,
+	SCOUTFS_NET_CMD_SET_VOLOPT,
+	SCOUTFS_NET_CMD_CLEAR_VOLOPT,
+	SCOUTFS_NET_CMD_RESIZE_DEVICES,
 	SCOUTFS_NET_CMD_FAREWELL,
 	SCOUTFS_NET_CMD_UNKNOWN,
 };
@@ -883,9 +1029,14 @@ struct scoutfs_net_roots {
 	struct scoutfs_btree_root srch_root;
 };

+struct scoutfs_net_resize_devices {
+	__le64 new_total_meta_blocks;
+	__le64 new_total_data_blocks;
+};
+
 struct scoutfs_net_lock {
 	struct scoutfs_key key;
-	__le64 write_version;
+	__le64 write_seq;
 	__u8 old_mode;
 	__u8 new_mode;
 	__u8 __pad[6];
@@ -909,7 +1060,6 @@ enum scoutfs_lock_trace {
 	SLT_INVALIDATE,
 	SLT_REQUEST,
 	SLT_RESPONSE,
-	SLT_NR,
 };

 /*
@@ -962,7 +1112,7 @@ enum scoutfs_corruption_sources {

 #define SC_NR_LONGS DIV_ROUND_UP(SC_NR_SOURCES, BITS_PER_LONG)

-#define SCOUTFS_OPEN_INO_MAP_SHIFT	7
+#define SCOUTFS_OPEN_INO_MAP_SHIFT	10
 #define SCOUTFS_OPEN_INO_MAP_BITS	(1 << SCOUTFS_OPEN_INO_MAP_SHIFT)
 #define SCOUTFS_OPEN_INO_MAP_MASK	(SCOUTFS_OPEN_INO_MAP_BITS - 1)
 #define SCOUTFS_OPEN_INO_MAP_LE64S	(SCOUTFS_OPEN_INO_MAP_BITS / 64)
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -34,6 +34,7 @@
 #include "client.h"
 #include "cmp.h"
 #include "omap.h"
+#include "forest.h"

 /*
 * XXX
@@ -54,10 +55,22 @@ struct inode_allocator {
 };

 struct inode_sb_info {
+	struct super_block *sb;
+	bool stopped;
+
 	spinlock_t writeback_lock;
-	struct rb_root writeback_inodes;
+	struct list_head writeback_list;
 	struct inode_allocator dir_ino_alloc;
 	struct inode_allocator ino_alloc;
+
+	struct delayed_work orphan_scan_dwork;
+
+	/* serialize multiple inode ->evict trying to delete same ino's items */
+	spinlock_t deleting_items_lock;
+	struct list_head deleting_items_list;
+
+	struct work_struct iput_work;
+	struct llist_head iput_llist;
 };

 #define DECLARE_INODE_SB_INFO(sb, name) \
@@ -82,9 +95,9 @@ static void scoutfs_inode_ctor(void *obj)
 	atomic64_set(&si->data_waitq.changed, 0);
 	init_waitqueue_head(&si->data_waitq.waitq);
 	init_rwsem(&si->xattr_rwsem);
-	RB_CLEAR_NODE(&si->writeback_node);
+	INIT_LIST_HEAD(&si->writeback_entry);
 	scoutfs_lock_init_coverage(&si->ino_lock_cov);
-	atomic_set(&si->inv_iput_count, 0);
+	atomic_set(&si->iput_count, 0);

 	inode_init_once(&si->inode);
 }
@@ -108,47 +121,14 @@ static void scoutfs_i_callback(struct rcu_head *head)
 	kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
 }

-static void insert_writeback_inode(struct inode_sb_info *inf,
-				   struct scoutfs_inode_info *ins)
-{
-	struct rb_root *root = &inf->writeback_inodes;
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct scoutfs_inode_info *si;
-
-	while (*node) {
-		parent = *node;
-		si = container_of(*node, struct scoutfs_inode_info,
-				  writeback_node);
-
-		if (ins->ino < si->ino)
-			node = &(*node)->rb_left;
-		else if (ins->ino > si->ino)
-			node = &(*node)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&ins->writeback_node, parent, node);
-	rb_insert_color(&ins->writeback_node, root);
-}
-
-static void remove_writeback_inode(struct inode_sb_info *inf,
-			       struct scoutfs_inode_info *si)
-{
-	if (!RB_EMPTY_NODE(&si->writeback_node)) {
-		rb_erase(&si->writeback_node, &inf->writeback_inodes);
-		RB_CLEAR_NODE(&si->writeback_node);
-	}
-}
-
 void scoutfs_destroy_inode(struct inode *inode)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	DECLARE_INODE_SB_INFO(inode->i_sb, inf);

 	spin_lock(&inf->writeback_lock);
-	remove_writeback_inode(inf, SCOUTFS_I(inode));
+	if (!list_empty(&si->writeback_entry))
+		list_del_init(&si->writeback_entry);
 	spin_unlock(&inf->writeback_lock);

 	scoutfs_lock_del_coverage(inode->i_sb, &si->ino_lock_cov);
@@ -352,7 +332,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	if (!S_ISREG(inode->i_mode))
 		return 0;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
 	if (ret)
 		return ret;

@@ -379,7 +359,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 	LIST_HEAD(ind_locks);
 	int ret;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
 	if (ret)
 		return ret;

@@ -494,7 +474,7 @@ retry:
 		}
 	}

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, false);
 	if (ret)
 		goto out;

@@ -682,14 +662,14 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
 	return ilookup5(sb, ino, scoutfs_iget_test, &ino);
 }

-struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
+struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf)
 {
 	struct scoutfs_lock *lock = NULL;
 	struct scoutfs_inode_info *si;
 	struct inode *inode;
 	int ret;

-	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, ino, &lock);
+	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock);
 	if (ret)
 		return ERR_PTR(ret);

@@ -1207,7 +1187,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
 * Returns > 0 if the seq changed and the locks should be retried.
 */
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq)
+				      struct list_head *list, u64 seq, bool allocing)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct index_lock *ind_lock;
@@ -1223,7 +1203,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
 			goto out;
 	}

-	ret = scoutfs_hold_trans(sb);
+	ret = scoutfs_hold_trans(sb, allocing);
 	if (ret == 0 && seq != sbi->trans_seq) {
 		scoutfs_release_trans(sb);
 		ret = 1;
@@ -1237,7 +1217,7 @@ out:
 }

 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq)
+				  bool set_data_seq, bool allocing)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
@@ -1247,7 +1227,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, list, inode,
 						  set_data_seq) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, list, seq);
+		      scoutfs_inode_index_try_lock_hold(sb, list, seq, allocing);
 	} while (ret > 0);

 	return ret;
@@ -1437,41 +1417,74 @@ out:
 	return inode;
 }

-static void init_orphan_key(struct scoutfs_key *key, u64 rid, u64 ino)
+static void init_orphan_key(struct scoutfs_key *key, u64 ino)
 {
 	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_RID_ZONE,
-		.sko_rid = cpu_to_le64(rid),
-		.sk_type = SCOUTFS_ORPHAN_TYPE,
+		.sk_zone = SCOUTFS_ORPHAN_ZONE,
 		.sko_ino = cpu_to_le64(ino),
+		.sk_type = SCOUTFS_ORPHAN_TYPE,
 	};
 }

-int scoutfs_orphan_dirty(struct super_block *sb, u64 ino)
+/*
+ * Create an orphan item.  The orphan items are maintained in their own
+ * zone under a write only lock while the caller has the inode protected
+ * by a write lock.
+ */
+int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;

-	init_orphan_key(&key, sbi->rid, ino);
+	init_orphan_key(&key, ino);

-	return scoutfs_item_dirty(sb, &key, lock);
+	return scoutfs_item_create_force(sb, &key, NULL, 0, lock);
 }

-int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
+int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
 	struct scoutfs_key key;
-	int ret;

-	init_orphan_key(&key, sbi->rid, ino);
+	init_orphan_key(&key, ino);

-	ret = scoutfs_item_delete(sb, &key, lock);
-	if (ret == -ENOENT)
-		ret = 0;
+	return scoutfs_item_delete_force(sb, &key, lock);
+}

-	return ret;
+struct deleting_ino_entry {
+	struct list_head head;
+	u64 ino;
+};
+
+static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino)
+{
+	struct deleting_ino_entry *tmp;
+	bool added = true;
+
+	spin_lock(&inf->deleting_items_lock);
+
+	list_for_each_entry(tmp, &inf->deleting_items_list, head) {
+		if (tmp->ino == ino) {
+			added = false;
+			break;
+		}
+	}
+
+	if (added) {
+		del->ino = ino;
+		list_add_tail(&del->head, &inf->deleting_items_list);
+	}
+
+	spin_unlock(&inf->deleting_items_lock);
+
+	return added;
+}
+
+static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del)
+{
+	if (del->ino) {
+		spin_lock(&inf->deleting_items_lock);
+		list_del_init(&del->head);
+		spin_unlock(&inf->deleting_items_lock);
+	}
 }

 /*
@@ -1482,9 +1495,21 @@ int scoutfs_orphan_delete(struct super_block *sb, u64 ino)
 * orphan item will continue triggering attempts to finish previous
 * partial deletion until all deletion is complete and the orphan item
 * is removed.
+ *
+ * Currently this can be called multiple times for multiple cached
+ * inodes for a given ino number (ilookup avoids freeing inodes to avoid
+ * cluster lock<->inode flag waiting inversions).  Some items are not
+ * safe to delete concurrently, for example concurrent data truncation
+ * could free extents multiple times.  We use a very silly list of inos
+ * being deleted.  Duplicates just return success.  If the first
+ * deletion ends up failing orphan deletion will come back around later
+ * and retry.
 */
-static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock)
+static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock,
+			      struct scoutfs_lock *orph_lock)
 {
+	DECLARE_INODE_SB_INFO(sb, inf);
+	struct deleting_ino_entry del = {{NULL, }};
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
 	LIST_HEAD(ind_locks);
@@ -1494,6 +1519,11 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 	u64 size;
 	int ret;

+	if (!added_deleting_ino(inf, &del, ino)) {
+		ret = 0;
+		goto out;
+	}
+
 	init_inode_key(&key, ino);

 	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
@@ -1531,7 +1561,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1553,8 +1583,9 @@ retry:
 	if (ret)
 		goto out;

-	ret = scoutfs_orphan_delete(sb, ino);
+	ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock);
 out:
+	del_deleting_ino(inf, &del);
 	if (release)
 		scoutfs_release_trans(sb);
 	scoutfs_inode_index_unlock(sb, &ind_locks);
@@ -1568,11 +1599,17 @@ out:
 * tear down.  We use locking and open inode number bitmaps to decide if
 * we should finally destroy an inode that is no longer open nor
 * reachable through directory entries.
+ *
+ * Because lookup ignores freeing inodes we can get here from multiple
+ * instances of an inode that is being deleted.  Orphan scanning in
+ * particular can race with deletion.   delete_inode_items() resolves
+ * concurrent attempts.
 */
 void scoutfs_evict_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
+	struct scoutfs_lock *orph_lock;
 	struct scoutfs_lock *lock;
 	int ret;

@@ -1584,14 +1621,16 @@ void scoutfs_evict_inode(struct inode *inode)

 	truncate_inode_pages_final(&inode->i_data);

-	ret = scoutfs_omap_should_delete(sb, inode, &lock);
+	ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock);
 	if (ret > 0) {
-		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock);
+		ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock);
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+		scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	}
-	if (ret < 0)
+	if (ret < 0) {
 		scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.",
 			    ret, ino);
+	}

 	scoutfs_omap_dec(sb, ino);

@@ -1625,76 +1664,185 @@ int scoutfs_drop_inode(struct inode *inode)
 	       generic_drop_inode(inode);
 }

-/*
- * Find orphan items and process each one.
- *
- * Runtime of this will be bounded by the number of orphans, which could
- * theoretically be very large. If that becomes a problem we might want to push
- * this work off to a thread.
- *
- * This only scans orphans for this node.  This will need to be covered by
- * the rest of node zone cleanup.
- */
-int scoutfs_scan_orphans(struct super_block *sb)
+static void iput_worker(struct work_struct *work)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
-	struct scoutfs_lock *inode_lock = NULL;
-	struct scoutfs_key key;
+	struct inode_sb_info *inf = container_of(work, struct inode_sb_info, iput_work);
+	struct scoutfs_inode_info *si;
+	struct scoutfs_inode_info *tmp;
+	struct llist_node *inodes;
+	bool more;
+
+	inodes = llist_del_all(&inf->iput_llist);
+
+	llist_for_each_entry_safe(si, tmp, inodes, iput_llnode) {
+		do {
+			more = atomic_dec_return(&si->iput_count) > 0;
+			iput(&si->inode);
+		} while (more);
+	}
+}
+
+/*
+ * Final iput can get into evict and perform final inode deletion which
+ * can delete a lot of items spanning multiple cluster locks and
+ * transactions.  It should be understood as a heavy high level
+ * operation, more like file writing and less like dropping a refcount.
+ *
+ * Unfortunately we also have incentives to use igrab/iput from internal
+ * contexts that have no business doing that work, like lock
+ * invalidation or dirty inode writeback during transaction commit.
+ *
+ * In those cases we can kick iput off to background work context.
+ * Nothing stops multiple puts of an inode before the work runs so we
+ * can track multiple puts in flight.
+ */
+void scoutfs_inode_queue_iput(struct inode *inode)
+{
+	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+
+	if (atomic_inc_return(&si->iput_count) == 1)
+		llist_add(&si->iput_llnode, &inf->iput_llist);
+	smp_wmb(); /* count and list visible before work executes */
+	schedule_work(&inf->iput_work);
+}
+
+/*
+ * All mounts are performing this work concurrently.  We introduce
+ * significant jitter between them to try and keep them from all
+ * bunching up and working on the same inodes.
+ */
+static void schedule_orphan_dwork(struct inode_sb_info *inf)
+{
+#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC)
+#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC)
+	unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
+					       prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
+	if (!inf->stopped) {
+		delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS +
+					 prandom_u32_max(ORPHAN_SCAN_JITTER_MS));
+		schedule_delayed_work(&inf->orphan_scan_dwork, delay);
+	}
+}
+
+/*
+ * Find and delete inodes whose only remaining reference is the
+ * persistent orphan item that was created as they were unlinked.
+ *
+ * Orphan items are created as the final directory entry referring to an
+ * inode is deleted.  They're deleted as the final cached inode is
+ * evicted and the inode items are destroyed.  They can linger if all
+ * the cached inodes pinning the inode fail to delete as they are
+ * evicted from the cache -- either through crashing or errors.
+ *
+ * This work runs in all mounts in the background looking for orphaned
+ * inodes that should be deleted.
+ *
+ * We use the forest hint call to read the persistent forest trees
+ * looking for orphan items without creating lock contention.  Orphan
+ * items exist for O_TMPFILE users and we don't want to force them to
+ * commit by trying to acquire a conflicting read lock the orphan zone.
+ * There's no rush to reclaim deleted items, eventually they will be
+ * found in the persistent item btrees.
+ *
+ * Once we find candidate orphan items we can first check our local
+ * inode cache for inodes that are already on their way to eviction and
+ * can be skipped.  Then we ask the server for the open map containing
+ * the inode.  Only if we don't have it cached, and no one else does, do
+ * we try and read it into our cache and evict it to trigger the final
+ * inode deletion process.
+ *
+ * Orphaned items that make it that far should be very rare.  They can
+ * only exist if all the mounts that were using an inode after it had
+ * been unlinked (or created with o_tmpfile) didn't unmount cleanly.
+ */
+static void inode_orphan_scan_worker(struct work_struct *work)
+{
+	struct inode_sb_info *inf = container_of(work, struct inode_sb_info,
+						 orphan_scan_dwork.work);
+	struct super_block *sb = inf->sb;
+	struct scoutfs_open_ino_map omap;
 	struct scoutfs_key last;
+	struct scoutfs_key next;
+	struct scoutfs_key key;
+	struct inode *inode;
+	u64 group_nr;
+	int bit_nr;
 	u64 ino;
-	int err = 0;
 	int ret;

-	trace_scoutfs_scan_orphans(sb);
+	scoutfs_inc_counter(sb, orphan_scan);

-	init_orphan_key(&key, sbi->rid, 0);
-	init_orphan_key(&last, sbi->rid, ~0ULL);
+	init_orphan_key(&last, U64_MAX);
+	omap.args.group_nr = cpu_to_le64(U64_MAX);

-	while (1) {
-		ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
-		if (ret == -ENOENT) /* No more orphan items */
-			break;
-		if (ret < 0)
+	for (ino = SCOUTFS_ROOT_INO + 1; ino != 0; ino++) {
+		if (inf->stopped) {
+			ret = 0;
 			goto out;
-
-		ino = le64_to_cpu(key.sko_ino);
-
-		ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &inode_lock);
-		if (ret == 0) {
-			ret = delete_inode_items(sb, le64_to_cpu(key.sko_ino), inode_lock);
-			scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE);
 		}
-		if (ret && ret != -ENOENT && !err)
-			err = ret;

-		if (le64_to_cpu(key.sko_ino) == U64_MAX) {
-			ret = -ENOENT;
+		/* find the next orphan item */
+		init_orphan_key(&key, ino);
+		ret = scoutfs_forest_next_hint(sb, &key, &next);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				break;
+			goto out;
+		}
+
+		if (scoutfs_key_compare(&next, &last) > 0)
 			break;
+
+		scoutfs_inc_counter(sb, orphan_scan_item);
+		ino = le64_to_cpu(next.sko_ino);
+
+		/* locally cached inodes will already be deleted */
+		inode = scoutfs_ilookup(sb, ino);
+		if (inode) {
+			scoutfs_inc_counter(sb, orphan_scan_cached);
+			iput(inode);
+			continue;
 		}
-		le64_add_cpu(&key.sko_ino, 1);
+
+		/* get an omap that covers the orphaned ino */
+		group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT;
+		bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK;
+
+		if (le64_to_cpu(omap.args.group_nr) != group_nr) {
+			ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
+			if (ret < 0)
+				goto out;
+		}
+
+		/* don't need to evict if someone else has it open (cached) */
+		if (test_bit_le(bit_nr, omap.bits)) {
+			scoutfs_inc_counter(sb, orphan_scan_omap_set);
+			continue;
+		}
+
+		/* try to cached and evict unused inode to delete, can be racing */
+		inode = scoutfs_iget(sb, ino, 0);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			if (ret == -ENOENT)
+				continue;
+			else
+				goto out;
+		}
+
+		scoutfs_inc_counter(sb, orphan_scan_read);
+		SCOUTFS_I(inode)->drop_invalidated = true;
+		iput(inode);
 	}

 	ret = 0;
+
 out:
-	return err ? err : ret;
-}
+	if (ret < 0)
+		scoutfs_inc_counter(sb, orphan_scan_error);

-int scoutfs_orphan_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_lock *lock = sbi->rid_lock;
-	struct scoutfs_key key;
-	int ret;
-
-	trace_scoutfs_orphan_inode(sb, inode);
-
-	init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));
-
-	ret = scoutfs_item_create(sb, &key, NULL, 0, lock);
-
-	return ret;
+	schedule_orphan_dwork(inf);
 }

 /*
@@ -1703,30 +1851,33 @@ int scoutfs_orphan_inode(struct inode *inode)
 * ourselves in knots trying to call through the high level vfs sync
 * methods.
 *
+ * File data block allocations tend to advance through free space so we
+ * add the inode to the end of the list to roughly encourage sequential
+ * IO.
+ *
 * This is called by writers who hold the inode and transaction.  The
- * inode's presence in the rbtree is removed by destroy_inode, prevented
- * by the inode hold, and by committing the transaction, which is
- * prevented by holding the transaction.  The inode can only go from
- * empty to on the rbtree while we're here.
+ * inode is removed from the list by evict->destroy if it's unlinked
+ * during the transaction or by committing the transaction.  Pruning the
+ * icache won't try to evict the inode as long as it has dirty buffers.
 */
 void scoutfs_inode_queue_writeback(struct inode *inode)
 {
 	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);

-	if (RB_EMPTY_NODE(&si->writeback_node)) {
+	if (list_empty(&si->writeback_entry)) {
 		spin_lock(&inf->writeback_lock);
-		if (RB_EMPTY_NODE(&si->writeback_node))
-			insert_writeback_inode(inf, si);
+		if (list_empty(&si->writeback_entry))
+			list_add_tail(&si->writeback_entry, &inf->writeback_list);
 		spin_unlock(&inf->writeback_lock);
 	}
 }

 /*
- * Walk our dirty inodes in ino order and either start dirty page
- * writeback or wait for writeback to complete.
+ * Walk our dirty inodes and either start dirty page writeback or wait
+ * for writeback to complete.
 *
- * This is called by transaction commiting so other writers are
+ * This is called by transaction committing so other writers are
 * excluded.  We're still very careful to iterate over the tree while it
 * and the inodes could be changing.
 *
@@ -1739,29 +1890,19 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
 {
 	DECLARE_INODE_SB_INFO(sb, inf);
 	struct scoutfs_inode_info *si;
-	struct rb_node *node;
+	struct scoutfs_inode_info *tmp;
 	struct inode *inode;
-	struct inode *defer_iput = NULL;
 	int ret;

 	spin_lock(&inf->writeback_lock);

-	node = rb_first(&inf->writeback_inodes);
-	while (node) {
-		si = container_of(node, struct scoutfs_inode_info,
-				  writeback_node);
-		node = rb_next(node);
+	list_for_each_entry_safe(si, tmp, &inf->writeback_list, writeback_entry) {
 		inode = igrab(&si->inode);
 		if (!inode)
 			continue;

 		spin_unlock(&inf->writeback_lock);

-		if (defer_iput) {
-			iput(defer_iput);
-			defer_iput = NULL;
-		}
-
 		if (write)
 			ret = filemap_fdatawrite(inode->i_mapping);
 		else
@@ -1769,28 +1910,28 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
 		trace_scoutfs_inode_walk_writeback(sb, scoutfs_ino(inode),
 						   write, ret);
 		if (ret) {
-			iput(inode);
+			scoutfs_inode_queue_iput(inode);
 			goto out;
 		}

 		spin_lock(&inf->writeback_lock);

-		if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node)))
-			node = rb_first(&inf->writeback_inodes);
+		/* restore tmp after reacquiring lock */
+		if (WARN_ON_ONCE(list_empty(&si->writeback_entry)))
+			tmp = list_first_entry(&inf->writeback_list, struct scoutfs_inode_info,
+					       writeback_entry);
 		else
-			node = rb_next(&si->writeback_node);
+			tmp = list_next_entry(si, writeback_entry);

 		if (!write)
-			remove_writeback_inode(inf, si);
+			list_del_init(&si->writeback_entry);

-		/* avoid iput->destroy lock deadlock */
-		defer_iput = inode;
+		scoutfs_inode_queue_iput(inode);
 	}

 	spin_unlock(&inf->writeback_lock);
 out:
-	if (defer_iput)
-		iput(defer_iput);
+
 	return ret;
 }

@@ -1803,16 +1944,44 @@ int scoutfs_inode_setup(struct super_block *sb)
 	if (!inf)
 		return -ENOMEM;

+	inf->sb = sb;
 	spin_lock_init(&inf->writeback_lock);
-	inf->writeback_inodes = RB_ROOT;
+	INIT_LIST_HEAD(&inf->writeback_list);
 	spin_lock_init(&inf->dir_ino_alloc.lock);
 	spin_lock_init(&inf->ino_alloc.lock);
+	INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
+	spin_lock_init(&inf->deleting_items_lock);
+	INIT_LIST_HEAD(&inf->deleting_items_list);
+	INIT_WORK(&inf->iput_work, iput_worker);
+	init_llist_head(&inf->iput_llist);

 	sbi->inode_sb_info = inf;

 	return 0;
 }

+/*
+ * Our inode subsystem is setup pretty early but orphan scanning uses
+ * many other subsystems like networking and the server.  We only kick
+ * it off once everything is ready.
+ */
+void scoutfs_inode_start(struct super_block *sb)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+
+	schedule_orphan_dwork(inf);
+}
+
+void scoutfs_inode_stop(struct super_block *sb)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+
+	if (inf) {
+		inf->stopped = true;
+		cancel_delayed_work_sync(&inf->orphan_scan_dwork);
+	}
+}
+
 void scoutfs_inode_destroy(struct super_block *sb)
 {
 	struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -49,14 +49,14 @@ struct scoutfs_inode_info {
 	struct scoutfs_per_task pt_data_lock;
 	struct scoutfs_data_waitq data_waitq;
 	struct rw_semaphore xattr_rwsem;
-	struct rb_node writeback_node;
+	struct list_head writeback_entry;

 	struct scoutfs_lock_coverage ino_lock_cov;

 	/* drop if i_count hits 0, allows drop while invalidate holds coverage */
 	bool drop_invalidated;
-	struct llist_node inv_iput_llnode;
-	atomic_t inv_iput_count;
+	struct llist_node iput_llnode;
+	atomic_t iput_count;

 	struct inode inode;
 };
@@ -75,9 +75,9 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
 void scoutfs_destroy_inode(struct inode *inode);
 int scoutfs_drop_inode(struct inode *inode);
 void scoutfs_evict_inode(struct inode *inode);
-int scoutfs_orphan_inode(struct inode *inode);
+void scoutfs_inode_queue_iput(struct inode *inode);

-struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
+struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);

 void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
@@ -89,9 +89,9 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
 				    struct list_head *list, u64 ino,
 				    umode_t mode);
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq);
+				      struct list_head *list, u64 seq, bool allocing);
 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq);
+				  bool set_data_seq, bool allocing);
 void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);

 int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
@@ -120,9 +120,8 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		    struct kstat *stat);
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr);

-int scoutfs_scan_orphans(struct super_block *sb);
-int scoutfs_orphan_dirty(struct super_block *sb, u64 ino);
-int scoutfs_orphan_delete(struct super_block *sb, u64 ino);
+int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);
+int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock);

 void scoutfs_inode_queue_writeback(struct inode *inode);
 int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
@@ -133,6 +132,8 @@ void scoutfs_inode_exit(void);
 int scoutfs_inode_init(void);

 int scoutfs_inode_setup(struct super_block *sb);
+void scoutfs_inode_start(struct super_block *sb);
+void scoutfs_inode_stop(struct super_block *sb);
 void scoutfs_inode_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -38,6 +38,7 @@
 #include "hash.h"
 #include "srch.h"
 #include "alloc.h"
+#include "server.h"
 #include "scoutfs_trace.h"

 /*
@@ -674,7 +675,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)

 	/* setting only so we don't see 0 data seq with nonzero data_version */
 	set_data_seq = sm.data_version != 0 ? true : false;
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq, false);
 	if (ret)
 		goto unlock;

@@ -866,28 +867,40 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_super_block *super;
 	struct scoutfs_ioctl_statfs_more sfm;
 	int ret;

 	if (get_user(sfm.valid_bytes, (__u64 __user *)arg))
 		return -EFAULT;

+	super = kzalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
+	if (!super)
+		return -ENOMEM;
+
+	ret = scoutfs_read_super(sb, super);
+	if (ret)
+		goto out;
+
 	sfm.valid_bytes = min_t(u64, sfm.valid_bytes,
 				sizeof(struct scoutfs_ioctl_statfs_more));
 	sfm.fsid = le64_to_cpu(super->hdr.fsid);
 	sfm.rid = sbi->rid;
 	sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks);
 	sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks);
+	sfm.reserved_meta_blocks = scoutfs_server_reserved_meta_blocks(sb);

 	ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
 	if (ret)
-		return ret;
+		goto out;

 	if (copy_to_user((void __user *)arg, &sfm, sfm.valid_bytes))
-		return -EFAULT;
-
-	return 0;
+		ret = -EFAULT;
+	else
+		ret = 0;
+out:
+	kfree(super);
+	return ret;
 }

 struct copy_alloc_detail_args {
@@ -991,6 +1004,37 @@ out:
 	return ret;
 }

+static long scoutfs_ioc_resize_devices(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_resize_devices __user *urd = (void __user *)arg;
+	struct scoutfs_ioctl_resize_devices rd;
+	struct scoutfs_net_resize_devices nrd;
+	int ret;
+
+	if (!(file->f_mode & FMODE_READ)) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	if (copy_from_user(&rd, urd, sizeof(rd))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	nrd.new_total_meta_blocks = cpu_to_le64(rd.new_total_meta_blocks);
+	nrd.new_total_data_blocks = cpu_to_le64(rd.new_total_data_blocks);
+
+	ret = scoutfs_client_resize_devices(sb, &nrd);
+out:
+	return ret;
+}
+
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1020,6 +1064,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_alloc_detail(file, arg);
 	case SCOUTFS_IOC_MOVE_BLOCKS:
 		return scoutfs_ioc_move_blocks(file, arg);
+	case SCOUTFS_IOC_RESIZE_DEVICES:
+		return scoutfs_ioc_resize_devices(file, arg);
 	}

 	return -ENOTTY;
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -371,6 +371,7 @@ struct scoutfs_ioctl_statfs_more {
 	__u64 committed_seq;
 	__u64 total_meta_blocks;
 	__u64 total_data_blocks;
+	__u64 reserved_meta_blocks;
 };

 #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \
@@ -476,4 +477,12 @@ struct scoutfs_ioctl_move_blocks {
 #define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
 				     struct scoutfs_ioctl_move_blocks)

+struct scoutfs_ioctl_resize_devices {
+	__u64 new_total_meta_blocks;
+	__u64 new_total_data_blocks;
+};
+
+#define SCOUTFS_IOC_RESIZE_DEVICES \
+	_IOR(SCOUTFS_IOCTL_MAGIC, 14, struct scoutfs_ioctl_resize_devices)
+
 #endif
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
@@ -95,7 +95,7 @@ struct item_cache_info {

 	/* written by page readers, read by shrink */
 	spinlock_t active_lock;
-	struct rb_root active_root;
+	struct list_head active_list;
 };

 #define DECLARE_ITEM_CACHE_INFO(sb, name) \
@@ -127,6 +127,7 @@ struct cached_page {
 	unsigned long lru_time;
 	struct list_head dirty_list;
 	struct list_head dirty_head;
+	u64 max_liv_seq;
 	struct page *page;
 	unsigned int page_off;
 	unsigned int erased_bytes;
@@ -149,7 +150,8 @@ struct cached_item {

 static int item_val_bytes(int val_len)
 {
-	return round_up(offsetof(struct cached_item, val[val_len]), CACHED_ITEM_ALIGN);
+	return round_up(offsetof(struct cached_item, val[val_len]),
+			CACHED_ITEM_ALIGN);
 }

 /*
@@ -345,7 +347,8 @@ static struct cached_page *alloc_pg(struct super_block *sb, gfp_t gfp)
 	page = alloc_page(GFP_NOFS | gfp);
 	if (!page || !pg) {
 		kfree(pg);
-		__free_page(page);
+		if (page)
+			__free_page(page);
 		return NULL;
 	}

@@ -383,6 +386,14 @@ static void put_pg(struct super_block *sb, struct cached_page *pg)
 	}
 }

+static void update_pg_max_liv_seq(struct cached_page *pg, struct cached_item *item)
+{
+	u64 liv_seq = le64_to_cpu(item->liv.seq);
+
+	if (liv_seq > pg->max_liv_seq)
+		pg->max_liv_seq = liv_seq;
+}
+
 /*
 * Allocate space for a new item from the free offset at the end of a
 * cached page.  This isn't a blocking allocation, and it's likely that
@@ -414,14 +425,15 @@ static struct cached_item *alloc_item(struct cached_page *pg,
 	if (val_len)
 		memcpy(item->val, val, val_len);

+	update_pg_max_liv_seq(pg, item);
+
 	return item;
 }

 static void erase_item(struct cached_page *pg, struct cached_item *item)
 {
 	rbtree_erase(&item->node, &pg->item_root);
-	pg->erased_bytes += round_up(item_val_bytes(item->val_len),
-				     CACHED_ITEM_ALIGN);
+	pg->erased_bytes += item_val_bytes(item->val_len);
 }

 static void lru_add(struct super_block *sb, struct item_cache_info *cinf,
@@ -621,6 +633,8 @@ static void mark_item_dirty(struct super_block *sb,
 		list_add_tail(&item->dirty_head, &pg->dirty_list);
 		item->dirty = 1;
 	}
+
+	update_pg_max_liv_seq(pg, item);
 }

 static void clear_item_dirty(struct super_block *sb,
@@ -852,8 +866,7 @@ static void compact_page_items(struct super_block *sb,

 	for (from = first_item(&pg->item_root); from; from = next_item(from)) {
 		to = page_address(empty->page) + page_off;
-		page_off += round_up(item_val_bytes(from->val_len),
-				     CACHED_ITEM_ALIGN);
+		page_off += item_val_bytes(from->val_len);

 		/* copy the entire item, struct members and all */
 		memcpy(to, from, item_val_bytes(from->val_len));
@@ -1260,46 +1273,76 @@ static int cache_empty_page(struct super_block *sb,
 	return 0;
 }

+/*
+ * Readers operate independently from dirty items and transactions.
+ * They read a set of persistent items and insert them into the cache
+ * when there aren't already pages whose key range contains the items.
+ * This naturally prefers cached dirty items over stale read items.
+ *
+ * We have to deal with the case where dirty items are written and
+ * invalidated while a read is in flight.   The reader won't have seen
+ * the items that were dirty in their persistent roots as they started
+ * reading.  By the time they insert their read pages the previously
+ * dirty items have been reclaimed and are not in the cache.  The old
+ * stale items will be inserted in their place, effectively corrupting
+ * by having the dirty items disappear.
+ *
+ * We fix this by tracking the max seq of items in pages.  As readers
+ * start they record the current transaction seq.  Invalidation skips
+ * pages with a max seq greater than the first reader seq because the
+ * items in the page have to stick around to prevent the readers stale
+ * items from being inserted.
+ *
+ * This naturally only affects a small set of pages with items that were
+ * written relatively recently.  If we're in memory pressure then we
+ * probably have a lot of pages and they'll naturally have items that
+ * were visible to any raders.  We don't bother with the complicated and
+ * expensive further refinement of tracking the ranges that are being
+ * read and comparing those with pages to invalidate.
+ */
 struct active_reader {
-	struct rb_node node;
-	struct scoutfs_key start;
-	struct scoutfs_key end;
+	struct list_head head;
+	u64 seq;
 };

-static struct active_reader *active_rbtree_walk(struct rb_root *root,
-						struct scoutfs_key *start,
-						struct scoutfs_key *end,
-						struct rb_node **par,
-						struct rb_node ***pnode)
+#define INIT_ACTIVE_READER(rdr) \
+	struct active_reader rdr = { .head = LIST_HEAD_INIT(rdr.head) }
+
+static void add_active_reader(struct super_block *sb, struct active_reader *active)
+{
+	DECLARE_ITEM_CACHE_INFO(sb, cinf);
+
+	BUG_ON(!list_empty(&active->head));
+
+	active->seq = scoutfs_trans_sample_seq(sb);
+
+	spin_lock(&cinf->active_lock);
+	list_add_tail(&active->head, &cinf->active_list);
+	spin_unlock(&cinf->active_lock);
+}
+
+static u64 first_active_reader_seq(struct item_cache_info *cinf)
 {
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct active_reader *ret = NULL;
 	struct active_reader *active;
-	int cmp;
+	u64 first;

-	while (*node) {
-		parent = *node;
-		active = container_of(*node, struct active_reader, node);
+	/* only the calling task adds or deletes this active */
+	spin_lock(&cinf->active_lock);
+	active = list_first_entry_or_null(&cinf->active_list, struct active_reader, head);
+	first = active ? active->seq : U64_MAX;
+	spin_unlock(&cinf->active_lock);

-		cmp = scoutfs_key_compare_ranges(start, end, &active->start,
-						 &active->end);
-		if (cmp < 0) {
-			node = &(*node)->rb_left;
-		} else if (cmp > 0) {
-			node = &(*node)->rb_right;
-		} else {
-			ret = active;
-			node = &(*node)->rb_left;
-		}
+	return first;
+}
+
+static void del_active_reader(struct item_cache_info *cinf, struct active_reader *active)
+{
+	/* only the calling task adds or deletes this active */
+	if (!list_empty(&active->head)) {
+		spin_lock(&cinf->active_lock);
+		list_del_init(&active->head);
+		spin_unlock(&cinf->active_lock);
 	}
-
-	if (par)
-		*par = parent;
-	if (pnode)
-		*pnode = node;
-
-	return ret;
 }

 /*
@@ -1308,10 +1351,10 @@ static struct active_reader *active_rbtree_walk(struct rb_root *root,
 * on our root and aren't in dirty or lru lists.
 *
 * We need to store deletion items here as we read items from all the
- * btrees so that they can override older versions of the items.  The
- * deletion items will be deleted before we insert the pages into the
- * cache.  We don't insert old versions of items into the tree here so
- * that the trees don't have to compare versions.
+ * btrees so that they can override older items.  The deletion items
+ * will be deleted before we insert the pages into the cache.  We don't
+ * insert old versions of items into the tree here so that the trees
+ * don't have to compare seqs.
 */
 static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 			  struct scoutfs_log_item_value *liv, void *val,
@@ -1331,7 +1374,7 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,

 	pg = page_rbtree_walk(sb, root, key, key, NULL, NULL, &p_par, &p_pnode);
 	found = item_rbtree_walk(&pg->item_root, key, NULL, &par, &pnode);
-	if (found && (le64_to_cpu(found->liv.vers) >= le64_to_cpu(liv->vers)))
+	if (found && (le64_to_cpu(found->liv.seq) >= le64_to_cpu(liv->seq)))
 		return 0;

 	if (!page_has_room(pg, val_len)) {
@@ -1399,22 +1442,15 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
 * locks held, but without locking the cache.  The regions we read can
 * be stale with respect to the current cache, which can be read and
 * dirtied by other cluster lock holders on our node, but the cluster
- * locks protect the stable items we read.
- *
- * There's also the exciting case where a reader can populate the cache
- * with stale old persistent data which was read before another local
- * cluster lock holder was able to read, dirty, write, and then shrink
- * the cache.  In this case the cache couldn't be cleared by lock
- * invalidation because the caller is actively holding the lock.  But
- * shrinking could evict the cache within the held lock.  So we record
- * that we're an active reader in the range covered by the lock and
- * shrink will refuse to reclaim any pages that intersect with our read.
+ * locks protect the stable items we read.  Invalidation is careful not
+ * to drop pages that have items that we couldn't see because they were
+ * dirty when we started reading.
 */
 static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 		      struct scoutfs_key *key, struct scoutfs_lock *lock)
 {
 	struct rb_root root = RB_ROOT;
-	struct active_reader active;
+	INIT_ACTIVE_READER(active);
 	struct cached_page *right = NULL;
 	struct cached_page *pg;
 	struct cached_page *rd;
@@ -1430,15 +1466,6 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	int pgi;
 	int ret;

-	/* stop shrink from freeing new clean data, would let us cache stale */
-	active.start = lock->start;
-	active.end = lock->end;
-	spin_lock(&cinf->active_lock);
-	active_rbtree_walk(&cinf->active_root, &active.start, &active.end,
-		           &par, &pnode);
-	rbtree_insert(&active.node, par, pnode, &cinf->active_root);
-	spin_unlock(&cinf->active_lock);
-
 	/* start with an empty page that covers the whole lock */
 	pg = alloc_pg(sb, 0);
 	if (!pg) {
@@ -1449,6 +1476,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf,
 	pg->end = lock->end;
 	rbtree_insert(&pg->node, NULL, &root.rb_node, &root);

+	/* set active reader seq before reading persistent roots */
+	add_active_reader(sb, &active);
+
 	ret = scoutfs_forest_read_items(sb, lock, key, &start, &end,
 				       read_page_item, &root);
 	if (ret < 0)
@@ -1526,9 +1556,7 @@ retry:

 	ret = 0;
 out:
-	spin_lock(&cinf->active_lock);
-	rbtree_erase(&active.node, &cinf->active_root);
-	spin_unlock(&cinf->active_lock);
+	del_active_reader(cinf, &active);

 	/* free any pages we left dangling on error */
 	for_each_page_safe(&root, rd, pg_tmp) {
@@ -1783,6 +1811,21 @@ out:
 	return ret;
 }

+/*
+ * An item's seq is greater of the client transaction's seq and the
+ * lock's write_seq.  This ensures that multiple commits in one lock
+ * grant will have increasing seqs, and new locks in open commits will
+ * also increase the seqs.  It lets us limit the inputs of item merging
+ * to the last stable seq and ensure that all the items in open
+ * transactions and granted locks will have greater seqs.
+ */
+static __le64 item_seq(struct super_block *sb, struct scoutfs_lock *lock)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	return cpu_to_le64(max(sbi->trans_seq, lock->write_seq));
+}
+
 /*
 * Mark the item dirty.  Dirtying while holding a transaction pins the
 * page holding the item and guarantees that the item can be deleted or
@@ -1815,8 +1858,8 @@ int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
 	if (!item || item->deletion) {
 		ret = -ENOENT;
 	} else {
+		item->liv.seq = item_seq(sb, lock);
 		mark_item_dirty(sb, cinf, pg, NULL, item);
-		item->liv.vers = cpu_to_le64(lock->write_version);
 		ret = 0;
 	}

@@ -1836,7 +1879,7 @@ static int item_create(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_version),
+		.seq = item_seq(sb, lock),
 	};
 	struct cached_item *found;
 	struct cached_item *item;
@@ -1911,7 +1954,7 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_version),
+		.seq = item_seq(sb, lock),
 	};
 	struct cached_item *item;
 	struct cached_item *found;
@@ -1944,9 +1987,10 @@ int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
 		if (val_len)
 			memcpy(found->val, val, val_len);
 		if (val_len < found->val_len)
-			pg->erased_bytes += found->val_len - val_len;
+			pg->erased_bytes += item_val_bytes(found->val_len) -
+					    item_val_bytes(val_len);
 		found->val_len = val_len;
-		found->liv.vers = liv.vers;
+		found->liv.seq = liv.seq;
 		mark_item_dirty(sb, cinf, pg, NULL, found);
 	} else {
 		item = alloc_item(pg, key, &liv, val, val_len);
@@ -1978,7 +2022,7 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 {
 	DECLARE_ITEM_CACHE_INFO(sb, cinf);
 	struct scoutfs_log_item_value liv = {
-		.vers = cpu_to_le64(lock->write_version),
+		.seq = item_seq(sb, lock),
 	};
 	struct cached_item *item;
 	struct cached_page *pg;
@@ -2020,10 +2064,11 @@ static int item_delete(struct super_block *sb, struct scoutfs_key *key,
 		erase_item(pg, item);
 	} else {
 		/* must emit deletion to clobber old persistent item */
-		item->liv.vers = cpu_to_le64(lock->write_version);
+		item->liv.seq = liv.seq;
 		item->liv.flags |= SCOUTFS_LOG_ITEM_FLAG_DELETION;
 		item->deletion = 1;
-		pg->erased_bytes += item->val_len;
+		pg->erased_bytes += item_val_bytes(item->val_len) -
+				    item_val_bytes(0);
 		item->val_len = 0;
 		mark_item_dirty(sb, cinf, pg, NULL, item);
 	}
@@ -2106,7 +2151,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 	struct page *page;
 	LIST_HEAD(pages);
 	LIST_HEAD(pos);
-	u64 max_vers = 0;
+	u64 max_seq = 0;
 	int val_len;
 	int bytes;
 	int off;
@@ -2171,7 +2216,7 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 			val_len = sizeof(item->liv) + item->val_len;
 			bytes = offsetof(struct scoutfs_btree_item_list,
 					 val[val_len]);
-			max_vers = max(max_vers, le64_to_cpu(item->liv.vers));
+			max_seq = max(max_seq, le64_to_cpu(item->liv.seq));

 			if (off + bytes > PAGE_SIZE) {
 				page = second;
@@ -2201,8 +2246,8 @@ int scoutfs_item_write_dirty(struct super_block *sb)
 		read_unlock(&pg->rwlock);
 	}

-	/* store max item vers in forest's log_trees */
-	scoutfs_forest_set_max_vers(sb, max_vers);
+	/* store max item seq in forest's log_trees */
+	scoutfs_forest_set_max_seq(sb, max_seq);

 	/* write all the dirty items into log btree blocks */
 	ret = scoutfs_forest_insert_list(sb, first);
@@ -2389,9 +2434,9 @@ retry:

 /*
 * Shrink the size the item cache.  We're operating against the fast
- * path lock ordering and we skip pages if we can't acquire locks.
- * Similarly, we can run into dirty pages or pages which intersect with
- * active readers that we can't shrink and also choose to skip.
+ * path lock ordering and we skip pages if we can't acquire locks.  We
+ * can run into dirty pages or pages with items that weren't visible to
+ * the earliest active reader which must be skipped.
 */
 static int item_lru_shrink(struct shrinker *shrink,
 			   struct shrink_control *sc)
@@ -2400,26 +2445,24 @@ static int item_lru_shrink(struct shrinker *shrink,
 						    struct item_cache_info,
 						    shrinker);
 	struct super_block *sb = cinf->sb;
-	struct active_reader *active;
 	struct cached_page *tmp;
 	struct cached_page *pg;
+	u64 first_reader_seq;
 	int nr;

 	if (sc->nr_to_scan == 0)
 		goto out;
 	nr = sc->nr_to_scan;

+	/* can't invalidate pages with items that weren't visible to first reader */
+	first_reader_seq = first_active_reader_seq(cinf);
+
 	write_lock(&cinf->rwlock);
 	spin_lock(&cinf->lru_lock);

 	list_for_each_entry_safe(pg, tmp, &cinf->lru_list, lru_head) {

-		/* can't invalidate ranges being read, reader might be stale */
-		spin_lock(&cinf->active_lock);
-		active = active_rbtree_walk(&cinf->active_root, &pg->start,
-					    &pg->end, NULL, NULL);
-		spin_unlock(&cinf->active_lock);
-		if (active) {
+		if (first_reader_seq <= pg->max_liv_seq) {
 			scoutfs_inc_counter(sb, item_shrink_page_reader);
 			continue;
 		}
@@ -2488,7 +2531,7 @@ int scoutfs_item_setup(struct super_block *sb)
 	spin_lock_init(&cinf->lru_lock);
 	INIT_LIST_HEAD(&cinf->lru_list);
 	spin_lock_init(&cinf->active_lock);
-	cinf->active_root = RB_ROOT;
+	INIT_LIST_HEAD(&cinf->active_list);

 	cinf->pcpu_pages = alloc_percpu(struct item_percpu_pages);
 	if (!cinf->pcpu_pages)
@@ -2519,7 +2562,7 @@ void scoutfs_item_destroy(struct super_block *sb)
 	int cpu;

 	if (cinf) {
-		BUG_ON(!RB_EMPTY_ROOT(&cinf->active_root));
+		BUG_ON(!list_empty(&cinf->active_list));

 		unregister_hotcpu_notifier(&cinf->notifier);
 		unregister_shrinker(&cinf->shrinker);
--- a/kmod/src/key.h
+++ b/kmod/src/key.h
@@ -108,6 +108,16 @@ static inline void scoutfs_key_set_ones(struct scoutfs_key *key)
 	memset(key->__pad, 0, sizeof(key->__pad));
 }

+static inline bool scoutfs_key_is_ones(struct scoutfs_key *key)
+{
+	return key->sk_zone == U8_MAX &&
+	       key->_sk_first == cpu_to_le64(U64_MAX) &&
+	       key->sk_type == U8_MAX &&
+	       key->_sk_second == cpu_to_le64(U64_MAX) &&
+	       key->_sk_third == cpu_to_le64(U64_MAX) &&
+	       key->_sk_fourth == U8_MAX;
+}
+
 /*
 * Return a -1/0/1 comparison of keys.
 *
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -66,6 +66,8 @@
 * relative to that lock state we resend.
 */

+#define GRACE_PERIOD_KT	ms_to_ktime(10)
+
 /*
 * allocated per-super, freed on unmount.
 */
@@ -82,13 +84,11 @@ struct lock_info {
 	struct workqueue_struct *workq;
 	struct work_struct grant_work;
 	struct list_head grant_list;
-	struct work_struct inv_work;
+	struct delayed_work inv_dwork;
 	struct list_head inv_list;
 	struct work_struct shrink_work;
 	struct list_head shrink_list;
 	atomic64_t next_refresh_gen;
-	struct work_struct inv_iput_work;
-	struct llist_head inv_iput_llist;

 	struct dentry *tseq_dentry;
 	struct scoutfs_tseq_tree tseq_tree;
@@ -124,34 +124,6 @@ static bool lock_modes_match(int granted, int requested)
 		requested == SCOUTFS_LOCK_READ);
 }

-/*
- * Final iput can get into evict and perform final inode deletion which
- * can delete a lot of items under locks and transactions.  We really
- * don't want to be doing all that in an iput during invalidation.  When
- * invalidation sees that iput might perform final deletion it puts them
- * on a list and queues this work.
- *
- * Nothing stops multiple puts for multiple invalidations of an inode
- * before the work runs so we can track multiple puts in flight.
- */
-static void lock_inv_iput_worker(struct work_struct *work)
-{
-	struct lock_info *linfo = container_of(work, struct lock_info, inv_iput_work);
-	struct scoutfs_inode_info *si;
-	struct scoutfs_inode_info *tmp;
-	struct llist_node *inodes;
-	bool more;
-
-	inodes = llist_del_all(&linfo->inv_iput_llist);
-
-	llist_for_each_entry_safe(si, tmp, inodes, inv_iput_llnode) {
-		do {
-			more = atomic_dec_return(&si->inv_iput_count) > 0;
-			iput(&si->inode);
-		} while (more);
-	}
-}
-
 /*
 * Invalidate cached data associated with an inode whose lock is going
 * away.
@@ -192,11 +164,8 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 		if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
 			iput(inode);
 		} else {
-			/* defer iput to work context so we don't evict inodes from invalidation */ 
-			if (atomic_inc_return(&si->inv_iput_count) == 1)
-				llist_add(&si->inv_iput_llnode, &linfo->inv_iput_llist);
-			smp_wmb(); /* count and list visible before work executes */
-			queue_work(linfo->workq, &linfo->inv_iput_work);
+			/* defer iput to work context so we don't evict inodes from invalidation */
+			scoutfs_inode_queue_iput(inode);
 		}
 	}
 }
@@ -362,6 +331,23 @@ static bool lock_counts_match(int granted, unsigned int *counts)
 	return true;
 }

+/*
+ * Returns true if there are any mode counts that match with the desired
+ * mode.  There can be other non-matching counts as well but we're only
+ * testing for the existence of any matching counts.
+ */
+static bool lock_count_match_exists(int desired, unsigned int *counts)
+{
+	enum scoutfs_lock_mode mode;
+
+	for (mode = 0; mode < SCOUTFS_LOCK_NR_MODES; mode++) {
+		if (counts[mode] && lock_modes_match(desired, mode))
+			return true;
+	}
+
+	return false;
+}
+
 /*
 * An idle lock has nothing going on.  It can be present in the lru and
 * can be freed by the final put when it has a null mode.
@@ -578,6 +564,24 @@ static void put_lock(struct lock_info *linfo,struct scoutfs_lock *lock)
 	}
 }

+/*
+ * Locks have a grace period that extends after activity and prevents
+ * invalidation.  It's intended to let nodes do reasonable batches of
+ * work as locks ping pong between nodes that are doing conflicting
+ * work.
+ */
+static void extend_grace(struct super_block *sb, struct scoutfs_lock *lock)
+{
+	ktime_t now = ktime_get();
+
+	if (ktime_after(now, lock->grace_deadline))
+		scoutfs_inc_counter(sb, lock_grace_set);
+	else
+		scoutfs_inc_counter(sb, lock_grace_extended);
+
+	lock->grace_deadline = ktime_add(now, GRACE_PERIOD_KT);
+}
+
 static void queue_grant_work(struct lock_info *linfo)
 {
 	assert_spin_locked(&linfo->lock);
@@ -587,15 +591,19 @@ static void queue_grant_work(struct lock_info *linfo)
 }

 /*
- * The caller has made a change (set a lock mode) which can let one of the
- * invalidating locks make forward progress.
+ * We immediately queue work on the assumption that the caller might
+ * have made a change (set a lock mode) which can let one of the
+ * invalidating locks make forward progress, even if other locks are
+ * waiting for their grace period to elapse.  It's a trade-off between
+ * invalidation latency and burning cpu repeatedly finding that locks
+ * are still in their grace period.
 */
 static void queue_inv_work(struct lock_info *linfo)
 {
 	assert_spin_locked(&linfo->lock);

 	if (!list_empty(&linfo->inv_list))
-		queue_work(linfo->workq, &linfo->inv_work);
+		mod_delayed_work(linfo->workq, &linfo->inv_dwork, 0);
 }

 /*
@@ -648,6 +656,15 @@ static void bug_on_inconsistent_grant_cache(struct super_block *sb,
 * Grant responses can be reordered with incoming invalidation requests
 * from the server so we have to be careful to only set the new mode
 * once the old mode matches.
+ *
+ * We extend the grace period as we grant the lock if there is a waiting
+ * locker who can use the lock.  This stops invalidation from pulling
+ * the granted lock out from under the requester, resulting in a lot of
+ * churn with no forward progress.  Using the grace period avoids having
+ * to identify a specific waiter and give it an acquired lock.  It's
+ * also very similar to waking up the locker and having it win the race
+ * against the invalidation.  In that case they'd extend the grace
+ * period anyway as they unlock.
 */
 static void lock_grant_worker(struct work_struct *work)
 {
@@ -680,7 +697,10 @@ static void lock_grant_worker(struct work_struct *work)

 		lock->request_pending = 0;
 		lock->mode = nl->new_mode;
-		lock->write_version = le64_to_cpu(nl->write_version);
+		lock->write_seq = le64_to_cpu(nl->write_seq);
+
+		if (lock_count_match_exists(nl->new_mode, lock->waiters))
+			extend_grace(sb, lock);

 		trace_scoutfs_lock_granted(sb, lock);
 		list_del_init(&lock->grant_head);
@@ -745,6 +765,11 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 * invalidate once the lock mode matches what the server told us to
 * invalidate.
 *
+ * We delay invalidation processing until a grace period has elapsed
+ * since the last unlock.  The intent is to let users do a reasonable
+ * batch of work before dropping the lock.  Continuous unlocking can
+ * continuously extend the deadline.
+ *
 * Before we start invalidating the lock we set the lock to the new
 * mode, preventing further incompatible users of the old mode from
 * using the lock while we're invalidating.
@@ -756,11 +781,15 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 */
 static void lock_invalidate_worker(struct work_struct *work)
 {
-	struct lock_info *linfo = container_of(work, struct lock_info, inv_work);
+	struct lock_info *linfo = container_of(work, struct lock_info,
+					       inv_dwork.work);
 	struct super_block *sb = linfo->sb;
 	struct scoutfs_net_lock *nl;
 	struct scoutfs_lock *lock;
 	struct scoutfs_lock *tmp;
+	unsigned long delay = MAX_JIFFY_OFFSET;
+	ktime_t now = ktime_get();
+	ktime_t deadline;
 	LIST_HEAD(ready);
 	u64 net_id;
 	int ret;
@@ -780,6 +809,15 @@ static void lock_invalidate_worker(struct work_struct *work)
 		if (!lock_counts_match(nl->new_mode, lock->users))
 			continue;

+		/* skip if grace hasn't elapsed, record earliest */
+		deadline = lock->grace_deadline;
+		if (!linfo->shutdown && ktime_before(now, deadline)) {
+			delay = min(delay,
+				    nsecs_to_jiffies(ktime_to_ns(
+						ktime_sub(deadline, now))));
+			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
+			continue;
+		}
 		/* set the new mode, no incompatible users during inval */
 		lock->mode = nl->new_mode;

@@ -790,7 +828,7 @@ static void lock_invalidate_worker(struct work_struct *work)
 	spin_unlock(&linfo->lock);

 	if (list_empty(&ready))
-		return;
+		goto out;

 	/* invalidate once the lock is read */
 	list_for_each_entry(lock, &ready, inv_head) {
@@ -833,6 +871,11 @@ static void lock_invalidate_worker(struct work_struct *work)
 	/* grant might have been waiting for invalidate request */
 	queue_grant_work(linfo);
 	spin_unlock(&linfo->lock);
+
+out:
+	/* queue delayed work if invalidations waiting on grace deadline */
+	if (delay != MAX_JIFFY_OFFSET)
+		queue_delayed_work(linfo->workq, &linfo->inv_dwork, delay);
 }

 /*
@@ -912,7 +955,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 	for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {

 		nlr->locks[i].key = lock->start;
-		nlr->locks[i].write_version = cpu_to_le64(lock->write_version);
+		nlr->locks[i].write_seq = cpu_to_le64(lock->write_seq);
 		nlr->locks[i].old_mode = lock->mode;
 		nlr->locks[i].new_mode = lock->mode;

@@ -1052,8 +1095,14 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i

 		trace_scoutfs_lock_wait(sb, lock);

-		ret = wait_event_interruptible(lock->waitq,
-					       lock_wait_cond(sb, lock, mode));
+		if (flags & SCOUTFS_LKF_INTERRUPTIBLE) {
+			ret = wait_event_interruptible(lock->waitq,
+						       lock_wait_cond(sb, lock, mode));
+		} else {
+			wait_event(lock->waitq, lock_wait_cond(sb, lock, mode));
+			ret = 0;
+		}
+
 		spin_lock(&linfo->lock);
 		if (ret)
 			break;
@@ -1271,33 +1320,36 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
 }

 /*
- * The rid lock protects a mount's private persistent items in the rid
- * zone.  It's held for the duration of the mount.  It lets the mount
- * modify the rid items at will and signals to other mounts that we're
- * still alive and our rid items shouldn't be reclaimed.
+ * Orphan items are stored in their own zone which are modified with
+ * shared write_only locks and are read inconsistently without locks by
+ * background scanning work.
 *
- * Being held for the entire mount prevents other nodes from reclaiming
- * our items, like free blocks, when it would make sense for them to be
- * able to.  Maybe we have a bunch free and they're trying to allocate
- * and are getting ENOSPC.
+ * Since we only use write_only locks we just lock the entire zone, but
+ * the api provides the inode in case we ever change the locking scheme.
 */
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
-		     u64 rid, struct scoutfs_lock **lock)
+int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
+			struct scoutfs_lock **lock)
 {
 	struct scoutfs_key start;
 	struct scoutfs_key end;

 	scoutfs_key_set_zeros(&start);
-	start.sk_zone = SCOUTFS_RID_ZONE;
-	start.sko_rid = cpu_to_le64(rid);
+	start.sk_zone = SCOUTFS_ORPHAN_ZONE;
+	start.sko_ino = 0;
+	start.sk_type = SCOUTFS_ORPHAN_TYPE;

-	scoutfs_key_set_ones(&end);
-	end.sk_zone = SCOUTFS_RID_ZONE;
-	end.sko_rid = cpu_to_le64(rid);
+	scoutfs_key_set_zeros(&end);
+	end.sk_zone = SCOUTFS_ORPHAN_ZONE;
+	end.sko_ino = cpu_to_le64(U64_MAX);
+	end.sk_type = SCOUTFS_ORPHAN_TYPE;

 	return lock_key_range(sb, mode, flags, &start, &end, lock);
 }

+/*
+ * As we unlock we always extend the grace period to give the caller
+ * another pass at the lock before its invalidated.
+ */
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
@@ -1310,6 +1362,7 @@ void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scou
 	spin_lock(&linfo->lock);

 	lock_dec_count(lock->users, mode);
+	extend_grace(sb, lock);
 	if (lock_mode_can_write(mode))
 		lock->dirty_trans_seq = scoutfs_trans_sample_seq(sb);

@@ -1549,7 +1602,7 @@ void scoutfs_lock_unmount_begin(struct super_block *sb)

 	if (linfo) {
 		linfo->unmounting = true;
-		flush_work(&linfo->inv_work);
+		flush_delayed_work(&linfo->inv_dwork);
 	}
 }

@@ -1642,6 +1695,8 @@ void scoutfs_lock_destroy(struct super_block *sb)
 	spin_unlock(&linfo->lock);

 	if (linfo->workq) {
+		/* pending grace work queues normal work */
+		flush_workqueue(linfo->workq);
 		/* now all work won't queue itself */
 		destroy_workqueue(linfo->workq);
 	}
@@ -1702,13 +1757,11 @@ int scoutfs_lock_setup(struct super_block *sb)
 	INIT_LIST_HEAD(&linfo->lru_list);
 	INIT_WORK(&linfo->grant_work, lock_grant_worker);
 	INIT_LIST_HEAD(&linfo->grant_list);
-	INIT_WORK(&linfo->inv_work, lock_invalidate_worker);
+	INIT_DELAYED_WORK(&linfo->inv_dwork, lock_invalidate_worker);
 	INIT_LIST_HEAD(&linfo->inv_list);
 	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
 	INIT_LIST_HEAD(&linfo->shrink_list);
 	atomic64_set(&linfo->next_refresh_gen, 0);
-	INIT_WORK(&linfo->inv_iput_work, lock_inv_iput_worker);
-	init_llist_head(&linfo->inv_iput_llist);
 	scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

 	sbi->lock_info = linfo;
--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -6,14 +6,15 @@

 #define SCOUTFS_LKF_REFRESH_INODE	0x01 /* update stale inode from item */
 #define SCOUTFS_LKF_NONBLOCK		0x02 /* only use already held locks */
-#define SCOUTFS_LKF_INVALID		(~((SCOUTFS_LKF_NONBLOCK << 1) - 1))
+#define SCOUTFS_LKF_INTERRUPTIBLE	0x04 /* pending signals return -ERESTARTSYS */
+#define SCOUTFS_LKF_INVALID		(~((SCOUTFS_LKF_INTERRUPTIBLE << 1) - 1))

 #define SCOUTFS_LOCK_NR_MODES		SCOUTFS_LOCK_INVALID

 struct scoutfs_omap_lock;

 /*
- * A few fields (start, end, refresh_gen, write_version, granted_mode)
+ * A few fields (start, end, refresh_gen, write_seq, granted_mode)
 * are referenced by code outside lock.c.
 */
 struct scoutfs_lock {
@@ -23,10 +24,11 @@ struct scoutfs_lock {
 	struct rb_node node;
 	struct rb_node range_node;
 	u64 refresh_gen;
-	u64 write_version;
+	u64 write_seq;
 	u64 dirty_trans_seq;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
+	ktime_t grace_deadline;
 	unsigned long request_pending:1,
 		      invalidate_pending:1;

@@ -84,8 +86,8 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int
 			struct inode *d, struct scoutfs_lock **D_lock);
 int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct scoutfs_lock **lock);
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
-		     u64 rid, struct scoutfs_lock **lock);
+int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+		        u64 ino, struct scoutfs_lock **lock);
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
 		    enum scoutfs_lock_mode mode);

--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -78,13 +78,9 @@ struct lock_server_info {

 	struct scoutfs_tseq_tree tseq_tree;
 	struct dentry *tseq_dentry;
-	struct scoutfs_tseq_tree stats_tseq_tree;
-	struct dentry *stats_tseq_dentry;

 	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
-
-	atomic64_t write_version;
 };

 #define DECLARE_LOCK_SERVER_INFO(sb, name) \
@@ -111,9 +107,6 @@ struct server_lock_node {
 	struct list_head granted;
 	struct list_head requested;
 	struct list_head invalidated;
-
-	struct scoutfs_tseq_entry stats_tseq_entry;
-	u64 stats[SLT_NR];
 };

 /*
@@ -303,8 +296,6 @@ static struct server_lock_node *alloc_server_lock(struct lock_server_info *inf,
 			snode = get_server_lock(inf, key, ins, false);
 			if (snode != ins)
 				kfree(ins);
-			else
-				scoutfs_tseq_add(&inf->stats_tseq_tree, &snode->stats_tseq_entry);
 		}
 	}

@@ -334,10 +325,8 @@ static void put_server_lock(struct lock_server_info *inf,

 	mutex_unlock(&snode->mutex);

-	if (should_free) {
-		scoutfs_tseq_del(&inf->stats_tseq_tree, &snode->stats_tseq_entry);
+	if (should_free)
 		kfree(snode);
-	}
 }

 static struct client_lock_entry *find_entry(struct server_lock_node *snode,
@@ -399,8 +388,6 @@ int scoutfs_lock_server_request(struct super_block *sb, u64 rid,
 		goto out;
 	}

-	snode->stats[SLT_REQUEST]++;
-
 	clent->snode = snode;
 	add_client_entry(snode, &snode->requested, clent);
 	scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);
@@ -441,8 +428,6 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
 		goto out;
 	}

-	snode->stats[SLT_RESPONSE]++;
-
 	clent = find_entry(snode, &snode->invalidated, rid);
 	if (!clent) {
 		put_server_lock(inf, snode);
@@ -492,14 +477,14 @@ static int process_waiting_requests(struct super_block *sb,
 	struct client_lock_entry *req_tmp;
 	struct client_lock_entry *gr;
 	struct client_lock_entry *gr_tmp;
-	u64 wv;
+	u64 seq;
 	int ret;

 	BUG_ON(!mutex_is_locked(&snode->mutex));

 	/* processing waits for all invalidation responses or recovery */
 	if (!list_empty(&snode->invalidated) ||
-	    scoutfs_recov_next_pending(sb, SCOUTFS_RECOV_LOCKS) != 0) {
+	    scoutfs_recov_next_pending(sb, 0, SCOUTFS_RECOV_LOCKS) != 0) {
 		ret = 0;
 		goto out;
 	}
@@ -523,7 +508,6 @@ static int process_waiting_requests(struct super_block *sb,
 			trace_scoutfs_lock_message(sb, SLT_SERVER,
 						   SLT_INVALIDATE, SLT_REQUEST,
 						   gr->rid, 0, &nl);
-			snode->stats[SLT_INVALIDATE]++;

 			add_client_entry(snode, &snode->invalidated, gr);
 		}
@@ -534,6 +518,7 @@ static int process_waiting_requests(struct super_block *sb,

 		nl.key = snode->key;
 		nl.new_mode = req->mode;
+		nl.write_seq = 0;

 		/* see if there's an existing compatible grant to replace */
 		gr = find_entry(snode, &snode->granted, req->rid);
@@ -546,8 +531,9 @@ static int process_waiting_requests(struct super_block *sb,

 		if (nl.new_mode == SCOUTFS_LOCK_WRITE ||
 		    nl.new_mode == SCOUTFS_LOCK_WRITE_ONLY) {
-			wv = atomic64_inc_return(&inf->write_version);
-			nl.write_version = cpu_to_le64(wv);
+			/* doesn't commit seq update, recovered with locks */
+			seq = scoutfs_server_next_seq(sb);
+			nl.write_seq = cpu_to_le64(seq);
 		}

 		ret = scoutfs_server_lock_response(sb, req->rid,
@@ -558,7 +544,6 @@ static int process_waiting_requests(struct super_block *sb,
 		trace_scoutfs_lock_message(sb, SLT_SERVER, SLT_GRANT,
 					   SLT_RESPONSE, req->rid,
 					   req->net_id, &nl);
-		snode->stats[SLT_GRANT]++;

 		/* don't track null client locks, track all else */ 
 		if (req->mode == SCOUTFS_LOCK_NULL)
@@ -624,14 +609,6 @@ int scoutfs_lock_server_finished_recovery(struct super_block *sb)
 	return ret;
 }

-static void set_max_write_version(struct lock_server_info *inf, u64 new)
-{
-	u64 old;
-
-	while (new > (old = atomic64_read(&inf->write_version)) &&
-	       (atomic64_cmpxchg(&inf->write_version, old, new) != old));
-}
-
 /*
 * We sent a lock recover request to the client when we received its
 * greeting while in recovery.  Here we instantiate all the locks it
@@ -695,9 +672,9 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,

 		put_server_lock(inf, snode);

-		/* make sure next write lock is greater than all recovered */
-		set_max_write_version(inf,
-				le64_to_cpu(nlr->locks[i].write_version));
+		/* make sure next core seq is greater than all lock write seq */
+		scoutfs_server_set_seq_if_greater(sb,
+				le64_to_cpu(nlr->locks[i].write_seq));
 	}

 	/* send request for next batch of keys */
@@ -809,23 +786,13 @@ static void lock_server_tseq_show(struct seq_file *m,
 		   clent->net_id);
 }

-static void stats_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
-{
-	struct server_lock_node *snode = container_of(ent, struct server_lock_node,
-						      stats_tseq_entry);
-
-	seq_printf(m, SK_FMT" req %llu inv %llu rsp %llu gr %llu\n",
-		   SK_ARG(&snode->key), snode->stats[SLT_REQUEST], snode->stats[SLT_INVALIDATE],
-		   snode->stats[SLT_RESPONSE], snode->stats[SLT_GRANT]);
-}
-
 /*
 * Setup the lock server.  This is called before networking can deliver
 * requests.
 */
 int scoutfs_lock_server_setup(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri, u64 max_vers)
+			      struct scoutfs_block_writer *wri)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct lock_server_info *inf;
@@ -838,10 +805,8 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	spin_lock_init(&inf->lock);
 	inf->locks_root = RB_ROOT;
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);
-	scoutfs_tseq_tree_init(&inf->stats_tseq_tree, stats_tseq_show);
 	inf->alloc = alloc;
 	inf->wri = wri;
-	atomic64_set(&inf->write_version, max_vers); /* inc_return gives +1 */

 	inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root,
 					       &inf->tseq_tree);
@@ -850,14 +815,6 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 		return -ENOMEM;
 	}

-	inf->stats_tseq_dentry = scoutfs_tseq_create("tmp_lock_stats", sbi->debug_root,
-					       &inf->stats_tseq_tree);
-	if (!inf->stats_tseq_dentry) {
-		debugfs_remove(inf->tseq_dentry);
-		kfree(inf);
-		return -ENOMEM;
-	}
-
 	sbi->lock_server_info = inf;

 	return 0;
@@ -879,7 +836,6 @@ void scoutfs_lock_server_destroy(struct super_block *sb)

 	if (inf) {
 		debugfs_remove(inf->tseq_dentry);
-		debugfs_remove(inf->stats_tseq_dentry);

 		rbtree_postorder_for_each_entry_safe(snode, stmp,
 						     &inf->locks_root, node) {
--- a/kmod/src/lock_server.h
+++ b/kmod/src/lock_server.h
@@ -13,7 +13,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid);

 int scoutfs_lock_server_setup(struct super_block *sb,
 			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri, u64 max_vers);
+			      struct scoutfs_block_writer *wri);
 void scoutfs_lock_server_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -30,6 +30,7 @@
 #include "net.h"
 #include "endian_swap.h"
 #include "tseq.h"
+#include "fence.h"

 /*
 * scoutfs networking delivers requests and responses between nodes.
@@ -330,6 +331,9 @@ static int submit_send(struct super_block *sb,
 	    WARN_ON_ONCE(id == 0 && (flags & SCOUTFS_NET_FLAG_RESPONSE)))
 		return -EINVAL;

+	if (scoutfs_forcing_unmount(sb))
+		return -EIO;
+
 	msend = kmalloc(offsetof(struct message_send,
 				 nh.data[data_len]), GFP_NOFS);
 	if (!msend)
@@ -420,6 +424,16 @@ static int process_request(struct scoutfs_net_connection *conn,
 			mrecv->nh.data, le16_to_cpu(mrecv->nh.data_len));
 }

+static int call_resp_func(struct super_block *sb, struct scoutfs_net_connection *conn,
+			  scoutfs_net_response_t resp_func, void *resp_data,
+			  void *resp, unsigned int resp_len, int error)
+{
+	if (resp_func)
+		return resp_func(sb, conn, resp, resp_len, error, resp_data);
+	else
+		return 0;
+}
+
 /*
 * An incoming response finds the queued request and calls its response
 * function.  The response function for a given request will only be
@@ -434,7 +448,6 @@ static int process_response(struct scoutfs_net_connection *conn,
 	struct message_send *msend;
 	scoutfs_net_response_t resp_func = NULL;
 	void *resp_data;
-	int ret = 0;

 	spin_lock(&conn->lock);

@@ -449,11 +462,8 @@ static int process_response(struct scoutfs_net_connection *conn,

 	spin_unlock(&conn->lock);

-	if (resp_func)
-		ret = resp_func(sb, conn, mrecv->nh.data,
-				le16_to_cpu(mrecv->nh.data_len),
-				net_err_to_host(mrecv->nh.error), resp_data);
-	return ret;
+	return call_resp_func(sb, conn, resp_func, resp_data, mrecv->nh.data,
+			      le16_to_cpu(mrecv->nh.data_len), net_err_to_host(mrecv->nh.error));
 }

 /*
@@ -823,9 +833,15 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
 	if (conn->listening_conn && conn->notify_down)
 		conn->notify_down(sb, conn, conn->info, conn->rid);

-	/* free all messages, refactor and complete for forced unmount? */
+	/*
+	 * Usually networking is idle and we destroy pending sends, but when forcing unmount
+	 * we can have to wake up waiters by failing pending sends.
+	 */
 	list_splice_init(&conn->resend_queue, &conn->send_queue);
 	list_for_each_entry_safe(msend, tmp, &conn->send_queue, head) {
+		if (scoutfs_forcing_unmount(sb))
+			call_resp_func(sb, conn, msend->resp_func, msend->resp_data,
+				       NULL, 0, -ECONNABORTED);
 		free_msend(ninf, msend);
 	}

@@ -925,6 +941,8 @@ static int sock_opts_and_names(struct scoutfs_net_connection *conn,
 		ret = -EAFNOSUPPORT;
 	if (ret)
 		goto out;
+
+	conn->last_peername = conn->peername;
 out:
 	return ret;
 }
@@ -1205,6 +1223,7 @@ static void scoutfs_net_reconn_free_worker(struct work_struct *work)
 	unsigned long now = jiffies;
 	unsigned long deadline = 0;
 	bool requeue = false;
+	int ret;

 	trace_scoutfs_net_reconn_free_work_enter(sb, 0, 0);

@@ -1218,10 +1237,18 @@ restart:
 		     time_after_eq(now, acc->reconn_deadline))) {
 			set_conn_fl(acc, reconn_freeing);
 			spin_unlock(&conn->lock);
-			if (!test_conn_fl(conn, shutting_down))
-				scoutfs_info(sb, "client timed out "SIN_FMT" -> "SIN_FMT", can not reconnect",
-					     SIN_ARG(&acc->sockname),
-					     SIN_ARG(&acc->peername));
+			if (!test_conn_fl(conn, shutting_down)) {
+				scoutfs_info(sb, "client "SIN_FMT" reconnect timed out, fencing",
+					     SIN_ARG(&acc->last_peername));
+				ret = scoutfs_fence_start(sb, acc->rid,
+						acc->last_peername.sin_addr.s_addr,
+						SCOUTFS_FENCE_CLIENT_RECONNECT);
+				if (ret) {
+					scoutfs_err(sb, "client fence returned err %d, shutting down server",
+						    ret);
+					scoutfs_server_abort(sb);
+				}
+			}
 			destroy_conn(acc);
 			goto restart;
 		}
@@ -1292,6 +1319,7 @@ scoutfs_net_alloc_conn(struct super_block *sb,
 	init_waitqueue_head(&conn->waitq);
 	conn->sockname.sin_family = AF_INET;
 	conn->peername.sin_family = AF_INET;
+	conn->last_peername.sin_family = AF_INET;
 	INIT_LIST_HEAD(&conn->accepted_head);
 	INIT_LIST_HEAD(&conn->accepted_list);
 	conn->next_send_seq = 1;
@@ -1458,8 +1486,7 @@ int scoutfs_net_connect(struct super_block *sb,
 			struct scoutfs_net_connection *conn,
 			struct sockaddr_in *sin, unsigned long timeout_ms)
 {
-	int error = 0;
-	int ret;
+	int ret = 0;

 	spin_lock(&conn->lock);
 	conn->connect_sin = *sin;
@@ -1467,10 +1494,8 @@ int scoutfs_net_connect(struct super_block *sb,
 	spin_unlock(&conn->lock);

 	queue_work(conn->workq, &conn->connect_work);
-
-	ret = wait_event_interruptible(conn->waitq,
-				       connect_result(conn, &error));
-	return ret ?: error;
+	wait_event(conn->waitq, connect_result(conn, &ret));
+	return ret;
 }

 static void set_valid_greeting(struct scoutfs_net_connection *conn)
@@ -1606,10 +1631,10 @@ restart:
 		conn->next_send_id = reconn->next_send_id;
 		atomic64_set(&conn->recv_seq, atomic64_read(&reconn->recv_seq));

-		/* greeting response/ack will be on conn send queue */
+		/* reconn should be idle while in reconn_wait  */
 		BUG_ON(!list_empty(&reconn->send_queue));
-		BUG_ON(!list_empty(&conn->resend_queue));
-		list_splice_init(&reconn->resend_queue, &conn->resend_queue);
+		/* queued greeting response is racing, can be in send or resend queue */
+		list_splice_tail_init(&reconn->resend_queue, &conn->resend_queue);

 		/* new conn info is unused, swap, old won't call down */
 		swap(conn->info, reconn->info);
@@ -1773,11 +1798,10 @@ int scoutfs_net_sync_request(struct super_block *sb,
 	ret = scoutfs_net_submit_request(sb, conn, cmd, arg, arg_len,
 					 sync_response, &sreq, &id);

-	ret = wait_for_completion_interruptible(&sreq.comp);
-	if (ret == -ERESTARTSYS)
-		scoutfs_net_cancel_request(sb, conn, cmd, id);
-	else
+	if (ret == 0) {
+		wait_for_completion(&sreq.comp);
 		ret = sreq.error;
+	}

 	return ret;
 }
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -49,6 +49,7 @@ struct scoutfs_net_connection {
 	u64 greeting_id;
 	struct sockaddr_in sockname;
 	struct sockaddr_in peername;
+	struct sockaddr_in last_peername;

 	struct list_head accepted_head;
 	struct scoutfs_net_connection *listening_conn;
@@ -99,6 +100,16 @@ static inline void scoutfs_addr_to_sin(struct sockaddr_in *sin,
 	sin->sin_port = cpu_to_be16(le16_to_cpu(addr->v4.port));
 }

+static inline void scoutfs_sin_to_addr(union scoutfs_inet_addr *addr, struct sockaddr_in *sin)
+{
+	BUG_ON(sin->sin_family != AF_INET);
+
+	memset(addr, 0, sizeof(union scoutfs_inet_addr));
+	addr->v4.family = cpu_to_le16(SCOUTFS_AF_IPV4);
+	addr->v4.addr = be32_to_le32(sin->sin_addr.s_addr);
+	addr->v4.port = be16_to_le16(sin->sin_port);
+}
+
 struct scoutfs_net_connection *
 scoutfs_net_alloc_conn(struct super_block *sb,
 		       scoutfs_net_notify_t notify_up,
--- a/kmod/src/omap.c
+++ b/kmod/src/omap.c
@@ -137,11 +137,10 @@ struct omap_request {
 /*
 * In each inode group cluster lock we store data to track the open ino
 * map which tracks all the inodes that the cluster lock covers.  When
- * the version shows that the map is stale we send a request to update
- * it.
+ * the seq shows that the map is stale we send a request to update it.
 */
 struct scoutfs_omap_lock_data {
-	u64 version;
+	u64 seq;
 	bool req_in_flight;
 	wait_queue_head_t waitq;
 	struct scoutfs_open_ino_map map;
@@ -485,6 +484,10 @@ static int remove_rid_from_reqs(struct omap_info *ominf, u64 rid, u64 *resp_rid,
 * response if it was the last rid waiting for a response.
 *
 * If this returns an error then the server will shut down.
+ *
+ * This can be called multiple times by different servers if there are
+ * errors reclaiming an evicted mount, so we allow asking to remove a
+ * rid that hasn't been added.
 */
 int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid)
 {
@@ -495,21 +498,20 @@ int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid)
 	u64 resp_id = 0;
 	int ret;

-	map = kmalloc(sizeof(struct scoutfs_open_ino_map), GFP_NOFS);
-	if (!map) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	spin_lock(&ominf->lock);
 	entry = find_rid(&ominf->rids, rid);
 	if (entry)
 		free_rid(&ominf->rids, entry);
 	spin_unlock(&ominf->lock);

-	/* the server really shouldn't be removing a rid it never added */
-	if (WARN_ON_ONCE(!entry)) {
-		ret = -ENOENT;
+	if (!entry) {
+		ret = 0;
+		goto out;
+	}
+
+	map = kmalloc(sizeof(struct scoutfs_open_ino_map), GFP_NOFS);
+	if (!map) {
+		ret = -ENOMEM;
 		goto out;
 	}

@@ -593,10 +595,6 @@ out:
 		free_req(req);
 	}

-	/* it's fine if we couldn't send to a client that left */
-	if (ret == -ENOTCONN)
-		ret = 0;
-
 	return ret;
 }

@@ -616,7 +614,7 @@ static int handle_requests(struct super_block *sb)
 	int ret;
 	int err;

-	if (scoutfs_recov_next_pending(sb, SCOUTFS_RECOV_GREETING))
+	if (scoutfs_recov_next_pending(sb, 0, SCOUTFS_RECOV_GREETING))
 		return 0;

 	ret = 0;
@@ -830,8 +828,7 @@ static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lo
 /*
 * Make sure the map covered by the cluster lock is current.  The caller
 * holds the cluster lock so once we store lock_data on the cluster lock
- * it won't be freed and the write_version in the cluster lock won't
- * change.
+ * it won't be freed and the write_seq in the cluster lock won't change.
 *
 * The omap_spinlock protects the omap_data in the cluster lock.  We
 * have to drop it if we have to block to allocate lock_data, send a
@@ -858,7 +855,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		}

 		if (lock->omap_data == NULL) {
-			ldata->version = lock->write_version - 1; /* ensure refresh */
+			ldata->seq = lock->write_seq - 1; /* ensure refresh */
 			init_waitqueue_head(&ldata->waitq);

 			lock->omap_data = ldata;
@@ -868,7 +865,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		}
 	}

-	while (ldata->version != lock->write_version) {
+	while (ldata->seq != lock->write_seq) {
 		/* only one waiter sends a request at a time */
 		if (!ldata->req_in_flight) {
 			ldata->req_in_flight = true;
@@ -888,7 +885,7 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
 		if (send_req) {
 			ldata->req_in_flight = false;
 			if (ret == 0)
-				ldata->version = lock->write_version;
+				ldata->seq = lock->write_seq;
 			wake_up(&ldata->waitq);
 			if (ret < 0)
 				goto out;
@@ -907,9 +904,9 @@ out:
 }

 /*
- * Return 1 and give the caller a write inode lock if it is safe to be
- * deleted.  It's safe to be deleted when it is no longer reachable and
- * nothing is referencing it.
+ * Return 1 and give the caller their locks when they should delete the
+ * inode items.  It's safe to delete the inode items when it is no
+ * longer reachable and nothing is referencing it.
 *
 * The inode is unreachable when nlink hits zero.  Cluster locks protect
 * modification and testing of nlink.  We use the ino_lock_cov covrage
@@ -924,15 +921,17 @@ out:
 * increase nlink from zero and let people get a reference to the inode.
 */
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret)
+			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_lock *orph_lock = NULL;
 	struct scoutfs_lock *lock = NULL;
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_omap_lock_data *ldata;
 	u64 group_nr;
 	int bit_nr;
 	int ret;
+	int err;

 	/* lock group and omap constants are defined independently */
 	BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR);
@@ -963,12 +962,19 @@ int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
 out:
 	trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret);

+	if (ret > 0) {
+		err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock);
+		if (err < 0)
+			ret = err;
+	}
+
 	if (ret <= 0) {
 		scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
 		lock = NULL;
 	}

 	*lock_ret = lock;
+	*orph_lock_ret = orph_lock;
 	return ret;
 }

--- a/kmod/src/omap.h
+++ b/kmod/src/omap.h
@@ -4,7 +4,7 @@
 int scoutfs_omap_inc(struct super_block *sb, u64 ino);
 void scoutfs_omap_dec(struct super_block *sb, u64 ino);
 int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode,
-			       struct scoutfs_lock **lock_ret);
+			       struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret);
 void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata);
 int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id,
 				       struct scoutfs_open_ino_map_args *args);
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -32,6 +32,7 @@
 #include "block.h"
 #include "net.h"
 #include "sysfs.h"
+#include "fence.h"
 #include "scoutfs_trace.h"

 /*
@@ -60,10 +61,9 @@
 * running (maybe they've deadlocked, or lost network communications).
 * In addition to a configuration slot in the super block, each quorum
 * member also has a known block location that represents their slot.
- * They set a flag in their block indicating that they've been elected
- * leader, then read slots for all the other blocks looking for
- * previously active leaders to fence.  After that it can start the
- * server.
+ * The block contains an array of events which are updated during the life
+ * time of the quorum agent.  The elected leader set its elected event
+ * and can then start the server.
 *
 * It's critical to raft elections that a participant's term not go
 * backwards in time so each mount also uses its quorum block to store
@@ -97,7 +97,7 @@ struct quorum_host_msg {

 struct last_msg {
 	struct quorum_host_msg msg;
-	struct timespec64 ts;
+	ktime_t ts;
 };

 enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
@@ -209,7 +209,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
 	DECLARE_QUORUM_INFO(sb, qinf);
 	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct timespec64 ts;
+	ktime_t now;
 	int i;

 	struct scoutfs_quorum_message qmes = {
@@ -235,7 +235,6 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,

 	qmes.crc = quorum_message_crc(&qmes);

-	ts = ktime_to_timespec64(ktime_get());

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
 		if (!quorum_slot_present(super, i) ||
@@ -243,12 +242,13 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
 			continue;

 		scoutfs_quorum_slot_sin(super, i, &sin);
+		now = ktime_get();
 		kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);

 		spin_lock(&qinf->show_lock);
 		qinf->last_send[i].msg.term = term;
 		qinf->last_send[i].msg.type = type;
-		qinf->last_send[i].ts = ts;
+		qinf->last_send[i].ts = now;
 		spin_unlock(&qinf->show_lock);

 		if (i == only)
@@ -308,6 +308,8 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
 	if (ret < 0)
 		return ret;

+	now = ktime_get();
+
 	if (ret != sizeof(qmes) ||
 	    qmes.crc != quorum_message_crc(&qmes) ||
 	    qmes.fsid != super->hdr.fsid ||
@@ -327,24 +329,25 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,

 	spin_lock(&qinf->show_lock);
 	qinf->last_recv[msg->from].msg = *msg;
-	qinf->last_recv[msg->from].ts = ktime_to_timespec64(ktime_get());
+	qinf->last_recv[msg->from].ts = now;
 	spin_unlock(&qinf->show_lock);

 	return 0;
 }

 /*
- * The caller can provide a mark that they're using to track their
- * written blocks.  It's updated as they write the block and we can
- * compare it with what we read to see if there have been unexpected
- * intervening writes to the block -- the caller is supposed to have
- * exclusive access to the block (or was fenced).
+ * Read and verify block fields before giving it to the caller.  We
+ * should have exclusive write access to the block.  We know that
+ * something has gone horribly wrong if we don't see our rid in the
+ * begin event after we've written it as we started up.
 */
-static int read_quorum_block(struct super_block *sb, u64 blkno,
-			     struct scoutfs_quorum_block *blk, __le64 *mark)
+static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk,
+			     bool check_rid)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
+	const u64 rid = sbi->rid;
+	char msg[150];
 	__le32 crc;
 	int ret;

@@ -355,165 +358,209 @@ static int read_quorum_block(struct super_block *sb, u64 blkno,

 	ret = scoutfs_block_read_sm(sb, sbi->meta_bdev, blkno,
 				     &blk->hdr, sizeof(*blk), &crc);
+	if (ret < 0) {
+		scoutfs_err(sb, "quorum block read error %d", ret);
+		goto out;
+	}

 	/* detect invalid blocks */
-	if (ret == 0 &&
-	    ((blk->hdr.crc != crc) ||
-	     (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM) ||
-	     (blk->hdr.fsid != super->hdr.fsid) ||
-	     (le64_to_cpu(blk->hdr.blkno) != blkno))) {
-		scoutfs_inc_counter(sb, quorum_read_invalid_block);
+	if (blk->hdr.crc != crc)
+		snprintf(msg, sizeof(msg), "blk crc %08x != %08x",
+			 le32_to_cpu(blk->hdr.crc), le32_to_cpu(crc));
+	else if (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM) 
+		snprintf(msg, sizeof(msg), "blk magic %08x != %08x",
+			 le32_to_cpu(blk->hdr.magic), SCOUTFS_BLOCK_MAGIC_QUORUM);
+	else if (blk->hdr.fsid != super->hdr.fsid)
+		snprintf(msg, sizeof(msg), "blk fsid %016llx != %016llx",
+			 le64_to_cpu(blk->hdr.fsid), le64_to_cpu(super->hdr.fsid));
+	else if (le64_to_cpu(blk->hdr.blkno) != blkno)
+		snprintf(msg, sizeof(msg), "blk blkno %llu != %llu",
+			 le64_to_cpu(blk->hdr.blkno), blkno);
+	else if (check_rid && le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid) != rid)
+		snprintf(msg, sizeof(msg), "quorum block begin rid %016llx != our rid %016llx, are multiple mounts configured with this slot?",
+		le64_to_cpu(blk->events[SCOUTFS_QUORUM_EVENT_BEGIN].rid), rid);
+	else
+		msg[0] = '\0';
+
+	if (msg[0] != '\0') {
+		scoutfs_err(sb, "read invalid quorum block, %s", msg);
 		ret = -EIO;
+		goto out;
 	}

-	if (mark && *mark != 0 && blk->random_write_mark != *mark) {
-		scoutfs_err(sb, "read unexpected quorum block write mark, are multiple mounts configured with the same slot?");
-		ret = -EIO;
-	}
-
-	if (ret < 0)
-		scoutfs_err(sb, "quorum block read error %d", ret);
-
+out:
 	return ret;
 }

-static void set_quorum_block_event(struct super_block *sb,
-				   struct scoutfs_quorum_block *blk,
-				   struct scoutfs_quorum_block_event *ev)
+static void set_quorum_block_event(struct super_block *sb, struct scoutfs_quorum_block *blk,
+				   int event, u64 term)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_quorum_block_event *ev;
 	struct timespec64 ts;

+	if (WARN_ON_ONCE(event < 0 || event >= SCOUTFS_QUORUM_EVENT_NR))
+		return;
+
 	getnstimeofday64(&ts);

+	ev = &blk->events[event];
 	ev->rid = cpu_to_le64(sbi->rid);
+	ev->term = cpu_to_le64(term);
 	ev->ts.sec = cpu_to_le64(ts.tv_sec);
 	ev->ts.nsec = cpu_to_le32(ts.tv_nsec);
 }

-/*
- * Every time we write a block we update the write stamp and random
- * write mark so readers can see our write.
- */
-static int write_quorum_block(struct super_block *sb, u64 blkno,
-			      struct scoutfs_quorum_block *blk, __le64 *mark)
+static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_quorum_block *blk)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	int ret;

 	if (WARN_ON_ONCE(blkno < SCOUTFS_QUORUM_BLKNO) ||
 	    WARN_ON_ONCE(blkno >= (SCOUTFS_QUORUM_BLKNO +
 				   SCOUTFS_QUORUM_BLOCKS)))
 		return -EINVAL;

-	do {
-		get_random_bytes(&blk->random_write_mark,
-				 sizeof(blk->random_write_mark));
-	} while (blk->random_write_mark == 0);
-
-	if (mark)
-		*mark = blk->random_write_mark;
-
-	set_quorum_block_event(sb, blk, &blk->write);
-
-	ret = scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno,
-				      &blk->hdr, sizeof(*blk));
-	if (ret < 0)
-		scoutfs_err(sb, "quorum block write error %d", ret);
-
-	return ret;
+	return scoutfs_block_write_sm(sb, sbi->meta_bdev, blkno, &blk->hdr, sizeof(*blk));
 }

 /*
- * Read the caller's slot's current quorum block, make a change, and
- * write it back out.  If the caller provides a mark it can cause read
- * errors if we read a mark that doesn't match the last mark that the
- * caller wrote.
+ * Read the caller's slot's quorum block, make a change, and write it
+ * back out.
 */
-static int update_quorum_block(struct super_block *sb, u64 blkno,
-			       __le64 *mark, int role, u64 term)
+static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid)
 {
+	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
+	u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
 	struct scoutfs_quorum_block blk;
-	u64 flags;
-	u64 bits;
-	u64 set;
 	int ret;

-	ret = read_quorum_block(sb, blkno, &blk, mark);
+	ret = read_quorum_block(sb, blkno, &blk, check_rid);
 	if (ret == 0) {
-		if (blk.term != cpu_to_le64(term)) {
-			blk.term = cpu_to_le64(term);
-			set_quorum_block_event(sb, &blk, &blk.update_term);
-		}
-
-		flags = le64_to_cpu(blk.flags);
-		bits = SCOUTFS_QUORUM_BLOCK_LEADER;
-		set = role == LEADER ? SCOUTFS_QUORUM_BLOCK_LEADER : 0;
-		if ((flags & bits) != set)
-			set_quorum_block_event(sb, &blk,
-					       set ? &blk.set_leader :
-					             &blk.clear_leader);
-		blk.flags = cpu_to_le64((flags & ~bits) | set);
-
-		ret = write_quorum_block(sb, blkno, &blk, mark);
+		set_quorum_block_event(sb, &blk, event, term);
+		ret = write_quorum_block(sb, blkno, &blk);
+		if (ret < 0)
+			scoutfs_err(sb, "error %d reading quorum block %llu to update event %d term %llu",
+				    ret, blkno, event, term);
+	} else {
+		scoutfs_err(sb, "error %d writing quorum block %llu after updating event %d term %llu",
+			    ret, blkno, event, term);
 	}

 	return ret;
 }

+/*
+ * The calling server has fenced previous leaders and reclaimed their
+ * resources.  We can now update our fence event with a greater term to
+ * stop future leaders from doing the same.
+ */
+int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term)
+{
+	return update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_FENCE, term, true);
+}

 /*
- * The calling server has been elected and updated their block, but
- * can't yet assume that it has exclusive access to the metadata device.
- * We read all the quorum blocks looking for previously elected leaders
- * to fence so that we're the only leader running.
+ * The calling server has been elected and has started running but can't
+ * yet assume that it has exclusive access to the metadata device.  We
+ * read all the quorum blocks looking for previously elected leaders to
+ * fence so that we're the only leader running.
+ *
+ * We're relying on the invariant that there can't be two mounts running
+ * with the same slot nr at the same time.  With this constraint there
+ * can be at most two previous leaders per slot that need to be fenced:
+ * a persistent record of an old mount on the slot, and an active mount.
+ *
+ * If we start fence requests then we only wait for them to complete
+ * before returning.  The server will reclaim their resources once it is
+ * up and running and will call us to update the fence event.  If we
+ * don't start fence requests then we update the fence event
+ * immediately, the server has nothing more to do.
+ *
+ * Quorum will be sending heartbeats while we wait for fencing.  That
+ * keeps us from being fenced while we allow userspace fencing to take a
+ * reasonably long time.  We still want to timeout eventually.
 */
-static int fence_leader_blocks(struct super_block *sb)
+int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
 {
+#define NR_OLD 2
+	struct scoutfs_quorum_block_event old[SCOUTFS_QUORUM_MAX_SLOTS][NR_OLD] = {{{0,}}};
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
-	struct mount_options *opts = &sbi->opts;
 	struct scoutfs_quorum_block blk;
 	struct sockaddr_in sin;
-	u64 blkno;
+	const u64 rid = sbi->rid;
+	bool fence_started = false;
+	u64 fenced = 0;
+	__le64 fence_rid;
 	int ret = 0;
+	int err;
 	int i;
+	int j;

 	BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS);

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		if (i == opts->quorum_slot_nr)
+		if (!quorum_slot_present(super, i))
 			continue;

-		blkno = SCOUTFS_QUORUM_BLKNO + i;
-		ret = read_quorum_block(sb, blkno, &blk, NULL);
+		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
 		if (ret < 0)
 			goto out;

-		if (!(le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER))
-			continue;
+		/* elected leader still running */
+		if (le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term) >
+		    le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term))
+			old[i][0] = blk.events[SCOUTFS_QUORUM_EVENT_ELECT];

-		scoutfs_inc_counter(sb, quorum_fence_leader);
-		scoutfs_quorum_slot_sin(super, i, &sin);
+		/* persistent record of previous server before elected */
+		if ((le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) >
+		     le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) &&
+		    (le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term) <
+		     le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term)))
+			old[i][1] = blk.events[SCOUTFS_QUORUM_EVENT_FENCE];

-		scoutfs_err(sb, "fencing "SCSBF" at "SIN_FMT,
-			    SCSB_LEFR_ARGS(super->hdr.fsid, blk.set_leader.rid),
-			    SIN_ARG(&sin));
+		/* find greatest term that has fenced everything before it */
+		fenced = max(fenced, le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_FENCE].term));
+	}

-		blk.flags &= ~cpu_to_le64(SCOUTFS_QUORUM_BLOCK_LEADER);
-		set_quorum_block_event(sb, &blk, &blk.fenced);
+	/* now actually fence any old leaders which haven't been fenced yet */
+	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
+		for (j = 0; j < NR_OLD; j++) {
+			if (le64_to_cpu(old[i][j].term) == 0 ||		/* uninitialized */
+			    le64_to_cpu(old[i][j].term) < fenced ||	/* already fenced */
+			    le64_to_cpu(old[i][j].term) > term ||	/* newer than us */
+			    le64_to_cpu(old[i][j].rid) == rid)		/* us */
+				continue;

-		ret = write_quorum_block(sb, blkno, &blk, NULL);
-		if (ret < 0)
-			goto out;
+			scoutfs_inc_counter(sb, quorum_fence_leader);
+			scoutfs_quorum_slot_sin(super, i, &sin);
+			fence_rid = old[i][j].rid;
+
+			scoutfs_info(sb, "fencing previous leader "SCSBF" at term %llu in slot %u with address "SIN_FMT,
+				     SCSB_LEFR_ARGS(super->hdr.fsid, fence_rid),
+				     le64_to_cpu(old[i][j].term), i, SIN_ARG(&sin));
+			ret = scoutfs_fence_start(sb, le64_to_cpu(fence_rid), sin.sin_addr.s_addr,
+						  SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
+			if (ret < 0)
+				goto out;
+			fence_started = true;
+		}
 	}

 out:
-	if (ret < 0) {
-		scoutfs_err(sb, "error %d fencing active", ret);
-		scoutfs_inc_counter(sb, quorum_fence_error);
+	if (fence_started) {
+		err = scoutfs_fence_wait_fenced(sb, msecs_to_jiffies(SCOUTFS_QUORUM_FENCE_TO_MS));
+		if (ret == 0)
+			ret = err;
+	} else {
+		err = scoutfs_quorum_fence_complete(sb, term);
+		if (ret == 0)
+			ret = err;
 	}

+	if (ret < 0)
+		scoutfs_inc_counter(sb, quorum_fence_error);
+
 	return ret;
 }

@@ -533,23 +580,22 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	struct sockaddr_in unused;
 	struct quorum_host_msg msg;
 	struct quorum_status qst;
-	__le64 mark;
 	u64 blkno;
 	int ret;
+	int err;

 	/* recording votes from slots as native single word bitmap */
 	BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);

 	/* get our starting term from our persistent block */
-	mark = 0;
 	blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr;
-	ret = read_quorum_block(sb, blkno, &blk, &mark);
+	ret = read_quorum_block(sb, blkno, &blk, false);
 	if (ret < 0)
 		goto out;

 	/* start out as a follower */
 	qst.role = FOLLOWER;
-	qst.term = le64_to_cpu(blk.term);
+	qst.term = le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_TERM].term);
 	qst.vote_for = -1;
 	qst.vote_bits = 0;

@@ -559,7 +605,12 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	else
 		qst.timeout = election_timeout();

-	while (!qinf->shutdown) {
+	/* record that we're up and running, readers check that it isn't updated */
+	ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_BEGIN, qst.term, false);
+	if (ret < 0)
+		goto out;
+
+	while (!(qinf->shutdown || scoutfs_forcing_unmount(sb))) {

 		ret = recv_msg(sb, &msg, qst.timeout);
 		if (ret < 0) {
@@ -589,11 +640,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 			send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
 					qst.term);
 			scoutfs_inc_counter(sb, quorum_send_resignation);
-
-			ret = update_quorum_block(sb, blkno, &mark,
-						  qst.role, qst.term);
-			if (ret < 0)
-				goto out;
 		}

 		spin_lock(&qinf->show_lock);
@@ -624,8 +670,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 				qst.timeout = election_timeout();

 			/* store our increased term */
-			ret = update_quorum_block(sb, blkno, &mark,
-						  qst.role, qst.term);
+			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
 			if (ret < 0)
 				goto out;
 		}
@@ -642,6 +687,11 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 					qst.term);
 			qst.timeout = election_timeout();
 			scoutfs_inc_counter(sb, quorum_send_request);
+
+			/* store our increased term */
+			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_TERM, qst.term, true);
+			if (ret < 0)
+				goto out;
 		}

 		/* candidates count votes in their term */
@@ -670,10 +720,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 					qst.term);
 			qst.timeout = heartbeat_interval();

-			/* set our leader flag and fence */
-			ret = update_quorum_block(sb, blkno, &mark,
-						  qst.role, qst.term) ?:
-			      fence_leader_blocks(sb);
+			/* record that we've been elected before starting up server */
+			ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
 			if (ret < 0)
 				goto out;

@@ -684,9 +732,16 @@ static void scoutfs_quorum_worker(struct work_struct *work)

 			ret = scoutfs_server_start(sb, qst.term);
 			if (ret < 0) {
-				scoutfs_err(sb, "server startup failed with %d",
-					    ret);
-				goto out;
+				clear_bit(QINF_FLAG_SERVER, &qinf->flags);
+				/* store our increased term */
+				err = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, qst.term,
+							  true);
+				if (err < 0) {
+					ret = err;
+					goto out;
+				}
+				ret = 0;
+				continue;
 			}
 		}

@@ -727,77 +782,75 @@ static void scoutfs_quorum_worker(struct work_struct *work)
 	/* always try to stop a running server as we stop */
 	if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) {
 		scoutfs_server_stop(sb);
+		scoutfs_fence_stop(sb);
 		send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
 				qst.term);
 	}

-	/* always try to clear leader block as we stop to avoid fencing */
-	if (qst.role == LEADER) {
-		ret = update_quorum_block(sb, blkno, &mark,
-					  FOLLOWER, qst.term);
-		if (ret < 0)
-			goto out;
-	}
+	/* informational event that we're shutting down, nothing relies on it */
+	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_END, qst.term, true);
 out:
 	if (ret < 0) {
-		scoutfs_err(sb, "quorum service saw error %d, shutting down.  Cluster will be degraded until this slot is remounted to restart the quorum service",
+		scoutfs_err(sb, "quorum service saw error %d, shutting down.  This mount is no longer participating in quorum.  It should be remounted to restore service.",
 			    ret);
 	}
 }

 /*
- * Set a flag for the quorum work's next iteration to indicate that the
- * server has shutdown and that it should step down as leader, update
- * quorum blocks, and stop sending heartbeats.
+ * The calling server has shutdown and is no longer using shared
+ * resources.  Clear the bit so that we stop sending heartbeats and
+ * allow the next server to be elected.  Update the stop event so that
+ * it won't be considered available by clients or fenced by the next
+ * leader.
 */
-void scoutfs_quorum_server_shutdown(struct super_block *sb)
+void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term)
 {
 	DECLARE_QUORUM_INFO(sb, qinf);

-	set_bit(QINF_FLAG_SERVER, &qinf->flags);
+	clear_bit(QINF_FLAG_SERVER, &qinf->flags);
+	update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP, term, true);
 }

 /*
 * Clients read quorum blocks looking for the leader with a server whose
 * address it can try and connect to.
 *
- * There can be multiple running servers if a client checks before a
- * server has had a chance to fence any old servers.  We try to use the
- * block with the most recent timestamp.  If we get it wrong the
- * connection will timeout and the client will try again, presumably
- * finding a single server block.
+ * There can be records of multiple previous elected leaders if the
+ * current server hasn't yet fenced any old servers.  We use the elected
+ * leader with the greatest elected term.  If we get it wrong the
+ * connection will timeout and the client will try again.
 */
 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_quorum_block blk;
-	struct timespec64 recent = {0,};
-	struct timespec64 ts;
-	int ret;
+	u64 elect_term;
+	u64 term = 0;
+	int ret = 0;
 	int i;

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
-		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk,
-					NULL);
+		if (!quorum_slot_present(super, i))
+			continue;
+
+		ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
 		if (ret < 0) {
 			scoutfs_err(sb, "error reading quorum block nr %u: %d",
 				    i, ret);
 			goto out;
 		}

-		ts.tv_sec = le64_to_cpu(blk.set_leader.ts.sec);
-		ts.tv_nsec = le32_to_cpu(blk.set_leader.ts.nsec);
-
-		if ((le64_to_cpu(blk.flags) & SCOUTFS_QUORUM_BLOCK_LEADER) &&
-		    (timespec64_to_ns(&ts) > timespec64_to_ns(&recent))) {
-			recent = ts;
+		elect_term = le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_ELECT].term);
+		if (elect_term > term &&
+		    elect_term > le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) {
+			term = elect_term;
 			scoutfs_quorum_slot_sin(super, i, sin);
 			continue;
 		}
 	}

-	if (timespec64_to_ns(&recent) == 0)
+	if (term == 0)
 		ret = -ENOENT;

 out:
@@ -864,6 +917,7 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 	struct quorum_status qst;
 	struct last_msg last;
 	struct timespec64 ts;
+	const ktime_t now = ktime_get();
 	size_t size;
 	int ret;
 	int i;
@@ -885,9 +939,9 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 		     qst.vote_for);
 	snprintf_ret(buf, size, &ret, "vote_bits 0x%lx (count %lu)\n",
 		     qst.vote_bits, hweight_long(qst.vote_bits));
-	ts = ktime_to_timespec64(qst.timeout);
-	snprintf_ret(buf, size, &ret, "timeout %llu.%u\n",
-		     (u64)ts.tv_sec, (int)ts.tv_nsec);
+	ts = ktime_to_timespec64(ktime_sub(qst.timeout, now));
+	snprintf_ret(buf, size, &ret, "timeout_in_secs %lld.%09u\n",
+		     (s64)ts.tv_sec, (int)ts.tv_nsec);

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
 		spin_lock(&qinf->show_lock);
@@ -897,10 +951,11 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
 		if (last.msg.term == 0)
 			continue;

+		ts = ktime_to_timespec64(ktime_sub(now, last.ts));
 		snprintf_ret(buf, size, &ret,
-			     "last_send to %u term %llu type %u ts %llu.%u\n",
+			     "last_send to %u term %llu type %u secs_since %lld.%09u\n",
 			     i, last.msg.term, last.msg.type,
-			     (u64)last.ts.tv_sec, (int)last.ts.tv_nsec);
+			     (s64)ts.tv_sec, (int)ts.tv_nsec);
 	}

 	for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
@@ -910,10 +965,12 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,

 		if (last.msg.term == 0)
 			continue;
+
+		ts = ktime_to_timespec64(ktime_sub(now, last.ts));
 		snprintf_ret(buf, size, &ret,
-			     "last_recv from %u term %llu type %u ts %llu.%u\n",
+			     "last_recv from %u term %llu type %u secs_since %lld.%09u\n",
 			     i, last.msg.term, last.msg.type,
-			     (u64)last.ts.tv_sec, (int)last.ts.tv_nsec);
+			     (s64)ts.tv_sec, (int)ts.tv_nsec);
 	}

 	return ret;
--- a/kmod/src/quorum.h
+++ b/kmod/src/quorum.h
@@ -2,12 +2,15 @@
 #define _SCOUTFS_QUORUM_H_

 int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);
-void scoutfs_quorum_server_shutdown(struct super_block *sb);
+void scoutfs_quorum_server_shutdown(struct super_block *sb, u64 term);

 u8 scoutfs_quorum_votes_needed(struct super_block *sb);
 void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
 			     struct sockaddr_in *sin);

+int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
+int scoutfs_quorum_fence_complete(struct super_block *sb, u64 term);
+
 int scoutfs_quorum_setup(struct super_block *sb);
 void scoutfs_quorum_shutdown(struct super_block *sb);
 void scoutfs_quorum_destroy(struct super_block *sb);
--- a/kmod/src/recov.c
+++ b/kmod/src/recov.c
@@ -16,9 +16,11 @@
 #include <linux/sched.h>
 #include <linux/rhashtable.h>
 #include <linux/rcupdate.h>
+#include <linux/list_sort.h>

 #include "super.h"
 #include "recov.h"
+#include "cmp.h"

 /*
 * There are a few server messages which can't be processed until they
@@ -47,18 +49,41 @@ struct recov_pending {
 	int which;
 };

-static struct recov_pending *find_pending(struct recov_info *recinf, u64 rid, int which)
+static struct recov_pending *next_pending(struct recov_info *recinf, u64 rid, int which)
 {
 	struct recov_pending *pend;

 	list_for_each_entry(pend, &recinf->pending, head) {
-		if ((rid == 0 || pend->rid == rid) && (pend->which & which))
+		if (pend->rid > rid && pend->which & which)
 			return pend;
 	}

 	return NULL;
 }

+static struct recov_pending *lookup_pending(struct recov_info *recinf, u64 rid, int which)
+{
+	struct recov_pending *pend;
+
+	pend = next_pending(recinf, rid - 1, which);
+	if (pend && pend->rid == rid)
+		return pend;
+
+	return NULL;
+}
+
+/*
+ * We keep the pending list sorted by rid so that we can iterate over
+ * them.  The list should be small and shouldn't be used often.
+ */
+static int cmp_pending_rid(void *priv, struct list_head *A, struct list_head *B)
+{
+	struct recov_pending *a = list_entry(A, struct recov_pending, head);
+	struct recov_pending *b = list_entry(B, struct recov_pending, head);
+
+	return scoutfs_cmp_u64s(a->rid, b->rid);
+}
+
 /*
 * Record that we'll be waiting for a client to recover something.
 * _finished will eventually be called for every _prepare, either
@@ -80,14 +105,15 @@ int scoutfs_recov_prepare(struct super_block *sb, u64 rid, int which)

 	spin_lock(&recinf->lock);

-	pend = find_pending(recinf, rid, SCOUTFS_RECOV_ALL);
+	pend = lookup_pending(recinf, rid, SCOUTFS_RECOV_ALL);
 	if (pend) {
 		pend->which |= which;
 	} else {
 		swap(pend, alloc);
 		pend->rid = rid;
 		pend->which = which;
-		list_add(&pend->head, &recinf->pending);
+		list_add_tail(&pend->head, &recinf->pending);
+		list_sort(NULL, &recinf->pending, cmp_pending_rid);
 	}

 	spin_unlock(&recinf->lock);
@@ -159,7 +185,7 @@ int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which)

 	spin_lock(&recinf->lock);

-	pend = find_pending(recinf, rid, which);
+	pend = lookup_pending(recinf, rid, which);
 	if (pend) {
 		pend->which &= ~which;
 		if (pend->which) {
@@ -190,29 +216,28 @@ bool scoutfs_recov_is_pending(struct super_block *sb, u64 rid, int which)
 	bool is_pending;

 	spin_lock(&recinf->lock);
-	is_pending = find_pending(recinf, rid, which) != NULL;
+	is_pending = lookup_pending(recinf, rid, which) != NULL;
 	spin_unlock(&recinf->lock);

 	return is_pending;
 }

 /*
- * Returns 0 if there are no rids waiting for the given state to be
- * recovered.  Returns the rid of a client still waiting if there are
- * any, in no specified order.
+ * Return the next rid after the given rid of a client waiting for the
+ * given state to be recovered.  Start with rid 0, returns 0 when there
+ * are no more clients waiting for recovery.
 *
 * This is inherently racey.  Callers are responsible for resolving any
 * actions taken based on pending with the recovery finishing, perhaps
 * before we return.
 */
-u64 scoutfs_recov_next_pending(struct super_block *sb, int which)
+u64 scoutfs_recov_next_pending(struct super_block *sb, u64 rid, int which)
 {
 	DECLARE_RECOV_INFO(sb, recinf);
 	struct recov_pending *pend;
-	u64 rid;

 	spin_lock(&recinf->lock);
-	pend = find_pending(recinf, 0, which);
+	pend = next_pending(recinf, rid, which);
 	rid = pend ? pend->rid : 0;
 	spin_unlock(&recinf->lock);

--- a/kmod/src/recov.h
+++ b/kmod/src/recov.h
@@ -14,7 +14,7 @@ int scoutfs_recov_begin(struct super_block *sb, void (*timeout_fn)(struct super_
 			unsigned int timeout_ms);
 int scoutfs_recov_finish(struct super_block *sb, u64 rid, int which);
 bool scoutfs_recov_is_pending(struct super_block *sb, u64 rid, int which);
-u64 scoutfs_recov_next_pending(struct super_block *sb, int which);
+u64 scoutfs_recov_next_pending(struct super_block *sb, u64 rid, int which);
 void scoutfs_recov_shutdown(struct super_block *sb);

 int scoutfs_recov_setup(struct super_block *sb);
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -424,14 +424,15 @@ TRACE_EVENT(scoutfs_trans_write_func,
 );

 DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),

-	TP_ARGS(sb, journal_info, holders),
+	TP_ARGS(sb, journal_info, holders, ret),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(unsigned long, journal_info)
 		__field(int, holders)
+		__field(int, ret)
 	),

 	TP_fast_assign(
@@ -440,17 +441,17 @@ DECLARE_EVENT_CLASS(scoutfs_trans_hold_release_class,
 		__entry->holders = holders;
 	),

-	TP_printk(SCSBF" journal_info 0x%0lx holders %d",
-		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders)
+	TP_printk(SCSBF" journal_info 0x%0lx holders %d ret %d",
+		  SCSB_TRACE_ARGS, __entry->journal_info, __entry->holders, __entry->ret)
 );

-DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_trans_acquired_hold,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
-	TP_ARGS(sb, journal_info, holders)
+DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_hold_trans,
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
+	TP_ARGS(sb, journal_info, holders, ret)
 );
 DEFINE_EVENT(scoutfs_trans_hold_release_class, scoutfs_release_trans,
-	TP_PROTO(struct super_block *sb, void *journal_info, int holders),
-	TP_ARGS(sb, journal_info, holders)
+	TP_PROTO(struct super_block *sb, void *journal_info, int holders, int ret),
+	TP_ARGS(sb, journal_info, holders, ret)
 );

 TRACE_EVENT(scoutfs_ioc_release,
@@ -985,22 +986,6 @@ TRACE_EVENT(scoutfs_delete_inode,
 		  __entry->mode, __entry->size)
 );

-TRACE_EVENT(scoutfs_scan_orphans,
-	TP_PROTO(struct super_block *sb),
-
-	TP_ARGS(sb),
-
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-	),
-
-	TP_fast_assign(
-		__entry->dev = sb->s_dev;
-	),
-
-	TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
-);
-
 DECLARE_EVENT_CLASS(scoutfs_key_class,
        TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
        TP_ARGS(sb, key),
@@ -1644,6 +1629,164 @@ TRACE_EVENT(scoutfs_btree_walk,
 		  __entry->level, __entry->ref_blkno, __entry->ref_seq)
 );

+TRACE_EVENT(scoutfs_btree_set_parent,
+	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *root, struct scoutfs_key *key,
+		 struct scoutfs_btree_root *par_root),
+
+	TP_ARGS(sb, root, key, par_root),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(key)
+		__field(__u64, par_root_blkno)
+		__field(__u64, par_root_seq)
+		__field(__u8, par_root_height)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(key, key);
+		__entry->par_root_blkno = le64_to_cpu(par_root->ref.blkno);
+		__entry->par_root_seq = le64_to_cpu(par_root->ref.seq);
+		__entry->par_root_height = par_root->height;
+	),
+
+	TP_printk(SCSBF" root blkno %llu seq %llu height %u, key "SK_FMT", par_root blkno %llu seq %llu height %u",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
+		  __entry->root_height, sk_trace_args(key),
+		  __entry->par_root_blkno, __entry->par_root_seq,
+		  __entry->par_root_height)
+);
+
+TRACE_EVENT(scoutfs_btree_merge,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 struct scoutfs_key *start, struct scoutfs_key *end),
+
+	TP_ARGS(sb, root, start, end),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(start)
+		sk_trace_define(end)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+	),
+
+	TP_printk(SCSBF" root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT,
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
+		  __entry->root_height, sk_trace_args(start),
+		  sk_trace_args(end))
+);
+
+TRACE_EVENT(scoutfs_btree_merge_items,
+	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_btree_root *m_root,
+		 struct scoutfs_key *m_key, int m_val_len,
+		 struct scoutfs_btree_root *f_root,
+		 struct scoutfs_key *f_key, int f_val_len,
+		 int is_del),
+
+	TP_ARGS(sb, m_root, m_key, m_val_len, f_root, f_key, f_val_len, is_del),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, m_root_blkno)
+		__field(__u64, m_root_seq)
+		__field(__u8, m_root_height)
+		sk_trace_define(m_key)
+		__field(int, m_val_len)
+		__field(__u64, f_root_blkno)
+		__field(__u64, f_root_seq)
+		__field(__u8, f_root_height)
+		sk_trace_define(f_key)
+		__field(int, f_val_len)
+		__field(int, is_del)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->m_root_blkno = m_root ?
+					le64_to_cpu(m_root->ref.blkno) : 0;
+		__entry->m_root_seq = m_root ? le64_to_cpu(m_root->ref.seq) : 0;
+		__entry->m_root_height = m_root ? m_root->height : 0;
+		sk_trace_assign(m_key, m_key);
+		__entry->m_val_len = m_val_len;
+		__entry->f_root_blkno = f_root ?
+					le64_to_cpu(f_root->ref.blkno) : 0;
+		__entry->f_root_seq = f_root ? le64_to_cpu(f_root->ref.seq) : 0;
+		__entry->f_root_height = f_root ? f_root->height : 0;
+		sk_trace_assign(f_key, f_key);
+		__entry->f_val_len = f_val_len;
+		__entry->is_del = !!is_del;
+	),
+
+	TP_printk(SCSBF" merge item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, fs item root blkno %llu seq %llu height %u key "SK_FMT" val_len %d, is_del %d",
+		  SCSB_TRACE_ARGS, __entry->m_root_blkno, __entry->m_root_seq,
+		  __entry->m_root_height, sk_trace_args(m_key),
+		  __entry->m_val_len, __entry->f_root_blkno,
+		  __entry->f_root_seq, __entry->f_root_height,
+		  sk_trace_args(f_key), __entry->f_val_len, __entry->is_del)
+);
+
+DECLARE_EVENT_CLASS(scoutfs_btree_free_blocks,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+
+	TP_ARGS(sb, root, blkno),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		__field(__u64, blkno)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		__entry->blkno = blkno;
+	),
+
+	TP_printk(SCSBF" root blkno %llu seq %llu height %u, free blkno %llu",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
+		  __entry->root_height, __entry->blkno)
+);
+DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_single,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+	TP_ARGS(sb, root, blkno)
+);
+DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_leaf,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+	TP_ARGS(sb, root, blkno)
+);
+DEFINE_EVENT(scoutfs_btree_free_blocks, scoutfs_btree_free_blocks_parent,
+	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
+		 u64 blkno),
+	TP_ARGS(sb, root, blkno)
+);
+
 TRACE_EVENT(scoutfs_online_offline_blocks,
 	TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta,
 		 u64 on_now, u64 off_now),
@@ -1900,6 +2043,116 @@ TRACE_EVENT(scoutfs_trans_seq_last,
 		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
 );

+TRACE_EVENT(scoutfs_get_log_merge_status,
+	TP_PROTO(struct super_block *sb, u64 rid, struct scoutfs_key *next_range_key,
+		 u64 nr_requests, u64 nr_complete, u64 last_seq, u64 seq),
+
+	TP_ARGS(sb, rid, next_range_key, nr_requests, nr_complete, last_seq, seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, s_rid)
+		sk_trace_define(next_range_key)
+		__field(__u64, nr_requests)
+		__field(__u64, nr_complete)
+		__field(__u64, last_seq)
+		__field(__u64, seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->s_rid = rid;
+		sk_trace_assign(next_range_key, next_range_key);
+		__entry->nr_requests = nr_requests;
+		__entry->nr_complete = nr_complete;
+		__entry->last_seq = last_seq;
+		__entry->seq = seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx next_range_key "SK_FMT" nr_requests %llu nr_complete %llu last_seq %llu seq %llu",
+		  SCSB_TRACE_ARGS, __entry->s_rid, sk_trace_args(next_range_key),
+		  __entry->nr_requests, __entry->nr_complete, __entry->last_seq, __entry->seq)
+);
+
+TRACE_EVENT(scoutfs_get_log_merge_request,
+	TP_PROTO(struct super_block *sb, u64 rid,
+		 struct scoutfs_btree_root *root, struct scoutfs_key *start,
+		 struct scoutfs_key *end, u64 last_seq, u64 seq),
+
+	TP_ARGS(sb, rid, root, start, end, last_seq, seq),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, s_rid)
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(start)
+		sk_trace_define(end)
+		__field(__u64, last_seq)
+		__field(__u64, seq)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->s_rid = rid;
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		__entry->last_seq = last_seq;
+		__entry->seq = seq;
+	),
+
+	TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" last_seq %llu seq %llu",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno,
+		  __entry->root_seq, __entry->root_height,
+		  sk_trace_args(start), sk_trace_args(end), __entry->last_seq,
+		  __entry->seq)
+);
+
+TRACE_EVENT(scoutfs_get_log_merge_complete,
+	TP_PROTO(struct super_block *sb, u64 rid,
+		 struct scoutfs_btree_root *root, struct scoutfs_key *start,
+		 struct scoutfs_key *end, struct scoutfs_key *remain,
+		 u64 seq, u64 flags),
+
+	TP_ARGS(sb, rid, root, start, end, remain, seq, flags),
+
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, s_rid)
+		__field(__u64, root_blkno)
+		__field(__u64, root_seq)
+		__field(__u8, root_height)
+		sk_trace_define(start)
+		sk_trace_define(end)
+		sk_trace_define(remain)
+		__field(__u64, seq)
+		__field(__u64, flags)
+	),
+
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->s_rid = rid;
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->root_seq = le64_to_cpu(root->ref.seq);
+		__entry->root_height = root->height;
+		sk_trace_assign(start, start);
+		sk_trace_assign(end, end);
+		sk_trace_assign(remain, remain);
+		__entry->seq = seq;
+		__entry->flags = flags;
+	),
+
+	TP_printk(SCSBF" rid %016llx root blkno %llu seq %llu height %u start "SK_FMT" end "SK_FMT" remain "SK_FMT" seq %llu flags 0x%llx",
+		  SCSB_TRACE_ARGS, __entry->s_rid, __entry->root_blkno,
+		  __entry->root_seq, __entry->root_height,
+		  sk_trace_args(start), sk_trace_args(end),
+		  sk_trace_args(remain), __entry->seq, __entry->flags)
+);
+
 DECLARE_EVENT_CLASS(scoutfs_forest_bloom_class,
 	TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
 		 u64 rid, u64 nr, u64 blkno, u64 seq, unsigned int count),
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -56,13 +56,15 @@ do {								\
 	__entry->name##_data_len, __entry->name##_cmd, __entry->name##_flags, \
 	__entry->name##_error

+u64 scoutfs_server_reserved_meta_blocks(struct super_block *sb);
+
 int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 				struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
 				 struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
-int scoutfs_server_hold_commit(struct super_block *sb);
+void scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);
 void scoutfs_server_recov_finish(struct super_block *sb, u64 rid, int which);

@@ -71,8 +73,10 @@ int scoutfs_server_send_omap_request(struct super_block *sb, u64 rid,
 int scoutfs_server_send_omap_response(struct super_block *sb, u64 rid, u64 id,
 				      struct scoutfs_open_ino_map *map, int err);

-struct sockaddr_in;
-struct scoutfs_quorum_elected_info;
+u64 scoutfs_server_seq(struct super_block *sb);
+u64 scoutfs_server_next_seq(struct super_block *sb);
+void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
+
 int scoutfs_server_start(struct super_block *sb, u64 term);
 void scoutfs_server_abort(struct super_block *sb);
 void scoutfs_server_stop(struct super_block *sb);
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
@@ -989,12 +989,13 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
 			    struct scoutfs_block_writer *wri,
 			    struct scoutfs_btree_root *root,
-			    struct scoutfs_srch_file *sfl)
+			    struct scoutfs_srch_file *sfl, bool force)
 {
 	struct scoutfs_key key;
 	int ret;

-	if (le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT)
+	if (sfl->ref.blkno == 0 ||
+	    (!force && le64_to_cpu(sfl->blocks) < SCOUTFS_SRCH_LOG_BLOCK_LIMIT))
 		return 0;

 	init_srch_key(&key, SCOUTFS_SRCH_LOG_TYPE,
--- a/kmod/src/srch.h
+++ b/kmod/src/srch.h
@@ -37,7 +37,7 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
 			    struct scoutfs_alloc *alloc,
 			    struct scoutfs_block_writer *wri,
 			    struct scoutfs_btree_root *root,
-			    struct scoutfs_srch_file *sfl);
+			    struct scoutfs_srch_file *sfl, bool force);
 int scoutfs_srch_get_compact(struct super_block *sb,
 			     struct scoutfs_alloc *alloc,
 			     struct scoutfs_block_writer *wri,
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -46,6 +46,8 @@
 #include "alloc.h"
 #include "recov.h"
 #include "omap.h"
+#include "volopt.h"
+#include "fence.h"
 #include "scoutfs_trace.h"

 static struct dentry *scoutfs_debugfs_root;
@@ -228,7 +230,15 @@ static void scoutfs_metadev_close(struct super_block *sb)
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);

 	if (sbi->meta_bdev) {
+		/*
+		 * Some kernels have blkdev_reread_part which calls
+		 * fsync_bdev while holding the bd_mutex which inverts
+		 * the s_umount hold in deactivate_super and blkdev_put
+		 * from kill_sb->put_super.
+		 */
+		lockdep_off();
 		blkdev_put(sbi->meta_bdev, SCOUTFS_META_BDEV_MODE);
+		lockdep_on();
 		sbi->meta_bdev = NULL;
 	}
 }
@@ -245,14 +255,14 @@ static void scoutfs_put_super(struct super_block *sb)

 	trace_scoutfs_put_super(sb);

+	scoutfs_inode_stop(sb);
+	scoutfs_forest_stop(sb);
 	scoutfs_srch_destroy(sb);

-	scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
-	sbi->rid_lock = NULL;
-
 	scoutfs_lock_shutdown(sb);

 	scoutfs_shutdown_trans(sb);
+	scoutfs_volopt_destroy(sb);
 	scoutfs_client_destroy(sb);
 	scoutfs_inode_destroy(sb);
 	scoutfs_item_destroy(sb);
@@ -268,6 +278,7 @@ static void scoutfs_put_super(struct super_block *sb)

 	scoutfs_block_destroy(sb);
 	scoutfs_destroy_triggers(sb);
+	scoutfs_fence_destroy(sb);
 	scoutfs_options_destroy(sb);
 	scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa);
 	debugfs_remove(sbi->debug_root);
@@ -281,6 +292,21 @@ static void scoutfs_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }

+/*
+ * Record that we're performing a forced unmount.  As put_super drives
+ * destruction of the filesystem we won't issue more network or storage
+ * operations because we assume that they'll hang.  Pending operations
+ * can return errors when it's possible to do so.  We may be racing with
+ * pending operations which can't be canceled.
+ */
+static void scoutfs_umount_begin(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	scoutfs_warn(sb, "forcing unmount, can return errors and lose unsynced data");
+	sbi->forced_unmount = true;
+}
+
 static const struct super_operations scoutfs_super_ops = {
 	.alloc_inode = scoutfs_alloc_inode,
 	.drop_inode = scoutfs_drop_inode,
@@ -290,6 +316,7 @@ static const struct super_operations scoutfs_super_ops = {
 	.statfs = scoutfs_statfs,
 	.show_options = scoutfs_show_options,
 	.put_super = scoutfs_put_super,
+	.umount_begin = scoutfs_umount_begin,
 };

 /*
@@ -309,28 +336,16 @@ int scoutfs_write_super(struct super_block *sb,
 				      sizeof(struct scoutfs_super_block));
 }

-static bool invalid_blkno_limits(struct super_block *sb, char *which,
-				 u64 start, __le64 first, __le64 last,
-				 struct block_device *bdev, int shift)
+static bool small_bdev(struct super_block *sb, char *which, u64 blocks,
+		       struct block_device *bdev, int shift)
 {
-	u64 blkno;
+	u64 size = (u64)i_size_read(bdev->bd_inode);
+	u64 count = size >> shift;

-	if (le64_to_cpu(first) < start) {
-		scoutfs_err(sb, "super block first %s blkno %llu is within first valid blkno %llu",
-			which, le64_to_cpu(first), start);
-		return true;
-	}
+	if (blocks > count) {
+		scoutfs_err(sb, "super block records %llu %s blocks, but device %u:%u size %llu only allows %llu blocks",
+			blocks, which, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev), size, count);

-	if (le64_to_cpu(first) > le64_to_cpu(last)) {
-		scoutfs_err(sb, "super block first %s blkno %llu is greater than last %s blkno %llu",
-			which, le64_to_cpu(first), which, le64_to_cpu(last));
-		return true;
-	}
-
-	blkno = (i_size_read(bdev->bd_inode) >> shift) - 1;
-	if (le64_to_cpu(last) > blkno) {
-		scoutfs_err(sb, "super block last %s blkno %llu is beyond device size last blkno %llu",
-			which, le64_to_cpu(last), blkno);
 		return true;
 	}

@@ -390,16 +405,10 @@ static int scoutfs_read_super_from_bdev(struct super_block *sb,

 	/* XXX do we want more rigorous invalid super checking? */

-	if (invalid_blkno_limits(sb, "meta",
-			         SCOUTFS_META_DEV_START_BLKNO,
-				 super->first_meta_blkno,
-				 super->last_meta_blkno, sbi->meta_bdev,
-				 SCOUTFS_BLOCK_LG_SHIFT) ||
-	    invalid_blkno_limits(sb, "data",
-			         SCOUTFS_DATA_DEV_START_BLKNO,
-				 super->first_data_blkno,
-				 super->last_data_blkno, sb->s_bdev,
-				 SCOUTFS_BLOCK_SM_SHIFT)) {
+	if (small_bdev(sb, "metadata", le64_to_cpu(super->total_meta_blocks), sbi->meta_bdev,
+		       SCOUTFS_BLOCK_LG_SHIFT) ||
+	    small_bdev(sb, "data", le64_to_cpu(super->total_data_blocks), sb->s_bdev,
+		       SCOUTFS_BLOCK_SM_SHIFT)) {
 		ret = -EINVAL;
 	}

@@ -588,6 +597,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa,
 				mount_options_attrs, "mount_options") ?:
 	      scoutfs_setup_triggers(sb) ?:
+	      scoutfs_fence_setup(sb) ?:
 	      scoutfs_block_setup(sb) ?:
 	      scoutfs_forest_setup(sb) ?:
 	      scoutfs_item_setup(sb) ?:
@@ -601,16 +611,17 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_server_setup(sb) ?:
 	      scoutfs_quorum_setup(sb) ?:
 	      scoutfs_client_setup(sb) ?:
-	      scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
-				   &sbi->rid_lock) ?:
-	      scoutfs_trans_get_log_trees(sb) ?:
+	      scoutfs_volopt_setup(sb) ?:
 	      scoutfs_srch_setup(sb);
 	if (ret)
 		goto out;

-	inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO);
+	/* this interruptible iget lets hung mount be aborted with ctl-c */
+	inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO, SCOUTFS_LKF_INTERRUPTIBLE);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
 		goto out;
 	}

@@ -620,12 +631,16 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out;
 	}

-	ret = scoutfs_client_advance_seq(sb, &sbi->trans_seq);
+	/* send requests once iget progress shows we had a server */
+	ret = scoutfs_trans_get_log_trees(sb) ?:
+	      scoutfs_client_advance_seq(sb, &sbi->trans_seq);
 	if (ret)
 		goto out;

+	/* start up background services that use everything else */
+	scoutfs_inode_start(sb);
+	scoutfs_forest_start(sb);
 	scoutfs_trans_restart_sync_deadline(sb);
-//	scoutfs_scan_orphans(sb);
 	ret = 0;
 out:
 	/* on error, generic_shutdown_super calls put_super if s_root */
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -28,13 +28,14 @@ struct forest_info;
 struct srch_info;
 struct recov_info;
 struct omap_info;
+struct volopt_info;
+struct fence_info;

 struct scoutfs_sb_info {
 	struct super_block *sb;

 	/* assigned once at the start of each mount, read-only */
 	u64 rid;
-	struct scoutfs_lock *rid_lock;

 	struct scoutfs_super_block super;

@@ -51,7 +52,9 @@ struct scoutfs_sb_info {
 	struct forest_info *forest_info;
 	struct srch_info *srch_info;
 	struct omap_info *omap_info;
+	struct volopt_info *volopt_info;
 	struct item_cache_info *item_cache_info;
+	struct fence_info *fence_info;

 	wait_queue_head_t trans_hold_wq;
 	struct task_struct *trans_task;
@@ -85,6 +88,8 @@ struct scoutfs_sb_info {

 	struct dentry *debug_root;

+	bool forced_unmount;
+
 	unsigned long corruption_messages_once[SC_NR_LONGS];
 };

@@ -105,6 +110,13 @@ static inline bool SCOUTFS_IS_META_BDEV(struct scoutfs_super_block *super_block)

 #define SCOUTFS_META_BDEV_MODE (FMODE_READ | FMODE_WRITE | FMODE_EXCL)

+static inline bool scoutfs_forcing_unmount(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	return sbi->forced_unmount;
+}
+
 /*
 * A small string embedded in messages that's used to identify a
 * specific mount.  It's the three most significant bytes of the fsid
--- a/kmod/src/sysfs.c
+++ b/kmod/src/sysfs.c
@@ -131,9 +131,10 @@ void scoutfs_sysfs_init_attrs(struct super_block *sb,
 *  If this returns success then the file will be visible and show can
 *  be called until unmount.
 */
-int scoutfs_sysfs_create_attrs(struct super_block *sb,
-			       struct scoutfs_sysfs_attrs *ssa,
-			       struct attribute **attrs, char *fmt, ...)
+int scoutfs_sysfs_create_attrs_parent(struct super_block *sb,
+				      struct kobject *parent,
+				      struct scoutfs_sysfs_attrs *ssa,
+				      struct attribute **attrs, char *fmt, ...)
 {
 	va_list args;
 	size_t name_len;
@@ -174,8 +175,8 @@ int scoutfs_sysfs_create_attrs(struct super_block *sb,
 		goto out;
 	}

-	ret = kobject_init_and_add(&ssa->kobj, &ssa->ktype,
-				   scoutfs_sysfs_sb_dir(sb), "%s", ssa->name);
+	ret = kobject_init_and_add(&ssa->kobj, &ssa->ktype, parent,
+				   "%s", ssa->name);
 out:
 	if (ret) {
 		kfree(ssa->name);
--- a/kmod/src/sysfs.h
+++ b/kmod/src/sysfs.h
@@ -10,6 +10,8 @@

 #define SCOUTFS_ATTR_RO(_name)						\
        static struct kobj_attribute scoutfs_attr_##_name = __ATTR_RO(_name)
+#define SCOUTFS_ATTR_RW(_name)						\
+        static struct kobj_attribute scoutfs_attr_##_name = __ATTR_RW(_name)

 #define SCOUTFS_ATTR_PTR(_name)						\
        &scoutfs_attr_##_name.attr
@@ -34,9 +36,14 @@ struct scoutfs_sysfs_attrs {

 void scoutfs_sysfs_init_attrs(struct super_block *sb,
 			      struct scoutfs_sysfs_attrs *ssa);
-int scoutfs_sysfs_create_attrs(struct super_block *sb,
-			       struct scoutfs_sysfs_attrs *ssa,
-			       struct attribute **attrs, char *fmt, ...);
+int scoutfs_sysfs_create_attrs_parent(struct super_block *sb,
+				      struct kobject *parent,
+				      struct scoutfs_sysfs_attrs *ssa,
+				      struct attribute **attrs, char *fmt, ...);
+#define scoutfs_sysfs_create_attrs(sb, ssa, attrs, fmt, args...)	\
+	scoutfs_sysfs_create_attrs_parent(sb, scoutfs_sysfs_sb_dir(sb),	\
+					  ssa, attrs, fmt, ##args)
+
 void scoutfs_sysfs_destroy_attrs(struct super_block *sb,
 				 struct scoutfs_sysfs_attrs *ssa);

--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -185,6 +185,11 @@ void scoutfs_trans_write_func(struct work_struct *work)

 	wait_event(sbi->trans_hold_wq, drained_holders(tri));

+	if (scoutfs_forcing_unmount(sb)) {
+		ret = -EIO;
+		goto out;
+	}
+
 	trace_scoutfs_trans_write_func(sb,
 			scoutfs_block_writer_dirty_bytes(sb, &tri->wri));

@@ -202,7 +207,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 			if (ret < 0)
 			      s = "clean advance seq";
 		}
-		goto out;
+		goto err;
 	}

 	if (sbi->trans_deadline_expired)
@@ -222,11 +227,12 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	      scoutfs_item_write_done(sb) ?:
 	      (s = "advance seq", scoutfs_client_advance_seq(sb, &trans_seq)) ?:
 	      (s = "get log trees", scoutfs_trans_get_log_trees(sb));
-out:
+err:
 	if (ret < 0)
 		scoutfs_err(sb, "critical transaction commit failure: %s, %d",
 			    s, ret);

+out:
 	spin_lock(&sbi->trans_write_lock);
 	sbi->trans_write_count++;
 	sbi->trans_write_ret = ret;
@@ -285,7 +291,7 @@ static void queue_trans_work(struct scoutfs_sb_info *sbi)
 int scoutfs_trans_sync(struct super_block *sb, int wait)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct write_attempt attempt;
+	struct write_attempt attempt = { .ret = 0 };
 	int ret;


@@ -300,10 +306,8 @@ int scoutfs_trans_sync(struct super_block *sb, int wait)

 	queue_trans_work(sbi);

-	ret = wait_event_interruptible(sbi->trans_write_wq,
-				       write_attempted(sbi, &attempt));
-	if (ret == 0)
-		ret = attempt.ret;
+	wait_event(sbi->trans_write_wq, write_attempted(sbi, &attempt));
+	ret = attempt.ret;

 	return ret;
 }
@@ -430,8 +434,8 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 		return true;
 	}

-	/* Try to refill data allocator before premature enospc */
-	if (scoutfs_data_alloc_free_bytes(sb) <= SCOUTFS_TRANS_DATA_ALLOC_LWM) {
+	/* if we're low and can't refill then alloc could empty and return enospc */
+	if (scoutfs_data_alloc_should_refill(sb, SCOUTFS_ALLOC_DATA_REFILL_THRESH)) {
 		scoutfs_inc_counter(sb, trans_commit_data_alloc_low);
 		return true;
 	}
@@ -439,38 +443,15 @@ static bool commit_before_hold(struct super_block *sb, struct trans_info *tri)
 	return false;
 }

-static bool acquired_hold(struct super_block *sb)
+/*
+ * called as a wait_event condition, needs to be careful to not change
+ * task state and is racing with waking paths that sub_return, test, and
+ * wake.
+ */
+static bool holders_no_writer(struct trans_info *tri)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	DECLARE_TRANS_INFO(sb, tri);
-	bool acquired;
-
-	/* if a caller already has a hold we acquire unconditionally */
-	if (inc_journal_info_holders()) {
-		atomic_inc(&tri->holders);
-		acquired = true;
-		goto out;
-	}
-
-	/* wait if the writer is blocking holds */
-	if (!inc_holders_unless_writer(tri)) {
-		dec_journal_info_holders();
-		acquired = false;
-		goto out;
-	}
-
-	/* wait if we're triggering another commit */
-	if (commit_before_hold(sb, tri)) {
-		release_holders(sb);
-		queue_trans_work(sbi);
-		acquired = false;
-		goto out;
-	}
-
-	trace_scoutfs_trans_acquired_hold(sb, current->journal_info, atomic_read(&tri->holders));
-	acquired = true;
-out:
-	return acquired;
+	smp_mb(); /* make sure task in wait_event queue before atomic read */
+	return !(atomic_read(&tri->holders) & TRANS_HOLDERS_WRITE_FUNC_BIT);
 }

 /*
@@ -486,15 +467,59 @@ out:
 * The writing thread marks itself as a global trans_task which
 * short-circuits all the hold machinery so it can call code that would
 * otherwise try to hold transactions while it is writing.
+ *
+ * If the caller is adding metadata items that will eventually consume
+ * free space -- not dirtying existing items or adding deletion items --
+ * then we can return enospc if our metadata allocator indicates that
+ * we're low on space.
 */
-int scoutfs_hold_trans(struct super_block *sb)
+int scoutfs_hold_trans(struct super_block *sb, bool allocing)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_TRANS_INFO(sb, tri);
+	u64 seq;
+	int ret;

 	if (current == sbi->trans_task)
 		return 0;

-	return wait_event_interruptible(sbi->trans_hold_wq, acquired_hold(sb));
+	for (;;) {
+		/* if a caller already has a hold we acquire unconditionally */
+		if (inc_journal_info_holders()) {
+			atomic_inc(&tri->holders);
+			ret = 0;
+			break;
+		}
+
+		/* wait until the writer work is finished */
+		if (!inc_holders_unless_writer(tri)) {
+			dec_journal_info_holders();
+			wait_event(sbi->trans_hold_wq, holders_no_writer(tri));
+			continue;
+		}
+
+		/* return enospc if server is into reserved blocks and we're allocating */
+		if (allocing && scoutfs_alloc_test_flag(sb, &tri->alloc, SCOUTFS_ALLOC_FLAG_LOW)) {
+			release_holders(sb);
+			ret = -ENOSPC;
+			break;
+		}
+
+		/* see if we need to trigger and wait for a commit before holding */
+		if (commit_before_hold(sb, tri)) {
+			seq = scoutfs_trans_sample_seq(sb);
+			release_holders(sb);
+			queue_trans_work(sbi);
+			wait_event(sbi->trans_hold_wq, scoutfs_trans_sample_seq(sb) != seq);
+			continue;
+		}
+
+		ret = 0;
+		break;
+	}
+
+	trace_scoutfs_hold_trans(sb, current->journal_info, atomic_read(&tri->holders), ret);
+	return ret;
 }

 /*
@@ -519,7 +544,7 @@ void scoutfs_release_trans(struct super_block *sb)

 	release_holders(sb);

-	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders));
+	trace_scoutfs_release_trans(sb, current->journal_info, atomic_read(&tri->holders), 0);
 }

 /*
--- a/kmod/src/trans.h
+++ b/kmod/src/trans.h
@@ -1,18 +1,13 @@
 #ifndef _SCOUTFS_TRANS_H_
 #define _SCOUTFS_TRANS_H_

-/* the server will attempt to fill data allocs for each trans */
-#define SCOUTFS_TRANS_DATA_ALLOC_HWM	(2ULL * 1024 * 1024 * 1024)
-/* the client will force commits if data allocators get too low */
-#define SCOUTFS_TRANS_DATA_ALLOC_LWM	(256ULL * 1024 * 1024)
-
 void scoutfs_trans_write_func(struct work_struct *work);
 int scoutfs_trans_sync(struct super_block *sb, int wait);
 int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 void scoutfs_trans_restart_sync_deadline(struct super_block *sb);

-int scoutfs_hold_trans(struct super_block *sb);
+int scoutfs_hold_trans(struct super_block *sb, bool allocing);
 bool scoutfs_trans_held(void);
 void scoutfs_release_trans(struct super_block *sb);
 u64 scoutfs_trans_sample_seq(struct super_block *sb);
--- a/kmod/src/volopt.c
+++ b/kmod/src/volopt.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2021 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include "super.h"
+#include "client.h"
+#include "volopt.h"
+
+/*
+ * Volume options are exposed through a sysfs directory.  Getting and
+ * setting the values sends rpcs to the server who owns the options in
+ * the super block.
+ */
+
+struct volopt_info {
+	struct super_block *sb;
+	struct scoutfs_sysfs_attrs ssa;
+};
+
+#define DECLARE_VOLOPT_INFO(sb, name) \
+	struct volopt_info *name = SCOUTFS_SB(sb)->volopt_info
+#define DECLARE_VOLOPT_INFO_KOBJ(kobj, name) \
+	DECLARE_VOLOPT_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
+
+/*
+ * attribute arrays need to be dense but the options we export could
+ * well become sparse over time.  .store and .load are generic and we
+ * have a lookup table to map the attributes array indexes to the number
+ * and name of the option.
+ */
+static struct volopt_nr_name {
+	int nr;
+	char *name;
+} volopt_table[] = {
+	{ SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, "data_alloc_zone_blocks" },
+};
+
+/* initialized by setup, pointer array is null terminated */
+static struct kobj_attribute volopt_attrs[ARRAY_SIZE(volopt_table)];
+static struct attribute *volopt_attr_ptrs[ARRAY_SIZE(volopt_table) + 1];
+
+static void get_opt_data(struct kobj_attribute *attr, struct scoutfs_volume_options *volopt,
+			 u64 *bit, __le64 **opt)
+{
+	size_t index = attr - &volopt_attrs[0];
+	int nr = volopt_table[index].nr;
+
+	*bit = 1ULL << nr;
+	*opt = &volopt->set_bits + 1 + nr;
+}
+
+static ssize_t volopt_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
+	struct super_block *sb = vinf->sb;
+	struct scoutfs_volume_options volopt;
+	__le64 *opt;
+	u64 bit;
+	int ret;
+
+	ret = scoutfs_client_get_volopt(sb, &volopt);
+	if (ret < 0)
+		return ret;
+
+	get_opt_data(attr, &volopt, &bit, &opt);
+
+	if (le64_to_cpu(volopt.set_bits) & bit) {
+		return snprintf(buf, PAGE_SIZE, "%llu", le64_to_cpup(opt));
+	} else {
+		buf[0] = '\0';
+		return 0;
+	}
+}
+
+static ssize_t volopt_attr_store(struct kobject *kobj, struct kobj_attribute *attr,
+				 const char *buf, size_t count)
+{
+	DECLARE_VOLOPT_INFO_KOBJ(kobj, vinf);
+	struct super_block *sb = vinf->sb;
+	struct scoutfs_volume_options volopt = {0,};
+	u8 chars[32];
+	__le64 *opt;
+	u64 bit;
+	u64 val;
+	int ret;
+
+	if (count == 0)
+		return 0;
+	if (count > sizeof(chars) - 1)
+		return -ERANGE;
+
+	get_opt_data(attr, &volopt, &bit, &opt);
+
+	if (buf[0] == '\n' || buf[0] == '\r') {
+		volopt.set_bits = cpu_to_le64(bit);
+
+		ret = scoutfs_client_clear_volopt(sb, &volopt);
+	} else {
+		memcpy(chars, buf, count);
+		chars[count] = '\0';
+		ret = kstrtoull(chars, 0, &val);
+		if (ret < 0)
+			return ret;
+
+		volopt.set_bits = cpu_to_le64(bit);
+		*opt = cpu_to_le64(val);
+
+		ret = scoutfs_client_set_volopt(sb, &volopt);
+	}
+
+	if (ret == 0)
+		ret = count;
+	return ret;
+}
+
+/*
+ * The volume option sysfs files are slim shims around RPCs so this
+ * should be called after the client is setup and before it is torn
+ * down.
+ */
+int scoutfs_volopt_setup(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct volopt_info *vinf;
+	int ret;
+	int i;
+
+	/* persistent volume options are always a bitmap u64 then the 64 options */
+	BUILD_BUG_ON(sizeof(struct scoutfs_volume_options) != (1 + 64) * 8);
+
+	vinf = kzalloc(sizeof(struct volopt_info), GFP_KERNEL);
+	if (!vinf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	scoutfs_sysfs_init_attrs(sb, &vinf->ssa);
+	vinf->sb = sb;
+	sbi->volopt_info = vinf;
+
+	for (i = 0; i < ARRAY_SIZE(volopt_table); i++) {
+		volopt_attrs[i] = (struct kobj_attribute) {
+			.attr = { .name = volopt_table[i].name, .mode = S_IWUSR | S_IRUGO },
+			.show = volopt_attr_show,
+			.store  = volopt_attr_store,
+		};
+		volopt_attr_ptrs[i] = &volopt_attrs[i].attr;
+	}
+
+	BUILD_BUG_ON(ARRAY_SIZE(volopt_table) != ARRAY_SIZE(volopt_attr_ptrs) - 1);
+	volopt_attr_ptrs[i] = NULL;
+
+	ret = scoutfs_sysfs_create_attrs(sb, &vinf->ssa, volopt_attr_ptrs, "volume_options");
+	if (ret < 0)
+		goto out;
+
+out:
+	if (ret)
+		scoutfs_volopt_destroy(sb);
+
+	return ret;
+}
+
+void scoutfs_volopt_destroy(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct volopt_info *vinf = SCOUTFS_SB(sb)->volopt_info;
+
+	if (vinf) {
+		scoutfs_sysfs_destroy_attrs(sb, &vinf->ssa);
+		kfree(vinf);
+		sbi->volopt_info = NULL;
+	}
+}
--- a/kmod/src/volopt.h
+++ b/kmod/src/volopt.h
@@ -0,0 +1,7 @@
+#ifndef _SCOUTFS_VOLOPT_H_
+#define _SCOUTFS_VOLOPT_H_
+
+int scoutfs_volopt_setup(struct super_block *sb);
+void scoutfs_volopt_destroy(struct super_block *sb);
+
+#endif
--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -577,7 +577,7 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, true);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -778,7 +778,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 					     &tgs) != 0)
 			memset(&tgs, 0, sizeof(tgs));

-		ret = scoutfs_hold_trans(sb);
+		ret = scoutfs_hold_trans(sb, false);
 		if (ret < 0)
 			break;
 		release = true;
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -4,3 +4,5 @@ src/dumb_setxattr
 src/handle_cat
 src/bulk_create_paths
 src/find_xattrs
+src/stage_tmpfile
+src/create_xattr_loop
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -7,7 +7,8 @@ BIN := src/createmany			\
 	src/handle_cat			\
 	src/bulk_create_paths		\
 	src/stage_tmpfile		\
-	src/find_xattrs
+	src/find_xattrs			\
+	src/create_xattr_loop

 DEPS := $(wildcard src/*.d)

--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -40,7 +40,7 @@ t_filter_dmesg()
 	# mount and unmount spew a bunch
 	re="$re|scoutfs.*client connected"
 	re="$re|scoutfs.*client disconnected"
-	re="$re|scoutfs.*server setting up"
+	re="$re|scoutfs.*server starting"
 	re="$re|scoutfs.*server ready"
 	re="$re|scoutfs.*server accepted"
 	re="$re|scoutfs.*server closing"
@@ -62,5 +62,16 @@ t_filter_dmesg()
 	# in debugging kernels we can slow things down a bit
 	re="$re|hrtimer: interrupt took .*"

+	# fencing tests force unmounts and trigger timeouts
+	re="$re|scoutfs .* forcing unmount"
+	re="$re|scoutfs .* reconnect timed out"
+	re="$re|scoutfs .* recovery timeout expired"
+	re="$re|scoutfs .* fencing previous leader"
+	re="$re|scoutfs .* reclaimed resources"
+	re="$re|scoutfs .* quorum .* error"
+	re="$re|scoutfs .* error reading quorum block"
+	re="$re|scoutfs .* error .* writing quorum block"
+	re="$re|scoutfs .* error .* while checking to delete inode"
+
 	egrep -v "($re)" 
 }
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -17,6 +17,17 @@ t_sync_seq_index()
 	t_quiet sync
 }

+t_mount_rid()
+{
+	local nr="${1:-0}"
+	local mnt="$(eval echo \$T_M$nr)"
+	local rid
+
+	rid=$(scoutfs statfs -s rid -p "$mnt")
+
+	echo "$rid"
+}
+
 #
 # Output the "f.$fsid.r.$rid" identifier string for the given mount
 # number, 0 is used by default if none is specified. 
@@ -132,6 +143,16 @@ t_umount()
 	eval t_quiet umount \$T_M$nr
 }

+t_force_umount()
+{
+	local nr="$1"
+
+	test "$nr" -lt "$T_NR_MOUNTS" || \
+		t_fail "fs nr $nr invalid"
+
+	eval t_quiet umount -f \$T_M$nr
+}
+
 #
 # Attempt to mount all the configured mounts, assuming that they're
 # not already mounted.
@@ -277,3 +298,67 @@ t_counter_diff_changed() {
 		echo "counter $which didn't change" ||
 		echo "counter $which changed"
 }
+
+#
+# See if we can find a local mount with the caller's rid.
+#
+t_rid_is_mounted() {
+	local rid="$1"
+	local fr="$1"
+
+	for fr in /sys/fs/scoutfs/*; do
+		if [ "$(cat $fr/rid)" == "$rid" ]; then
+			return 0
+		fi
+	done
+
+	return 1
+}
+
+#
+# A given mount is being fenced if any mount has a fence request pending
+# for it which hasn't finished and been removed.
+#
+t_rid_is_fencing() {
+	local rid="$1"
+	local fr
+
+	for fr in /sys/fs/scoutfs/*; do
+		if [ -d "$fr/fence/$rid" ]; then
+			return 0
+		fi
+	done
+
+	return 1
+}
+
+#
+# Wait until the mount identified by the first rid arg is not in any
+# states specified by the remaining state description word args.
+#
+t_wait_if_rid_is() {
+	local rid="$1"
+
+	while ( [[ $* =~ mounted ]] && t_rid_is_mounted $rid ) ||
+	      ( [[ $* =~ fencing ]] && t_rid_is_fencing $rid ) ; do
+		sleep .5
+	done
+}
+
+#
+# Wait until any mount identifies itself as the elected leader.  We can
+# be waiting while tests mount and unmount so mounts may not be mounted
+# at the test's expected mount points.
+#
+t_wait_for_leader() {
+	local i
+
+	while sleep .25; do
+		for i in $(t_fs_nrs); do
+			local ldr="$(t_sysfs_path $i 2>/dev/null)/quorum/is_leader"
+			if [ "$(cat $ldr 2>/dev/null)" == "1" ]; then
+				return
+			fi
+		done
+	done
+}
--- a/tests/golden/enospc
+++ b/tests/golden/enospc
@@ -0,0 +1,8 @@
+== prepare directories and files
+== fallocate until enospc
+== remove all the files and verify free data blocks
+== make small meta fs
+== create large xattrs until we fill up metadata
+== remove files with xattrs after enospc
+== make sure we can create again
+== cleanup small meta fs
--- a/tests/golden/fence-and-reclaim
+++ b/tests/golden/fence-and-reclaim
@@ -0,0 +1,5 @@
+== make sure all mounts can see each other
+== force unmount one client, connection timeout, fence nop, mount
+== force unmount all non-server, connection timeout, fence nop, mount
+== force unmount server, quorum elects new leader, fence nop, mount
+== force unmount everything, new server fences all previous
--- a/tests/golden/lock-conflicting-batch-commit
+++ b/tests/golden/lock-conflicting-batch-commit
@@ -0,0 +1,4 @@
+== create per mount files
+== time independent modification
+== time concurrent independent modification
+== time concurrent conflicting modification
--- a/tests/golden/orphan-inodes
+++ b/tests/golden/orphan-inodes
@@ -0,0 +1,4 @@
+== test our inode existance function
+== unlinked and opened inodes still exist
+== orphan from failed evict deletion is picked up
+== orphaned inos in all mounts all deleted
--- a/tests/golden/resize-devices
+++ b/tests/golden/resize-devices
@@ -0,0 +1,27 @@
+== make initial small fs
+== 0s do nothing
+== shrinking fails
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+== existing sizes do nothing
+== growing outside device fails
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+== resizing meta works
+== resizing data works
+== shrinking back fails
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+resize_devices ioctl failed: Invalid argument (22)
+scoutfs: resize-devices failed: Invalid argument (22)
+== resizing again does nothing
+== resizing to full works
+== cleanup extra fs
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -18,10 +18,15 @@ die() {
 	exit 1
 }

+timestamp()
+{
+	date '+%F %T.%N'
+}
+
 # output a message with a timestamp to the run.log
 log()
 {
-	echo "[$(date '+%F %T.%N')] $*" >> "$T_RESULTS/run.log"
+	echo "[$(timestamp)] $*" >> "$T_RESULTS/run.log"
 }

 # run a logged command, exiting if it fails
@@ -66,6 +71,7 @@ $(basename $0) options:
    -X        | xfstests git repo. Used by tests/xfstests.sh.
    -x        | xfstests git branch to checkout and track.
    -y        | xfstests ./check additional args
+    -z <nr>   | set data-alloc-zone-blocks in mkfs
 EOF
 }

@@ -169,6 +175,11 @@ while true; do
 		T_XFSTESTS_ARGS="$2"
 		shift
 		;;
+	-z)
+		test -n "$2" || die "-z must have nr mounts argument"
+		T_DATA_ALLOC_ZONE_BLOCKS="-z $2"
+		shift
+		;;
 	-h|-\?|--help)
 		show_help
 		exit 1
@@ -319,7 +330,8 @@ if [ -n "$T_MKFS" ]; then
 	done

 	msg "making new filesystem with $T_QUORUM quorum members"
-	cmd scoutfs mkfs -f $quo "$T_META_DEVICE" "$T_DATA_DEVICE"
+	cmd scoutfs mkfs -f $quo $T_DATA_ALLOC_ZONE_BLOCKS \
+		"$T_META_DEVICE" "$T_DATA_DEVICE"
 fi

 if [ -n "$T_INSMOD" ]; then
@@ -360,6 +372,39 @@ cmd cat /sys/kernel/debug/tracing/set_event
 cmd grep .  /sys/kernel/debug/tracing/options/trace_printk \
 	    /proc/sys/kernel/ftrace_dump_on_oops

+#
+# Build a fenced config that runs scripts out of the repository rather
+# than the default system directory
+#
+conf="$T_RESULTS/scoutfs-fencd.conf"
+cat > $conf << EOF
+SCOUTFS_FENCED_DELAY=1
+SCOUTFS_FENCED_RUN=$T_UTILS/fenced/local-force-unmount
+SCOUTFS_FENCED_RUN_ARGS=""
+EOF
+export SCOUTFS_FENCED_CONFIG_FILE="$conf"
+
+#
+# Run the agent in the background, log its output, an kill it if we
+# exit
+#
+fenced_log()
+{
+	echo "[$(timestamp)] $*" >> "$T_RESULTS/fenced.stdout.log"
+}
+fenced_pid=""
+kill_fenced()
+{
+	if test -n "$fenced_pid" -a -d "/proc/$fenced_pid" ; then
+		fenced_log "killing fenced pid $fenced_pid"
+		kill "$fenced_pid"
+	fi
+}
+trap kill_fenced EXIT
+$T_UTILS/fenced/scoutfs-fenced > "$T_RESULTS/fenced.stdout.log" 2> "$T_RESULTS/fenced.stderr.log" &
+fenced_pid=$!
+fenced_log "started fenced pid $fenced_pid in the background"
+
 #
 # mount concurrently so that a quorum is present to elect the leader and
 # start a server.
--- a/tests/sequence
+++ b/tests/sequence
@@ -7,6 +7,7 @@ simple-release-extents.sh
 setattr_more.sh
 offline-extent-waiting.sh
 move-blocks.sh
+enospc.sh
 srch-basic-functionality.sh
 simple-xattr-unit.sh
 lock-refleak.sh
@@ -24,9 +25,13 @@ basic-posix-consistency.sh
 dirent-consistency.sh
 mkdir-rename-rmdir.sh
 lock-ex-race-processes.sh
+lock-conflicting-batch-commit.sh
 cross-mount-data-free.sh
 persistent-item-vers.sh
 setup-error-teardown.sh
+resize-devices.sh
+fence-and-reclaim.sh
+orphan-inodes.sh
 mount-unmount-race.sh
 createmany-parallel-mounts.sh
 archive-light-cycle.sh
--- a/tests/src/create_xattr_loop.c
+++ b/tests/src/create_xattr_loop.c
@@ -0,0 +1,113 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+static void exit_usage(void)
+{
+	printf(" -h/-?         output this usage message and exit\n"
+	       " -c <count>    number of xattrs to create\n"
+	       " -n <string>   xattr name prefix, -NR is appended\n"
+	       " -p <path>     string with path to file with xattrs\n" 
+	       " -s <size>     xattr value size\n");
+	exit(1);
+}
+
+int main(int argc, char **argv)
+{
+	char *pref = NULL;
+	char *path = NULL;
+	char *val;
+	char *name;
+	unsigned long long count = 0;
+	unsigned long long size = 0;
+	unsigned long long i;
+	int ret;
+	int c;
+
+	while ((c = getopt(argc, argv, "+c:n:p:s:")) != -1) {
+
+		switch (c) {
+			case 'c':
+				count = strtoull(optarg, NULL, 0);
+				break;
+			case 'n':
+				pref = strdup(optarg);
+				break;
+			case 'p':
+				path = strdup(optarg);
+				break;
+			case 's':
+				size = strtoull(optarg, NULL, 0);
+				break;
+			case '?':
+				printf("unknown argument: %c\n", optind);
+			case 'h':
+				exit_usage();
+		}
+	}
+
+	if (count == 0) {
+		printf("specify count of xattrs to create with -c\n");
+		exit(1);
+	}
+
+	if (count == ULLONG_MAX) {
+		printf("invalid -c count\n");
+		exit(1);
+	}
+
+	if (size == 0) {
+		printf("specify xattrs value size with -s\n");
+		exit(1);
+	}
+
+	if (size == ULLONG_MAX || size < 2) {
+		printf("invalid -s size\n");
+		exit(1);
+	}
+
+	if (path == NULL) {
+		printf("specify path to file with -p\n");
+		exit(1);
+	}
+
+	if (pref == NULL) {
+		printf("specify xattr name prefix string with -n\n");
+		exit(1);
+	}
+
+	ret = snprintf(NULL, 0, "%s-%llu", pref, ULLONG_MAX) + 1;
+	name = malloc(ret);
+	if (!name) {
+		printf("couldn't allocate xattr name buffer\n");
+		exit(1);
+	}
+
+	val = malloc(size);
+	if (!val) {
+		printf("couldn't allocate xattr value buffer\n");
+		exit(1);
+	}
+
+	memset(val, 'a', size - 1);
+	val[size - 1] = '\0';
+
+	for (i = 0; i < count; i++) {
+		sprintf(name, "%s-%llu", pref, i);
+
+		ret = setxattr(path, name, val, size, 0);
+		if (ret) {
+			printf("returned %d errno %d (%s)\n",
+					ret, errno, strerror(errno));
+			return 1;
+		}
+	}
+
+	return 0;
+}
--- a/tests/src/stage_tmpfile.c
+++ b/tests/src/stage_tmpfile.c
@@ -48,8 +48,9 @@ char buf[SZ];

 int main(int argc, char **argv)
 {
-	struct scoutfs_ioctl_release ioctl_args = {0};
+	struct scoutfs_ioctl_release rel = {0};
 	struct scoutfs_ioctl_move_blocks mb;
+	struct scoutfs_ioctl_stat_more stm;
 	struct sub_tmp_info sub_tmps[8];
 	int tot_size = 0;
 	char *dest_file;
@@ -111,12 +112,20 @@ int main(int argc, char **argv)
 		exit(1);
 	}

-	// release everything in dest file
-	ioctl_args.offset = 0;
-	ioctl_args.length = tot_size;
-	ioctl_args.data_version = 0;
+	// get current data_version after fallocate's size extensions
+	stm.valid_bytes = sizeof(struct scoutfs_ioctl_stat_more);
+	ret = ioctl(dest_fd, SCOUTFS_IOC_STAT_MORE, &stm);
+	if (ret < 0) {
+		perror("stat_more ioctl error");
+		exit(1);
+	}

-	ret = ioctl(dest_fd, SCOUTFS_IOC_RELEASE, &ioctl_args);
+	// release everything in dest file
+	rel.offset = 0;
+	rel.length = tot_size;
+	rel.data_version = stm.data_version;
+
+	ret = ioctl(dest_fd, SCOUTFS_IOC_RELEASE, &rel);
 	if (ret < 0) {
 		perror("error");
 		exit(1);
@@ -130,7 +139,7 @@ int main(int argc, char **argv)
 		mb.from_off = 0;
 		mb.len = sub_tmp->length;
 		mb.to_off = sub_tmp->offset;
-		mb.data_version = 0;
+		mb.data_version = stm.data_version;
 		mb.flags = SCOUTFS_IOC_MB_STAGE;

 		ret = ioctl(dest_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
--- a/tests/tests/enospc.sh
+++ b/tests/tests/enospc.sh
@@ -0,0 +1,100 @@
+#
+# test hititng enospc by filling with data or metadata and
+# then recovering by removing what we filled.
+#
+
+#    Type  Size     Total   Used      Free  Use%  
+#MetaData  64KB   1048576  32782   1015794     3  
+#    Data   4KB  16777152      0  16777152     0  
+free_blocks() {
+	local md="$1"
+	local mnt="$2"
+	scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }'
+}
+
+t_require_commands scoutfs stat fallocate createmany
+
+echo "== prepare directories and files"
+for n in $(t_fs_nrs); do
+	eval path="\$T_D${n}/dir-$n/file-$n"
+	mkdir -p $(dirname $path)
+	touch $path
+done
+sync
+
+echo "== fallocate until enospc"
+before=$(free_blocks Data "$T_M0")
+finished=0
+while [ $finished != 1 ]; do
+	for n in $(t_fs_nrs); do
+		eval path="\$T_D${n}/dir-$n/file-$n"
+		off=$(stat -c "%s" "$path")
+
+		LC_ALL=C fallocate -o $off -l 128MiB  "$path" > $T_TMP.fallocate 2>&1
+		err="$?"
+
+		if grep -qi "no space" $T_TMP.fallocate; then
+			finished=1
+			break
+		fi
+		if [ "$err" != "0" ]; then
+			t_fail "fallocate failed with $err"
+		fi
+	done
+done
+
+echo "== remove all the files and verify free data blocks"
+for n in $(t_fs_nrs); do
+	eval dir="\$T_D${n}/dir-$n"
+	rm -rf "$dir"
+done
+sync
+after=$(free_blocks Data "$T_M0")
+# nothing else should be modifying data blocks
+test "$before" == "$after" || \
+	t_fail "$after free data blocks after rm, expected $before"
+
+# XXX this is all pretty manual, would be nice to have helpers
+echo "== make small meta fs"
+# meta device just big enough for reserves and the metadata we'll fill
+scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
+	t_fail "mkfs failed"
+SCR="/mnt/scoutfs.enospc"
+mkdir -p "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
+	"$T_EX_DATA_DEV" "$SCR"
+
+echo "== create large xattrs until we fill up metadata"
+mkdir -p "$SCR/xattrs"
+
+for f in $(seq 1 100000); do
+	file="$SCR/xattrs/file-$f"
+	touch "$file"
+
+	LC_ALL=C create_xattr_loop -c 1000 -n user.scoutfs-enospc -p "$file" -s 65535 > $T_TMP.cxl 2>&1
+	err="$?"
+
+	if grep -qi "no space" $T_TMP.cxl; then
+		echo "enospc at f $f" >> $T_TMP.cxl
+		break
+	fi
+	if [ "$err" != "0" ]; then
+		t_fail "create_xattr_loop failed with $err"
+	fi
+done
+
+echo "== remove files with xattrs after enospc"
+rm -rf "$SCR/xattrs"
+
+echo "== make sure we can create again"
+file="$SCR/file-after"
+touch $file
+setfattr -n user.scoutfs-enospc -v 1 "$file"
+sync
+rm -f "$file"
+
+echo "== cleanup small meta fs"
+umount "$SCR"
+rmdir "$SCR"
+
+t_pass
--- a/tests/tests/fence-and-reclaim.sh
+++ b/tests/tests/fence-and-reclaim.sh
@@ -0,0 +1,127 @@
+#
+# Fence nodes and reclaim their resources.
+#
+
+t_require_commands sleep touch grep sync scoutfs
+t_require_mounts 2
+
+#
+# Make sure that all mounts can read the results of a write from each
+# mount.  And make sure that the greatest of all the written seqs is
+# visible after the writes were commited by remote reads.
+#
+check_read_write()
+{
+	local expected
+	local greatest=0
+	local seq
+	local path
+	local saw
+	local w
+	local r
+
+	for w in $(t_fs_nrs); do
+		expected="$w wrote at $(date --rfc-3339=ns)"
+		eval path="\$T_D${w}/written"
+		echo "$expected" > "$path"
+
+		seq=$(scoutfs stat -s meta_seq $path)
+		if [ "$seq" -gt "$greatest" ]; then
+			greatest=$seq
+		fi
+
+		for r in $(t_fs_nrs); do
+			eval path="\$T_D${r}/written"
+			saw=$(cat "$path")
+			if [ "$saw" != "$expected" ]; then
+				echo "mount $r read '$saw' after mount $w wrote '$expected'"
+			fi
+		done
+	done
+
+	seq=$(scoutfs statfs -s committed_seq -p $T_D0)
+	if [ "$seq" -lt "$greatest" ]; then
+		echo "committed_seq $seq less than greatest $greatest"
+	fi
+}
+
+echo "== make sure all mounts can see each other"
+check_read_write
+
+echo "== force unmount one client, connection timeout, fence nop, mount"
+cl=$(t_first_client_nr)
+sv=$(t_server_nr)
+rid=$(t_mount_rid $cl)
+echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
+sync
+t_force_umount $cl
+# wait for client reconnection to timeout
+while grep -q $rid $(t_debugfs_path $sv)/connections; do
+	sleep .5
+done
+while t_rid_is_fencing $rid; do
+	sleep .5
+done
+t_mount $cl
+check_read_write
+
+echo "== force unmount all non-server, connection timeout, fence nop, mount"
+sv=$(t_server_nr)
+pattern="nonsense"
+sync
+for cl in $(t_fs_nrs); do
+	if [ $cl == $sv ]; then
+		continue;
+	fi
+
+	rid=$(t_mount_rid $cl)
+	pattern="$pattern|$rid"
+	echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
+
+	t_force_umount $cl
+done
+
+# wait for all client reconnections to timeout
+while egrep -q "($pattern)" $(t_debugfs_path $sv)/connections; do
+	sleep .5
+done
+# wait for all fence requests to complete
+while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
+	sleep .5
+done
+# remount all the clients
+for cl in $(t_fs_nrs); do
+	if [ $cl == $sv ]; then
+		continue;
+	fi
+	t_mount $cl
+done
+check_read_write
+
+echo "== force unmount server, quorum elects new leader, fence nop, mount"
+sv=$(t_server_nr)
+rid=$(t_mount_rid $sv)
+echo "sv $sv rid $rid" >> "$T_TMP.log"
+sync
+t_force_umount $sv
+t_wait_for_leader
+# wait until new server is done fencing unmounted leader rid
+while t_rid_is_fencing $rid; do
+	sleep .5
+done
+t_mount $sv
+check_read_write
+
+echo "== force unmount everything, new server fences all previous"
+sync
+for nr in $(t_fs_nrs); do
+	t_force_umount $nr
+done
+t_mount_all
+# wait for all fence requests to complete
+while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
+	sleep .5
+done
+check_read_write
+
+t_pass
--- a/tests/tests/lock-conflicting-batch-commit.sh
+++ b/tests/tests/lock-conflicting-batch-commit.sh
@@ -0,0 +1,59 @@
+#
+# If bulk work accidentally conflicts in the worst way we'd like to have
+# it not result in catastrophic performance.  Make sure that each
+# instance of bulk work is given the opportunity to get as much as it
+# can into the transaction under a lock before the lock is revoked
+# and the transaction is committed.
+#
+
+t_require_commands setfattr
+t_require_mounts 2
+
+NR=3000
+
+echo "== create per mount files" 
+for m in 0 1; do 
+	eval dir="\$T_D${m}/dir/$m"
+	t_quiet mkdir -p "$dir"
+	for a in $(seq 1 $NR); do touch "$dir/$a"; done
+done
+
+echo "== time independent modification"
+for m in 0 1; do 
+	eval dir="\$T_D${m}/dir/$m"
+	START=$SECONDS
+	for a in $(seq 1 $NR); do
+		setfattr -n user.test_grace -v $a "$dir/$a"
+	done
+	echo "mount $m: $((SECONDS - START))" >> $T_TMP.log
+done
+
+echo "== time concurrent independent modification"
+START=$SECONDS
+for m in 0 1; do 
+	eval dir="\$T_D${m}/dir/$m"
+	(for a in $(seq 1 $NR); do
+		setfattr -n user.test_grace -v $a "$dir/$a"; 
+	done) &
+done
+wait
+IND="$((SECONDS - START))"
+echo "ind: $IND" >> $T_TMP.log
+
+echo "== time concurrent conflicting modification"
+START=$SECONDS
+for m in 0 1; do 
+	eval dir="\$T_D${m}/dir/0"
+	(for a in $(seq 1 $NR); do
+		setfattr -n user.test_grace -v $a "$dir/$a"; 
+	done) &
+done
+wait
+CONF="$((SECONDS - START))"
+echo "conf: $CONF" >> $T_TMP.log
+
+if [ "$CONF" -gt "$((IND * 5))" ]; then
+	t_fail "conflicting $CONF secs is more than 5x independent $IND secs"
+fi
+
+t_pass
--- a/tests/tests/orphan-inodes.sh
+++ b/tests/tests/orphan-inodes.sh
@@ -0,0 +1,77 @@
+#
+# make sure we clean up orphaned inodes
+#
+
+t_require_commands sleep touch sync stat handle_cat kill rm
+t_require_mounts 2
+
+#
+# usually bash prints an annoying output message when jobs
+# are killed.  We can avoid that by redirecting stderr for
+# the bash process when it reaps the jobs that are killed.
+#
+silent_kill() {
+	exec {ERR}>&2 2>/dev/null
+	kill "$@"
+	wait "$@"
+	exec 2>&$ERR {ERR}>&-
+}
+
+#
+# We don't have a great way to test that inode items still exist.   We
+# don't prevent opening handles with nlink 0 today, so we'll use that.
+# This would have to change to some other method.
+#
+inode_exists()
+{
+	local ino="$1"
+
+	handle_cat "$T_M0" "$ino" > "$T_TMP.handle_cat.log" 2>&1
+}
+
+echo "== test our inode existance function"
+path="$T_D0/file"
+touch "$path"
+ino=$(stat -c "%i" "$path")
+inode_exists $ino || echo "$ino didn't exist"
+
+echo "== unlinked and opened inodes still exist"
+sleep 1000000 < "$path" &
+pid="$!"
+rm -f "$path"
+inode_exists $ino || echo "$ino didn't exist"
+
+echo "== orphan from failed evict deletion is picked up"
+# pending kill signal stops evict from getting locks and deleting
+silent_kill $pid
+sleep 55
+inode_exists $ino && echo "$ino still exists"
+
+echo "== orphaned inos in all mounts all deleted"
+pids=""
+inos=""
+for nr in $(t_fs_nrs); do
+	eval path="\$T_D${nr}/file-$nr"
+	touch "$path"
+	inos="$inos $(stat -c %i $path)"
+	sleep 1000000 < "$path" &
+	pids="$pids $!"
+	rm -f "$path"
+done
+sync
+silent_kill $pids
+for nr in $(t_fs_nrs); do
+	t_force_umount $nr
+done
+t_mount_all
+# wait for all fence requests to complete
+while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
+	sleep .5
+done
+# wait for orphan scans to run
+sleep 55
+for ino in $inos; do
+	inode_exists $ino && echo "$ino still exists"
+done
+
+t_pass
--- a/tests/tests/resize-devices.sh
+++ b/tests/tests/resize-devices.sh
@@ -0,0 +1,149 @@
+#
+# Some basic tests of online resizing metadata and data devices.
+#
+
+statfs_total() {
+	local single="total_$1_blocks"
+	local mnt="$2"
+
+	scoutfs statfs -s $single -p "$mnt"
+}
+
+df_free() {
+	local md="$1"
+	local mnt="$2"
+
+	scoutfs df -p "$mnt" | awk '($1 == "'$md'") { print $5; exit }'
+}
+
+same_totals() {
+	cur_meta_tot=$(statfs_total meta "$SCR")
+	cur_data_tot=$(statfs_total data "$SCR")
+
+	test "$cur_meta_tot" == "$exp_meta_tot" || \
+		t_fail "cur total_meta_blocks $cur_meta_tot != expected $exp_meta_tot"
+	test "$cur_data_tot" == "$exp_data_tot" || \
+		t_fail "cur total_data_blocks $cur_data_tot != expected $exp_data_tot"
+}
+
+#
+# make sure that the specified devices have grown by doubling.   The
+# total blocks can be tested exactly but the df reported total needs
+# some slop to account for reserved blocks and concurrent allocation.
+#
+devices_grew() {
+	cur_meta_tot=$(statfs_total meta "$SCR")
+	cur_data_tot=$(statfs_total data "$SCR")
+	cur_meta_df=$(df_free MetaData "$SCR")
+	cur_data_df=$(df_free Data "$SCR")
+
+	local grow_meta_tot=$(echo "$exp_meta_tot * 2" | bc)
+	local grow_data_tot=$(echo "$exp_data_tot * 2" | bc)
+	local grow_meta_df=$(echo "($exp_meta_df * 1.95)/1" | bc)
+	local grow_data_df=$(echo "($exp_data_df * 1.95)/1" | bc)
+
+	if [ "$1" == "meta" ]; then
+		test "$cur_meta_tot" == "$grow_meta_tot" || \
+			t_fail "cur total_meta_blocks $cur_meta_tot != grown $grow_meta_tot"
+		test "$cur_meta_df" -lt "$grow_meta_df" && \
+			t_fail "cur meta df total $cur_meta_df < grown $grow_meta_df"
+		exp_meta_tot=$cur_meta_tot
+		exp_meta_df=$cur_meta_df
+		shift
+	fi
+
+	if [ "$1" == "data" ]; then
+		test "$cur_data_tot" == "$grow_data_tot" || \
+			t_fail "cur total_data_blocks $cur_data_tot != grown $grow_data_tot"
+		test "$cur_data_df" -lt "$grow_data_df" && \
+			t_fail "cur data df total $cur_data_df < grown $grow_data_df"
+		exp_data_tot=$cur_data_tot
+		exp_data_df=$cur_data_df
+	fi
+}
+
+# first calculate small mkfs based on device size
+size_meta=$(blockdev --getsize64 "$T_EX_META_DEV")
+size_data=$(blockdev --getsize64 "$T_EX_DATA_DEV")
+quarter_meta=$(echo "$size_meta / 4" | bc)
+quarter_data=$(echo "$size_data / 4" | bc)
+
+# XXX this is all pretty manual, would be nice to have helpers
+echo "== make initial small fs"
+scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m $quarter_meta -d $quarter_data \
+	"$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
+		t_fail "mkfs failed"
+SCR="/mnt/scoutfs.enospc"
+mkdir -p "$SCR"
+mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
+	"$T_EX_DATA_DEV" "$SCR"
+
+# then calculate sizes based on blocks that mkfs used
+quarter_meta=$(echo "$(statfs_total meta "$SCR") * 64 * 1024" | bc)
+quarter_data=$(echo "$(statfs_total data "$SCR") * 4 * 1024" | bc)
+whole_meta=$(echo "$quarter_meta * 4" | bc)
+whole_data=$(echo "$quarter_data * 4" | bc)
+outsize_meta=$(echo "$whole_meta * 2" | bc)
+outsize_data=$(echo "$whole_data * 2" | bc)
+half_meta=$(echo "$whole_meta / 2" | bc)
+half_data=$(echo "$whole_data / 2" | bc)
+shrink_meta=$(echo "$quarter_meta / 2" | bc)
+shrink_data=$(echo "$quarter_data / 2" | bc)
+
+# and save expected values for checks
+exp_meta_tot=$(statfs_total meta "$SCR")
+exp_meta_df=$(df_free MetaData "$SCR")
+exp_data_tot=$(statfs_total data "$SCR")
+exp_data_df=$(df_free Data "$SCR")
+
+echo "== 0s do nothing"
+scoutfs resize-devices -p "$SCR" 
+scoutfs resize-devices -p "$SCR" -m 0
+scoutfs resize-devices -p "$SCR" -d 0
+scoutfs resize-devices -p "$SCR" -m 0 -d 0
+
+echo "== shrinking fails"
+scoutfs resize-devices -p "$SCR" -m $shrink_meta
+scoutfs resize-devices -p "$SCR" -d $shrink_data
+scoutfs resize-devices -p "$SCR" -m $shrink_meta -d $shrink_data
+same_totals
+
+echo "== existing sizes do nothing"
+scoutfs resize-devices -p "$SCR" -m $quarter_meta
+scoutfs resize-devices -p "$SCR" -d $quarter_data
+scoutfs resize-devices -p "$SCR" -m $quarter_meta -d $quarter_data
+same_totals
+
+echo "== growing outside device fails"
+scoutfs resize-devices -p "$SCR" -m $outsize_meta
+scoutfs resize-devices -p "$SCR" -d $outsize_data
+scoutfs resize-devices -p "$SCR" -m $outsize_meta -d $outsize_data
+same_totals
+
+echo "== resizing meta works"
+scoutfs resize-devices -p "$SCR" -m $half_meta
+devices_grew meta
+
+echo "== resizing data works"
+scoutfs resize-devices -p "$SCR" -d $half_data
+devices_grew data
+
+echo "== shrinking back fails"
+scoutfs resize-devices -p "$SCR" -m $quarter_meta
+scoutfs resize-devices -p "$SCR" -m $quarter_data
+same_totals
+
+echo "== resizing again does nothing"
+scoutfs resize-devices -p "$SCR" -m $half_meta
+scoutfs resize-devices -p "$SCR" -m $half_data
+same_totals
+
+echo "== resizing to full works"
+scoutfs resize-devices -p "$SCR" -m $whole_meta -d $whole_data
+devices_grew meta data
+
+echo "== cleanup extra fs"
+umount "$SCR"
+rmdir "$SCR"
+
+t_pass
--- a/utils/fenced/local-force-unmount
+++ b/utils/fenced/local-force-unmount
@@ -0,0 +1,35 @@
+#!/usr/bin/bash
+
+echo_fail() {
+	echo "$@" > /dev/stderr
+	exit 1
+}
+
+rid="$SCOUTFS_FENCED_REQ_RID"
+
+#
+# Look for a local mount with the rid to fence.  Typically we'll at
+# least find the mount with the server that requested the fence that
+# we're processing.   But it's possible that mounts are unmounted
+# before, or while, we're running.
+#
+mnts=$(findmnt -l -n -t scoutfs -o TARGET) || \
+	echo_fail "findmnt -t scoutfs failed" > /dev/stderr
+
+for mnt in $mnts; do
+	mnt_rid=$(scoutfs statfs -p "$mnt" -s rid) || \
+		echo_fail "scoutfs statfs $mnt failed"
+
+	if [ "$mnt_rid" == "$rid" ]; then
+		umount -f "$mnt" || \
+			echo_fail "umout -f $mnt"
+
+		exit 0
+	fi
+done
+
+#
+# If the mount doesn't exist on this host then it can't access the
+# devices by definition and can be considered fenced.
+#
+exit 0
--- a/utils/fenced/scoutfs-fenced
+++ b/utils/fenced/scoutfs-fenced
@@ -0,0 +1,94 @@
+#!/usr/bin/bash
+
+message_output()
+{
+	printf "[%s] %s\n" "$(date '+%F %T.%N')" "$@"
+}
+
+error_message()
+{
+	message_output "$@" >&2
+}
+
+error_exit()
+{
+	error_message "$@, exiting"
+	exit 1
+}
+
+log_message()
+{
+	message_output "$@"
+}
+
+# restart if we catch hup to re-read the config
+hup_restart()
+{
+	log_message "caught SIGHUP, restarting"
+	exec "$@"
+}
+trap hup_restart SIGHUP
+
+# defaults
+SCOUTFS_FENCED_CONFIG_FILE=${SCOUTFS_FENCED_CONFIG_FILE:-/etc/scoutfs/scoutfs-fenced.conf}
+SCOUTFS_FENCED_DELAY=2
+#SCOUTFS_FENCED_RUN
+#SCOUTFS_FENCED_RUN_ARGS
+
+test -n "$SCOUTFS_FENCED_CONFIG_FILE" || \
+	error_exit "SCOUTFS_FENCED_CONFIG_FILE isn't set"
+test -r "$SCOUTFS_FENCED_CONFIG_FILE" || \
+	error_exit "SCOUTFS_FENCED_CONFIG_FILE isn't readable file"
+
+log_message "reading config file $SCOUTFS_FENCED_CONFIG_FILE"
+
+. "$SCOUTFS_FENCED_CONFIG_FILE" || \
+	error_exit "error sourcing $SCOUTFS_FENCED_CONFIG_FILE as bash script"
+
+for conf in "${!SCOUTFS_FENCED_@}"; do
+	log_message "    config var $conf=${!conf}"
+done
+
+test -n "$SCOUTFS_FENCED_RUN" || \
+	error_exit "SCOUTFS_FENCED_RUN must be set"
+test -x "$SCOUTFS_FENCED_RUN" || \
+	error_exit "SCOUTFS_FENCED_RUN '$SCOUTFS_FENCED_RUN' isn't executable"
+
+#
+# main loop watching for fence request across all filesystems 
+#
+
+while sleep $SCOUTFS_FENCED_DELAY; do
+	for fence in /sys/fs/scoutfs/*/fence/*; do
+		# catches unmatched regex when no dirs
+		if [ ! -d "$fence" ]; then
+			continue
+		fi
+
+		# skip requests that have been handled
+		if [ $(cat "$fence/fenced") == 1 -o $(cat "$fence/error") == 1 ]; then
+			continue
+		fi
+
+		srv=$(basename $(dirname $(dirname $fence)))
+		rid="$(cat $fence/rid)"
+		ip="$(cat $fence/ipv4_addr)"
+		reason="$(cat $fence/reason)"
+
+		log_message "server $srv fencing rid $rid at IP $ip for $reason"
+
+		# export _REQ_ vars for run to use
+		export SCOUTFS_FENCED_REQ_RID="$rid"
+		export SCOUTFS_FENCED_REQ_IP="$ip"
+
+		$run $SCOUTFS_FENCED_RUN_ARGS
+		rc=$?
+		if [ "$rc" != 0 ]; then
+			log_message "server $srv fencing rid $rid saw error status $rc from $run"
+			echo 1 > "$fence/error"
+			continue
+		fi
+
+		echo 1 > "$fence/fenced"
+	done
+done
--- a/utils/fenced/scoutfs-fenced.conf.example
+++ b/utils/fenced/scoutfs-fenced.conf.example
@@ -0,0 +1,3 @@
+SCOUTFS_FENCED_DELAY=1
+SCOUTFS_FENCED_RUN=/usr/libexec/scoutfs-fenced/run/local-force-unmount
+SCOUTFS_FENCED_RUN_ARGS=""
--- a/utils/fenced/scoutfs-fenced.service
+++ b/utils/fenced/scoutfs-fenced.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=ScoutFS fenced
+
+[Service]
+Restart=on-failure
+RestartSec=5s
+StartLimitBurst=5
+ExecStart=/usr/libexec/scoutfs-fenced/scoutfs-fenced
+
+[Install]
+WantedBy=default.target
--- a/utils/man/scoutfs-fenced.8
+++ b/utils/man/scoutfs-fenced.8
@@ -0,0 +1,66 @@
+.TH scoutfs-fenced 8
+.SH NAME
+scoutfs-fenced \- scoutfs fence request monitoring and dispatch daemon
+.SH DESCRIPTION
+The
+.B scoutfs-fenced
+daemon runs on hosts with mounts that are configured as quorum members
+and could create fence requests.  It watches sysfs directories of
+mounted scoutfs volumes for the directories store requests
+to fence a mount.
+
+.SH ENVIRONMENT
+scoutfs-fenced reads the
+.I SCOUTFS_FENCED_CONFIG_FILE
+environment variable for the path to the config file that contains its
+configuration.  The file must be readable and is sourced as a bash
+script and is expected to set the following configuration variables.
+
+.SH CONFIGURATION
+
+.TP
+.B SCOUTFS_FENCED_DELAY
+The number of seconds to wait beteween checking for fence request
+directories in the sysfs directories of all mounts on the host.
+
+.TP
+.B SCOUTFS_FENCED_RUN
+The path to the command to execute for each fence request.  The file at
+the path must be executable.
+
+.TP
+.B SCOUTFS_FENCED_RUN_ARGS
+The arguments that are unconditionally passed through to the run
+command.
+
+.SH DAEMONIZING AND LOGGING
+
+scoutfs-fenced runs in the foreground and writes to stderr and stdout.
+Disconnecting it from parents and redirecting its output are the
+responsibility of the host environment.
+
+.SH RUN COMMAND INTERFACE
+
+scoutfs-fenced sets enviroment variables for the run command with
+information about the mount that must be fenced:
+
+.TP
+.B SCOUTFS_FENCED_REQ_RID
+The RID of the mount to be fenced.
+.TP
+.B SCOUTFS_FENCED_REQ_IP
+The dotted quad IPv4 address of the last connection from the mount.
+
+.RE
+The return status of the run command indicates if the mount was
+fenced, or not.  If the mount was successfully fenced then the command
+should return a 0 success status.  If the run command returns a non-zero
+failure status then the request will be set as errored and the server
+will shut down.  The next server that starts will create another fence
+request for the mount.
+
+.SH SEE ALSO
+.BR scoutfs (5),
+
+.SH AUTHORS
+Zach Brown <zab@versity.com>
--- a/utils/man/scoutfs.5
+++ b/utils/man/scoutfs.5
@@ -1,6 +1,6 @@
 .TH scoutfs 5
 .SH NAME
-scoutfs \- overview and mount options for the scoutfs filesystem
+scoutfs \- high level overview of the scoutfs filesystem
 .SH DESCRIPTION
 A scoutfs filesystem is stored on two block devices.  Multiple mounts of
 the filesystem are supported between hosts that share access to the
@@ -34,7 +34,116 @@ the server for the filesystem if it is elected leader.
 The assigned number must match one of the slots defined with \-Q options
 when the filesystem was created with mkfs.  If the number assigned
 doesn't match a number created during mkfs then the mount will fail.
-.SH FURTHER READING
+.SH VOLUME OPTIONS
+Volume options are persistent options which are stored in the super
+block in the metadata device and which apply to all mounts of the volume.
+.sp
+Volume options may be initially specified as the volume is created
+as described in the mkfs command in
+.BR scoutfs (8).
+.sp
+Volume options may be changed at runtime by writing to files in sysfs
+while the volume is mounted.  Volume options are found in the
+volume_options/ directory with a file for each option.  Reading the
+file provides the current setting of the option and an empty string
+is returned if the option is not set.  To set the option, write
+the new value ofthe option to the file.  To clear the option, write
+a blank line with a newline to the file.  The write syscall will
+return an error if the set operation fails and a message will be written
+to the console.
+.sp
+The following volume options are supported:
+.TP
+.B data_alloc_zone_blocks=<zone size in 4KiB blocks>
+When the data_alloc_zone_blocks option is set the data device is
+logically divided into zones of equal length as specified by the value
+of the option.  The size of the zones must be greater than a minimum
+allocation pool size, large enough to result in no more than 1024 zones,
+and not more than the total number of blocks in the data device.
+.sp
+When set, the server will try to provide each mount with free data
+extents that don't share a zone with other mounts.  When a mount has free
+extents in a given zone the server will try and find more free extents
+in that zone.  When the mount is not in a zone, or its zone has no more
+free extents, the server will try and find free extents in a zone that
+no other mount currently occupies.  The result is to try and produce
+write streams where only one mount is writing into each zone.
+.SH FENCING
+.B scoutfs
+mounts coordinate exclusive access to shared resources through
+comminication with the mount that was elected leader.
+A mount can malfunction and stop participating at which point it needs
+to be safely isolated ("fenced off") from shared resources before other mounts can
+have their turn at exclusive access.
+.sp
+Only the elected leader can fence mounts.  As the leader decides that a
+mount must be fenced, typically by timeouts expiring without
+comminication from the mount, it creates a fence request.   Fence
+requests are visible as directories in the leader mount's sysfs
+directory.  The fence request directory is named for the RID of the
+mount being fenced.  The directory contains the following files:
+
+.RS
+.TP
+.B elapsec_secs
+Reading this file gives the number of seconds that have passed since
+this fence request was created.
+.TP
+.B error
+This file contains 0 when the fence request is created.  Userspace
+fencing agents write 1 into this file if they are unable to fence the
+mount.  The volume can not make progress until the mount is fenced so
+this will cause the server to stop and another mount will be elected
+leader.
+.TP
+.B fenced
+This file contains 0 when the fence request is created.  Userspace
+fencing agents write 1 into this file once the mount has been fenced.
+.TP
+.B ipv4_addr
+This file contains the dotted quad IPv4 peer address of the last
+connected socket from the mount.  Userspace fencing agents can use this
+to find the host that contains the mount.
+.TP
+.B reason
+This file contains a text string that indicates the reason that the
+mount is being fenced:
+
+.B client_recovery
+- During startup the server found persistent items recording the presence
+of a mount that didn't reconnect to the server in time.
+.sp
+.B client_reconnect
+- A mount disconnected from the server and didn't reconnect in time.
+.sp
+.B quorum_block_leader
+- As a leader was elected it read persistent blocks that indicated that
+a previous leader had not shut down and cleared their quorum block.
+.TP
+.B rid
+This file contains the hex string of the RID of the mount to be fenced.
+.RE
+
+The request directories enable userspace processes to gather the
+information to find the host with the mount to fence, isolate the mount
+by whatever means are appropriate (f.e. cut off network and storage
+communication, force unmount the mount, isolate storage fabric ports,
+reboot the host) and write to the
+.I fenced
+file.
+.sp
+Once the 
+.I fenced
+file is written to the server reclaims the resources
+associated with the fenced mount and resumes normal operations.
+.sp
+If the 
+.I error
+file is written to then the server cannot make forward progress and
+shuts down.  The request can similarly enter an errored state if enough
+time passes before userspace completes the request.
+ 
+.SH CORRUPTION DETECTION
 A
 .B scoutfs
 filesystem can detect corruption at runtime.  A catalog of kernel log
--- a/utils/man/scoutfs.8
+++ b/utils/man/scoutfs.8
@@ -32,10 +32,18 @@ A path within a ScoutFS filesystem.
 .PD

 .TP
-.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-f|--force]"
+.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force] [-A|--allow-small-size]"
 .sp
 Initialize a new ScoutFS filesystem on the target devices. Since ScoutFS uses
 separate block devices for its metadata and data storage, two are required.
+The internal structures and nature of metadata and data transactions
+lead to minimum viable device sizes.  
+.B mkfs
+will check both devices and fail with an error if either are under the
+minimum size.   If
+.B --allow-small-size
+is given then sizes under the minimum size will be
+allowed after printing an informational warning.
 .sp
 If
 .B --force
@@ -81,12 +89,77 @@ kibibytes, mebibytes, etc.
 .B "-d, --max-data-size SIZE"
 Same as previous, but for limiting the size of the data device.
 .TP
+.B "-A, --allow-small-size"
+Allows use of specified device sizes less than the minimum.  This can
+result in bad behaviour and is only intended for testing.
+.TP
+.B "-z, --data-alloc-zone-blocks BLOCKS"
+Set the data_alloc_zone_blocks volume option, as described in
+.BR scoutfs (5).
+.TP
 .B "-f, --force"
 Ignore presence of existing data on the data and metadata devices.
 .RE
 .PD

 .TP
+.BI "resize-devices [-p|--path PATH] [-m|--meta-size SIZE] [-d|--data-size SIZE]"
+.sp
+Resize the metadata or data devices of a mounted ScoutFS filesystem.
+.sp
+ScoutFS metadata has free extent records and fields in the super block
+that reflect the size of the devices in use.  This command sends a
+request to the server to change the size of the device that can be used
+by updating free extents and setting the super block fields.
+.sp
+The specified sizes are in bytes and are translated into block counts.
+If the specified sizes are not a multiple of the metadata or data block
+sizes then a message is output and the resized size is truncated down to
+the next whole block.  Specifying either a size of 0 or the current
+device size makes no change.    The current size of the devices can be
+seen, in units of their respective block sizes, in the total_meta_blocks
+and total_data_blocks fields returned by the scoutfs statfs command (via
+the statfs_more ioctl).
+.sp
+Shrinking is not supported.  Specifying a smaller size for either device
+will return an error and neither device will be resized.
+.sp
+Specifying a larger size will expand the initial size of the device that
+will be used.  Free space records are added for the expanded region and
+can be used once the resizing transaction is complete.
+.sp
+The resizing action is performed in a transaction on the server.  This
+command will hang until a server is elected and running and can service
+the reqeust.  The server serializes any concurrent requests to resize.
+.sp
+The new sizes must fit within the current sizes of the mounted devices.
+Presumably this command is being performed as part of a larger
+coordinated resize of the underlying devices.  The device must be
+expanded before ScoutFS can use the larger device and ScoutFS must stop
+using a region to shrink before it could be removed from the device
+(which is not currently supported).
+.sp
+The resize will be committed by the server before the response is sent
+to the client.  The system can be using the new device size before the
+result is communicated through the client and this command completes.
+The client could crash and the server could still have performed the
+resize.
+.RS 1.0i
+.PD 0
+.TP
+.sp
+.B "-p, --path PATH"
+A path in the mounted ScoutFS filesystem which will have its devices
+resized.
+.TP
+.B "-m, --meta-size SIZE"
+.B "-d, --data-size SIZE"
+The new size of the metadata or data device to use, in bytes.  Size is given as
+an integer followed by a units digit: "K", "M", "G", "T", "P", to denote
+kibibytes, mebibytes, etc.
+.RE
+.PD
+
 .BI "stat FILE [-s|--single-field FIELD-NAME]"
 .sp
 Display ScoutFS-specific metadata fields for the given file.
--- a/utils/scoutfs-utils.spec.in
+++ b/utils/scoutfs-utils.spec.in
@@ -54,12 +54,19 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
 install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
 install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
 install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
+install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
+install -m 755 -D fenced/local-force-unmount $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/run/local-force-unmount
+install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
+install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example

 %files
 %defattr(644,root,root,755)
 %{_mandir}/man*/scoutfs*.gz
+%{_unitdir}/scoutfs-fenced.service
+%{_sysconfdir}/scoutfs
 %defattr(755,root,root,755)
 %{_sbindir}/scoutfs
+%{_libexecdir}/scoutfs-fenced

 %files -n scoutfs-devel
 %defattr(644,root,root,755)
--- a/utils/src/btree.c
+++ b/utils/src/btree.c
@@ -40,7 +40,7 @@ static void *alloc_val(struct scoutfs_btree_block *bt, int len)
 {
 	le16_add_cpu(&bt->mid_free_len, -len);
 	le16_add_cpu(&bt->total_item_bytes, len);
-	return (void *)bt + le16_to_cpu(bt->mid_free_len);
+	return (void *)&bt->items[le16_to_cpu(bt->nr_items)] + le16_to_cpu(bt->mid_free_len);
 }

 /*
--- a/utils/src/dev.c
+++ b/utils/src/dev.c
@@ -6,12 +6,13 @@
 #include <sys/ioctl.h>
 #include <linux/fs.h>
 #include <errno.h>
+#include <stdbool.h>

 #include "sparse.h"
 #include "dev.h"

 int device_size(char *path, int fd,
-		u64 min_size, u64 max_size,
+		u64 min_size, u64 max_size, bool allow_small_size,
 		char *use_type, u64 *size_ret)
 {
 	struct stat st;
@@ -63,10 +64,13 @@ int device_size(char *path, int fd,
 	if (size < min_size) {
 		fprintf(stderr,
 			BASE_SIZE_FMT" %s too small for min "
-			BASE_SIZE_FMT" %s device\n",
+			BASE_SIZE_FMT" %s device%s\n",
 			BASE_SIZE_ARGS(size), target_type,
-			BASE_SIZE_ARGS(min_size), use_type);
-		return -EINVAL;
+			BASE_SIZE_ARGS(min_size), use_type,
+			allow_small_size ? ", allowing with -A" : "");
+
+		if (!allow_small_size)
+			return -EINVAL;
 	}

 	*size_ret = size;
--- a/utils/src/dev.h
+++ b/utils/src/dev.h
@@ -1,6 +1,8 @@
 #ifndef _DEV_H_
 #define _DEV_H_

+#include <stdbool.h>
+
 #define BASE_SIZE_FMT "%.2f%s"
 #define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1)

@@ -8,7 +10,7 @@
 #define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz)

 int device_size(char *path, int fd,
-		u64 min_size, u64 max_size,
+		u64 min_size, u64 max_size, bool allow_small_size,
 		char *use_type, u64 *size_ret);
 float size_flt(u64 nr, unsigned size);
 char *size_str(u64 nr, unsigned size);
--- a/utils/src/df.c
+++ b/utils/src/df.c
@@ -86,6 +86,11 @@ static int do_df(struct df_args *args)
 			data_free += ade[i].blocks;
 	}

+	if (meta_free >= sfm.reserved_meta_blocks)
+		meta_free -= sfm.reserved_meta_blocks;
+	else
+		meta_free = 0;
+
 	snprintf(cells[0][0], CHARS, "Type");
 	snprintf(cells[0][1], CHARS, "Size");
 	snprintf(cells[0][2], CHARS, "Total");
--- a/utils/src/mkfs.c
+++ b/utils/src/mkfs.c
@@ -57,6 +57,15 @@ static int write_block(int fd, u32 magic, __le64 fsid, u64 seq, u64 blkno,
 	return 0;
 }

+/*
+ * Return the order of the length of a free extent, which we define as
+ * floor(log_8_(len)): 0..7 = 0, 8..63 = 1, etc.
+ */
+static u64 free_extent_order(u64 len)
+{
+	return (flsll(len | 1) - 1) / 3;
+}
+
 /*
 * Write the single btree block that contains the blkno and len indexed
 * items to store the given extent, and update the root to point to it.
@@ -72,31 +81,61 @@ static int write_alloc_root(int fd, __le64 fsid,
 	root->total_len = cpu_to_le64(len);

 	memset(&key, 0, sizeof(key));
-	key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
-	key.sk_type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
-	key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
+	key.sk_zone = SCOUTFS_FREE_EXTENT_BLKNO_ZONE;
 	key.skfb_end = cpu_to_le64(start + len - 1);
 	key.skfb_len = cpu_to_le64(len);
 	btree_append_item(bt, &key, NULL, 0);

 	memset(&key, 0, sizeof(key));
-	key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
-	key.sk_type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
-	key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
-	key.skfl_neglen = cpu_to_le64(-len);
-	key.skfl_blkno = cpu_to_le64(start);
+	key.sk_zone = SCOUTFS_FREE_EXTENT_ORDER_ZONE;
+	key.skfo_revord = cpu_to_le64(U64_MAX - free_extent_order(len));
+	key.skfo_end = cpu_to_le64(start + len - 1);
+	key.skfo_len = cpu_to_le64(len);
 	btree_append_item(bt, &key, NULL, 0);

 	return write_block(fd, SCOUTFS_BLOCK_MAGIC_BTREE, fsid, seq, blkno,
 			   SCOUTFS_BLOCK_LG_SHIFT, &bt->hdr);
 }

+#define SCOUTFS_SERVER_DATA_FILL_TARGET \
+	((4ULL * 1024 * 1024 * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
+static bool invalid_data_alloc_zone_blocks(u64 total_data_blocks, u64 zone_blocks)
+{
+	u64 nr;
+
+	if (zone_blocks == 0)
+		return false;
+
+	if (zone_blocks < SCOUTFS_SERVER_DATA_FILL_TARGET) {
+		fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at least %llu mount data allocation target blocks",
+		        zone_blocks, SCOUTFS_SERVER_DATA_FILL_TARGET);
+		return true;
+	}
+
+	nr = total_data_blocks / SCOUTFS_DATA_ALLOC_MAX_ZONES;
+	if (zone_blocks < nr) {
+		fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be greater than %llu blocks which results in max %u zones",
+			    zone_blocks, nr, SCOUTFS_DATA_ALLOC_MAX_ZONES);
+		return true;
+	}
+
+	if (zone_blocks > total_data_blocks) {
+		fprintf(stderr, "setting data_alloc_zone_blocks to '%llu' failed, must be at most %llu total data device blocks",
+			    zone_blocks, total_data_blocks);
+		return true;
+	}
+
+	return false;
+}
+
 struct mkfs_args {
 	char *meta_device;
 	char *data_device;
 	unsigned long long max_meta_size;
 	unsigned long long max_data_size;
+	u64 data_alloc_zone_blocks;
 	bool force;
+	bool allow_small_size;
 	int nr_slots;
 	struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
 };
@@ -177,13 +216,15 @@ static int do_mkfs(struct mkfs_args *args)
 		goto out;
 	}

-	ret = device_size(args->meta_device, meta_fd, 2ULL * (1024 * 1024 * 1024),
-			  args->max_meta_size, "meta", &meta_size);
+	/* minumum meta device size to make reserved blocks reasonably large */
+	ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024),
+			  args->max_meta_size, args->allow_small_size, "meta", &meta_size);
 	if (ret)
 		goto out;

-	ret = device_size(args->data_device, data_fd, 8ULL * (1024 * 1024 * 1024),
-			  args->max_data_size, "data", &data_size);
+	/* .. then arbitrarily the same minimum data device size */
+	ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024),
+			  args->max_data_size, args->allow_small_size, "data", &data_size);
 	if (ret)
 		goto out;

@@ -197,19 +238,26 @@ static int do_mkfs(struct mkfs_args *args)
 	memset(super, 0, SCOUTFS_BLOCK_SM_SIZE);
 	super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION);
 	uuid_generate(super->uuid);
-	super->next_ino = cpu_to_le64(round_up(SCOUTFS_ROOT_INO + 1, SCOUTFS_LOCK_INODE_GROUP_NR));
-	super->next_trans_seq = cpu_to_le64(1);
+	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
+	super->seq = cpu_to_le64(1);
 	super->total_meta_blocks = cpu_to_le64(last_meta + 1);
-	super->first_meta_blkno = cpu_to_le64(next_meta);
-	super->last_meta_blkno = cpu_to_le64(last_meta);
-	super->total_data_blocks = cpu_to_le64(last_data - first_data + 1);
-	super->first_data_blkno = cpu_to_le64(first_data);
-	super->last_data_blkno = cpu_to_le64(last_data);
+	super->total_data_blocks = cpu_to_le64(last_data + 1);

 	assert(sizeof(args->slots) ==
 		     member_sizeof(struct scoutfs_super_block, qconf.slots));
 	memcpy(super->qconf.slots, args->slots, sizeof(args->slots));

+	if (invalid_data_alloc_zone_blocks(le64_to_cpu(super->total_data_blocks),
+					   args->data_alloc_zone_blocks)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (args->data_alloc_zone_blocks) {
+		super->volopt.set_bits |= cpu_to_le64(SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT);
+		super->volopt.data_alloc_zone_blocks = cpu_to_le64(args->data_alloc_zone_blocks);
+	}
+
 	/* fs root starts with root inode and its index items */
 	blkno = next_meta++;
 	btree_init_root_single(&super->fs_root, bt, 1, blkno);
@@ -268,7 +316,7 @@ static int do_mkfs(struct mkfs_args *args)
 	blkno = next_meta++;
 	ret = write_alloc_root(meta_fd, fsid, &super->data_alloc, bt,
 			       1, blkno, first_data,
-			       le64_to_cpu(super->total_data_blocks));
+			       last_data - first_data + 1);
 	if (ret < 0)
 		goto out;

@@ -471,6 +519,20 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 				prev_val, args->max_data_size);
 		break;
 	}
+	case 'A':
+		args->allow_small_size = true;
+		break;
+	case 'z': /* data-alloc-zone-blocks */
+	{
+		ret = parse_u64(arg, &args->data_alloc_zone_blocks);
+		if (ret)
+			return ret;
+
+		if (args->data_alloc_zone_blocks == 0)
+			argp_error(state, "must provide non-zero data-alloc-zone-blocks");
+
+		break;
+	}
 	case ARGP_KEY_ARG:
 		if (!args->meta_device)
 			args->meta_device = strdup_or_error(state, arg);
@@ -499,8 +561,10 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
 static struct argp_option options[] = {
 	{ "quorum-slot", 'Q', "NR,ADDR,PORT", 0, "Specify quorum slot addresses [Required]"},
 	{ "force", 'f', NULL, 0, "Overwrite existing data on block devices"},
+	{ "allow-small-size", 'A', NULL, 0, "Allow specified meta/data devices less than minimum, still warns"},
 	{ "max-meta-size", 'm', "SIZE", 0, "Use a size less than the base metadata device size (bytes or KMGTP units)"},
 	{ "max-data-size", 'd', "SIZE", 0, "Use a size less than the base data device size (bytes or KMGTP units)"},
+	{ "data-alloc-zone-blocks", 'z', "BLOCKS", 0, "Divide data device into block zones so each mounts writes to a zone (4KB blocks)"},
 	{ NULL }
 };

--- a/utils/src/print.c
+++ b/utils/src/print.c
@@ -1,3 +1,4 @@
+#define _GNU_SOURCE /* ffsll for glibc < 2.27 */
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -157,7 +158,7 @@ static print_func_t find_printer(u8 zone, u8 type)
 	    type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE)
 		return print_inode_index;

-	if (zone == SCOUTFS_RID_ZONE) {
+	if (zone == SCOUTFS_ORPHAN_ZONE) {
 		if (type == SCOUTFS_ORPHAN_TYPE)
 			return print_orphan;
 	}
@@ -209,8 +210,8 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
 	/* only items in leaf blocks have values */
 	if (val) {
 		liv = val;
-		printf("    log_item_value: vers %llu flags %x\n",
-		       le64_to_cpu(liv->vers), liv->flags);
+		printf("    log_item_value: seq %llu flags %x\n",
+		       le64_to_cpu(liv->seq), liv->flags);

 		/* deletion items don't have values */
 		if (!(liv->flags & SCOUTFS_LOG_ITEM_FLAG_DELETION)) {
@@ -244,15 +245,15 @@ static int print_logs_item(struct scoutfs_key *key, void *val,
 	le64_to_cpu((p)->blkno), le64_to_cpu((p)->seq)

 #define AL_HEAD_F \
-	AL_REF_F" total_nr %llu first_nr %u"
+	AL_REF_F" total_nr %llu first_nr %u flags 0x%x"
 #define AL_HEAD_A(p)					\
 	AL_REF_A(&(p)->ref), le64_to_cpu((p)->total_nr),\
-	le32_to_cpu((p)->first_nr)
+	le32_to_cpu((p)->first_nr), le32_to_cpu((p)->flags)

 #define ALCROOT_F \
-	BTROOT_F" total_len %llu"
+	BTROOT_F" total_len %llu flags 0x%x"
 #define ALCROOT_A(ar) \
-	BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len)
+	BTROOT_A(&(ar)->root), le64_to_cpu((ar)->total_len), le32_to_cpu((ar)->flags)

 #define SRE_FMT "%016llx.%llu.%llu"
 #define SRE_A(sre)						\
@@ -272,6 +273,9 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 				unsigned val_len, void *arg)
 {
 	struct scoutfs_log_trees *lt = val;
+	u64 zones;
+	int bit;
+	int i;

 	printf("    rid %llu nr %llu\n",
 	       le64_to_cpu(key->sklt_rid), le64_to_cpu(key->sklt_nr));
@@ -285,9 +289,12 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       "      data_avail: "ALCROOT_F"\n"
 		       "      data_freed: "ALCROOT_F"\n"
 		       "      srch_file: "SRF_FMT"\n"
-		       "      max_item_vers: %llu\n"
+		       "      max_item_seq: %llu\n"
 		       "      rid: %016llx\n"
-		       "      nr: %llu\n",
+		       "      nr: %llu\n"
+		       "      flags: %llx\n"
+		       "      data_alloc_zone_blocks: %llu\n"
+		       "      data_alloc_zones: ",
 		       AL_HEAD_A(&lt->meta_avail),
 		       AL_HEAD_A(&lt->meta_freed),
 			lt->item_root.height,
@@ -298,9 +305,24 @@ static int print_log_trees_item(struct scoutfs_key *key, void *val,
 		       ALCROOT_A(&lt->data_avail),
 		       ALCROOT_A(&lt->data_freed),
 		       SRF_A(&lt->srch_file),
-		       le64_to_cpu(lt->max_item_vers),
+		       le64_to_cpu(lt->max_item_seq),
 		       le64_to_cpu(lt->rid),
-		       le64_to_cpu(lt->nr));
+		       le64_to_cpu(lt->nr),
+		       le64_to_cpu(lt->flags),
+		       le64_to_cpu(lt->data_alloc_zone_blocks));
+
+		for (i = 0; i < SCOUTFS_DATA_ALLOC_ZONE_LE64S; i++) {
+			if (lt->data_alloc_zones[i] == 0)
+				continue;
+
+			zones = le64_to_cpu(lt->data_alloc_zones[i]);
+			while (zones) {
+				bit = ffsll(zones) - 1;
+				printf("%u ", (i * 64) + bit);
+				zones ^= (1ULL << bit);
+			}
+		}
+		printf("\n");
 	}

 	return 0;
@@ -352,9 +374,79 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val,
 				      unsigned val_len, void *arg)
 {
 	struct scoutfs_mounted_client_btree_val *mcv = val;
+	struct in_addr in;

-	printf("    rid %016llx flags 0x%x\n",
-	       le64_to_cpu(key->skmc_rid), mcv->flags);
+	memset(&in, 0, sizeof(in));
+	in.s_addr = htonl(le32_to_cpu(mcv->addr.v4.addr));
+
+	printf("    rid %016llx ipv4_addr %s flags 0x%x\n",
+	       le64_to_cpu(key->skmc_rid), inet_ntoa(in), mcv->flags);
+
+	return 0;
+}
+
+static int print_log_merge_item(struct scoutfs_key *key, void *val,
+				      unsigned val_len, void *arg)
+{
+	struct scoutfs_log_merge_status *stat;
+	struct scoutfs_log_merge_range *rng;
+	struct scoutfs_log_merge_request *req;
+	struct scoutfs_log_merge_complete *comp;
+	struct scoutfs_log_merge_freeing *fr;
+
+	switch (key->sk_zone) {
+	case SCOUTFS_LOG_MERGE_STATUS_ZONE:
+		stat = val;
+		printf("    status: next_range_key "SK_FMT" nr_req %llu nr_comp %llu"
+		       " last_seq %llu seq %llu\n",
+		       SK_ARG(&stat->next_range_key),
+		       le64_to_cpu(stat->nr_requests),
+		       le64_to_cpu(stat->nr_complete),
+		       le64_to_cpu(stat->last_seq),
+		       le64_to_cpu(stat->seq));
+		break;
+	case SCOUTFS_LOG_MERGE_RANGE_ZONE:
+		rng = val;
+		printf("    range: start "SK_FMT" end "SK_FMT"\n",
+		       SK_ARG(&rng->start),
+		       SK_ARG(&rng->end));
+		break;
+	case SCOUTFS_LOG_MERGE_REQUEST_ZONE:
+		req = val;
+		printf("    request: logs_root "BTROOT_F" logs_root "BTROOT_F" start "SK_FMT
+		       " end "SK_FMT" last_seq %llu rid %016llx seq %llu flags 0x%llx\n",
+		       BTROOT_A(&req->logs_root),
+		       BTROOT_A(&req->root),
+		       SK_ARG(&req->start),
+		       SK_ARG(&req->end),
+		       le64_to_cpu(req->last_seq),
+		       le64_to_cpu(req->rid),
+		       le64_to_cpu(req->seq),
+		       le64_to_cpu(req->flags));
+		break;
+	case SCOUTFS_LOG_MERGE_COMPLETE_ZONE:
+		comp = val;
+		printf("    complete: root "BTROOT_F" start "SK_FMT" end "SK_FMT
+		       " remain "SK_FMT" rid %016llx seq %llu flags %llx\n",
+		       BTROOT_A(&comp->root),
+		       SK_ARG(&comp->start),
+		       SK_ARG(&comp->end),
+		       SK_ARG(&comp->remain),
+		       le64_to_cpu(comp->rid),
+		       le64_to_cpu(comp->seq),
+		       le64_to_cpu(comp->flags));
+		break;
+	case SCOUTFS_LOG_MERGE_FREEING_ZONE:
+		fr = val;
+		printf("    freeing: root "BTROOT_F" key "SK_FMT" seq %llu\n",
+		       BTROOT_A(&fr->root),
+		       SK_ARG(&fr->key),
+		       le64_to_cpu(fr->seq));
+		break;
+	default:
+		printf("    (unknown log merge key zone %u)\n", key->sk_zone);
+		break;
+	}

 	return 0;
 }
@@ -362,17 +454,17 @@ static int print_mounted_client_entry(struct scoutfs_key *key, void *val,
 static int print_alloc_item(struct scoutfs_key *key, void *val,
 			    unsigned val_len, void *arg)
 {
-	if (key->sk_type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE)
+	if (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE)
 		printf("    free extent: blkno %llu len %llu end %llu\n",
 		       le64_to_cpu(key->skfb_end) -
 		       le64_to_cpu(key->skfb_len) + 1,
 		       le64_to_cpu(key->skfb_len),
 		       le64_to_cpu(key->skfb_end));
 	else
-		printf("    free extent: blkno %llu len %llu neglen %lld\n",
-		       le64_to_cpu(key->skfl_blkno),
-		       -le64_to_cpu(key->skfl_neglen),
-		       (long long)le64_to_cpu(key->skfl_neglen));
+		printf("    free extent: blkno %llu len %llu order %llu\n",
+		       le64_to_cpu(key->skfo_end) - le64_to_cpu(key->skfo_len) + 1,
+		       le64_to_cpu(key->skfo_len),
+		       (long long)(U64_MAX - le64_to_cpu(key->skfo_revord)));

 	return 0;
 }
@@ -792,16 +884,16 @@ static char *alloc_addr_str(union scoutfs_inet_addr *ia)

 static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 {
-	struct print_events {
-		size_t offset;
-		char *name;
-	} events[] = {
-		OFF_NAME(write), OFF_NAME(update_term), OFF_NAME(set_leader),
-		OFF_NAME(clear_leader), OFF_NAME(fenced),
+	const static char *event_names[] = {
+		[SCOUTFS_QUORUM_EVENT_BEGIN] = "begin",
+		[SCOUTFS_QUORUM_EVENT_TERM] = "term",
+		[SCOUTFS_QUORUM_EVENT_ELECT] = "elect",
+		[SCOUTFS_QUORUM_EVENT_FENCE] = "fence",
+		[SCOUTFS_QUORUM_EVENT_STOP] = "stop",
+		[SCOUTFS_QUORUM_EVENT_END] = "end",
 	};
 	struct scoutfs_quorum_block *blk = NULL;
 	struct scoutfs_quorum_block_event *ev;
-	char *log_addr = NULL;
 	u64 blkno;
 	int ret;
 	int i;
@@ -810,6 +902,7 @@ static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
 		blkno = SCOUTFS_QUORUM_BLKNO + i;
 		free(blk);
+		blk = NULL;
 		ret = read_block(fd, blkno, SCOUTFS_BLOCK_SM_SHIFT, (void **)&blk);
 		if (ret)
 			goto out;
@@ -817,28 +910,27 @@ static int print_quorum_blocks(int fd, struct scoutfs_super_block *super)
 		printf("quorum blkno %llu (slot %llu)\n",
 		       blkno, blkno - SCOUTFS_QUORUM_BLKNO);
 		print_block_header(&blk->hdr, SCOUTFS_BLOCK_SM_SIZE);
-		printf("  term %llu random_write_mark 0x%llx flags 0x%llx\n",
-		       le64_to_cpu(blk->term),
-		       le64_to_cpu(blk->random_write_mark),
-		       le64_to_cpu(blk->flags));

-		for (e = 0; e < array_size(events); e++) {
-			ev = (void *)blk + events[e].offset;
+		for (e = 0; e < array_size(event_names); e++) {
+			ev = &blk->events[e];

-			printf("  %12s: rid %016llx ts %llu.%08u\n",
-			       events[e].name, le64_to_cpu(ev->rid),
-			       le64_to_cpu(ev->ts.sec),
-			       le32_to_cpu(ev->ts.nsec));
+			printf("  %12s: rid %016llx term %llu ts %llu.%08u\n",
+			       event_names[e], le64_to_cpu(ev->rid), le64_to_cpu(ev->term),
+			       le64_to_cpu(ev->ts.sec), le32_to_cpu(ev->ts.nsec));
 		}
 	}

 	ret = 0;
 out:
-	free(log_addr);
+	free(blk);

 	return ret;
 }

+#define BTR_FMT "blkno %llu seq %016llx height %u"
+#define BTR_ARG(rt) \
+	le64_to_cpu((rt)->ref.blkno), le64_to_cpu((rt)->ref.seq), (rt)->height
+
 static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 {
 	char uuid_str[37];
@@ -858,9 +950,8 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	printf("  flags: 0x%016llx\n", le64_to_cpu(super->flags));

 	/* XXX these are all in a crazy order */
-	printf("  next_ino %llu next_trans_seq %llu\n"
-	       "  total_meta_blocks %llu first_meta_blkno %llu last_meta_blkno %llu\n"
-	       "  total_data_blocks %llu first_data_blkno %llu last_data_blkno %llu\n"
+	printf("  next_ino %llu seq %llu\n"
+	       "  total_meta_blocks %llu total_data_blocks %llu\n"
 	       "  meta_alloc[0]: "ALCROOT_F"\n"
 	       "  meta_alloc[1]: "ALCROOT_F"\n"
 	       "  data_alloc: "ALCROOT_F"\n"
@@ -868,18 +959,16 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 	       "  server_meta_avail[1]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[0]: "AL_HEAD_F"\n"
 	       "  server_meta_freed[1]: "AL_HEAD_F"\n"
-	       "  mounted_clients root: height %u blkno %llu seq %llu\n"
-	       "  srch_root root: height %u blkno %llu seq %llu\n"
-	       "  trans_seqs root: height %u blkno %llu seq %llu\n"
-	       "  fs_root btree root: height %u blkno %llu seq %llu\n",
+	       "  fs_root: "BTR_FMT"\n"
+	       "  logs_root: "BTR_FMT"\n"
+	       "  log_merge: "BTR_FMT"\n"
+	       "  trans_seqs: "BTR_FMT"\n"
+	       "  mounted_clients: "BTR_FMT"\n"
+	       "  srch_root: "BTR_FMT"\n",
 		le64_to_cpu(super->next_ino),
-		le64_to_cpu(super->next_trans_seq),
+		le64_to_cpu(super->seq),
 		le64_to_cpu(super->total_meta_blocks),
-		le64_to_cpu(super->first_meta_blkno),
-		le64_to_cpu(super->last_meta_blkno),
 		le64_to_cpu(super->total_data_blocks),
-		le64_to_cpu(super->first_data_blkno),
-		le64_to_cpu(super->last_data_blkno),
 		ALCROOT_A(&super->meta_alloc[0]),
 		ALCROOT_A(&super->meta_alloc[1]),
 		ALCROOT_A(&super->data_alloc),
@@ -887,18 +976,20 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
 		AL_HEAD_A(&super->server_meta_avail[1]),
 		AL_HEAD_A(&super->server_meta_freed[0]),
 		AL_HEAD_A(&super->server_meta_freed[1]),
-		super->mounted_clients.height,
-		le64_to_cpu(super->mounted_clients.ref.blkno),
-		le64_to_cpu(super->mounted_clients.ref.seq),
-		super->srch_root.height,
-		le64_to_cpu(super->srch_root.ref.blkno),
-		le64_to_cpu(super->srch_root.ref.seq),
-		super->trans_seqs.height,
-		le64_to_cpu(super->trans_seqs.ref.blkno),
-		le64_to_cpu(super->trans_seqs.ref.seq),
-		super->fs_root.height,
-		le64_to_cpu(super->fs_root.ref.blkno),
-		le64_to_cpu(super->fs_root.ref.seq));
+		BTR_ARG(&super->fs_root),
+		BTR_ARG(&super->logs_root),
+		BTR_ARG(&super->log_merge),
+		BTR_ARG(&super->trans_seqs),
+		BTR_ARG(&super->mounted_clients),
+		BTR_ARG(&super->srch_root));
+
+	printf("  volume options:\n"
+	       "    set_bits: %016llx\n",
+		le64_to_cpu(super->volopt.set_bits));
+	if (le64_to_cpu(super->volopt.set_bits) & SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_BIT) {
+		printf("    data_alloc_zone_blocks: %llu\n",
+			le64_to_cpu(super->volopt.data_alloc_zone_blocks));
+	}

 	printf("  quorum config version %llu\n",
 		le64_to_cpu(super->qconf.version));
@@ -945,6 +1036,11 @@ static int print_volume(int fd)
 	if (err && !ret)
 		ret = err;

+	err = print_btree(fd, super, "log_merge", &super->log_merge,
+			  print_log_merge_item, NULL);
+	if (err && !ret)
+		ret = err;
+
 	for (i = 0; i < array_size(super->server_meta_avail); i++) {
 		snprintf(str, sizeof(str), "server_meta_avail[%u]", i);
 		err = print_alloc_list_block(fd, str,
--- a/utils/src/resize_devices.c
+++ b/utils/src/resize_devices.c
@@ -0,0 +1,120 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <argp.h>
+
+#include "sparse.h"
+#include "parse.h"
+#include "util.h"
+#include "format.h"
+#include "ioctl.h"
+#include "cmd.h"
+
+struct resize_args {
+	char *path;
+	u64 meta_size;
+	u64 data_size;
+};
+
+static int do_resize_devices(struct resize_args *args)
+{
+	struct scoutfs_ioctl_resize_devices rd;
+	int ret;
+	int fd;
+
+	if (args->meta_size & SCOUTFS_BLOCK_LG_MASK) {
+		printf("metadata device size %llu is not a multiple of %u metadata block size, truncating down to %llu byte size\n",
+		args->meta_size, SCOUTFS_BLOCK_LG_SIZE,
+		args->meta_size & ~(u64)SCOUTFS_BLOCK_LG_MASK);
+	}
+
+	if (args->data_size & SCOUTFS_BLOCK_SM_MASK) {
+		printf("data device size %llu is not a multiple of %u data block size, truncating down to %llu byte size\n",
+		args->data_size, SCOUTFS_BLOCK_SM_SIZE,
+		args->data_size & ~(u64)SCOUTFS_BLOCK_SM_MASK);
+	}
+
+	fd = get_path(args->path, O_RDONLY);
+	if (fd < 0)
+		return fd;
+
+	rd.new_total_meta_blocks = args->meta_size >> SCOUTFS_BLOCK_LG_SHIFT;
+	rd.new_total_data_blocks = args->data_size >> SCOUTFS_BLOCK_SM_SHIFT;
+
+	ret = ioctl(fd, SCOUTFS_IOC_RESIZE_DEVICES, &rd);
+	if (ret < 0) {
+		ret = -errno;
+		fprintf(stderr, "resize_devices ioctl failed: %s (%d)\n", strerror(errno), errno);
+	}
+
+	close(fd);
+	return ret;
+};
+
+static int parse_opt(int key, char *arg, struct argp_state *state)
+{
+	struct resize_args *args = state->input;
+	int ret;
+
+	switch (key) {
+	case 'm': /* meta-size */
+	{
+		ret = parse_human(arg, &args->meta_size);
+		if (ret)
+			return ret;
+		break;
+	}
+	case 'd': /* data-size */
+	{
+		ret = parse_human(arg, &args->data_size);
+		if (ret)
+			return ret;
+		break;
+	}
+	case 'p':
+		args->path = strdup_or_error(state, arg);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct argp_option options[] = {
+	{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
+	{ "meta-size", 'm', "SIZE", 0, "New metadata device size (bytes or KMGTP units)"},
+	{ "data-size", 'd', "SIZE", 0, "New data device size (bytes or KMGTP units)"},
+	{ NULL }
+};
+
+static struct argp argp = {
+	options,
+	parse_opt,
+	"",
+	"Online resize of metadata and/or data devices",
+};
+
+static int resize_devices_cmd(int argc, char **argv)
+{
+
+	struct resize_args resize_args = {NULL,};
+	int ret;
+
+	ret = argp_parse(&argp, argc, argv, 0, NULL, &resize_args);
+	if (ret)
+		return ret;
+
+	return do_resize_devices(&resize_args);
+}
+
+static void __attribute__((constructor)) read_xattr_totals_ctor(void)
+{
+	cmd_register_argp("resize-devices", &argp, GROUP_CORE, resize_devices_cmd);
+}