Disable mount-unmount-race test

The mount-unmount-race test is occasionally hanging, disable it while we debug it and have test coverage for unrelated work. Signed-off-by: Zach Brown <zab@versity.com>
Merge pull request #14 from agrover/fix-jira-202
2026-05-03 19:35:43 +00:00 · 2021-02-01 10:07:47 -08:00 · 2021-02-01 09:46:01 -08:00 · 2021-02-01 09:24:59 -08:00 · 2021-01-29 09:30:57 -08:00 · 2021-01-26 16:07:05 -08:00
196 changed files with 25783 additions and 6760 deletions
--- a/.gitignore
+++ b/.gitignore
--- a/17
+++ b/17
@@ -0,0 +1,17 @@
+#
+# Typically development is done in each subdir, but we have a tiny
+# makefile here to make it easy to run simple targets across all the
+# subdirs.
+#
+
+SUBDIRS := kmod utils tests
+NOTTESTS := kmod utils
+
+all clean: $(SUBDIRS) FORCE
+dist: $(NOTTESTS) FORCE
+
+$(SUBDIRS): FORCE
+	$(MAKE) -C $@ $(MAKECMDGOALS)
+
+all:
+FORCE:
--- a/kmod/README.md
+++ b/kmod/README.md
@@ -6,7 +6,7 @@ from the ground up to support large archival systems.
 Its key differentiating features are:

 - Integrated consistent indexing accelerates archival maintenance operations
- - Log-structured commits allow nodes to write concurrently without contention
+ - Commit logs allow nodes to write concurrently without contention

 It meets best of breed expectations:

@@ -31,15 +31,9 @@ functionality hasn't been implemented.  It's appropriate for early
 adopters and interested developers, not for production use.

 In that vein, expect significant incompatible changes to both the format
-of network messages and persistent structures.  To avoid mistakes the
-implementation currently calculates a hash of the format and ioctl
-header files in the source tree.  The kernel module will refuse to mount
-a volume created by userspace utilities with a mismatched hash, and it
-will refuse to connect to a remote node with a mismatched hash.  This
-means having to unmount, mkfs, and remount everything across many
-functional changes.  Once the format is nailed down we'll wire up
-forward and back compat machinery and remove this temporary safety
-measure. 
+of network messages and persistent structures. Since the format hash-checking
+has now been removed in preparation for release, if there is any doubt, mkfs
+is strongly recommended.

 The current kernel module is developed against the RHEL/CentOS 7.x
 kernel to minimize the friction of developing and testing with partners'
@@ -62,17 +56,17 @@ help on the mailing list.**
 The requirements for running scoutfs on a small cluster are:

 1. One or more nodes running x86-64 CentOS/RHEL 7.4 (or 7.3)
- 2. Access to a single shared block device
+ 2. Access to two shared block devices
 3. IPv4 connectivity between the nodes

 The steps for getting scoutfs mounted and operational are:

 1. Get the kernel module running on the nodes
- 2. Make a new filesystem on the device with the userspace utilities
- 3. Mount the device on all the nodes
+ 2. Make a new filesystem on the devices with the userspace utilities
+ 3. Mount the devices on all the nodes

-In this example we run all of these commands on three nodes.  The block
-device name is the same on all the nodes.
+In this example we run all of these commands on three nodes.  The names
+of the block devices are the same on all the nodes.

 1. Get the Kernel Module and Userspace Binaries

@@ -87,14 +81,11 @@ device name is the same on all the nodes.

   ```shell
   yum install kernel-devel
-   git clone git@github.com:versity/scoutfs-kmod-dev.git
-   make -C scoutfs-kmod-dev module 
+   git clone git@github.com:versity/scoutfs.git
+   make -C scoutfs
   modprobe libcrc32c
-   insmod scoutfs-kmod-dev/src/scoutfs.ko
-
-   git clone git@github.com:versity/scoutfs-utils-dev.git
-   make -C scoutfs-utils-dev
-   alias scoutfs=$PWD/scoutfs-utils-dev/src/scoutfs
+   insmod scoutfs/kmod/src/scoutfs.ko
+   alias scoutfs=$PWD/scoutfs/utils/src/scoutfs
   ```

 2. Make a New Filesystem (**destroys contents, no questions asked**)
@@ -103,7 +94,7 @@ device name is the same on all the nodes.
   quorum for the system to function.

   ```shell
-   scoutfs mkfs -Q 2 /dev/shared_block_device
+   scoutfs mkfs -Q 2 /dev/meta_dev /dev/data_dev
   ```

 3. Mount the Filesystem
@@ -114,7 +105,7 @@ device name is the same on all the nodes.

   ```shell
   mkdir /mnt/scoutfs
-   mount -t scoutfs -o server_addr=$NODE_ADDR /dev/shared_block_device /mnt/scoutfs
+   mount -t scoutfs -o server_addr=$NODE_ADDR,metadev_path=/dev/meta_dev /dev/data_dev /mnt/scoutfs
   ```

 4. For Kicks, Observe the Metadata Change Index
--- a/kmod/Makefile
+++ b/kmod/Makefile
@@ -16,11 +16,7 @@ SCOUTFS_GIT_DESCRIBE := \
 	$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
 		echo no-git)

-SCOUTFS_FORMAT_HASH := \
-	$(shell cat src/format.h src/ioctl.h | md5sum | cut -b1-16)
-
 SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
-		SCOUTFS_FORMAT_HASH=$(SCOUTFS_FORMAT_HASH) \
 		CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
 		EXTRA_CFLAGS="-Werror"

@@ -51,7 +47,7 @@ modules_install:

 dist: scoutfs-kmod.spec
 	git archive --format=tar --prefix scoutfs-kmod-$(RPM_VERSION)/ HEAD^{tree} > $(TARFILE)
-	@ tar rf $(TARFILE) --transform="s@\(.*\)@scoutfs-$(RPM_VERSION)/\1@" scoutfs-kmod.spec
+	@ tar rf $(TARFILE) --transform="s@\(.*\)@scoutfs-kmod-$(RPM_VERSION)/\1@" scoutfs-kmod.spec

 clean:
 	make $(SCOUTFS_ARGS) clean
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -1,7 +1,6 @@
 obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o

-CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\" \
-		 -DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
+CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\"

 CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include

@@ -9,6 +8,8 @@ CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
 -include $(src)/Makefile.kernelcompat
 
 scoutfs-y +=			\
+	avl.o			\
+	alloc.o			\
 	block.o			\
 	btree.o			\
 	client.o		\
@@ -16,10 +17,12 @@ scoutfs-y +=			\
 	data.o			\
 	dir.o			\
 	export.o		\
+	ext.o			\
 	file.o			\
 	forest.o		\
 	inode.o			\
 	ioctl.o			\
+	item.o			\
 	lock.o			\
 	lock_server.o		\
 	msg.o			\
@@ -27,10 +30,11 @@ scoutfs-y +=			\
 	options.o		\
 	per_task.o		\
 	quorum.o		\
-	radix.o			\
 	scoutfs_trace.o		\
 	server.o		\
+	sort_priv.o		\
 	spbm.o			\
+	srch.o			\
 	super.o			\
 	sysfs.o			\
 	trans.o			\
@@ -50,5 +54,9 @@ $(src)/check_exported_types:
 		echo "no raw types in exported headers, preface with __";     \
 		exit 1;							      \
 	fi
+	@if egrep '\<__packed\>' $(src)/format.h $(src)/ioctl.h; then \
+		echo "no __packed allowed in exported headers";     \
+		exit 1;							      \
+	fi

 extra-y += check_exported_types
--- a/kmod/src/alloc.c
+++ b/kmod/src/alloc.c
--- a/kmod/src/alloc.h
+++ b/kmod/src/alloc.h
@@ -0,0 +1,155 @@
+#ifndef _SCOUTFS_ALLOC_H_
+#define _SCOUTFS_ALLOC_H_
+
+#include "ext.h"
+
+/*
+ * These are implementation-specific metrics, they don't need to be
+ * consistent across implementations.  They should probably be run-time
+ * knobs.
+ */
+
+/*
+ * The largest extent that we'll try to allocate with fallocate.  We're
+ * trying not to completely consume a transactions data allocation all
+ * at once.  This is only allocation granularity, repeated allocations
+ * can produce large contiguous extents.
+ */
+#define SCOUTFS_FALLOCATE_ALLOC_LIMIT \
+	(128ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
+
+/*
+ * The largest aligned region that we'll try to allocate at the end of
+ * the file as it's extended.  This is also limited to the current file
+ * size so we can only waste at most twice the total file size when
+ * files are less than this.  We try to keep this around the point of
+ * diminishing returns in streaming performance of common data devices
+ * to limit waste.
+ */
+#define SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT \
+	(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
+
+/*
+ * Small data allocations are satisfied by cached extents stored in
+ * the run-time alloc struct to minimize item operations for small
+ * block allocations.  Large allocations come directly from btree
+ * extent items, and this defines the threshold beetwen them.
+ */
+#define SCOUTFS_ALLOC_DATA_LG_THRESH \
+	(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
+
+/*
+ * Fill client alloc roots to the target when they fall below the lo
+ * threshold.
+ *
+ * We're giving the client the most available meta blocks we can so that
+ * it has the freedom to build large transactions before worrying that
+ * it might run out of meta allocs during commits.
+ */
+#define SCOUTFS_SERVER_META_FILL_TARGET \
+	SCOUTFS_ALLOC_LIST_MAX_BLOCKS
+#define SCOUTFS_SERVER_META_FILL_LO \
+	(SCOUTFS_ALLOC_LIST_MAX_BLOCKS / 2)
+#define SCOUTFS_SERVER_DATA_FILL_TARGET \
+	(4ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
+#define SCOUTFS_SERVER_DATA_FILL_LO \
+	(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
+
+/*
+ * Each of the server meta_alloc roots will try to keep a minimum amount
+ * of free blocks.  The server will swap roots when its current avail
+ * falls below the threshold while the freed root is still above it.  It
+ * must have room for all the largest allocation attempted in a
+ * transaction on the server.
+ */
+#define SCOUTFS_SERVER_META_ALLOC_MIN \
+	(SCOUTFS_SERVER_META_FILL_TARGET * 2)
+
+/*
+ * A run-time use of a pair of persistent avail/freed roots as a
+ * metadata allocator.  It has the machinery needed to lock and avoid
+ * recursion when dirtying the list blocks that are used during the
+ * transaction.
+ */
+struct scoutfs_alloc {
+	spinlock_t lock;
+	struct mutex mutex;
+	struct scoutfs_block *dirty_avail_bl;
+	struct scoutfs_block *dirty_freed_bl;
+	struct scoutfs_alloc_list_head avail;
+	struct scoutfs_alloc_list_head freed;
+};
+
+/*
+ * A run-time data allocator.  We have a cached extent in memory that is
+ * a lot cheaper to work with than the extent items, and we have a
+ * consistent record of the total_len that can be sampled outside of the
+ * usual heavy serialization of the extent modifications.
+ */
+struct scoutfs_data_alloc {
+	struct scoutfs_alloc_root root;
+	struct scoutfs_extent cached;
+	atomic64_t total_len;
+};
+
+void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
+			struct scoutfs_alloc_list_head *avail,
+			struct scoutfs_alloc_list_head *freed);
+int scoutfs_alloc_prepare_commit(struct super_block *sb,
+				 struct scoutfs_alloc *alloc,
+				 struct scoutfs_block_writer *wri);
+
+int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
+		       struct scoutfs_block_writer *wri, u64 *blkno);
+int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
+		      struct scoutfs_block_writer *wri, u64 blkno);
+
+void scoutfs_dalloc_init(struct scoutfs_data_alloc *dalloc,
+			 struct scoutfs_alloc_root *data_avail);
+void scoutfs_dalloc_get_root(struct scoutfs_data_alloc *dalloc,
+			     struct scoutfs_alloc_root *data_avail);
+u64 scoutfs_dalloc_total_len(struct scoutfs_data_alloc *dalloc);
+int scoutfs_dalloc_return_cached(struct super_block *sb,
+				 struct scoutfs_alloc *alloc,
+				 struct scoutfs_block_writer *wri,
+				 struct scoutfs_data_alloc *dalloc);
+int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
+		       struct scoutfs_block_writer *wri,
+		       struct scoutfs_data_alloc *dalloc, u64 count,
+		       u64 *blkno_ret, u64 *count_ret);
+int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
+		      struct scoutfs_block_writer *wri,
+		      struct scoutfs_alloc_root *root, u64 blkno, u64 count);
+
+int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
+		       struct scoutfs_block_writer *wri,
+		       struct scoutfs_alloc_root *dst,
+		       struct scoutfs_alloc_root *src, u64 total);
+
+int scoutfs_alloc_fill_list(struct super_block *sb,
+			    struct scoutfs_alloc *alloc,
+			    struct scoutfs_block_writer *wri,
+			    struct scoutfs_alloc_list_head *lhead,
+			    struct scoutfs_alloc_root *root,
+			    u64 lo, u64 target);
+int scoutfs_alloc_empty_list(struct super_block *sb,
+			     struct scoutfs_alloc *alloc,
+			     struct scoutfs_block_writer *wri,
+			     struct scoutfs_alloc_root *root,
+			     struct scoutfs_alloc_list_head *lhead);
+int scoutfs_alloc_splice_list(struct super_block *sb,
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri,
+			      struct scoutfs_alloc_list_head *dst,
+			      struct scoutfs_alloc_list_head *src);
+
+bool scoutfs_alloc_meta_low(struct super_block *sb,
+			    struct scoutfs_alloc *alloc, u32 nr);
+
+typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
+					  int owner, u64 id,
+					  bool meta, bool avail, u64 blocks);
+int scoutfs_alloc_foreach(struct super_block *sb,
+			  scoutfs_alloc_foreach_cb_t cb, void *arg);
+
+#endif
--- a/kmod/src/avl.c
+++ b/kmod/src/avl.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright (C) 2020 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include "format.h"
+#include "avl.h"
+
+/*
+ * We use a simple avl to index items in btree blocks.  The interface
+ * looks a bit like the kernel rbtree interface in that the caller
+ * manages locking and storage for the nodes.  Node references are
+ * stored as byte offsets from the root so that the implementation
+ * doesn't have to know anything about the caller's container. 
+ *
+ * We store the full height in each node, rather than just 2 bits for
+ * the balance, so that we can use the extra redundancy to verify the
+ * integrity of the tree.
+ */
+
+static struct scoutfs_avl_node *node_ptr(struct scoutfs_avl_root *root,
+					 __le16 off)
+{
+	return off ? (void *)root + le16_to_cpu(off) : NULL;
+}
+
+static __le16 node_off(struct scoutfs_avl_root *root,
+		       struct scoutfs_avl_node *node)
+{
+	return node ? cpu_to_le16((void *)node - (void *)root) : 0;
+}
+
+static __u8 node_height(struct scoutfs_avl_node *node)
+{
+	return node ? node->height : 0;
+}
+
+struct scoutfs_avl_node *
+scoutfs_avl_search(struct scoutfs_avl_root *root,
+		   scoutfs_avl_compare_t compare, void *arg, int *cmp_ret,
+		   struct scoutfs_avl_node **par,
+		   struct scoutfs_avl_node **next,
+		   struct scoutfs_avl_node **prev)
+{
+	struct scoutfs_avl_node *node = node_ptr(root, root->node);
+	int cmp;
+
+	if (cmp_ret)
+		*cmp_ret = -1;
+	if (par)
+		*par = NULL;
+	if (next)
+		*next = NULL;
+	if (prev)
+		*prev = NULL;
+
+	while (node) {
+		cmp = compare(arg, node);
+		if (par)
+			*par = node;
+		if (cmp_ret)
+			*cmp_ret = cmp;
+		if (cmp < 0) {
+			if (next)
+				*next = node;
+			node = node_ptr(root, node->left);
+		} else if (cmp > 0) {
+			if (prev)
+				*prev = node;
+			node = node_ptr(root, node->right);
+		} else {
+			return node;
+		}
+	}
+
+	return NULL;
+}
+
+struct scoutfs_avl_node *scoutfs_avl_first(struct scoutfs_avl_root *root)
+{
+	struct scoutfs_avl_node *node = node_ptr(root, root->node);
+
+	while (node && node->left)
+		node = node_ptr(root, node->left);
+
+	return node;
+}
+
+struct scoutfs_avl_node *scoutfs_avl_last(struct scoutfs_avl_root *root)
+{
+	struct scoutfs_avl_node *node = node_ptr(root, root->node);
+
+	while (node && node->right)
+		node = node_ptr(root, node->right);
+
+	return node;
+}
+
+struct scoutfs_avl_node *scoutfs_avl_next(struct scoutfs_avl_root *root,
+					  struct scoutfs_avl_node *node)
+{
+	struct scoutfs_avl_node *parent;
+
+	if (node->right) {
+		node = node_ptr(root, node->right);
+		while (node->left)
+			node = node_ptr(root, node->left);
+		return node;
+	}
+
+	while ((parent = node_ptr(root, node->parent)) &&
+	       node == node_ptr(root, parent->right))
+		node = parent;
+
+	return parent;
+}
+
+struct scoutfs_avl_node *scoutfs_avl_prev(struct scoutfs_avl_root *root,
+					  struct scoutfs_avl_node *node)
+{
+	struct scoutfs_avl_node *parent;
+
+	if (node->left) {
+		node = node_ptr(root, node->left);
+		while (node->right)
+			node = node_ptr(root, node->right);
+		return node;
+	}
+
+	while ((parent = node_ptr(root, node->parent)) &&
+	       node == node_ptr(root, parent->left))
+		node = parent;
+
+	return parent;
+}
+
+static void set_parent_left_right(struct scoutfs_avl_root *root,
+				  struct scoutfs_avl_node *parent,
+				  struct scoutfs_avl_node *old,
+				  struct scoutfs_avl_node *new)
+{
+	__le16 *off;
+
+	if (parent == NULL)
+		off = &root->node;
+	else if (parent->left == node_off(root, old))
+		off = &parent->left;
+	else
+		off = &parent->right;
+
+	*off = node_off(root, new);
+}
+
+static void set_height(struct scoutfs_avl_root *root,
+		       struct scoutfs_avl_node *node)
+{
+	struct scoutfs_avl_node *left = node_ptr(root, node->left);
+	struct scoutfs_avl_node *right = node_ptr(root, node->right);
+
+	node->height = 1 + max(node_height(left), node_height(right));
+}
+
+static int node_balance(struct scoutfs_avl_root *root,
+		        struct scoutfs_avl_node *node)
+{
+	if (node == NULL)
+		return 0;
+
+	return (int)node_height(node_ptr(root, node->right)) -
+	       (int)node_height(node_ptr(root, node->left));
+}
+
+/*
+ *     d                         b
+ *    / \    rotate right ->    / \
+ *   b   e                     a   d
+ *  / \      <- rotate left       / \
+ * a   c                         c   e
+ *
+ * The rotate functions are always called with the higher node as the
+ * earlier argument.  Links to a and e are constant.  We have to update
+ * the forward and back refs between parents and nodes for the three links
+ * along root->[db]->[bd]->c.
+ */
+static void rotate_right(struct scoutfs_avl_root *root,
+			 struct scoutfs_avl_node *d)
+{
+	struct scoutfs_avl_node *gpa = node_ptr(root, d->parent);
+	struct scoutfs_avl_node *b = node_ptr(root, d->left);
+	struct scoutfs_avl_node *c = node_ptr(root, b->right);
+
+	set_parent_left_right(root, gpa, d, b);
+	b->parent = node_off(root, gpa);
+
+	b->right = node_off(root, d);
+	d->parent = node_off(root, b);
+
+	d->left = node_off(root, c);
+	if (c)
+		c->parent = node_off(root, d);
+
+	set_height(root, d);
+	set_height(root, b);
+}
+
+static void rotate_left(struct scoutfs_avl_root *root,
+			struct scoutfs_avl_node *b)
+{
+	struct scoutfs_avl_node *gpa = node_ptr(root, b->parent);
+	struct scoutfs_avl_node *d = node_ptr(root, b->right);
+	struct scoutfs_avl_node *c = node_ptr(root, d->left);
+
+	set_parent_left_right(root, gpa, b, d);
+	d->parent = node_off(root, gpa);
+
+	d->left = node_off(root, b);
+	b->parent = node_off(root, d);
+
+	b->right = node_off(root, c);
+	if (c)
+		c->parent = node_off(root, b);
+
+	set_height(root, b);
+	set_height(root, d);
+}
+
+/*
+ * Check the balance factor for the given node and perform rotations if
+ * its two child subtrees are too far out of balance.  Return either the
+ * node again or the root of the newly balanced subtree.
+ */
+static struct scoutfs_avl_node *
+rotate_imbalance(struct scoutfs_avl_root *root, struct scoutfs_avl_node *node)
+{
+	int bal = node_balance(root, node);
+	struct scoutfs_avl_node *child;
+
+	if (bal >= -1 && bal <= 1)
+		return node;
+
+	if (bal > 0) {
+		/* turn right-left case into right-right */
+		child = node_ptr(root, node->right);
+		if (node_balance(root, child) < 0)
+			rotate_right(root, child);
+		/* rotate left to address right-right */
+		rotate_left(root, node);
+
+	} else {
+		/* or do the mirror for the left- cases */
+		child = node_ptr(root, node->left);
+		if (node_balance(root, child) > 0)
+			rotate_left(root, child);
+		rotate_right(root, node);
+	}
+
+	return node_ptr(root, node->parent);
+}
+
+void scoutfs_avl_insert(struct scoutfs_avl_root *root,
+			struct scoutfs_avl_node *parent,
+			struct scoutfs_avl_node *node, int cmp)
+{
+	node->parent = 0;
+	node->left = 0;
+	node->right = 0;
+	set_height(root, node);
+	memset(node->__pad, 0, sizeof(node->__pad));
+
+	if (parent == NULL) {
+		root->node = node_off(root, node);
+		node->parent = 0;
+		return;
+	}
+
+	if (cmp < 0)
+		parent->left = node_off(root, node);
+	else
+		parent->right = node_off(root, node);
+	node->parent = node_off(root, parent);
+
+	while (parent) {
+		set_height(root, parent);
+		parent = rotate_imbalance(root, parent);
+		parent = node_ptr(root, parent->parent);
+	}
+}
+
+static struct scoutfs_avl_node *avl_successor(struct scoutfs_avl_root *root,
+					      struct scoutfs_avl_node *node)
+{
+	node = node_ptr(root, node->right);
+	while (node->left)
+		node = node_ptr(root, node->left);
+
+	return node;
+}
+
+/*
+ * Find a node next successor and then swap the positions of the two
+ * nodes with each other in the tree.  This is only tricky because the
+ * successor can be a direct child of the node and if we weren't careful
+ * we'd be modifying each of the nodes through the pointers between
+ * them.
+ */
+static void swap_with_successor(struct scoutfs_avl_root *root,
+				struct scoutfs_avl_node *node)
+{
+	struct scoutfs_avl_node *succ = avl_successor(root, node);
+	struct scoutfs_avl_node *succ_par = node_ptr(root, succ->parent);
+	struct scoutfs_avl_node *succ_right = node_ptr(root, succ->right);
+	struct scoutfs_avl_node *parent;
+	struct scoutfs_avl_node *left;
+	struct scoutfs_avl_node *right;
+
+	/* Link old node's parent and left child with the successor */
+	succ->parent = node->parent;
+	parent = node_ptr(root, succ->parent);
+	set_parent_left_right(root, parent, node, succ);
+	succ->left = node->left;
+	left = node_ptr(root, succ->left);
+	if (left)
+		left->parent = node_off(root, succ);
+
+	/*
+	 * Link the old node's right with successor and the old
+	 * successor's parent with the node, they could have pointed to
+	 * each other.
+	 */
+	if (succ_par == node) {
+		succ->right = node_off(root, node);
+		node->parent = node_off(root, succ);
+	} else {
+		succ->right = node->right;
+		right = node_ptr(root, succ->right);
+		if (right)
+			right->parent = node_off(root, succ);
+		set_parent_left_right(root, succ_par, succ, node);
+		node->parent = node_off(root, succ_par);
+	}
+
+	/* Link the old successor's right with the node, it can't have left */
+	node->right = node_off(root, succ_right);
+	if (succ_right)
+		succ_right->parent = node_off(root, node);
+	node->left = 0;
+
+	swap(node->height, succ->height);
+}
+
+void scoutfs_avl_delete(struct scoutfs_avl_root *root,
+			struct scoutfs_avl_node *node)
+{
+	struct scoutfs_avl_node *parent;
+	struct scoutfs_avl_node *child;
+
+	if (node->left && node->right)
+		swap_with_successor(root, node);
+
+	parent = node_ptr(root, node->parent);
+	child = node_ptr(root, node->left ?: node->right);
+
+	set_parent_left_right(root, parent, node, child);
+	if (child)
+		child->parent = node->parent;
+
+	while (parent) {
+		set_height(root, parent);
+		parent = rotate_imbalance(root, parent);
+		parent = node_ptr(root, parent->parent);
+	}
+}
+
+/*
+ * Move the contents of a node to a new node location in memory.  The
+ * logical position of the node in the tree does not change.
+ */
+void scoutfs_avl_relocate(struct scoutfs_avl_root *root,
+			  struct scoutfs_avl_node *to,
+			  struct scoutfs_avl_node *from)
+{
+	struct scoutfs_avl_node *parent = node_ptr(root, from->parent);
+	struct scoutfs_avl_node *left = node_ptr(root, from->left);
+	struct scoutfs_avl_node *right = node_ptr(root, from->right);
+
+	set_parent_left_right(root, parent, from, to);
+	to->parent = from->parent;
+	to->left = from->left;
+	if (left)
+		left->parent = node_off(root, to);
+	to->right = from->right;
+	if (right)
+		right->parent = node_off(root, to);
+	to->height = from->height;
+}
--- a/kmod/src/avl.h
+++ b/kmod/src/avl.h
@@ -0,0 +1,30 @@
+#ifndef _SCOUTFS_AVL_H_
+#define _SCOUTFS_AVL_H_
+
+#include "format.h"
+
+typedef int (*scoutfs_avl_compare_t)(void *arg,
+				       struct scoutfs_avl_node *node);
+
+struct scoutfs_avl_node *
+scoutfs_avl_search(struct scoutfs_avl_root *root,
+		   scoutfs_avl_compare_t compare, void *arg, int *cmp_ret,
+		   struct scoutfs_avl_node **par,
+		   struct scoutfs_avl_node **next,
+		   struct scoutfs_avl_node **prev);
+struct scoutfs_avl_node *scoutfs_avl_first(struct scoutfs_avl_root *root);
+struct scoutfs_avl_node *scoutfs_avl_last(struct scoutfs_avl_root *root);
+struct scoutfs_avl_node *scoutfs_avl_next(struct scoutfs_avl_root *root,
+					  struct scoutfs_avl_node *node);
+struct scoutfs_avl_node *scoutfs_avl_prev(struct scoutfs_avl_root *root,
+					  struct scoutfs_avl_node *node);
+void scoutfs_avl_insert(struct scoutfs_avl_root *root,
+			  struct scoutfs_avl_node *parent,
+			  struct scoutfs_avl_node *node, int cmp);
+void scoutfs_avl_delete(struct scoutfs_avl_root *root,
+			  struct scoutfs_avl_node *node);
+void scoutfs_avl_relocate(struct scoutfs_avl_root *root,
+			    struct scoutfs_avl_node *to,
+			    struct scoutfs_avl_node *from);
+
+#endif
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -19,7 +19,6 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/rbtree.h>

 #include "format.h"
 #include "super.h"
@@ -46,7 +45,7 @@
 struct block_info {
 	struct super_block *sb;
 	spinlock_t lock;
-	struct rb_root root;
+	struct radix_tree_root radix;
 	struct list_head lru_list;
 	u64 lru_nr;
 	u64 lru_move_counter;
@@ -59,22 +58,20 @@ struct block_info {
 #define DECLARE_BLOCK_INFO(sb, name) \
 	struct block_info *name = SCOUTFS_SB(sb)->block_info

-enum {
+enum block_status_bits {
 	BLOCK_BIT_UPTODATE = 0,	/* contents consistent with media */
 	BLOCK_BIT_NEW,		/* newly allocated, contents undefined */
 	BLOCK_BIT_DIRTY,	/* dirty, writer will write */
 	BLOCK_BIT_IO_BUSY,	/* bios are in flight */
 	BLOCK_BIT_ERROR,	/* saw IO error */
-	BLOCK_BIT_DELETED,	/* has been deleted from rbtree */
+	BLOCK_BIT_DELETED,	/* has been deleted from radix tree */
 	BLOCK_BIT_PAGE_ALLOC,	/* page (possibly high order) allocation */
 	BLOCK_BIT_VIRT,		/* mapped virt allocation */
 	BLOCK_BIT_CRC_VALID,	/* crc has been verified */
-	BLOCK_BIT_VISITED,	/* used by callers to track blocks */
 };

 struct block_private {
 	struct scoutfs_block bl;
-	struct rb_node node;
 	struct super_block *sb;
 	atomic_t refcount;
 	union {
@@ -108,18 +105,18 @@ do {									\
 * be refactored away.
 */

-__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr)
+__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
 {
 	int off = offsetof(struct scoutfs_block_header, crc) +
 		  FIELD_SIZEOF(struct scoutfs_block_header, crc);
-	u32 calc = crc32c(~0, (char *)hdr + off, SCOUTFS_BLOCK_SIZE - off);
+	u32 calc = crc32c(~0, (char *)hdr + off, size - off);

 	return cpu_to_le32(calc);
 }

-bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr)
+bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr, u32 size)
 {
-	return hdr->crc == scoutfs_block_calc_crc(hdr);
+	return hdr->crc == scoutfs_block_calc_crc(hdr, size);
 }

 bool scoutfs_block_valid_ref(struct super_block *sb,
@@ -132,22 +129,6 @@ bool scoutfs_block_valid_ref(struct super_block *sb,
 	       hdr->blkno == blkno;
 }

-bool scoutfs_block_tas_visited(struct super_block *sb,
-			       struct scoutfs_block *bl)
-{
-	struct block_private *bp = BLOCK_PRIVATE(bl);
-
-	return test_bit(BLOCK_BIT_VISITED, &bp->bits) != 0;
-}
-
-void scoutfs_block_clear_visited(struct super_block *sb,
-				 struct scoutfs_block *bl)
-{
-	struct block_private *bp = BLOCK_PRIVATE(bl);
-
-	clear_bit(BLOCK_BIT_VISITED, &bp->bits);
-}
-
 static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 {
 	struct block_private *bp;
@@ -157,19 +138,20 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	 * more careful with a partial page allocator when allocating
 	 * blocks and would make the lru per-page instead of per-block.
 	 */
-	BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_SIZE);
+	BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_LG_SIZE);

 	bp = kzalloc(sizeof(struct block_private), GFP_NOFS);
 	if (!bp)
 		goto out;

-	bp->page = alloc_pages(GFP_NOFS, SCOUTFS_BLOCK_PAGE_ORDER);
+	bp->page = alloc_pages(GFP_NOFS | __GFP_NOWARN,
+			       SCOUTFS_BLOCK_LG_PAGE_ORDER);
 	if (bp->page) {
 		scoutfs_inc_counter(sb, block_cache_alloc_page_order);
 		set_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits);
 		bp->bl.data = page_address(bp->page);
 	} else {
-		bp->virt = __vmalloc(SCOUTFS_BLOCK_SIZE,
+		bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE,
 				     GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
 		if (!bp->virt) {
 			kfree(bp);
@@ -183,7 +165,6 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	}

 	bp->bl.blkno = blkno;
-	RB_CLEAR_NODE(&bp->node);
 	bp->sb = sb;
 	atomic_set(&bp->refcount, 1);
 	INIT_LIST_HEAD(&bp->lru_entry);
@@ -206,7 +187,7 @@ static void block_free(struct super_block *sb, struct block_private *bp)
 	TRACE_BLOCK(free, bp);

 	if (test_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits))
-		__free_pages(bp->page, SCOUTFS_BLOCK_PAGE_ORDER);
+		__free_pages(bp->page, SCOUTFS_BLOCK_LG_PAGE_ORDER);
 	else if (test_bit(BLOCK_BIT_VIRT, &bp->bits))
 		vfree(bp->virt);
 	else
@@ -253,39 +234,9 @@ static void block_put(struct super_block *sb, struct block_private *bp)
 	}
 }

-static struct block_private *walk_block_rbtree(struct rb_root *root,
-					       u64 blkno,
-					       struct block_private *ins)
-{
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct block_private *bp;
-	int cmp;
-
-	while (*node) {
-		parent = *node;
-		bp = container_of(*node, struct block_private, node);
-
-		cmp = scoutfs_cmp_u64s(bp->bl.blkno, blkno);
-		if (cmp == 0)
-			return bp;
-		else if (cmp < 0)
-			node = &(*node)->rb_left;
-		else
-			node = &(*node)->rb_right;
-	}
-
-	if (ins) {
-		rb_link_node(&ins->node, parent, node);
-		rb_insert_color(&ins->node, root);
-		return ins;
-	}
-
-	return NULL;
-}
-
 /*
- * Add a new block into the cache.  The caller holds the lock.
+ * Add a new block into the cache.  The caller holds the lock and has
+ * preloaded the radix.
 */
 static void block_insert(struct super_block *sb, struct block_private *bp,
 			 u64 blkno)
@@ -294,10 +245,9 @@ static void block_insert(struct super_block *sb, struct block_private *bp,

 	assert_spin_locked(&binf->lock);
 	BUG_ON(!list_empty(&bp->lru_entry));
-	BUG_ON(!RB_EMPTY_NODE(&bp->node));

 	atomic_inc(&bp->refcount);
-	walk_block_rbtree(&binf->root, blkno, bp);
+	radix_tree_insert(&binf->radix, blkno, bp);
 	list_add_tail(&bp->lru_entry, &binf->lru_list);
 	bp->lru_moved = ++binf->lru_move_counter;
 	binf->lru_nr++;
@@ -345,10 +295,11 @@ static void block_remove(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);

+	assert_spin_locked(&binf->lock);
+
 	if (!test_and_set_bit(BLOCK_BIT_DELETED, &bp->bits)) {
 		BUG_ON(list_empty(&bp->lru_entry));
-		rb_erase(&bp->node, &binf->root);
-		RB_CLEAR_NODE(&bp->node);
+		radix_tree_delete(&binf->radix, bp->bl.blkno);
 		list_del_init(&bp->lru_entry);
 		binf->lru_nr--;
 		block_put(sb, bp);
@@ -368,18 +319,19 @@ static void block_remove_all(struct super_block *sb)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	struct block_private *bp;
-	struct rb_node *node;

-	for (node = rb_first(&binf->root); node; ) {
-		bp = container_of(node, struct block_private, node);
-		node = rb_next(node);
+	spin_lock(&binf->lock);
+
+	while (radix_tree_gang_lookup(&binf->radix, (void **)&bp, 0, 1) == 1) {
 		wait_event(binf->waitq, !io_busy(bp));
 		block_remove(sb, bp);
 	}

+	spin_unlock(&binf->lock);
+
 	WARN_ON_ONCE(!list_empty(&binf->lru_list));
 	WARN_ON_ONCE(binf->lru_nr != 0);
-	WARN_ON_ONCE(!RB_EMPTY_ROOT(&binf->root));
+	WARN_ON_ONCE(binf->radix.rnode != NULL);
 }

 /*
@@ -434,6 +386,7 @@ static void block_bio_end_io(struct bio *bio, int err)
 static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 			    int rw)
 {
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 	struct page *page;
@@ -441,7 +394,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	sector_t sector;
 	int ret = 0;

-	sector = bp->bl.blkno << (SCOUTFS_BLOCK_SHIFT - 9);
+	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);

 	WARN_ON_ONCE(bp->bl.blkno == U64_MAX);
 	WARN_ON_ONCE(sector == U64_MAX || sector == 0);
@@ -453,16 +406,16 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,

 	blk_start_plug(&plug);

-	for (off = 0; off < SCOUTFS_BLOCK_SIZE; off += PAGE_SIZE) {
+	for (off = 0; off < SCOUTFS_BLOCK_LG_SIZE; off += PAGE_SIZE) {
 		if (!bio) {
-			bio = bio_alloc(GFP_NOFS, SCOUTFS_PAGES_PER_BLOCK);
+			bio = bio_alloc(GFP_NOFS, SCOUTFS_BLOCK_LG_PAGES_PER);
 			if (!bio) {
 				ret = -ENOMEM;
 				break;
 			}

 			bio->bi_sector = sector + (off >> 9);
-			bio->bi_bdev = sb->s_bdev;
+			bio->bi_bdev = sbi->meta_bdev;
 			bio->bi_end_io = block_bio_end_io;
 			bio->bi_private = bp;

@@ -497,8 +450,8 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,

 /*
 * Return a reference to a cached block in the system, allocating a new
- * block if one isn't found in the rbtree.  Its contents are undefined
- * if it's newly allocated.
+ * block if one isn't found in the radix.  Its contents are undefined if
+ * it's newly allocated.
 */
 static struct block_private *block_get(struct super_block *sb, u64 blkno)
 {
@@ -507,11 +460,11 @@ static struct block_private *block_get(struct super_block *sb, u64 blkno)
 	struct block_private *bp;
 	int ret;

-	spin_lock(&binf->lock);
-	bp = walk_block_rbtree(&binf->root, blkno, NULL);
+	rcu_read_lock();
+	bp = radix_tree_lookup(&binf->radix, blkno);
 	if (bp)
 		atomic_inc(&bp->refcount);
-	spin_unlock(&binf->lock);
+	rcu_read_unlock();

 	/* drop failed reads that interrupted waiters abandoned */
 	if (bp && (test_bit(BLOCK_BIT_ERROR, &bp->bits) &&
@@ -530,15 +483,20 @@ static struct block_private *block_get(struct super_block *sb, u64 blkno)
 			goto out;
 		}

-		/* could refactor to insert in one walk */
+		ret = radix_tree_preload(GFP_NOFS);
+		if (ret)
+			goto out;
+
+		/* could use slot instead of lookup/insert */
 		spin_lock(&binf->lock);
-		found = walk_block_rbtree(&binf->root, blkno, NULL);
+		found = radix_tree_lookup(&binf->radix, blkno);
 		if (found) {
 			atomic_inc(&found->refcount);
 		} else {
 			block_insert(sb, bp, blkno);
 		}
 		spin_unlock(&binf->lock);
+		radix_tree_preload_end();

 		if (found) {
 			block_put(sb, bp);
@@ -634,6 +592,7 @@ void scoutfs_block_invalidate(struct super_block *sb, struct scoutfs_block *bl)
 	}
 }

+/* This is only used for large metadata blocks */
 bool scoutfs_block_consistent_ref(struct super_block *sb,
 				  struct scoutfs_block *bl,
 				  __le64 seq, __le64 blkno, u32 magic)
@@ -643,7 +602,8 @@ bool scoutfs_block_consistent_ref(struct super_block *sb,
 	struct scoutfs_block_header *hdr = bl->data;

 	if (!test_bit(BLOCK_BIT_CRC_VALID, &bp->bits)) {
-		if (hdr->crc != scoutfs_block_calc_crc(hdr))
+		if (hdr->crc !=
+		    scoutfs_block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE))
 			return false;
 		set_bit(BLOCK_BIT_CRC_VALID, &bp->bits);
 	}
@@ -722,7 +682,7 @@ int scoutfs_block_writer_write(struct super_block *sb,
 	/* checksum everything to reduce time between io submission merging */
 	list_for_each_entry(bp, &wri->dirty_list, dirty_entry) {
 		hdr = bp->bl.data;
-		hdr->crc = scoutfs_block_calc_crc(hdr);
+		hdr->crc = scoutfs_block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE);
 	}

        blk_start_plug(&plug);
@@ -810,44 +770,6 @@ void scoutfs_block_writer_forget(struct super_block *sb,
 	}
 }

-/*
- * Change a cached block's location.  We're careful to only change its
- * position in the rbtree.  If we find another block existing at the new
- * location then we remove it from the cache and forget it if it was
- * dirty.
- */
-void scoutfs_block_move(struct super_block *sb,
-			struct scoutfs_block_writer *wri,
-			struct scoutfs_block *bl, u64 blkno)
-{
-	DECLARE_BLOCK_INFO(sb, binf);
-	struct block_private *bp = BLOCK_PRIVATE(bl);
-	struct block_private *existing = NULL;
-
-	spin_lock(&binf->lock);
-
-	existing = walk_block_rbtree(&binf->root, blkno, NULL);
-	if (existing) {
-		/* only nesting of binf and wri locks */
-		if (test_bit(BLOCK_BIT_DIRTY, &bp->bits)) {
-			spin_lock(&wri->lock);
-			if (test_bit(BLOCK_BIT_DIRTY, &bp->bits))
-				block_forget(sb, wri, bp);
-			spin_unlock(&wri->lock);
-		}
-		block_remove(sb, existing);
-	}
-
-	rb_erase(&bp->node, &binf->root);
-	RB_CLEAR_NODE(&bp->node);
-	bp->bl.blkno = blkno;
-	walk_block_rbtree(&binf->root, blkno, bp);
-
-	TRACE_BLOCK(move, bp);
-
-	spin_unlock(&binf->lock);
-}
-
 /*
 * The caller has ensured that no more dirtying will take place.  This
 * helps the caller avoid doing a bunch of work before calling into the
@@ -866,7 +788,7 @@ bool scoutfs_block_writer_has_dirty(struct super_block *sb,
 u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
 				     struct scoutfs_block_writer *wri)
 {
-	return wri->nr_dirty_blocks * SCOUTFS_BLOCK_SIZE;
+	return wri->nr_dirty_blocks * SCOUTFS_BLOCK_LG_SIZE;
 }

 /*
@@ -916,12 +838,9 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	spin_unlock(&binf->lock);

 out:
-	return min_t(u64, binf->lru_nr * SCOUTFS_PAGES_PER_BLOCK, INT_MAX);
+	return min_t(u64, binf->lru_nr * SCOUTFS_BLOCK_LG_PAGES_PER, INT_MAX);
 }

-#define SCOUTFS_SM_BLOCK_SHIFT	12
-#define SCOUTFS_SM_BLOCK_SIZE	(1 << SCOUTFS_SM_BLOCK_SHIFT)
-
 struct sm_block_completion {
 	struct completion comp;
 	int err;
@@ -946,7 +865,7 @@ static void sm_block_bio_end_io(struct bio *bio, int err)
 * only layer that sees the full block buffer so we pass the calculated
 * crc to the caller for them to check in their context.
 */
-static int sm_block_io(struct super_block *sb, int rw, u64 blkno,
+static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,
 		       struct scoutfs_block_header *hdr, size_t len,
 		       __le32 *blk_crc)
 {
@@ -956,11 +875,9 @@ static int sm_block_io(struct super_block *sb, int rw, u64 blkno,
 	struct bio *bio;
 	int ret;

-	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_SM_BLOCK_SIZE);
-	/* block calc crc is assuming block size, they'll be different later */
-	BUILD_BUG_ON(SCOUTFS_SM_BLOCK_SIZE != SCOUTFS_BLOCK_SIZE);
+	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);

-	if (WARN_ON_ONCE(len > SCOUTFS_SM_BLOCK_SIZE) ||
+	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
 	    WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
 		return -EINVAL;

@@ -972,10 +889,11 @@ static int sm_block_io(struct super_block *sb, int rw, u64 blkno,

 	if (rw & WRITE) {
 		memcpy(pg_hdr, hdr, len);
-		if (len < SCOUTFS_SM_BLOCK_SIZE)
+		if (len < SCOUTFS_BLOCK_SM_SIZE)
 			memset((char *)pg_hdr + len, 0,
-			       SCOUTFS_SM_BLOCK_SIZE - len);
-		pg_hdr->crc = scoutfs_block_calc_crc(pg_hdr);
+			       SCOUTFS_BLOCK_SM_SIZE - len);
+		pg_hdr->crc = scoutfs_block_calc_crc(pg_hdr,
+						     SCOUTFS_BLOCK_SM_SIZE);
 	}

 	bio = bio_alloc(GFP_NOFS, 1);
@@ -984,11 +902,11 @@ static int sm_block_io(struct super_block *sb, int rw, u64 blkno,
 		goto out;
 	}

-	bio->bi_sector = blkno << (SCOUTFS_SM_BLOCK_SHIFT - 9);
-	bio->bi_bdev = sb->s_bdev;
+	bio->bi_sector = blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9);
+	bio->bi_bdev = bdev;
 	bio->bi_end_io = sm_block_bio_end_io;
 	bio->bi_private = &sbc;
-	bio_add_page(bio, page, SCOUTFS_SM_BLOCK_SIZE, 0);
+	bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);

 	init_completion(&sbc.comp);
 	sbc.err = 0;
@@ -1000,32 +918,44 @@ static int sm_block_io(struct super_block *sb, int rw, u64 blkno,

 	if (ret == 0 && !(rw & WRITE)) {
 		memcpy(hdr, pg_hdr, len);
-		*blk_crc = scoutfs_block_calc_crc(pg_hdr);
+		*blk_crc = scoutfs_block_calc_crc(pg_hdr,
+						  SCOUTFS_BLOCK_SM_SIZE);
 	}
 out:
 	__free_page(page);
 	return ret;
 }

-int scoutfs_block_read_sm(struct super_block *sb, u64 blkno,
+int scoutfs_block_read_sm(struct super_block *sb,
+			  struct block_device *bdev, u64 blkno,
 			  struct scoutfs_block_header *hdr, size_t len,
 			  __le32 *blk_crc)
 {
-	return sm_block_io(sb, READ, blkno, hdr, len, blk_crc);
+	return sm_block_io(bdev, READ, blkno, hdr, len, blk_crc);
 }

-int scoutfs_block_write_sm(struct super_block *sb, u64 blkno,
+int scoutfs_block_write_sm(struct super_block *sb,
+			   struct block_device *bdev, u64 blkno,
 			   struct scoutfs_block_header *hdr, size_t len)
 {
-	return sm_block_io(sb, WRITE, blkno, hdr, len, NULL);
+	return sm_block_io(bdev, WRITE, blkno, hdr, len, NULL);
 }

 int scoutfs_block_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct block_info *binf;
+	loff_t size;
 	int ret;

+	/* we store blknos in longs in the radix */
+	size = i_size_read(sb->s_bdev->bd_inode);
+	if ((size >> SCOUTFS_BLOCK_LG_SHIFT) >= LONG_MAX) {
+		scoutfs_err(sb, "Cant reference all blocks in %llu byte device with %u bit long radix tree indexes",
+			size, BITS_PER_LONG);
+		return -EINVAL;
+	}
+
 	binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
 	if (!binf) {
 		ret = -ENOMEM;
@@ -1034,7 +964,7 @@ int scoutfs_block_setup(struct super_block *sb)

 	binf->sb = sb;
 	spin_lock_init(&binf->lock);
-	binf->root = RB_ROOT;
+	INIT_RADIX_TREE(&binf->radix, GFP_ATOMIC); /* insertion preloads */
 	INIT_LIST_HEAD(&binf->lru_list);
 	init_waitqueue_head(&binf->waitq);
 	binf->shrinker.shrink = block_shrink;
--- a/kmod/src/block.h
+++ b/kmod/src/block.h
@@ -10,17 +10,14 @@ struct scoutfs_block_writer {
 struct scoutfs_block {
 	u64 blkno;
 	void *data;
+	void *priv;
 };

-__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr);
-bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr);
+__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr, u32 size);
+bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr, u32 size);
 bool scoutfs_block_valid_ref(struct super_block *sb,
 			     struct scoutfs_block_header *hdr,
 			     __le64 seq, __le64 blkno);
-bool scoutfs_block_tas_visited(struct super_block *sb,
-			       struct scoutfs_block *bl);
-void scoutfs_block_clear_visited(struct super_block *sb,
-				 struct scoutfs_block *bl);

 struct scoutfs_block *scoutfs_block_create(struct super_block *sb, u64 blkno);
 struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno);
@@ -44,18 +41,17 @@ void scoutfs_block_writer_forget_all(struct super_block *sb,
 void scoutfs_block_writer_forget(struct super_block *sb,
 			         struct scoutfs_block_writer *wri,
 				 struct scoutfs_block *bl);
-void scoutfs_block_move(struct super_block *sb,
-			struct scoutfs_block_writer *wri,
-			struct scoutfs_block *bl, u64 blkno);
 bool scoutfs_block_writer_has_dirty(struct super_block *sb,
 				    struct scoutfs_block_writer *wri);
 u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
 				     struct scoutfs_block_writer *wri);

-int scoutfs_block_read_sm(struct super_block *sb, u64 blkno,
+int scoutfs_block_read_sm(struct super_block *sb,
+			  struct block_device *bdev, u64 blkno,
 			  struct scoutfs_block_header *hdr, size_t len,
 			  __le32 *blk_crc);
-int scoutfs_block_write_sm(struct super_block *sb, u64 blkno,
+int scoutfs_block_write_sm(struct super_block *sb,
+			   struct block_device *bdev, u64 blkno,
 			   struct scoutfs_block_header *hdr, size_t len);

 int scoutfs_block_setup(struct super_block *sb);
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
--- a/kmod/src/btree.h
+++ b/kmod/src/btree.h
@@ -3,15 +3,14 @@

 #include <linux/uio.h>

-struct scoutfs_radix_allocator;
+struct scoutfs_alloc;
 struct scoutfs_block_writer;
 struct scoutfs_block;

 struct scoutfs_btree_item_ref {
 	struct super_block *sb;
 	struct scoutfs_block *bl;
-	void *key;
-	unsigned key_len;
+	struct scoutfs_key *key;
 	void *val;
 	unsigned val_len;
 };
@@ -19,50 +18,69 @@ struct scoutfs_btree_item_ref {
 #define SCOUTFS_BTREE_ITEM_REF(name) \
 	struct scoutfs_btree_item_ref name = {NULL,}

+/* caller gives an item to the callback */
+typedef int (*scoutfs_btree_item_cb)(struct super_block *sb,
+				     struct scoutfs_key *key,
+				     void *val, int val_len, void *arg);

-int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_btree_root *root,
-			 void *key, unsigned key_len,
+/* simple singly-linked list of items */
+struct scoutfs_btree_item_list {
+	struct scoutfs_btree_item_list *next;
+	struct scoutfs_key key;
+	int val_len;
+	u8 val[0];
+};
+
+int scoutfs_btree_lookup(struct super_block *sb,
+			 struct scoutfs_btree_root *root,
+			 struct scoutfs_key *key,
 			 struct scoutfs_btree_item_ref *iref);
 int scoutfs_btree_insert(struct super_block *sb,
-			 struct scoutfs_radix_allocator *alloc,
+			 struct scoutfs_alloc *alloc,
 			 struct scoutfs_block_writer *wri,
 			 struct scoutfs_btree_root *root,
-			 void *key, unsigned key_len,
+			 struct scoutfs_key *key,
 			 void *val, unsigned val_len);
 int scoutfs_btree_update(struct super_block *sb,
-			 struct scoutfs_radix_allocator *alloc,
+			 struct scoutfs_alloc *alloc,
 			 struct scoutfs_block_writer *wri,
 			 struct scoutfs_btree_root *root,
-			 void *key, unsigned key_len,
+			 struct scoutfs_key *key,
 			 void *val, unsigned val_len);
 int scoutfs_btree_force(struct super_block *sb,
-			struct scoutfs_radix_allocator *alloc,
+			struct scoutfs_alloc *alloc,
 			struct scoutfs_block_writer *wri,
 			struct scoutfs_btree_root *root,
-			void *key, unsigned key_len,
+			struct scoutfs_key *key,
 			void *val, unsigned val_len);
 int scoutfs_btree_delete(struct super_block *sb,
-			 struct scoutfs_radix_allocator *alloc,
+			 struct scoutfs_alloc *alloc,
 			 struct scoutfs_block_writer *wri,
 			 struct scoutfs_btree_root *root,
-			 void *key, unsigned key_len);
+			 struct scoutfs_key *key);
 int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
-		       void *key, unsigned key_len,
+		       struct scoutfs_key *key,
 		       struct scoutfs_btree_item_ref *iref);
-int scoutfs_btree_after(struct super_block *sb, struct scoutfs_btree_root *root,
-		        void *key, unsigned key_len,
-		        struct scoutfs_btree_item_ref *iref);
 int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root,
-		       void *key, unsigned key_len,
+		       struct scoutfs_key *key,
 		       struct scoutfs_btree_item_ref *iref);
-int scoutfs_btree_before(struct super_block *sb, struct scoutfs_btree_root *root,
-		         void *key, unsigned key_len,
-		         struct scoutfs_btree_item_ref *iref);
 int scoutfs_btree_dirty(struct super_block *sb,
-			struct scoutfs_radix_allocator *alloc,
+			struct scoutfs_alloc *alloc,
 			struct scoutfs_block_writer *wri,
 			struct scoutfs_btree_root *root,
-			void *key, unsigned key_len);
+			struct scoutfs_key *key);
+
+int scoutfs_btree_read_items(struct super_block *sb,
+			     struct scoutfs_btree_root *root,
+			     struct scoutfs_key *key,
+			     struct scoutfs_key *start,
+			     struct scoutfs_key *end,
+			     scoutfs_btree_item_cb cb, void *arg);
+int scoutfs_btree_insert_list(struct super_block *sb,
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri,
+			      struct scoutfs_btree_root *root,
+			      struct scoutfs_btree_item_list *lst);

 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);

--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -108,19 +108,27 @@ int scoutfs_client_commit_log_trees(struct super_block *sb,
 					lt, sizeof(*lt), NULL, 0);
 }

+int scoutfs_client_get_roots(struct super_block *sb,
+			     struct scoutfs_net_roots *roots)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_GET_ROOTS,
+					NULL, 0, roots, sizeof(*roots));
+}
+
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-	__le64 before = cpu_to_le64p(seq);
-	__le64 after;
+	__le64 leseq;
 	int ret;

 	ret = scoutfs_net_sync_request(sb, client->conn,
 				       SCOUTFS_NET_CMD_ADVANCE_SEQ,
-				       &before, sizeof(before),
-				       &after, sizeof(after));
+				       NULL, 0, &leseq, sizeof(leseq));
 	if (ret == 0)
-		*seq = le64_to_cpu(after);
+		*seq = le64_to_cpu(leseq);

 	return ret;
 }
@@ -140,24 +148,13 @@ int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq)
 	return ret;
 }

-int scoutfs_client_statfs(struct super_block *sb,
-			  struct scoutfs_net_statfs *nstatfs)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn,
-					SCOUTFS_NET_CMD_STATFS, NULL, 0,
-					nstatfs,
-					sizeof(struct scoutfs_net_statfs));
-}
-
 /* process an incoming grant response from the server */
 static int client_lock_response(struct super_block *sb,
 				struct scoutfs_net_connection *conn,
 				void *resp, unsigned int resp_len,
 				int error, void *data)
 {
-	if (resp_len != sizeof(struct scoutfs_net_lock))
+	if (resp_len != sizeof(struct scoutfs_net_lock_grant_response))
 		return -EINVAL;

 	/* XXX error? */
@@ -200,6 +197,28 @@ int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
 				    net_id, 0, nlr, bytes);
 }

+/* Find srch files that need to be compacted. */
+int scoutfs_client_srch_get_compact(struct super_block *sb,
+				    struct scoutfs_srch_compact *sc)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
+					NULL, 0, sc, sizeof(*sc));
+}
+
+/* Commit the result of a srch file compaction. */
+int scoutfs_client_srch_commit_compact(struct super_block *sb,
+				       struct scoutfs_srch_compact *res)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
+					res, sizeof(*res), NULL, 0);
+}
+
 /* The client is receiving a invalidation request from the server */
 static int client_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
@@ -261,10 +280,10 @@ static int client_greeting(struct super_block *sb,
 		goto out;
 	}

-	if (gr->format_hash != super->format_hash) {
+	if (gr->version != super->version) {
 		scoutfs_warn(sb, "server sent format 0x%llx, client has 0x%llx",
-			     le64_to_cpu(gr->format_hash),
-			     le64_to_cpu(super->format_hash));
+			     le64_to_cpu(gr->version),
+			     le64_to_cpu(super->version));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -373,7 +392,7 @@ static void scoutfs_client_connect_worker(struct work_struct *work)

 	/* send a greeting to verify endpoints of each connection */
 	greet.fsid = super->hdr.fsid;
-	greet.format_hash = super->format_hash;
+	greet.version = super->version;
 	greet.server_term = cpu_to_le64(client->server_term);
 	greet.unmount_barrier = cpu_to_le64(client->greeting_umb);
 	greet.rid = cpu_to_le64(sbi->rid);
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -7,17 +7,21 @@ int scoutfs_client_get_log_trees(struct super_block *sb,
 				 struct scoutfs_log_trees *lt);
 int scoutfs_client_commit_log_trees(struct super_block *sb,
 				    struct scoutfs_log_trees *lt);
+int scoutfs_client_get_roots(struct super_block *sb,
+			     struct scoutfs_net_roots *roots);
 u64 *scoutfs_client_bulk_alloc(struct super_block *sb);
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq);
 int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq);
-int scoutfs_client_statfs(struct super_block *sb,
-			  struct scoutfs_net_statfs *nstatfs);
 int scoutfs_client_lock_request(struct super_block *sb,
 				struct scoutfs_net_lock *nl);
 int scoutfs_client_lock_response(struct super_block *sb, u64 net_id,
 				struct scoutfs_net_lock *nl);
 int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
 					 struct scoutfs_net_lock_recover *nlr);
+int scoutfs_client_srch_get_compact(struct super_block *sb,
+				    struct scoutfs_srch_compact *sc);
+int scoutfs_client_srch_commit_compact(struct super_block *sb,
+				       struct scoutfs_srch_compact *res);

 int scoutfs_client_setup(struct super_block *sb);
 void scoutfs_client_destroy(struct super_block *sb);
--- a/kmod/src/count.h
+++ b/kmod/src/count.h
@@ -1,319 +0,0 @@
-#ifndef _SCOUTFS_COUNT_H_
-#define _SCOUTFS_COUNT_H_
-
-/*
- * Our estimate of the space consumed while dirtying items is based on
- * the number of items and the size of their values.
- *
- * The estimate is still a read-only input to entering the transaction.
- * We'd like to use it as a clean rhs arg to hold_trans.  We define SIC_
- * functions which return the count struct.  This lets us have a single
- * arg and avoid bugs in initializing and passing in struct pointers
- * from callers.  The internal __count functions are used compose an
- * estimate out of the sets of items it manipulates.  We program in much
- * clearer C instead of in the preprocessor.
- *
- * Compilers are able to collapse the inlines into constants for the
- * constant estimates.
- */
-
-struct scoutfs_item_count {
-	signed items;
-	signed vals;
-};
-
-/* The caller knows exactly what they're doing. */
-static inline const struct scoutfs_item_count SIC_EXACT(signed items,
-							signed vals)
-{
-	struct scoutfs_item_count cnt = {
-		.items = items,
-		.vals = vals,
-	};
-
-	return cnt;
-}
-
-/*
- * Allocating an inode creates a new set of indexed items.
- */
-static inline void __count_alloc_inode(struct scoutfs_item_count *cnt)
-{
-	const int nr_indices = SCOUTFS_INODE_INDEX_NR;
-
-	cnt->items += 1 + nr_indices;
-	cnt->vals += sizeof(struct scoutfs_inode);
-}
-
-/*
- * Dirtying an inode dirties the inode item and can delete and create
- * the full set of indexed items.
- */
-static inline void __count_dirty_inode(struct scoutfs_item_count *cnt)
-{
-	const int nr_indices = 2 * SCOUTFS_INODE_INDEX_NR;
-
-	cnt->items += 1 + nr_indices;
-	cnt->vals += sizeof(struct scoutfs_inode);
-}
-
-static inline const struct scoutfs_item_count SIC_ALLOC_INODE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_alloc_inode(&cnt);
-
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_DIRTY_INODE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirty_inode(&cnt);
-
-	return cnt;
-}
-
-/*
- * Directory entries are stored in three items.
- */
-static inline void __count_dirents(struct scoutfs_item_count *cnt,
-				   unsigned name_len)
-{
-	cnt->items += 3;
-	cnt->vals += 3 * offsetof(struct scoutfs_dirent, name[name_len]);
-}
-
-static inline void __count_sym_target(struct scoutfs_item_count *cnt,
-				      unsigned size)
-{
-	unsigned nr = DIV_ROUND_UP(size, SCOUTFS_MAX_VAL_SIZE);
-
-	cnt->items += nr;
-	cnt->vals += size;
-}
-
-static inline void __count_orphan(struct scoutfs_item_count *cnt)
-{
-
-	cnt->items += 1;
-}
-
-static inline void __count_mknod(struct scoutfs_item_count *cnt,
-				 unsigned name_len)
-{
-	__count_alloc_inode(cnt);
-	__count_dirents(cnt, name_len);
-	__count_dirty_inode(cnt);
-}
-
-static inline const struct scoutfs_item_count SIC_MKNOD(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_mknod(&cnt, name_len);
-
-	return cnt;
-}
-
-/*
- * Dropping the inode deletes all its items.  Potentially enormous numbers
- * of items (data mapping, xattrs) are deleted in their own transactions.
- */
-static inline const struct scoutfs_item_count SIC_DROP_INODE(int mode,
-							     u64 size)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	if (S_ISLNK(mode))
-		__count_sym_target(&cnt, size);
-	__count_dirty_inode(&cnt);
-	__count_orphan(&cnt);
-
-	cnt.vals = 0;
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_LINK(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirents(&cnt, name_len);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-
-	return cnt;
-}
-
-/*
- * Unlink can add orphan items.
- */
-static inline const struct scoutfs_item_count SIC_UNLINK(unsigned name_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirents(&cnt, name_len);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_orphan(&cnt);
-
-	return cnt;
-}
-
-static inline const struct scoutfs_item_count SIC_SYMLINK(unsigned name_len,
-							  unsigned size)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_mknod(&cnt, name_len);
-	__count_sym_target(&cnt, size);
-
-	return cnt;
-}
-
-/*
- * This assumes the worst case of a rename between directories that
- * unlinks an existing target.  That'll be worse than the common case
- * by a few hundred bytes.
- */
-static inline const struct scoutfs_item_count SIC_RENAME(unsigned old_len,
-							 unsigned new_len)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	/* dirty dirs and inodes */
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-	__count_dirty_inode(&cnt);
-
-	/* unlink old and new, link new */
-	__count_dirents(&cnt, old_len);
-	__count_dirents(&cnt, new_len);
-	__count_dirents(&cnt, new_len);
-
-	/* orphan the existing target */
-	__count_orphan(&cnt);
-
-	return cnt;
-}
-
-/*
- * Creating an xattr results in a dirty set of items with values that
- * store the xattr header, name, and value.  There's always at least one
- * item with the header and name.  Any previously existing items are
- * deleted which dirties their key but removes their value.  The two
- * sets of items are indexed by different ids so their items don't
- * overlap.  If the xattr name is indexed then we modify one xattr index
- * item.
- */
-static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,
-							    bool creating,
-							    unsigned name_len,
-							    unsigned size,
-							    bool indexed)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int new_parts;
-
-	__count_dirty_inode(&cnt);
-
-	if (old_parts)
-		cnt.items += old_parts;
-	if (indexed)
-		cnt.items++;
-
-	if (creating) {
-		new_parts = SCOUTFS_XATTR_NR_PARTS(name_len, size);
-
-		cnt.items += new_parts;
-		cnt.vals += sizeof(struct scoutfs_xattr) + name_len + size;
-	}
-
-	return cnt;
-}
-
-/*
- * write_begin can have to allocate all the blocks in the page and can
- * have to add a big allocation from the server to do so:
- *  - merge added free extents from the server
- *  - remove a free extent per block
- *  - remove an offline extent for every other block
- *  - add a file extent per block
- */
-static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned nr_free = (1 + SCOUTFS_BLOCKS_PER_PAGE) * 3;
-	unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCKS_PER_PAGE, 2) +
-			    SCOUTFS_BLOCKS_PER_PAGE) * 3;
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items += nr_free + nr_file;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * Truncating an extent can:
- *  - delete existing file extent,
- *  - create two surrounding file extents,
- *  - add an offline file extent,
- *  - delete two existing free extents
- *  - create a merged free extent
- */
-static inline const struct scoutfs_item_count
-SIC_TRUNC_EXTENT(struct inode *inode)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int nr_file = 1 + 2 + 1;
-	unsigned int nr_free = (2 + 1) * 2;
-
-	if (inode)
-		__count_dirty_inode(&cnt);
-
-	cnt.items += nr_file + nr_free;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * Fallocating an extent can, at most:
- *  - allocate from the server: delete two free and insert merged
- *  - free an allocated extent: delete one and create two split
- *  - remove an unallocated file extent: delete one and create two split
- *  - add an fallocated flie extent: delete two and inset one merged
- */
-static inline const struct scoutfs_item_count SIC_FALLOCATE_ONE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-	unsigned int nr_free = ((1 + 2) * 2) * 2;
-	unsigned int nr_file = (1 + 2) * 2;
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items += nr_free + nr_file;
-	cnt.vals += nr_file;
-
-	return cnt;
-}
-
-/*
- * ioc_setattr_more can dirty the inode and add a single offline extent.
- */
-static inline const struct scoutfs_item_count SIC_SETATTR_MORE(void)
-{
-	struct scoutfs_item_count cnt = {0,};
-
-	__count_dirty_inode(&cnt);
-
-	cnt.items++;
-
-	return cnt;
-}
-
-#endif
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -12,6 +12,15 @@
 * other places by this macro.  Don't forget to update LAST_COUNTER.
 */
 #define EXPAND_EACH_COUNTER					\
+	EXPAND_COUNTER(alloc_alloc_data)			\
+	EXPAND_COUNTER(alloc_alloc_meta)			\
+	EXPAND_COUNTER(alloc_free_data)				\
+	EXPAND_COUNTER(alloc_free_meta)				\
+	EXPAND_COUNTER(alloc_list_avail_lo)			\
+	EXPAND_COUNTER(alloc_list_freed_hi)			\
+	EXPAND_COUNTER(alloc_move)				\
+	EXPAND_COUNTER(alloc_moved_extent)			\
+	EXPAND_COUNTER(alloc_stale_cached_list_block)		\
 	EXPAND_COUNTER(block_cache_access)			\
 	EXPAND_COUNTER(block_cache_alloc_failure)		\
 	EXPAND_COUNTER(block_cache_alloc_page_order)		\
@@ -22,8 +31,23 @@
 	EXPAND_COUNTER(block_cache_invalidate)			\
 	EXPAND_COUNTER(block_cache_lru_move)			\
 	EXPAND_COUNTER(block_cache_shrink)			\
+	EXPAND_COUNTER(btree_compact_values)			\
+	EXPAND_COUNTER(btree_compact_values_enomem)		\
+	EXPAND_COUNTER(btree_delete)				\
+	EXPAND_COUNTER(btree_dirty)				\
+	EXPAND_COUNTER(btree_force)				\
+	EXPAND_COUNTER(btree_join)				\
+	EXPAND_COUNTER(btree_insert)				\
+	EXPAND_COUNTER(btree_leaf_item_hash_search)		\
+	EXPAND_COUNTER(btree_lookup)				\
+	EXPAND_COUNTER(btree_next)				\
+	EXPAND_COUNTER(btree_prev)				\
 	EXPAND_COUNTER(btree_read_error)			\
+	EXPAND_COUNTER(btree_split)				\
 	EXPAND_COUNTER(btree_stale_read)			\
+	EXPAND_COUNTER(btree_update)				\
+	EXPAND_COUNTER(btree_walk)				\
+	EXPAND_COUNTER(btree_walk_restart)			\
 	EXPAND_COUNTER(client_farewell_error)			\
 	EXPAND_COUNTER(corrupt_btree_block_level)		\
 	EXPAND_COUNTER(corrupt_btree_no_child_ref)		\
@@ -34,6 +58,8 @@
 	EXPAND_COUNTER(corrupt_symlink_inode_size)		\
 	EXPAND_COUNTER(corrupt_symlink_missing_item)		\
 	EXPAND_COUNTER(corrupt_symlink_not_null_term)		\
+	EXPAND_COUNTER(data_fallocate_enobufs_retry)		\
+	EXPAND_COUNTER(data_write_begin_enobufs_retry)		\
 	EXPAND_COUNTER(dentry_revalidate_error)			\
 	EXPAND_COUNTER(dentry_revalidate_invalid)		\
 	EXPAND_COUNTER(dentry_revalidate_locked)		\
@@ -42,25 +68,65 @@
 	EXPAND_COUNTER(dentry_revalidate_root)			\
 	EXPAND_COUNTER(dentry_revalidate_valid)			\
 	EXPAND_COUNTER(dir_backref_excessive_retries)		\
+	EXPAND_COUNTER(ext_op_insert)				\
+	EXPAND_COUNTER(ext_op_next)				\
+	EXPAND_COUNTER(ext_op_remove)				\
+	EXPAND_COUNTER(forest_bloom_fail)			\
+	EXPAND_COUNTER(forest_bloom_pass)			\
+	EXPAND_COUNTER(forest_read_items)			\
+	EXPAND_COUNTER(forest_roots_next_hint)			\
+	EXPAND_COUNTER(forest_set_bloom_bits)			\
+	EXPAND_COUNTER(item_clear_dirty)			\
+	EXPAND_COUNTER(item_create)				\
+	EXPAND_COUNTER(item_delete)				\
+	EXPAND_COUNTER(item_dirty)				\
+	EXPAND_COUNTER(item_invalidate)				\
+	EXPAND_COUNTER(item_invalidate_page)			\
+	EXPAND_COUNTER(item_lookup)				\
+	EXPAND_COUNTER(item_mark_dirty)				\
+	EXPAND_COUNTER(item_next)				\
+	EXPAND_COUNTER(item_page_accessed)			\
+	EXPAND_COUNTER(item_page_alloc)				\
+	EXPAND_COUNTER(item_page_clear_dirty)			\
+	EXPAND_COUNTER(item_page_compact)			\
+	EXPAND_COUNTER(item_page_free)				\
+	EXPAND_COUNTER(item_page_lru_add)			\
+	EXPAND_COUNTER(item_page_lru_remove)			\
+	EXPAND_COUNTER(item_page_mark_dirty)			\
+	EXPAND_COUNTER(item_page_rbtree_walk)			\
+	EXPAND_COUNTER(item_page_split)				\
+	EXPAND_COUNTER(item_pcpu_add_replaced)			\
+	EXPAND_COUNTER(item_pcpu_page_hit)			\
+	EXPAND_COUNTER(item_pcpu_page_miss)			\
+	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
+	EXPAND_COUNTER(item_read_pages_split)			\
+	EXPAND_COUNTER(item_shrink_page)			\
+	EXPAND_COUNTER(item_shrink_page_dirty)			\
+	EXPAND_COUNTER(item_shrink_page_reader)			\
+	EXPAND_COUNTER(item_shrink_page_trylock)		\
+	EXPAND_COUNTER(item_update)				\
+	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
 	EXPAND_COUNTER(lock_free)				\
-	EXPAND_COUNTER(lock_grace_elapsed)			\
 	EXPAND_COUNTER(lock_grace_extended)			\
 	EXPAND_COUNTER(lock_grace_set)				\
 	EXPAND_COUNTER(lock_grace_wait)				\
 	EXPAND_COUNTER(lock_grant_request)			\
 	EXPAND_COUNTER(lock_grant_response)			\
-	EXPAND_COUNTER(lock_invalidate_commit)			\
+	EXPAND_COUNTER(lock_grant_work)				\
 	EXPAND_COUNTER(lock_invalidate_coverage)		\
 	EXPAND_COUNTER(lock_invalidate_inode)			\
 	EXPAND_COUNTER(lock_invalidate_request)			\
 	EXPAND_COUNTER(lock_invalidate_response)		\
+	EXPAND_COUNTER(lock_invalidate_sync)			\
+	EXPAND_COUNTER(lock_invalidate_work)			\
 	EXPAND_COUNTER(lock_lock)				\
 	EXPAND_COUNTER(lock_lock_error)				\
 	EXPAND_COUNTER(lock_nonblock_eagain)			\
 	EXPAND_COUNTER(lock_recover_request)			\
-	EXPAND_COUNTER(lock_shrink_queued)			\
-	EXPAND_COUNTER(lock_shrink_request_aborted)		\
+	EXPAND_COUNTER(lock_shrink_attempted)			\
+	EXPAND_COUNTER(lock_shrink_aborted)			\
+	EXPAND_COUNTER(lock_shrink_work)			\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
 	EXPAND_COUNTER(net_dropped_response)			\
@@ -85,17 +151,37 @@
 	EXPAND_COUNTER(quorum_write_block)			\
 	EXPAND_COUNTER(quorum_write_block_error)		\
 	EXPAND_COUNTER(quorum_fenced)				\
-	EXPAND_COUNTER(radix_enospc_data)			\
-	EXPAND_COUNTER(radix_enospc_paths)			\
-	EXPAND_COUNTER(radix_enospc_synth)			\
+	EXPAND_COUNTER(server_commit_hold)			\
+	EXPAND_COUNTER(server_commit_queue)			\
+	EXPAND_COUNTER(server_commit_worker)			\
+	EXPAND_COUNTER(srch_add_entry)				\
+	EXPAND_COUNTER(srch_compact_dirty_block)		\
+	EXPAND_COUNTER(srch_compact_entry)			\
+	EXPAND_COUNTER(srch_compact_flush)			\
+	EXPAND_COUNTER(srch_compact_log_page)			\
+	EXPAND_COUNTER(srch_compact_removed_entry)		\
+	EXPAND_COUNTER(srch_inconsistent_ref)			\
+	EXPAND_COUNTER(srch_rotate_log)				\
+	EXPAND_COUNTER(srch_search_log)				\
+	EXPAND_COUNTER(srch_search_log_block)			\
+	EXPAND_COUNTER(srch_search_retry_empty)			\
+	EXPAND_COUNTER(srch_search_sorted)			\
+	EXPAND_COUNTER(srch_search_sorted_block)		\
+	EXPAND_COUNTER(srch_search_stale_eio)			\
+	EXPAND_COUNTER(srch_search_stale_retry)			\
+	EXPAND_COUNTER(srch_search_xattrs)			\
+	EXPAND_COUNTER(srch_read_stale)				\
+	EXPAND_COUNTER(statfs)					\
 	EXPAND_COUNTER(trans_commit_data_alloc_low)		\
+	EXPAND_COUNTER(trans_commit_dirty_meta_full)		\
 	EXPAND_COUNTER(trans_commit_fsync)			\
-	EXPAND_COUNTER(trans_commit_full)			\
+	EXPAND_COUNTER(trans_commit_meta_alloc_low)		\
 	EXPAND_COUNTER(trans_commit_sync_fs)			\
-	EXPAND_COUNTER(trans_commit_timer)
+	EXPAND_COUNTER(trans_commit_timer)			\
+	EXPAND_COUNTER(trans_commit_written)

-#define FIRST_COUNTER	block_cache_access
-#define LAST_COUNTER	trans_commit_timer
+#define FIRST_COUNTER	alloc_alloc_data
+#define LAST_COUNTER	trans_commit_written

 #undef EXPAND_COUNTER
 #define EXPAND_COUNTER(which) struct percpu_counter which;
@@ -113,11 +199,21 @@ struct scoutfs_counters {
 	     pcpu <= &SCOUTFS_SB(sb)->counters->LAST_COUNTER;	\
 	     pcpu++)

-#define scoutfs_inc_counter(sb, which) \
-	percpu_counter_inc(&SCOUTFS_SB(sb)->counters->which)
+/*
+ * We always read with _sum, we have no use for the shared count and
+ * certainly don't want to pay the cost of a shared lock to update it.
+ * The default batch of 32 make counter increments show up significantly
+ * in profiles.
+ */
+#define SCOUTFS_PCPU_COUNTER_BATCH (1 << 30)

-#define scoutfs_add_counter(sb, which, cnt) \
-	percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, cnt)
+#define scoutfs_inc_counter(sb, which)					\
+	__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, 1,	\
+			     SCOUTFS_PCPU_COUNTER_BATCH)
+
+#define scoutfs_add_counter(sb, which, cnt)				\
+	__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, cnt,	\
+			     SCOUTFS_PCPU_COUNTER_BATCH)

 void __init scoutfs_init_counters(void);
 int scoutfs_setup_counters(struct super_block *sb);
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -47,7 +47,7 @@ struct scoutfs_traced_extent {

 extern const struct address_space_operations scoutfs_file_aops;
 extern const struct file_operations scoutfs_file_fops;
-struct scoutfs_radix_allocator;
+struct scoutfs_alloc;
 struct scoutfs_block_writer;

 int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
@@ -58,6 +58,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
 int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 				     struct scoutfs_lock *lock);
+int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
+			     u64 byte_len, struct inode *to, u64 to_off);

 int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 			    u8 sef, u8 op, struct scoutfs_data_wait *ow,
@@ -77,11 +79,12 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
 			 unsigned int nr);

 void scoutfs_data_init_btrees(struct super_block *sb,
-			      struct scoutfs_radix_allocator *alloc,
+			      struct scoutfs_alloc *alloc,
 			      struct scoutfs_block_writer *wri,
 			      struct scoutfs_log_trees *lt);
 void scoutfs_data_get_btrees(struct super_block *sb,
 			     struct scoutfs_log_trees *lt);
+int scoutfs_data_prepare_commit(struct super_block *sb);
 u64 scoutfs_data_alloc_free_bytes(struct super_block *sb);

 int scoutfs_data_setup(struct super_block *sb);
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -13,7 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/crc32c.h>
 #include <linux/uio.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
@@ -28,9 +27,9 @@
 #include "super.h"
 #include "trans.h"
 #include "xattr.h"
-#include "kvec.h"
-#include "forest.h"
+#include "item.h"
 #include "lock.h"
+#include "hash.h"
 #include "counters.h"
 #include "scoutfs_trace.h"

@@ -79,7 +78,7 @@ static unsigned int mode_to_type(umode_t mode)
 #undef S_SHIFT
 }

-static unsigned int dentry_type(unsigned int type)
+static unsigned int dentry_type(enum scoutfs_dentry_type type)
 {
 	static unsigned char types[] = {
 		[SCOUTFS_DT_FIFO]	= DT_FIFO,
@@ -213,12 +212,44 @@ static struct scoutfs_dirent *alloc_dirent(unsigned int name_len)
 	return kmalloc(dirent_bytes(name_len), GFP_NOFS);
 }

+/*
+ * Test a bit number as though an array of bytes is a large len-bit
+ * big-endian value.  nr 0 is the LSB of the final byte, nr (len - 1) is
+ * the MSB of the first byte.
+ */
+static int test_be_bytes_bit(int nr, const char *bytes, int len)
+{
+	return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
+}
+
+/*
+ * Generate a 32bit "fingerprint" of the name by extracting 32 evenly
+ * distributed bits from the name.  The intent is to have the sort order
+ * of the fingerprints reflect the memcmp() sort order of the names
+ * while mapping large names down to small fs keys.
+ *
+ * Names that are smaller than 32bits are biased towards the high bits
+ * of the fingerprint so that most significant bits of the fingerprints
+ * consistently reflect the initial characters of the names.
+ */
+static u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
+{
+	int name_bits = name_len * 8;
+	int skip = max(name_bits / 32, 1);
+	u32 fp = 0;
+	int f;
+	int n;
+
+	for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
+		fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
+
+	return fp;
+}
+
 static u64 dirent_name_hash(const char *name, unsigned int name_len)
 {
-       unsigned int half = (name_len + 1) / 2;
-
-       return crc32c(~0, name, half) |
-              ((u64)crc32c(~0, name + name_len - half, half) << 32);
+       return scoutfs_hash32(name, name_len) |
+              ((u64)dirent_name_fingerprint(name, name_len) << 32);
 }

 static u64 dirent_names_equal(const char *a_name, unsigned int a_len,
@@ -239,7 +270,6 @@ static int lookup_dirent(struct super_block *sb, u64 dir_ino, const char *name,
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
 	struct scoutfs_dirent *dent = NULL;
-	struct kvec val;
 	int ret;

 	dent = alloc_dirent(SCOUTFS_NAME_LEN);
@@ -250,10 +280,10 @@ static int lookup_dirent(struct super_block *sb, u64 dir_ino, const char *name,

 	init_dirent_key(&key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, 0);
 	init_dirent_key(&last_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, U64_MAX);
-	kvec_init(&val, dent, dirent_bytes(SCOUTFS_NAME_LEN));

 	for (;;) {
-		ret = scoutfs_forest_next(sb, &key, &last_key, &val, lock);
+		ret = scoutfs_item_next(sb, &key, &last_key, dent,
+					dirent_bytes(SCOUTFS_NAME_LEN), lock);
 		if (ret < 0)
 			break;

@@ -433,7 +463,18 @@ out:
 	else
 		inode = scoutfs_iget(sb, ino);

-	return d_splice_alias(inode, dentry);
+	/*
+	 * We can't splice dir aliases into the dcache.  dir entries
+	 * might have changed on other nodes so our dcache could still
+	 * contain them, rather than having been moved in rename.  For
+	 * dirs, we use d_materialize_unique to remove any existing
+	 * aliases which must be stale.  Our inode numbers aren't reused
+	 * so inodes pointed to by entries can't change types.
+	 */
+	if (!IS_ERR_OR_NULL(inode) && S_ISDIR(inode->i_mode))
+		return d_materialise_unique(dentry, inode);
+	else
+		return d_splice_alias(inode, dentry);
 }

 /*
@@ -452,7 +493,6 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,
 	struct scoutfs_key key;
 	struct scoutfs_key last_key;
 	struct scoutfs_lock *dir_lock;
-	struct kvec val;
 	int name_len;
 	u64 pos;
 	int ret;
@@ -468,7 +508,6 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,

 	init_dirent_key(&last_key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
 			SCOUTFS_DIRENT_LAST_POS, 0);
-	kvec_init(&val, dent, dirent_bytes(SCOUTFS_NAME_LEN));

 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock);
 	if (ret)
@@ -478,7 +517,9 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,
 		init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
 				kc_readdir_pos(file, ctx), 0);

-		ret = scoutfs_forest_next(sb, &key, &last_key, &val, dir_lock);
+		ret = scoutfs_item_next(sb, &key, &last_key, dent,
+					dirent_bytes(SCOUTFS_NAME_LEN),
+					dir_lock);
 		if (ret < 0) {
 			if (ret == -ENOENT)
 				ret = 0;
@@ -535,7 +576,6 @@ static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 	struct scoutfs_dirent *dent;
 	bool del_ent = false;
 	bool del_rdir = false;
-	struct kvec val;
 	int ret;

 	dent = alloc_dirent(name_len);
@@ -554,25 +594,27 @@ static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 	init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos);
 	init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0);
 	init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos);
-	kvec_init(&val, dent, dirent_bytes(name_len));

-	ret = scoutfs_forest_create(sb, &ent_key, &val, dir_lock);
+	ret = scoutfs_item_create(sb, &ent_key, dent, dirent_bytes(name_len),
+				  dir_lock);
 	if (ret)
 		goto out;
 	del_ent = true;

-	ret = scoutfs_forest_create(sb, &rdir_key, &val, dir_lock);
+	ret = scoutfs_item_create(sb, &rdir_key, dent, dirent_bytes(name_len),
+				  dir_lock);
 	if (ret)
 		goto out;
 	del_rdir = true;

-	ret = scoutfs_forest_create(sb, &lb_key, &val, inode_lock);
+	ret = scoutfs_item_create(sb, &lb_key, dent, dirent_bytes(name_len),
+				  inode_lock);
 out:
 	if (ret < 0) {
 		if (del_ent)
-			scoutfs_forest_delete_dirty(sb, &ent_key);
+			scoutfs_item_delete(sb, &ent_key, dir_lock);
 		if (del_rdir)
-			scoutfs_forest_delete_dirty(sb, &rdir_key);
+			scoutfs_item_delete(sb, &rdir_key, dir_lock);
 	}

 	kfree(dent);
@@ -594,23 +636,20 @@ static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 	struct scoutfs_key rdir_key;
 	struct scoutfs_key ent_key;
 	struct scoutfs_key lb_key;
-	LIST_HEAD(dir_saved);
-	LIST_HEAD(inode_saved);
 	int ret;

 	init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos);
 	init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0);
 	init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos);

-	ret = scoutfs_forest_delete_save(sb, &ent_key, &dir_saved, dir_lock) ?:
-	      scoutfs_forest_delete_save(sb, &rdir_key, &dir_saved, dir_lock) ?:
-	      scoutfs_forest_delete_save(sb, &lb_key, &inode_saved, inode_lock);
-	if (ret < 0) {
-		scoutfs_forest_restore(sb, &dir_saved, dir_lock);
-		scoutfs_forest_restore(sb, &inode_saved, inode_lock);
-	} else {
-		scoutfs_forest_free_batch(sb, &dir_saved);
-		scoutfs_forest_free_batch(sb, &inode_saved);
+	ret = scoutfs_item_dirty(sb, &ent_key, dir_lock) ?:
+	      scoutfs_item_dirty(sb, &rdir_key, dir_lock) ?:
+	      scoutfs_item_dirty(sb, &lb_key, inode_lock);
+	if (ret == 0) {
+		ret = scoutfs_item_delete(sb, &ent_key, dir_lock) ?:
+		      scoutfs_item_delete(sb, &rdir_key, dir_lock) ?:
+		      scoutfs_item_delete(sb, &lb_key, inode_lock);
+		BUG_ON(ret); /* _dirty should have guaranteed success */
 	}

 	return ret;
@@ -627,7 +666,6 @@ static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 */
 static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 				      umode_t mode, dev_t rdev,
-				      const struct scoutfs_item_count cnt,
 				      struct scoutfs_lock **dir_lock,
 				      struct scoutfs_lock **inode_lock,
 				      struct list_head *ind_locks)
@@ -642,7 +680,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		return ERR_PTR(ret);

-	ret = scoutfs_alloc_ino(dir, &ino);
+	ret = scoutfs_alloc_ino(sb, S_ISDIR(mode), &ino);
 	if (ret)
 		return ERR_PTR(ret);

@@ -666,7 +704,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
 	      scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, cnt);
+	      scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -713,7 +751,6 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,

 	hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
 	inode = lock_hold_create(dir, dentry, mode, rdev,
-				 SIC_MKNOD(dentry->d_name.len),
 				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -808,8 +845,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_LINK(dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -890,8 +926,7 @@ retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_UNLINK(dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -960,17 +995,16 @@ static void init_symlink_key(struct scoutfs_key *key, u64 ino, u8 nr)
 * The target name can be null for deletion when val isn't used.  Size
 * still has to be provided to determine the number of items.
 */
-enum {
+enum symlink_ops {
 	SYM_CREATE = 0,
 	SYM_LOOKUP,
 	SYM_DELETE,
 };
-static int symlink_item_ops(struct super_block *sb, int op, u64 ino,
+static int symlink_item_ops(struct super_block *sb, enum symlink_ops op, u64 ino,
 			    struct scoutfs_lock *lock, const char *target,
 			    size_t size)
 {
 	struct scoutfs_key key;
-	struct kvec val;
 	unsigned bytes;
 	unsigned nr;
 	int ret;
@@ -985,14 +1019,16 @@ static int symlink_item_ops(struct super_block *sb, int op, u64 ino,

 		init_symlink_key(&key, ino, i);
 		bytes = min_t(u64, size, SCOUTFS_MAX_VAL_SIZE);
-		kvec_init(&val, (void *)target, bytes);

 		if (op == SYM_CREATE)
-			ret = scoutfs_forest_create(sb, &key, &val, lock);
+			ret = scoutfs_item_create(sb, &key, (void *)target,
+						  bytes, lock);
 		else if (op == SYM_LOOKUP)
-			ret = scoutfs_forest_lookup_exact(sb, &key, &val, lock);
+			ret = scoutfs_item_lookup_exact(sb, &key,
+						        (void *)target, bytes,
+							lock);
 		else if (op == SYM_DELETE)
-			ret = scoutfs_forest_delete(sb, &key, lock);
+			ret = scoutfs_item_delete(sb, &key, lock);
 		if (ret)
 			break;

@@ -1125,7 +1161,6 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
 		return ret;

 	inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-				 SIC_SYMLINK(dentry->d_name.len, name_len),
 				 &dir_lock, &inode_lock, &ind_locks);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -1207,7 +1242,6 @@ int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino,
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
 	struct scoutfs_lock *lock = NULL;
-	struct kvec val;
 	int len;
 	int ret;

@@ -1223,13 +1257,13 @@ int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino,
 	init_dirent_key(&key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, dir_pos);
 	init_dirent_key(&last_key, SCOUTFS_LINK_BACKREF_TYPE, ino, U64_MAX,
 			U64_MAX);
-	kvec_init(&val, &ent->dent, dirent_bytes(SCOUTFS_NAME_LEN));

 	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, ino, &lock);
 	if (ret)
 		goto out;

-	ret = scoutfs_forest_next(sb, &key, &last_key, &val, lock);
+	ret = scoutfs_item_next(sb, &key, &last_key, &ent->dent,
+				dirent_bytes(SCOUTFS_NAME_LEN), lock);
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
 	lock = NULL;
 	if (ret < 0)
@@ -1558,9 +1592,7 @@ retry:
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
 	      (new_inode == NULL ? 0 :
 	       scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-					    SIC_RENAME(old_dentry->d_name.len,
-						       new_dentry->d_name.len));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
--- a/kmod/src/ext.c
+++ b/kmod/src/ext.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (C) 2020 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+
+#include "ext.h"
+#include "counters.h"
+#include "scoutfs_trace.h"
+
+/*
+ * Extents are used to track free block regions and to map logical file
+ * regions to device blocks.   Extents can be split and merged as
+ * they're modified.  These helpers implement all the fiddly extent
+ * manipulations.  Callers provide callbacks which implement the actual
+ * storage of extents in either the item cache or btree items.
+ */
+
+static void ext_zero(struct scoutfs_extent *ext)
+{
+	memset(ext, 0, sizeof(struct scoutfs_extent));
+}
+
+static bool ext_overlap(struct scoutfs_extent *ext, u64 start, u64 len)
+{
+	u64 e_end = ext->start + ext->len - 1;
+	u64 end = start + len - 1;
+
+	return !(e_end < start || ext->start > end);
+}
+
+static bool ext_inside(u64 start, u64 len, struct scoutfs_extent *out)
+{
+	u64 in_end = start + len - 1;
+	u64 out_end = out->start + out->len - 1;
+
+	return out->start <= start && out_end >= in_end;
+}
+
+/* we only translate mappings when they exist */
+static inline u64 ext_map_add(u64 map, u64 diff)
+{
+	return map ? map + diff : 0;
+}
+
+/*
+ * Extents can merge if they're logically contiguous, both don't have
+ * mappings or have mappings which are also contiguous, and have
+ * matching flags.
+ */
+bool scoutfs_ext_can_merge(struct scoutfs_extent *left,
+			   struct scoutfs_extent *right)
+{
+	return (left->start + left->len == right->start) &&
+	       ((!left->map && !right->map) ||
+		(left->map + left->len == right->map)) &&
+	       (left->flags == right->flags);
+}
+
+/*
+ * Split an existing extent in to left and right extents by removing
+ * an interior range.  The split extents are all zeros if the range
+ * extends to their end of the extent.
+ */
+static void ext_split(struct scoutfs_extent *ext, u64 start, u64 len,
+		      struct scoutfs_extent *left,
+		      struct scoutfs_extent *right)
+{
+	if (ext->start < start) {
+		left->start = ext->start;
+		left->len = start - ext->start;
+		left->map = ext->map;
+		left->flags = ext->flags;
+	} else {
+		ext_zero(left);
+	}
+
+	if (ext->start + ext->len > start + len) {
+		right->start = start + len;
+		right->len = ext->start + ext->len - right->start;
+		right->map = ext_map_add(ext->map, right->start - ext->start);
+		right->flags = ext->flags;
+	} else {
+		ext_zero(right);
+	}
+}
+
+#define op_call(sb, ops, arg, which, args...)			\
+({								\
+	int _ret;						\
+	_ret = ops->which(sb, arg, ##args);			\
+	scoutfs_inc_counter(sb, ext_op_##which);		\
+	trace_scoutfs_ext_op_##which(sb, ##args, _ret);		\
+	_ret;							\
+})
+
+struct extent_changes {
+	struct scoutfs_extent exts[4];
+	bool ins[4];
+	u8 nr;
+};
+
+static void add_change(struct extent_changes *chg,
+		       struct scoutfs_extent *ext, bool ins)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(chg->ins) != ARRAY_SIZE(chg->exts));
+
+	if (ext->len) {
+		BUG_ON(chg->nr == ARRAY_SIZE(chg->exts));
+		chg->exts[chg->nr] = *ext;
+		chg->ins[chg->nr] = !!ins;
+		chg->nr++;
+	}
+}
+
+static int apply_changes(struct super_block *sb, struct scoutfs_ext_ops *ops,
+			 void *arg, struct extent_changes *chg)
+{
+	int ret = 0;
+	int err;
+	int i;
+
+	for (i = 0; i < chg->nr; i++) {
+		if (chg->ins[i])
+			ret = op_call(sb, ops, arg, insert, chg->exts[i].start,
+				      chg->exts[i].len, chg->exts[i].map,
+				      chg->exts[i].flags);
+		else
+			ret = op_call(sb, ops, arg, remove, chg->exts[i].start,
+				      chg->exts[i].len, chg->exts[i].map,
+				      chg->exts[i].flags);
+		if (ret < 0)
+			break;
+	}
+
+	while (ret < 0 && --i >= 0) {
+		if (chg->ins[i])
+			err = op_call(sb, ops, arg, remove, chg->exts[i].start,
+				      chg->exts[i].len, chg->exts[i].map,
+				      chg->exts[i].flags);
+		else
+			err = op_call(sb, ops, arg, insert, chg->exts[i].start,
+				      chg->exts[i].len, chg->exts[i].map,
+				      chg->exts[i].flags);
+		BUG_ON(err); /* inconsistent */
+	}
+
+	return ret;
+}
+
+int scoutfs_ext_next(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		     void *arg, u64 start, u64 len, struct scoutfs_extent *ext)
+{
+	int ret;
+
+	ret = op_call(sb, ops, arg, next, start, len, ext);
+	trace_scoutfs_ext_next(sb, start, len, ext, ret);
+	return ret;
+}
+
+/*
+ * Insert the given extent.  EINVAL is returned if there's already an existing
+ * overlapping extent.  This can merge with its neighbours.
+ */
+int scoutfs_ext_insert(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		       void *arg, u64 start, u64 len, u64 map, u8 flags)
+{
+	struct extent_changes chg = { .nr = 0 };
+	struct scoutfs_extent found;
+	struct scoutfs_extent ins;
+	int ret;
+
+	ins.start = start;
+	ins.len = len;
+	ins.map = map;
+	ins.flags = flags;
+
+	/* find right neighbour and check for overlap */
+	ret = op_call(sb, ops, arg, next, start, 1, &found);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+
+	/* inserting extent must not overlap */
+	if (found.len && ext_overlap(&ins, found.start, found.len)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* merge with right if we can */
+	if (found.len && scoutfs_ext_can_merge(&ins, &found)) {
+		ins.len += found.len;
+		add_change(&chg, &found, false);
+	}
+
+	/* see if we can merge with a left neighbour */
+	if (start > 0) {
+		ret = op_call(sb, ops, arg, next, start - 1,  1, &found);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+
+		if (ret == 0 && scoutfs_ext_can_merge(&found, &ins)) {
+			ins.start = found.start;
+			ins.map = found.map;
+			ins.len += found.len;
+			add_change(&chg, &found, false);
+		}
+	}
+
+	add_change(&chg, &ins, true);
+	ret = apply_changes(sb, ops, arg, &chg);
+out:
+	trace_scoutfs_ext_insert(sb, start, len, map, flags, ret);
+	return ret;
+}
+
+/*
+ * Remove the given extent.  The extent to remove must be found entirely
+ * in an existing extent.  If the existing extent is larger then we leave
+ * behind the remaining extent.  The existing extent can be split.
+ */
+int scoutfs_ext_remove(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		       void *arg, u64 start, u64 len)
+{
+	struct extent_changes chg = { .nr = 0 };
+	struct scoutfs_extent found;
+	struct scoutfs_extent left;
+	struct scoutfs_extent right;
+	int ret;
+
+	ret = op_call(sb, ops, arg, next, start, 1, &found);
+	if (ret < 0)
+		goto out;
+
+	/* removed extent must be entirely within found */
+	if (!ext_inside(start, len, &found)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ext_split(&found, start, len, &left, &right);
+
+	add_change(&chg, &found, false);
+	add_change(&chg, &left, true);
+	add_change(&chg, &right, true);
+
+	ret = apply_changes(sb, ops, arg, &chg);
+out:
+	trace_scoutfs_ext_remove(sb, start, len, 0, 0, ret);
+	return ret;
+}
+
+/*
+ * Find and remove the next extent, removing only a portion if the
+ * extent is larger than the count.  Returns ENOENT if it didn't
+ * find any extents.
+ *
+ * This does not search for merge candidates so it's safe to call with
+ * extents indexed by length.
+ */
+int scoutfs_ext_alloc(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		      void *arg, u64 start, u64 len, u64 count,
+		      struct scoutfs_extent *ext)
+{
+	struct extent_changes chg = { .nr = 0 };
+	struct scoutfs_extent found;
+	struct scoutfs_extent ins;
+	int ret;
+
+	ret = op_call(sb, ops, arg, next, start, len, &found);
+	if (ret < 0)
+		goto out;
+
+	add_change(&chg, &found, false);
+
+	if (found.len > count) {
+		ins.start = found.start + count;
+		ins.len = found.len - count;
+		ins.map = ext_map_add(found.map, count);
+		ins.flags = found.flags;
+
+		add_change(&chg, &ins, true);
+	}
+
+	ret = apply_changes(sb, ops, arg, &chg);
+out:
+	if (ret == 0) {
+		ext->start = found.start;
+		ext->len = min(found.len, count);
+		ext->map = found.map;
+		ext->flags = found.flags;
+	} else {
+		ext_zero(ext);
+	}
+
+	trace_scoutfs_ext_alloc(sb, start, len, count, ext, ret);
+	return ret;
+}
+
+/*
+ * Set the map and flags for an extent region, with the magical property
+ * that extents with map and flags set to 0 are removed.
+ *
+ * If we're modifying an existing extent then the modification must be
+ * fully inside the existing extent.  The modification can leave edges
+ * of the extent which need to be inserted.  If the modification extends
+ * to the end of the existing extent then we need to check for adjacent
+ * neighbouring extents which might now be able to be merged.
+ *
+ * Inserting a new extent is like the case of modifying the entire
+ * existing extent.  We need to check neighbours of the inserted extent
+ * to see if they can be merged.
+ */
+int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		    void *arg, u64 start, u64 len, u64 map, u8 flags)
+{
+	struct extent_changes chg = { .nr = 0 };
+	struct scoutfs_extent found;
+	struct scoutfs_extent left;
+	struct scoutfs_extent right;
+	struct scoutfs_extent set;
+	int ret;
+
+	set.start = start;
+	set.len = len;
+	set.map = map;
+	set.flags = flags;
+
+	/* find extent to remove */
+	ret = op_call(sb, ops, arg, next, start, 1, &found);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+
+	if (ret == 0 && ext_overlap(&found, start, len)) {
+		/* set extent must be entirely within found */
+		if (!ext_inside(start, len, &found)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		add_change(&chg, &found, false);
+		ext_split(&found, start, len, &left, &right);
+	} else {
+		ext_zero(&found);
+		ext_zero(&left);
+		ext_zero(&right);
+	}
+
+	if (left.len) {
+		/* inserting split left, won't merge */
+		add_change(&chg, &left, true);
+	} else if (start > 0) {
+		ret = op_call(sb, ops, arg, next, start - 1, 1, &left);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		else if (ret == 0 && scoutfs_ext_can_merge(&left, &set)) {
+			/* remove found left, merging */
+			set.start = left.start;
+			set.map = left.map;
+			set.len += left.len;
+			add_change(&chg, &left, false);
+		}
+	}
+
+	if (right.len) {
+		/* inserting split right, won't merge */
+		add_change(&chg, &right, true);
+	} else {
+		ret = op_call(sb, ops, arg, next, start + len, 1, &right);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		else if (ret == 0 && scoutfs_ext_can_merge(&set, &right)) {
+			/* remove found right, merging */
+			set.len += right.len;
+			add_change(&chg, &right, false);
+		}
+	}
+
+	if (set.flags || set.map)
+		add_change(&chg, &set, true);
+
+	ret = apply_changes(sb, ops, arg, &chg);
+out:
+	trace_scoutfs_ext_set(sb, start, len, map, flags, ret);
+	return ret;
+}
--- a/kmod/src/ext.h
+++ b/kmod/src/ext.h
@@ -0,0 +1,35 @@
+#ifndef _SCOUTFS_EXT_H_
+#define _SCOUTFS_EXT_H_
+
+struct scoutfs_extent {
+	u64 start;
+	u64 len;
+	u64 map;
+	u8 flags;
+};
+
+struct scoutfs_ext_ops {
+	int (*next)(struct super_block *sb, void *arg,
+		    u64 start, u64 len, struct scoutfs_extent *ext);
+	int (*insert)(struct super_block *sb, void *arg,
+		      u64 start, u64 len, u64 map, u8 flags);
+	int (*remove)(struct super_block *sb, void *arg, u64 start, u64 len,
+		      u64 map, u8 flags);
+};
+
+bool scoutfs_ext_can_merge(struct scoutfs_extent *left,
+			   struct scoutfs_extent *right);
+
+int scoutfs_ext_next(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		     void *arg, u64 start, u64 len, struct scoutfs_extent *ext);
+int scoutfs_ext_insert(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		       void *arg, u64 start, u64 len, u64 map, u8 flags);
+int scoutfs_ext_remove(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		       void *arg, u64 start, u64 len);
+int scoutfs_ext_alloc(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		      void *arg, u64 start, u64 len, u64 limit,
+		      struct scoutfs_extent *ext);
+int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,
+		    void *arg, u64 start, u64 len, u64 map, u8 flags);
+
+#endif
--- a/kmod/src/forest.c
+++ b/kmod/src/forest.c
--- a/kmod/src/forest.h
+++ b/kmod/src/forest.h
@@ -1,54 +1,43 @@
 #ifndef _SCOUTFS_FOREST_H_
 #define _SCOUTFS_FOREST_H_

-struct scoutfs_radix_allocator;
+struct scoutfs_alloc;
 struct scoutfs_block_writer;
+struct scoutfs_block;
+
+#include "btree.h"
+
+/* caller gives an item to the callback */
+typedef int (*scoutfs_forest_item_cb)(struct super_block *sb,
+				      struct scoutfs_key *key,
+				      struct scoutfs_log_item_value *liv,
+				      void *val, int val_len, void *arg);

-int scoutfs_forest_lookup(struct super_block *sb, struct scoutfs_key *key,
-			  struct kvec *val, struct scoutfs_lock *lock);
-int scoutfs_forest_lookup_exact(struct super_block *sb,
-				struct scoutfs_key *key, struct kvec *val,
-				struct scoutfs_lock *lock);
-int scoutfs_forest_next(struct super_block *sb, struct scoutfs_key *key,
-			struct scoutfs_key *last, struct kvec *val,
-			struct scoutfs_lock *lock);
 int scoutfs_forest_next_hint(struct super_block *sb, struct scoutfs_key *key,
 			     struct scoutfs_key *next);
-int scoutfs_forest_prev(struct super_block *sb, struct scoutfs_key *key,
-			struct scoutfs_key *first, struct kvec *val,
-			struct scoutfs_lock *lock);
-int scoutfs_forest_create(struct super_block *sb, struct scoutfs_key *key,
-			  struct kvec *val, struct scoutfs_lock *lock);
-int scoutfs_forest_create_force(struct super_block *sb,
-				struct scoutfs_key *key, struct kvec *val,
-				struct scoutfs_lock *lock);
-int scoutfs_forest_update(struct super_block *sb, struct scoutfs_key *key,
-			  struct kvec *val, struct scoutfs_lock *lock);
-int scoutfs_forest_delete_dirty(struct super_block *sb,
-			        struct scoutfs_key *key);
-int scoutfs_forest_delete(struct super_block *sb, struct scoutfs_key *key,
-			  struct scoutfs_lock *lock);
-int scoutfs_forest_delete_force(struct super_block *sb,
-				struct scoutfs_key *key,
-				struct scoutfs_lock *lock);
-int scoutfs_forest_delete_save(struct super_block *sb,
-			       struct scoutfs_key *key,
-			       struct list_head *list,
-			       struct scoutfs_lock *lock);
-int scoutfs_forest_restore(struct super_block *sb, struct list_head *list,
-			   struct scoutfs_lock *lock);
-void scoutfs_forest_free_batch(struct super_block *sb, struct list_head *list);
+int scoutfs_forest_read_items(struct super_block *sb,
+			      struct scoutfs_lock *lock,
+			      struct scoutfs_key *key,
+			      struct scoutfs_key *start,
+			      struct scoutfs_key *end,
+			      scoutfs_forest_item_cb cb, void *arg);
+int scoutfs_forest_set_bloom_bits(struct super_block *sb,
+				  struct scoutfs_lock *lock);
+void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers);
+int scoutfs_forest_get_max_vers(struct super_block *sb,
+				struct scoutfs_super_block *super,
+				u64 *vers);
+int scoutfs_forest_insert_list(struct super_block *sb,
+			       struct scoutfs_btree_item_list *lst);
+int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);

 void scoutfs_forest_init_btrees(struct super_block *sb,
-				struct scoutfs_radix_allocator *alloc,
+				struct scoutfs_alloc *alloc,
 				struct scoutfs_block_writer *wri,
 				struct scoutfs_log_trees *lt);
 void scoutfs_forest_get_btrees(struct super_block *sb,
 			       struct scoutfs_log_trees *lt);

-void scoutfs_forest_clear_lock(struct super_block *sb,
-			       struct scoutfs_lock *lock);
-
 int scoutfs_forest_setup(struct super_block *sb);
 void scoutfs_forest_destroy(struct super_block *sb);

--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -1,6 +1,9 @@
 #ifndef _SCOUTFS_FORMAT_H_
 #define _SCOUTFS_FORMAT_H_

+#define SCOUTFS_INTEROP_VERSION		0ULL
+#define SCOUTFS_INTEROP_VERSION_STR	__stringify(0)
+
 /* statfs(2) f_type */
 #define SCOUTFS_SUPER_MAGIC	0x554f4353		/* "SCOU" */

@@ -8,27 +11,47 @@
 #define SCOUTFS_BLOCK_MAGIC_SUPER	0x103c428b
 #define SCOUTFS_BLOCK_MAGIC_BTREE	0xe597f96d
 #define SCOUTFS_BLOCK_MAGIC_BLOOM	0x31995604
-#define SCOUTFS_BLOCK_MAGIC_RADIX	0xebeb5e65
+#define SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK	0x897e4a7d
+#define SCOUTFS_BLOCK_MAGIC_SRCH_PARENT	0xb23a2a05
+#define SCOUTFS_BLOCK_MAGIC_ALLOC_LIST	0x8a93ac83

 /*
- * The super block and btree blocks are fixed 4k.
+ * The super block, quorum block, and file data allocation granularity
+ * use the smaller 4KB block.
 */
-#define SCOUTFS_BLOCK_SHIFT 12
-#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
-#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
-#define SCOUTFS_BLOCKS_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_SIZE)
-#define SCOUTFS_BLOCK_SECTOR_SHIFT (SCOUTFS_BLOCK_SHIFT - 9)
-#define SCOUTFS_BLOCK_SECTORS (1 << SCOUTFS_BLOCK_SECTOR_SHIFT)
-#define SCOUTFS_BLOCK_MAX (U64_MAX >> SCOUTFS_BLOCK_SHIFT)
+#define SCOUTFS_BLOCK_SM_SHIFT		12
+#define SCOUTFS_BLOCK_SM_SIZE		(1 << SCOUTFS_BLOCK_SM_SHIFT)
+#define SCOUTFS_BLOCK_SM_MASK		(SCOUTFS_BLOCK_SM_SIZE - 1)
+#define SCOUTFS_BLOCK_SM_PER_PAGE	(PAGE_SIZE / SCOUTFS_BLOCK_SM_SIZE)
+#define SCOUTFS_BLOCK_SM_SECTOR_SHIFT	(SCOUTFS_BLOCK_SM_SHIFT - 9)
+#define SCOUTFS_BLOCK_SM_SECTORS	(1 << SCOUTFS_BLOCK_SM_SECTOR_SHIFT)
+#define SCOUTFS_BLOCK_SM_MAX		(U64_MAX >> SCOUTFS_BLOCK_SM_SHIFT)
+#define SCOUTFS_BLOCK_SM_PAGES_PER	(SCOUTFS_BLOCK_SM_SIZE / PAGE_SIZE)
+#define SCOUTFS_BLOCK_SM_PAGE_ORDER	(SCOUTFS_BLOCK_SM_SHIFT - PAGE_SHIFT)
+
+/*
+ * The radix and btree structures, and the forest bloom block, use the
+ * larger 64KB metadata block size.
+ */
+#define SCOUTFS_BLOCK_LG_SHIFT		16
+#define SCOUTFS_BLOCK_LG_SIZE		(1 << SCOUTFS_BLOCK_LG_SHIFT)
+#define SCOUTFS_BLOCK_LG_MASK		(SCOUTFS_BLOCK_LG_SIZE - 1)
+#define SCOUTFS_BLOCK_LG_PER_PAGE	(PAGE_SIZE / SCOUTFS_BLOCK_LG_SIZE)
+#define SCOUTFS_BLOCK_LG_SECTOR_SHIFT	(SCOUTFS_BLOCK_LG_SHIFT - 9)
+#define SCOUTFS_BLOCK_LG_SECTORS	(1 << SCOUTFS_BLOCK_LG_SECTOR_SHIFT)
+#define SCOUTFS_BLOCK_LG_MAX		(U64_MAX >> SCOUTFS_BLOCK_LG_SHIFT)
+#define SCOUTFS_BLOCK_LG_PAGES_PER	(SCOUTFS_BLOCK_LG_SIZE / PAGE_SIZE)
+#define SCOUTFS_BLOCK_LG_PAGE_ORDER	(SCOUTFS_BLOCK_LG_SHIFT - PAGE_SHIFT)
+
+#define SCOUTFS_BLOCK_SM_LG_SHIFT	(SCOUTFS_BLOCK_LG_SHIFT - \
+					 SCOUTFS_BLOCK_SM_SHIFT)

-#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE)
-#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT)

 /*
 * The super block leaves some room before the first block for platform
 * structures like boot loaders.
 */
-#define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
+#define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)

 /*
 * A reasonably large region of aligned quorum blocks follow the super
@@ -38,8 +61,14 @@
 * mounts that have a reasonable probability of not overwriting each
 * other's random block locations.
 */
-#define SCOUTFS_QUORUM_BLKNO		((256ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
-#define SCOUTFS_QUORUM_BLOCKS		((256ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
+#define SCOUTFS_QUORUM_BLKNO	((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
+#define SCOUTFS_QUORUM_BLOCKS	((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
+
+/*
+ * Start data on the data device aligned as well.
+ */
+#define SCOUTFS_DATA_DEV_START_BLKNO ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
+

 #define SCOUTFS_UNIQUE_NAME_MAX_BYTES	64 /* includes null */

@@ -49,18 +78,15 @@
 struct scoutfs_timespec {
 	__le64 sec;
 	__le32 nsec;
-} __packed;
-
-struct scoutfs_betimespec {
-	__be64 sec;
-	__be32 nsec;
-} __packed;
+	__u8 __pad[4];
+};

 /* XXX ipv6 */
 struct scoutfs_inet_addr {
 	__le32 addr;
 	__le16 port;
-} __packed;
+	__u8 __pad[2];
+};

 /*
 * This header is stored at the start of btree blocks and the super
@@ -73,7 +99,7 @@ struct scoutfs_block_header {
 	__le64 fsid;
 	__le64 seq;
 	__le64 blkno;
-} __packed;
+};

 /*
 * scoutfs identifies all file system metadata items by a small key
@@ -89,23 +115,19 @@ struct scoutfs_block_header {
 * increment them, subtract them from each other, etc.
 */
 struct scoutfs_key {
-	__u8	sk_zone;
 	__le64	_sk_first;
-	__u8	sk_type;
 	__le64	_sk_second;
 	__le64	_sk_third;
 	__u8	_sk_fourth;
-}__packed;
+	__u8	sk_zone;
+	__u8	sk_type;
+	__u8	__pad[5];
+};

 /* inode index */
 #define skii_major	_sk_second
 #define skii_ino	_sk_third

-/* xattr index */
-#define skxi_hash	_sk_first
-#define skxi_ino	_sk_second
-#define skxi_id		_sk_third
-
 /* node orphan inode */
 #define sko_rid		_sk_first
 #define sko_ino		_sk_second
@@ -128,85 +150,46 @@ struct scoutfs_key {
 #define sks_ino		_sk_first
 #define sks_nr		_sk_second

-/* packed extents */
-#define skpe_ino	_sk_first
-#define skpe_base	_sk_second
-#define skpe_part	_sk_fourth
+/* data extents */
+#define skdx_ino	_sk_first
+#define skdx_end	_sk_second
+#define skdx_len	_sk_third

-struct scoutfs_radix_block {
-	struct scoutfs_block_header hdr;
-	__le32 sm_first;
-	__le32 lg_first;
-	union {
-		struct scoutfs_radix_ref {
-			__le64 blkno;
-			__le64 seq;
-			__le64 sm_total;
-			__le64 lg_total;
-		} __packed refs[0];
-		__le64 bits[0];
-	} __packed;
-} __packed;
+/* log trees */
+#define sklt_rid	_sk_first
+#define sklt_nr		_sk_second

-struct scoutfs_radix_root {
+/* lock clients */
+#define sklc_rid	_sk_first
+
+/* seqs */
+#define skts_trans_seq	_sk_first
+#define skts_rid	_sk_second
+
+/* mounted clients */
+#define skmc_rid	_sk_first
+
+/* free extents by blkno */
+#define skfb_end	_sk_second
+#define skfb_len	_sk_third
+/* free extents by len */
+#define skfl_neglen	_sk_second
+#define skfl_blkno	_sk_third
+
+struct scoutfs_avl_root {
+	__le16 node;
+};
+
+struct scoutfs_avl_node {
+	__le16 parent;
+	__le16 left;
+	__le16 right;
 	__u8 height;
-	__le64 next_find_bit;
-	struct scoutfs_radix_ref ref;
-} __packed;
+	__u8 __pad[1];
+};

-#define SCOUTFS_RADIX_REFS \
-	((SCOUTFS_BLOCK_SIZE - offsetof(struct scoutfs_radix_block, refs[0])) /\
-		sizeof(struct scoutfs_radix_ref))
-
-/* 8 meg regions with 4k data blocks */
-#define SCOUTFS_RADIX_LG_SHIFT	11
-#define SCOUTFS_RADIX_LG_BITS	(1 << SCOUTFS_RADIX_LG_SHIFT)
-#define SCOUTFS_RADIX_LG_MASK	(SCOUTFS_RADIX_LG_BITS - 1)
-
-/* round block bits down to a multiple of large ranges */
-#define SCOUTFS_RADIX_BITS					\
-	(((SCOUTFS_BLOCK_SIZE -					\
-	   offsetof(struct scoutfs_radix_block, bits[0])) * 8) &	\
-	 ~(__u64)SCOUTFS_RADIX_LG_MASK)
-#define SCOUTFS_RADIX_BITS_BYTES (SCOUTFS_RADIX_BITS / 8)
-
-/*
- * The btree still uses memcmp() to compare keys.  We should fix that
- * before too long.
- */
-struct scoutfs_key_be {
-	__u8	sk_zone;
-	__be64	_sk_first;
-	__u8	sk_type;
-	__be64	_sk_second;
-	__be64	_sk_third;
-	__u8	_sk_fourth;
-}__packed;
-
-/* chose reasonable max key lens that have room for some u64s */
-#define SCOUTFS_BTREE_MAX_KEY_LEN 40
 /* when we split we want to have multiple items on each side */
-#define SCOUTFS_BTREE_MAX_VAL_LEN (SCOUTFS_BLOCK_SIZE / 8)
-
-/*
- * The min number of free bytes we must leave in a parent as we descend
- * to modify.  This leaves enough free bytes to insert a possibly maximal
- * sized key as a seperator for a child block.  Fewer bytes then this
- * and split/merge might try to insert a max child item in the parent
- * that wouldn't fit.
- */
-#define SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES				\
-	(sizeof(struct scoutfs_btree_item_header) +			\
-	 sizeof(struct scoutfs_btree_item) + SCOUTFS_BTREE_MAX_KEY_LEN +\
-	 sizeof(struct scoutfs_btree_ref))
-
-/*
- * When debugging we can tune the splitting and merging thresholds to
- * create much larger trees by having blocks with many fewer items.  We
- * implement this by pretending the blocks are tiny.  They're still
- * large enough for a handful of items.
- */
-#define SCOUTFS_BTREE_TINY_BLOCK_SIZE	512
+#define SCOUTFS_BTREE_MAX_VAL_LEN 896

 /*
 * A 4EB test image measured a worst case height of 17.  This is plenty
@@ -217,7 +200,7 @@ struct scoutfs_key_be {
 struct scoutfs_btree_ref {
 	__le64 blkno;
 	__le64 seq;
-} __packed;
+};

 /*
 * A height of X means that the first block read will have level X-1 and
@@ -226,91 +209,230 @@ struct scoutfs_btree_ref {
 struct scoutfs_btree_root {
 	struct scoutfs_btree_ref ref;
 	__u8 height;
-} __packed;
-
-struct scoutfs_btree_item_header {
-	__le32 off;
-} __packed;
+	__u8 __pad[7];
+};

 struct scoutfs_btree_item {
-	__le16 key_len;
+	struct scoutfs_avl_node node;
+	struct scoutfs_key key;
+	__le16 val_off;
 	__le16 val_len;
-	__u8 data[0];
-} __packed;
+	__u8 __pad[4];
+};

 struct scoutfs_btree_block {
 	struct scoutfs_block_header hdr;
-	__le32 free_end;
-	__le32 nr_items;
+	struct scoutfs_avl_root item_root;
+	__le16 nr_items;
+	__le16 total_item_bytes;
+	__le16 mid_free_len;
 	__u8 level;
-	struct scoutfs_btree_item_header item_hdrs[0];
-} __packed;
+	__u8 __pad[7];
+	struct scoutfs_btree_item items[0];
+	/* leaf blocks have a fixed size item offset hash table at the end */
+};
+
+#define SCOUTFS_BTREE_VALUE_ALIGN 8

 /*
- * The lock server keeps a persistent record of connected clients so that
- * server failover knows who to wait for before resuming operations.
+ * Try to aim for a 75% load in a leaf full of items with no value.
+ * We'll almost never see this because most items have values and most
+ * blocks aren't full.
 */
-struct scoutfs_lock_client_btree_key {
-	__be64 rid;
-} __packed;
+#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED			  \
+	((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) /	  \
+	 (sizeof(struct scoutfs_btree_item) + (sizeof(__le16))) * 100 / 75)
+#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR					  \
+	(round_up(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED,		  \
+		  SCOUTFS_BTREE_VALUE_ALIGN))
+#define SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES \
+	(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR * sizeof(__le16))
+
+struct scoutfs_alloc_list_ref {
+	__le64 blkno;
+	__le64 seq;
+};

 /*
- * The server tracks transaction sequence numbers that clients have
- * open.  This limits results that can be returned from the seq indices.
+ * first_nr tracks the nr of the first block in the list and is used for
+ * allocation sizing. total_nr is the sum of the nr of all the blocks in
+ * the list and is used for calculating total free block counts.
 */
-struct scoutfs_trans_seq_btree_key {
-	__be64 trans_seq;
-	__be64 rid;
-} __packed;
+struct scoutfs_alloc_list_head {
+	struct scoutfs_alloc_list_ref ref;
+	__le64 total_nr;
+	__le32 first_nr;
+	__u8 __pad[4];
+};

 /*
- * The server keeps a persistent record of mounted clients.
+ * While the main allocator uses extent items in btree blocks, metadata
+ * allocations for a single transaction are recorded in arrays in
+ * blocks.  This limits the number of allocations and frees needed to
+ * cow and modify the structure.  The blocks can be stored in a list
+ * which lets us create a persistent log of pending frees that are
+ * generated as we cow btree blocks to insert freed extents.
+ *
+ * The array floats in the block so that both adding and removing blknos
+ * only modifies an index.
 */
-struct scoutfs_mounted_client_btree_key {
-	__be64 rid;
-} __packed;
+struct scoutfs_alloc_list_block {
+	struct scoutfs_block_header hdr;
+	struct scoutfs_alloc_list_ref next;
+	__le32 start;
+	__le32 nr;
+	__le64 blknos[0]; /* naturally aligned for sorting */
+};
+
+#define SCOUTFS_ALLOC_LIST_MAX_BLOCKS					      \
+	((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_alloc_list_block)) /  \
+	 (member_sizeof(struct scoutfs_alloc_list_block, blknos[0])))
+
+/*
+ * These can safely be initialized to all-zeros.
+ */
+struct scoutfs_alloc_root {
+	__le64 total_len;
+	struct scoutfs_btree_root root;
+};
+
+/* types of allocators, exposed to alloc_detail ioctl */
+#define SCOUTFS_ALLOC_OWNER_NONE	0
+#define SCOUTFS_ALLOC_OWNER_SERVER	1
+#define SCOUTFS_ALLOC_OWNER_MOUNT	2
+#define SCOUTFS_ALLOC_OWNER_SRCH	3

 struct scoutfs_mounted_client_btree_val {
 	__u8 flags;
-} __packed;
+};

 #define SCOUTFS_MOUNTED_CLIENT_VOTER	(1 << 0)

+/*
+ * srch files are a contiguous run of blocks with compressed entries
+ * described by a dense parent radix.  The files can be stored in
+ * log_tree items when the files contain unsorted entries written by
+ * mounts during their transactions.  Sorted files of increasing size
+ * are kept in a btree off the super for searching and further
+ * compacting.
+ */
+struct scoutfs_srch_entry {
+	__le64 hash;
+	__le64 ino;
+	__le64 id;
+};
+
+#define SCOUTFS_SRCH_ENTRY_MAX_BYTES	(2 + (sizeof(__u64) * 3))
+
+struct scoutfs_srch_ref {
+	__le64 blkno;
+	__le64 seq;
+};
+
+struct scoutfs_srch_file {
+	struct scoutfs_srch_entry first;
+	struct scoutfs_srch_entry last;
+	struct scoutfs_srch_ref ref;
+	__le64 blocks;
+	__le64 entries;
+	__u8 height;
+	__u8 __pad[7];
+};
+
+struct scoutfs_srch_parent {
+	struct scoutfs_block_header hdr;
+	struct scoutfs_srch_ref refs[0];
+};
+
+#define SCOUTFS_SRCH_PARENT_REFS				\
+	((SCOUTFS_BLOCK_LG_SIZE -				\
+	  offsetof(struct scoutfs_srch_parent, refs)) /		\
+	 sizeof(struct scoutfs_srch_ref))
+
+struct scoutfs_srch_block {
+	struct scoutfs_block_header hdr;
+	struct scoutfs_srch_entry first;
+	struct scoutfs_srch_entry last;
+	struct scoutfs_srch_entry tail;
+	__le32 entry_nr;
+	__le32 entry_bytes;
+	__u8 entries[0];
+};
+
+/*
+ * Decoding loads final small deltas with full __u64 loads.  Rather than
+ * check the size before each load we stop coding entries past the point
+ * where a full size entry could overflow the block.  A final entry can
+ * start at this byte count and consume the rest of the block, though
+ * its unlikely.
+ */
+#define SCOUTFS_SRCH_BLOCK_SAFE_BYTES					\
+	(SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_srch_block) -	\
+	 SCOUTFS_SRCH_ENTRY_MAX_BYTES)
+
+#define SCOUTFS_SRCH_LOG_BLOCK_LIMIT	(1024 * 1024 / SCOUTFS_BLOCK_LG_SIZE)
+#define SCOUTFS_SRCH_COMPACT_ORDER	2
+#define SCOUTFS_SRCH_COMPACT_NR		(1 << SCOUTFS_SRCH_COMPACT_ORDER)
+
+/*
+ * A persistent record of a srch file compaction operation in progress.
+ *
+ * When compacting log files blk and pos aren't used.  When compacting
+ * sorted files blk is the logical block number and pos is the byte
+ * offset of the next entry.  When deleting files pos is the height of
+ * the level that we're deleting, and blk is the logical block offset of
+ * the next parent ref array index to descend through.
+ */
+struct scoutfs_srch_compact {
+	struct scoutfs_alloc_list_head meta_avail;
+	struct scoutfs_alloc_list_head meta_freed;
+	__le64 id;
+	__u8 nr;
+	__u8 flags;
+	__u8 __pad[6];
+	struct scoutfs_srch_file out;
+	struct scoutfs_srch_compact_input {
+		struct scoutfs_srch_file sfl;
+		__le64 blk;
+		__le64 pos;
+	} in[SCOUTFS_SRCH_COMPACT_NR];
+};
+
+/* server -> client: combine input log file entries into output file */
+#define SCOUTFS_SRCH_COMPACT_FLAG_LOG		(1 << 0)
+/* server -> client: combine input sorted file entries into output file */
+#define SCOUTFS_SRCH_COMPACT_FLAG_SORTED	(1 << 1)
+/* server -> client: delete input files */
+#define SCOUTFS_SRCH_COMPACT_FLAG_DELETE	(1 << 2)
+/* client -> server: compaction phase (LOG,SORTED,DELETE) done */
+#define SCOUTFS_SRCH_COMPACT_FLAG_DONE		(1 << 4)
+/* client -> server: compaction failed */
+#define SCOUTFS_SRCH_COMPACT_FLAG_ERROR		(1 << 5)
+
 /*
 * XXX I imagine we should rename these now that they've evolved to track
 * all the btrees that clients use during a transaction.  It's not just
 * about item logs, it's about clients making changes to trees.
 */
 struct scoutfs_log_trees {
-	struct scoutfs_radix_root meta_avail;
-	struct scoutfs_radix_root meta_freed;
+	struct scoutfs_alloc_list_head meta_avail;
+	struct scoutfs_alloc_list_head meta_freed;
 	struct scoutfs_btree_root item_root;
 	struct scoutfs_btree_ref bloom_ref;
-	struct scoutfs_radix_root data_avail;
-	struct scoutfs_radix_root data_freed;
+	struct scoutfs_alloc_root data_avail;
+	struct scoutfs_alloc_root data_freed;
+	struct scoutfs_srch_file srch_file;
+	__le64 max_item_vers;
 	__le64 rid;
 	__le64 nr;
-} __packed;
-
-struct scoutfs_log_trees_key {
-	__be64 rid;
-	__be64 nr;
-} __packed;
-
-struct scoutfs_log_trees_val {
-	struct scoutfs_radix_root meta_avail;
-	struct scoutfs_radix_root meta_freed;
-	struct scoutfs_btree_root item_root;
-	struct scoutfs_btree_ref bloom_ref;
-	struct scoutfs_radix_root data_avail;
-	struct scoutfs_radix_root data_freed;
-} __packed;
+};

 struct scoutfs_log_item_value {
 	__le64 vers;
 	__u8 flags;
+	__u8 __pad[7];
 	__u8 data[0];
-} __packed;
+};

 /*
 * FS items are limited by the max btree value length with the log item
@@ -325,7 +447,7 @@ struct scoutfs_bloom_block {
 	struct scoutfs_block_header hdr;
 	__le64 total_set;
 	__le64 bits[0];
-} __packed;
+};

 /*
 * Item log trees are accompanied by a block of bits that make up a
@@ -334,30 +456,33 @@ struct scoutfs_bloom_block {
 * before the bloom filters fill up and start returning excessive false
 * positives.
 */
-#define SCOUTFS_FOREST_BLOOM_NRS		7
+#define SCOUTFS_FOREST_BLOOM_NRS		3
 #define SCOUTFS_FOREST_BLOOM_BITS \
-	(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_bloom_block)) /	\
-	 member_sizeof(struct scoutfs_bloom_block, bits[0])) *		\
-	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)	\
+	(((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_bloom_block)) /  \
+	 member_sizeof(struct scoutfs_bloom_block, bits[0])) *		  \
+	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)
+#define SCOUTFS_FOREST_BLOOM_FUNC_BITS		(SCOUTFS_BLOCK_LG_SHIFT + 3)

 /*
 * Keys are first sorted by major key zones.
 */
 #define SCOUTFS_INODE_INDEX_ZONE		1
-#define SCOUTFS_XATTR_INDEX_ZONE		2
-#define SCOUTFS_RID_ZONE			3
-#define SCOUTFS_FS_ZONE				4
-#define SCOUTFS_LOCK_ZONE			5
-#define SCOUTFS_MAX_ZONE			8 /* power of 2 is efficient */
+#define SCOUTFS_RID_ZONE			2
+#define SCOUTFS_FS_ZONE				3
+#define SCOUTFS_LOCK_ZONE			4
+/* Items only stored in server btrees */
+#define SCOUTFS_LOG_TREES_ZONE			6
+#define SCOUTFS_LOCK_CLIENTS_ZONE		7
+#define SCOUTFS_TRANS_SEQ_ZONE			8
+#define SCOUTFS_MOUNTED_CLIENT_ZONE		9
+#define SCOUTFS_SRCH_ZONE			10
+#define SCOUTFS_FREE_EXTENT_ZONE		11

 /* inode index zone */
 #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE	1
 #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
 #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */

-/* xattr index zone */
-#define SCOUTFS_XATTR_INDEX_NAME_TYPE		1
-
 /* rid zone (also used in server alloc btree) */
 #define SCOUTFS_ORPHAN_TYPE			1

@@ -368,44 +493,27 @@ struct scoutfs_bloom_block {
 #define SCOUTFS_READDIR_TYPE			4
 #define SCOUTFS_LINK_BACKREF_TYPE		5
 #define SCOUTFS_SYMLINK_TYPE			6
-#define SCOUTFS_PACKED_EXTENT_TYPE		7
+#define SCOUTFS_DATA_EXTENT_TYPE		7

 /* lock zone, only ever found in lock ranges, never in persistent items */
 #define SCOUTFS_RENAME_TYPE			1

-#define SCOUTFS_MAX_TYPE			8 /* power of 2 is efficient */
+/* srch zone, only in server btrees */
+#define SCOUTFS_SRCH_LOG_TYPE		1
+#define SCOUTFS_SRCH_BLOCKS_TYPE	2
+#define SCOUTFS_SRCH_PENDING_TYPE	3
+#define SCOUTFS_SRCH_BUSY_TYPE		4

+/* free extents in allocator btrees in client and server, by blkno or len */
+#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE	1
+#define SCOUTFS_FREE_EXTENT_LEN_TYPE	2

-/*
- * The extents that map blocks in a fixed-size logical region of a file
- * are packed and stored in item values.  The packed extents are
- * contiguous so the starting logical block is implicit from the length
- * of previous extents.  Sparse regions are represented by 0 flags and
- * blkno.  The blkno of a packed extent is encoded as the zigzag (lsb is
- * sign bit) difference from the last blkno of the previous extent.
- * This guarantees that non-sparse extents must have a blkno delta of at
- * least -1/1.  High zero byte aren't stored.
- */
-struct scoutfs_packed_extent {
-	__le16 count;
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8 diff_bytes:4,
-	     flags:3,
-	     final:1;
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	__u8 final:1,
-	     flags:3,
-	     diff_bytes:4;
-#else
-#error "no {BIG,LITTLE}_ENDIAN_BITFIELD defined?"
-#endif
-	__u8 le_blkno_diff[0];
-} __packed;
-
-#define SCOUTFS_PACKEXT_BLOCKS		(8 * 1024 * 1024 / SCOUTFS_BLOCK_SIZE)
-#define SCOUTFS_PACKEXT_BASE_SHIFT	(ilog2(SCOUTFS_PACKEXT_BLOCKS))
-#define SCOUTFS_PACKEXT_BASE_MASK	(~((__u64)SCOUTFS_PACKEXT_BLOCKS - 1))
-#define SCOUTFS_PACKEXT_MAX_BYTES	SCOUTFS_MAX_VAL_SIZE
+/* file data extents have start and len in key */
+struct scoutfs_data_extent_val {
+	__le64 blkno;
+	__u8 flags;
+	__u8 __pad[7];
+};

 #define SEF_OFFLINE	(1 << 0)
 #define SEF_UNWRITTEN	(1 << 1)
@@ -417,10 +525,11 @@ struct scoutfs_packed_extent {
 * part item and overflow into the values of the rest of the part items.
 */
 struct scoutfs_xattr {
-	__u8 name_len;
 	__le16 val_len;
+	__u8 name_len;
+	__u8 __pad[5];
 	__u8 name[0];
-} __packed;
+};


 /* XXX does this exist upstream somewhere? */
@@ -460,47 +569,51 @@ struct scoutfs_quorum_block {
 	__le64 vote_for_rid;
 	__le32 crc;
 	__u8 log_nr;
+	__u8 __pad[3];
 	struct scoutfs_quorum_log {
 		__le64 term;
 		__le64 rid;
 		struct scoutfs_inet_addr addr;
-	} __packed log[0];
-} __packed;
+	} log[0];
+};

-#define SCOUTFS_QUORUM_LOG_MAX						\
-	((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_quorum_block)) /	\
+#define SCOUTFS_QUORUM_LOG_MAX						  \
+	((SCOUTFS_BLOCK_SM_SIZE - sizeof(struct scoutfs_quorum_block)) /  \
 		sizeof(struct scoutfs_quorum_log))

+#define SCOUTFS_FLAG_IS_META_BDEV 0x01
+
 struct scoutfs_super_block {
 	struct scoutfs_block_header hdr;
 	__le64 id;
-	__le64 format_hash;
+	__le64 version;
+	__le64 flags;
 	__u8 uuid[SCOUTFS_UUID_BYTES];
 	__le64 next_ino;
 	__le64 next_trans_seq;
 	__le64 total_meta_blocks;	/* both static and dynamic */
 	__le64 first_meta_blkno;	/* first dynamically allocated */
 	__le64 last_meta_blkno;
-	__le64 free_meta_blocks;
 	__le64 total_data_blocks;
 	__le64 first_data_blkno;
 	__le64 last_data_blkno;
-	__le64 free_data_blocks;
 	__le64 quorum_fenced_term;
 	__le64 quorum_server_term;
 	__le64 unmount_barrier;
 	__u8 quorum_count;
+	__u8 __pad[7];
 	struct scoutfs_inet_addr server_addr;
-	struct scoutfs_radix_root core_meta_avail;
-	struct scoutfs_radix_root core_meta_freed;
-	struct scoutfs_radix_root core_data_avail;
-	struct scoutfs_radix_root core_data_freed;
+	struct scoutfs_alloc_root meta_alloc[2];
+	struct scoutfs_alloc_root data_alloc;
+	struct scoutfs_alloc_list_head server_meta_avail[2];
+	struct scoutfs_alloc_list_head server_meta_freed[2];
 	struct scoutfs_btree_root fs_root;
 	struct scoutfs_btree_root logs_root;
 	struct scoutfs_btree_root lock_clients;
 	struct scoutfs_btree_root trans_seqs;
 	struct scoutfs_btree_root mounted_clients;
-} __packed;
+	struct scoutfs_btree_root srch_root;
+};

 #define SCOUTFS_ROOT_INO 1

@@ -549,7 +662,7 @@ struct scoutfs_inode {
 	struct scoutfs_timespec atime;
 	struct scoutfs_timespec ctime;
 	struct scoutfs_timespec mtime;
-} __packed;
+};

 #define SCOUTFS_INO_FLAG_TRUNCATE 0x1

@@ -571,8 +684,9 @@ struct scoutfs_dirent {
 	__le64 hash;
 	__le64 pos;
 	__u8 type;
+	__u8 __pad[7];
 	__u8 name[0];
-} __packed;
+};

 #define SCOUTFS_NAME_LEN 255

@@ -584,7 +698,7 @@ struct scoutfs_dirent {
 /* getdents returns next pos with an entry, no entry at (f_pos)~0 */
 #define SCOUTFS_DIRENT_LAST_POS (U64_MAX - 1)

-enum {
+enum scoutfs_dentry_type {
 	SCOUTFS_DT_FIFO = 0,
 	SCOUTFS_DT_CHR,
 	SCOUTFS_DT_DIR,
@@ -635,12 +749,12 @@ enum {
 */
 struct scoutfs_net_greeting {
 	__le64 fsid;
-	__le64 format_hash;
+	__le64 version;
 	__le64 server_term;
 	__le64 unmount_barrier;
 	__le64 rid;
 	__le64 flags;
-} __packed;
+};

 #define SCOUTFS_NET_GREETING_FLAG_FAREWELL	(1 << 0)
 #define SCOUTFS_NET_GREETING_FLAG_VOTER		(1 << 1)
@@ -675,22 +789,25 @@ struct scoutfs_net_header {
 	__u8 cmd;
 	__u8 flags;
 	__u8 error;
+	__u8 __pad[3];
 	__u8 data[0];
-} __packed;
+};

 #define SCOUTFS_NET_FLAG_RESPONSE	(1 << 0)
 #define SCOUTFS_NET_FLAGS_UNKNOWN	(U8_MAX << 1)

-enum {
+enum scoutfs_net_cmd {
 	SCOUTFS_NET_CMD_GREETING = 0,
 	SCOUTFS_NET_CMD_ALLOC_INODES,
 	SCOUTFS_NET_CMD_GET_LOG_TREES,
 	SCOUTFS_NET_CMD_COMMIT_LOG_TREES,
+	SCOUTFS_NET_CMD_GET_ROOTS,
 	SCOUTFS_NET_CMD_ADVANCE_SEQ,
 	SCOUTFS_NET_CMD_GET_LAST_SEQ,
-	SCOUTFS_NET_CMD_STATFS,
 	SCOUTFS_NET_CMD_LOCK,
 	SCOUTFS_NET_CMD_LOCK_RECOVER,
+	SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
+	SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
 	SCOUTFS_NET_CMD_FAREWELL,
 	SCOUTFS_NET_CMD_UNKNOWN,
 };
@@ -709,7 +826,7 @@ enum {

 #undef EXPAND_NET_ERRNO
 #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
-enum {
+enum scoutfs_net_errors {
 	SCOUTFS_NET_ERR_NONE = 0,
 	EXPAND_EACH_NET_ERRNO
 	SCOUTFS_NET_ERR_UNKNOWN,
@@ -725,33 +842,39 @@ enum {
 struct scoutfs_net_inode_alloc {
 	__le64 ino;
 	__le64 nr;
-} __packed;
+};

-struct scoutfs_net_statfs {
-	__le64 total_blocks;		/* total blocks in device */
-	__le64 next_ino;		/* next unused inode number */
-	__le64 bfree;			/* free blocks */
-	__u8 uuid[SCOUTFS_UUID_BYTES];	/* logical volume uuid */
-} __packed;
+struct scoutfs_net_roots {
+	struct scoutfs_btree_root fs_root;
+	struct scoutfs_btree_root logs_root;
+	struct scoutfs_btree_root srch_root;
+};

 struct scoutfs_net_lock {
 	struct scoutfs_key key;
 	__le64 write_version;
 	__u8 old_mode;
 	__u8 new_mode;
-} __packed;
+	__u8 __pad[6];
+};
+
+struct scoutfs_net_lock_grant_response {
+	struct scoutfs_net_lock nl;
+	struct scoutfs_net_roots roots;
+};

 struct scoutfs_net_lock_recover {
 	__le16 nr;
+	__u8 __pad[6];
 	struct scoutfs_net_lock locks[0];
-} __packed;
+};

 #define SCOUTFS_NET_LOCK_MAX_RECOVER_NR					       \
 	((SCOUTFS_NET_MAX_DATA_LEN - sizeof(struct scoutfs_net_lock_recover)) /\
 	 sizeof(struct scoutfs_net_lock))

 /* some enums for tracing */
-enum {
+enum scoutfs_lock_trace {
 	SLT_CLIENT,
 	SLT_SERVER,
 	SLT_GRANT,
@@ -772,7 +895,7 @@ enum {
 *
 * The null mode provides no access and is used to destroy locks.
 */
-enum {
+enum scoutfs_lock_mode {
 	SCOUTFS_LOCK_NULL = 0,
 	SCOUTFS_LOCK_READ,
 	SCOUTFS_LOCK_WRITE,
@@ -787,7 +910,7 @@ enum {
 struct scoutfs_fid {
 	__le64 ino;
 	__le64 parent_ino;
-} __packed;
+};

 #define FILEID_SCOUTFS			0x81
 #define FILEID_SCOUTFS_WITH_PARENT	0x82
@@ -795,7 +918,7 @@ struct scoutfs_fid {
 /*
 * Identifiers for sources of corruption that can generate messages.
 */
-enum {
+enum scoutfs_corruption_sources {
 	SC_DIRENT_NAME_LEN = 0,
 	SC_DIRENT_BACKREF_NAME_LEN,
 	SC_DIRENT_READDIR_NAME_LEN,
--- a/kmod/src/hash.h
+++ b/kmod/src/hash.h
@@ -1,15 +1,49 @@
 #ifndef _SCOUTFS_HASH_H_
 #define _SCOUTFS_HASH_H_

-#include <linux/crc32c.h>
+/*
+ * We're using FNV1a for now.  It's fine.  Ish.
+ *
+ * The longer term plan is xxh3 but it looks like it'll take just a bit
+ * more time to be declared stable and then it needs to be ported to the
+ * kernel.
+ *
+ *  - https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *  - https://github.com/Cyan4973/xxHash/releases/tag/v0.7.4
+ */
+
+static inline u32 fnv1a32(const void *data, unsigned int len)
+{
+	u32 hash = 0x811c9dc5;
+
+	while (len--) {
+		hash ^= *(u8 *)(data++);
+		hash *= 0x01000193;
+	}
+
+	return hash;
+}
+
+static inline u64 fnv1a64(const void *data, unsigned int len)
+{
+	u64 hash = 0xcbf29ce484222325ULL;
+
+	while (len--) {
+		hash ^= *(u8 *)(data++);
+		hash *= 0x100000001b3ULL;
+	}
+
+	return hash;
+}
+
+static inline u32 scoutfs_hash32(const void *data, unsigned int len)
+{
+	return fnv1a32(data, len);
+}

-/* XXX replace with xxhash */
 static inline u64 scoutfs_hash64(const void *data, unsigned int len)
 {
-       unsigned int half = (len + 1) / 2;
-
-       return crc32c(~0, data, half) |
-              ((u64)crc32c(~0, data + len - half, half) << 32);
+	return fnv1a64(data, len);
 }

 #endif
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -30,8 +30,7 @@
 #include "xattr.h"
 #include "trans.h"
 #include "msg.h"
-#include "kvec.h"
-#include "forest.h"
+#include "item.h"
 #include "client.h"
 #include "cmp.h"

@@ -47,9 +46,17 @@
 *  - describe data locking size problems
 */

+struct inode_allocator {
+	spinlock_t lock;
+	u64 ino;
+	u64 nr;
+};
+
 struct inode_sb_info {
 	spinlock_t writeback_lock;
 	struct rb_root writeback_inodes;
+	struct inode_allocator dir_ino_alloc;
+	struct inode_allocator ino_alloc;
 };

 #define DECLARE_INODE_SB_INFO(sb, name) \
@@ -64,30 +71,30 @@ static struct kmem_cache *scoutfs_inode_cachep;
 */
 static void scoutfs_inode_ctor(void *obj)
 {
-	struct scoutfs_inode_info *ci = obj;
+	struct scoutfs_inode_info *si = obj;

-	mutex_init(&ci->item_mutex);
-	seqcount_init(&ci->seqcount);
-	ci->staging = false;
-	scoutfs_per_task_init(&ci->pt_data_lock);
-	atomic64_set(&ci->data_waitq.changed, 0);
-	init_waitqueue_head(&ci->data_waitq.waitq);
-	init_rwsem(&ci->xattr_rwsem);
-	RB_CLEAR_NODE(&ci->writeback_node);
-	spin_lock_init(&ci->ino_alloc.lock);
+	init_rwsem(&si->extent_sem);
+	mutex_init(&si->item_mutex);
+	seqcount_init(&si->seqcount);
+	si->staging = false;
+	scoutfs_per_task_init(&si->pt_data_lock);
+	atomic64_set(&si->data_waitq.changed, 0);
+	init_waitqueue_head(&si->data_waitq.waitq);
+	init_rwsem(&si->xattr_rwsem);
+	RB_CLEAR_NODE(&si->writeback_node);

-	inode_init_once(&ci->inode);
+	inode_init_once(&si->inode);
 }

 struct inode *scoutfs_alloc_inode(struct super_block *sb)
 {
-	struct scoutfs_inode_info *ci;
+	struct scoutfs_inode_info *si;

-	ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
-	if (!ci)
+	si = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
+	if (!si)
 		return NULL;

-	return &ci->inode;
+	return &si->inode;
 }

 static void scoutfs_i_callback(struct rcu_head *head)
@@ -215,7 +222,7 @@ static void set_item_info(struct scoutfs_inode_info *si,

 static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);

 	i_size_write(inode, le64_to_cpu(cinode->size));
 	set_nlink(inode, le32_to_cpu(cinode->nlink));
@@ -230,23 +237,23 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 	inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);

-	ci->meta_seq = le64_to_cpu(cinode->meta_seq);
-	ci->data_seq = le64_to_cpu(cinode->data_seq);
-	ci->data_version = le64_to_cpu(cinode->data_version);
-	ci->online_blocks = le64_to_cpu(cinode->online_blocks);
-	ci->offline_blocks = le64_to_cpu(cinode->offline_blocks);
-	ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
-	ci->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
-	ci->flags = le32_to_cpu(cinode->flags);
+	si->meta_seq = le64_to_cpu(cinode->meta_seq);
+	si->data_seq = le64_to_cpu(cinode->data_seq);
+	si->data_version = le64_to_cpu(cinode->data_version);
+	si->online_blocks = le64_to_cpu(cinode->online_blocks);
+	si->offline_blocks = le64_to_cpu(cinode->offline_blocks);
+	si->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
+	si->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
+	si->flags = le32_to_cpu(cinode->flags);

 	/*
 	 * i_blocks is initialized from online and offline and is then
 	 * maintained as blocks come and go.
 	 */
-	inode->i_blocks = (ci->online_blocks + ci->offline_blocks)
-				<< SCOUTFS_BLOCK_SECTOR_SHIFT;
+	inode->i_blocks = (si->online_blocks + si->offline_blocks)
+				<< SCOUTFS_BLOCK_SM_SECTOR_SHIFT;

-	set_item_info(ci, cinode);
+	set_item_info(si, cinode);
 }

 static void init_inode_key(struct scoutfs_key *key, u64 ino)
@@ -276,7 +283,6 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_key key;
 	struct scoutfs_inode sinode;
-	struct kvec val;
 	const u64 refresh_gen = lock->refresh_gen;
 	int ret;

@@ -292,11 +298,11 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
 		return 0;

 	init_inode_key(&key, scoutfs_ino(inode));
-	kvec_init(&val, &sinode, sizeof(sinode));

 	mutex_lock(&si->item_mutex);
 	if (atomic64_read(&si->last_refreshed) < refresh_gen) {
-		ret = scoutfs_forest_lookup_exact(sb, &key, &val, lock);
+		ret = scoutfs_item_lookup_exact(sb, &key, &sinode,
+						sizeof(sinode), lock);
 		if (ret == 0) {
 			load_inode(inode, &sinode);
 			atomic64_set(&si->last_refreshed, refresh_gen);
@@ -329,7 +335,7 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 			  u64 new_size, bool truncate)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	LIST_HEAD(ind_locks);
 	int ret;
@@ -337,8 +343,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	if (!S_ISREG(inode->i_mode))
 		return 0;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
 	if (ret)
 		return ret;

@@ -348,7 +353,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	truncate_setsize(inode, new_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	if (truncate)
-		ci->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
+		si->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
 	scoutfs_inode_set_data_seq(inode);
 	scoutfs_update_inode_item(inode, lock, &ind_locks);

@@ -360,17 +365,16 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,

 static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	LIST_HEAD(ind_locks);
 	int ret;

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		return ret;

-	ci->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
+	si->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
 	scoutfs_update_inode_item(inode, lock, &ind_locks);

 	scoutfs_release_trans(sb);
@@ -381,16 +385,17 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)

 int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 start;
 	int ret, err;

-	trace_scoutfs_complete_truncate(inode, ci->flags);
+	trace_scoutfs_complete_truncate(inode, si->flags);

-	if (!(ci->flags & SCOUTFS_INO_FLAG_TRUNCATE))
+	if (!(si->flags & SCOUTFS_INO_FLAG_TRUNCATE))
 		return 0;

-	start = (i_size_read(inode) + SCOUTFS_BLOCK_SIZE - 1) >> SCOUTFS_BLOCK_SHIFT;
+	start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
+		SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(inode->i_sb, inode,
 					  scoutfs_ino(inode), start, ~0ULL,
 					  false, lock);
@@ -480,8 +485,7 @@ retry:
 		}
 	}

-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
-					    SIC_DIRTY_INODE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
 	if (ret)
 		goto out;

@@ -573,7 +577,7 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off)
 		si->online_blocks += on;
 		si->offline_blocks += off;
 		/* XXX not sure if this is right */
-		inode->i_blocks += (on + off) * SCOUTFS_BLOCK_SECTORS;
+		inode->i_blocks += (on + off) * SCOUTFS_BLOCK_SM_SECTORS;

 		trace_scoutfs_online_offline_blocks(inode, on, off,
 						    si->online_blocks,
@@ -637,19 +641,19 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)

 static int scoutfs_iget_test(struct inode *inode, void *arg)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;

-	return ci->ino == *ino;
+	return si->ino == *ino;
 }

 static int scoutfs_iget_set(struct inode *inode, void *arg)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;

 	inode->i_ino = *ino;
-	ci->ino = *ino;
+	si->ino = *ino;

 	return 0;
 }
@@ -681,8 +685,6 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 		/* XXX ensure refresh, instead clear in drop_inode? */
 		si = SCOUTFS_I(inode);
 		atomic64_set(&si->last_refreshed, 0);
-		si->ino_alloc.ino = 0;
-		si->ino_alloc.nr = 0;

 		ret = scoutfs_inode_refresh(inode, lock, 0);
 		if (ret) {
@@ -701,7 +703,7 @@ out:

 static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 {
-	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 online_blocks;
 	u64 offline_blocks;

@@ -715,19 +717,22 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 	cinode->rdev = cpu_to_le32(inode->i_rdev);
 	cinode->atime.sec = cpu_to_le64(inode->i_atime.tv_sec);
 	cinode->atime.nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	memset(cinode->atime.__pad, 0, sizeof(cinode->atime.__pad));
 	cinode->ctime.sec = cpu_to_le64(inode->i_ctime.tv_sec);
 	cinode->ctime.nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	memset(cinode->ctime.__pad, 0, sizeof(cinode->ctime.__pad));
 	cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec);
 	cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	memset(cinode->mtime.__pad, 0, sizeof(cinode->mtime.__pad));

 	cinode->meta_seq = cpu_to_le64(scoutfs_inode_meta_seq(inode));
 	cinode->data_seq = cpu_to_le64(scoutfs_inode_data_seq(inode));
 	cinode->data_version = cpu_to_le64(scoutfs_inode_data_version(inode));
 	cinode->online_blocks = cpu_to_le64(online_blocks);
 	cinode->offline_blocks = cpu_to_le64(offline_blocks);
-	cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos);
-	cinode->next_xattr_id = cpu_to_le64(ci->next_xattr_id);
-	cinode->flags = cpu_to_le32(ci->flags);
+	cinode->next_readdir_pos = cpu_to_le64(si->next_readdir_pos);
+	cinode->next_xattr_id = cpu_to_le64(si->next_xattr_id);
+	cinode->flags = cpu_to_le32(si->flags);
 }

 /*
@@ -753,15 +758,13 @@ int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock)
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
-	struct kvec val;
 	int ret;

 	store_inode(&sinode, inode);
-	kvec_init(&val, &sinode, sizeof(sinode));

 	init_inode_key(&key, scoutfs_ino(inode));

-	ret = scoutfs_forest_update(sb, &key, &val, lock);
+	ret = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
 	if (!ret)
 		trace_scoutfs_dirty_inode(inode);
 	return ret;
@@ -893,7 +896,7 @@ static int update_index_items(struct super_block *sb,
 	scoutfs_inode_init_index_key(&ins, type, major, minor, ino);

 	ins_lock = find_index_lock(lock_list, type, major, minor, ino);
-	ret = scoutfs_forest_create_force(sb, &ins, NULL, ins_lock);
+	ret = scoutfs_item_create_force(sb, &ins, NULL, 0, ins_lock);
 	if (ret || !will_del_index(si, type, major, minor))
 		return ret;

@@ -905,9 +908,9 @@ static int update_index_items(struct super_block *sb,

 	del_lock = find_index_lock(lock_list, type, si->item_majors[type],
 				   si->item_minors[type], ino);
-	ret = scoutfs_forest_delete_force(sb, &del, del_lock);
+	ret = scoutfs_item_delete_force(sb, &del, del_lock);
 	if (ret) {
-		err = scoutfs_forest_delete(sb, &ins, ins_lock);
+		err = scoutfs_item_delete(sb, &ins, ins_lock);
 		BUG_ON(err);
 	}

@@ -966,7 +969,6 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_key key;
 	struct scoutfs_inode sinode;
-	struct kvec val;
 	int ret;
 	int err;

@@ -982,9 +984,8 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
 	BUG_ON(ret);

 	init_inode_key(&key, ino);
-	kvec_init(&val, &sinode, sizeof(sinode));

-	err = scoutfs_forest_update(sb, &key, &val, lock);
+	err = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
 	if (err) {
 		scoutfs_err(sb, "inode %llu update err %d", ino, err);
 		BUG_ON(err);
@@ -1185,8 +1186,7 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
 * Returns > 0 if the seq changed and the locks should be retried.
 */
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq,
-				      const struct scoutfs_item_count cnt)
+				      struct list_head *list, u64 seq)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct index_lock *ind_lock;
@@ -1202,7 +1202,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
 			goto out;
 	}

-	ret = scoutfs_hold_trans(sb, cnt);
+	ret = scoutfs_hold_trans(sb);
 	if (ret == 0 && seq != sbi->trans_seq) {
 		scoutfs_release_trans(sb);
 		ret = 1;
@@ -1216,8 +1216,7 @@ out:
 }

 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq,
-				  const struct scoutfs_item_count cnt)
+				  bool set_data_seq)
 {
 	struct super_block *sb = inode->i_sb;
 	int ret;
@@ -1227,7 +1226,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
 		ret = scoutfs_inode_index_start(sb, &seq) ?:
 		      scoutfs_inode_index_prepare(sb, list, inode,
 						  set_data_seq) ?:
-		      scoutfs_inode_index_try_lock_hold(sb, list, seq, cnt);
+		      scoutfs_inode_index_try_lock_hold(sb, list, seq);
 	} while (ret > 0);

 	return ret;
@@ -1259,7 +1258,7 @@ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major,
 	scoutfs_inode_init_index_key(&key, type, major, minor, ino);

 	lock = find_index_lock(ind_locks, type, major, minor, ino);
-	ret = scoutfs_forest_delete_force(sb, &key, lock);
+	ret = scoutfs_item_delete_force(sb, &key, lock);
 	if (ret == -ENOENT)
 		ret = 0;
 	return ret;
@@ -1321,14 +1320,16 @@ u64 scoutfs_last_ino(struct super_block *sb)
 * minimize that loss while still being large enough for typical
 * directory file counts.
 */
-int scoutfs_alloc_ino(struct inode *parent, u64 *ino_ret)
+int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret)
 {
-	struct scoutfs_inode_allocator *ia = &SCOUTFS_I(parent)->ino_alloc;
-	struct super_block *sb = parent->i_sb;
+	DECLARE_INODE_SB_INFO(sb, inf);
+	struct inode_allocator *ia;
 	u64 ino;
 	u64 nr;
 	int ret;

+	ia = is_dir ? &inf->dir_ino_alloc : &inf->ino_alloc;
+
 	spin_lock(&ia->lock);

 	if (ia->nr == 0) {
@@ -1363,29 +1364,26 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 				umode_t mode, dev_t rdev, u64 ino,
 				struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *ci;
+	struct scoutfs_inode_info *si;
 	struct scoutfs_key key;
 	struct scoutfs_inode sinode;
 	struct inode *inode;
-	struct kvec val;
 	int ret;

 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);

-	ci = SCOUTFS_I(inode);
-	ci->ino = ino;
-	ci->data_version = 0;
-	ci->online_blocks = 0;
-	ci->offline_blocks = 0;
-	ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
-	ci->next_xattr_id = 0;
-	ci->have_item = false;
-	atomic64_set(&ci->last_refreshed, lock->refresh_gen);
-	ci->flags = 0;
-	ci->ino_alloc.ino = 0;
-	ci->ino_alloc.nr = 0;
+	si = SCOUTFS_I(inode);
+	si->ino = ino;
+	si->data_version = 0;
+	si->online_blocks = 0;
+	si->offline_blocks = 0;
+	si->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
+	si->next_xattr_id = 0;
+	si->have_item = false;
+	atomic64_set(&si->last_refreshed, lock->refresh_gen);
+	si->flags = 0;

 	scoutfs_inode_set_meta_seq(inode);
 	scoutfs_inode_set_data_seq(inode);
@@ -1399,9 +1397,8 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,

 	store_inode(&sinode, inode);
 	init_inode_key(&key, scoutfs_ino(inode));
-	kvec_init(&val, &sinode, sizeof(sinode));

-	ret = scoutfs_forest_create(sb, &key, &val, lock);
+	ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock);
 	if (ret) {
 		iput(inode);
 		return ERR_PTR(ret);
@@ -1429,7 +1426,7 @@ static int remove_orphan_item(struct super_block *sb, u64 ino)

 	init_orphan_key(&key, sbi->rid, ino);

-	ret = scoutfs_forest_delete(sb, &key, lock);
+	ret = scoutfs_item_delete(sb, &key, lock);
 	if (ret == -ENOENT)
 		ret = 0;

@@ -1451,7 +1448,6 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 	struct scoutfs_key key;
 	LIST_HEAD(ind_locks);
 	bool release = false;
-	struct kvec val;
 	umode_t mode;
 	u64 ind_seq;
 	u64 size;
@@ -1462,9 +1458,9 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 		return ret;

 	init_inode_key(&key, ino);
-	kvec_init(&val, &sinode, sizeof(sinode));

-	ret = scoutfs_forest_lookup_exact(sb, &key, &val, lock);
+	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
+					lock);
 	if (ret < 0) {
 		if (ret == -ENOENT)
 			ret = 0;
@@ -1498,8 +1494,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_DROP_INODE(mode, size));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -1517,7 +1512,7 @@ retry:
 			goto out;
 	}

-	ret = scoutfs_forest_delete(sb, &key, lock);
+	ret = scoutfs_item_delete(sb, &key, lock);
 	if (ret)
 		goto out;

@@ -1586,7 +1581,7 @@ int scoutfs_scan_orphans(struct super_block *sb)
 	init_orphan_key(&last, sbi->rid, ~0ULL);

 	while (1) {
-		ret = scoutfs_forest_next(sb, &key, &last, NULL, lock);
+		ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
 		if (ret == -ENOENT) /* No more orphan items */
 			break;
 		if (ret < 0)
@@ -1620,7 +1615,7 @@ int scoutfs_orphan_inode(struct inode *inode)

 	init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));

-	ret = scoutfs_forest_create(sb, &key, NULL, lock);
+	ret = scoutfs_item_create(sb, &key, NULL, 0, lock);

 	return ret;
 }
@@ -1724,6 +1719,8 @@ int scoutfs_inode_setup(struct super_block *sb)

 	spin_lock_init(&inf->writeback_lock);
 	inf->writeback_inodes = RB_ROOT;
+	spin_lock_init(&inf->dir_ino_alloc.lock);
+	spin_lock_init(&inf->ino_alloc.lock);

 	sbi->inode_sb_info = inf;

--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -4,18 +4,11 @@
 #include "key.h"
 #include "lock.h"
 #include "per_task.h"
-#include "count.h"
 #include "format.h"
 #include "data.h"

 struct scoutfs_lock;

-struct scoutfs_inode_allocator {
-	spinlock_t lock;
-	u64 ino;
-	u64 nr;
-};
-
 struct scoutfs_inode_info {
 	/* read or initialized for each inode instance */
 	u64 ino;
@@ -28,6 +21,14 @@ struct scoutfs_inode_info {
 	u64 offline_blocks;
 	u32 flags;

+	/*
+	 * Protects per-inode extent items, most particularly readers
+	 * who want to serialize writers without holding i_mutex. (only
+	 * used in data.c, it's the only place that understands file
+	 * extent items)
+	 */
+	struct rw_semaphore extent_sem;
+
 	/*
 	 * The in-memory item info caches the current index item values
 	 * so that we can decide to update them with comparisons instead
@@ -42,9 +43,6 @@ struct scoutfs_inode_info {
 	/* updated at on each new lock acquisition */
 	atomic64_t last_refreshed;

-	/* reset for every new inode instance */
-	struct scoutfs_inode_allocator ino_alloc;
-
 	/* initialized once for slab object */
 	seqcount_t seqcount;
 	bool staging;			/* holder of i_mutex is staging */
@@ -84,18 +82,16 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
 				    struct list_head *list, u64 ino,
 				    umode_t mode);
 int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
-				      struct list_head *list, u64 seq,
-				      const struct scoutfs_item_count cnt);
+				      struct list_head *list, u64 seq);
 int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
-				  bool set_data_seq,
-				  const struct scoutfs_item_count cnt);
+				  bool set_data_seq);
 void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);

 int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
 void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
 			       struct list_head *ind_locks);

-int scoutfs_alloc_ino(struct inode *parent, u64 *ino);
+int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret);
 struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 				umode_t mode, dev_t rdev, u64 ino,
 				struct scoutfs_lock *lock);
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -12,6 +12,7 @@
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/compiler.h>
 #include <linux/uio.h>
@@ -27,6 +28,7 @@
 #include "ioctl.h"
 #include "super.h"
 #include "inode.h"
+#include "item.h"
 #include "forest.h"
 #include "data.h"
 #include "client.h"
@@ -34,6 +36,8 @@
 #include "trans.h"
 #include "xattr.h"
 #include "hash.h"
+#include "srch.h"
+#include "alloc.h"
 #include "scoutfs_trace.h"

 /*
@@ -109,7 +113,7 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)

 	for (nr = 0; nr < walk.nr_entries; ) {

-		ret = scoutfs_forest_next(sb, &key, &last_key, NULL, lock);
+		ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock);
 		if (ret < 0 && ret != -ENOENT)
 			break;

@@ -271,8 +275,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_ioctl_release args;
 	struct scoutfs_lock *lock = NULL;
-	loff_t start;
-	loff_t end_inc;
+	u64 sblock;
+	u64 eblock;
 	u64 online;
 	u64 offline;
 	u64 isize;
@@ -283,9 +287,11 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_release(sb, scoutfs_ino(inode), &args);

-	if (args.count == 0)
+	if (args.length == 0)
 		return 0;
-	if ((args.block + args.count) < args.block)
+	if (((args.offset + args.length) < args.offset) ||
+	    (args.offset & SCOUTFS_BLOCK_SM_MASK) ||
+	    (args.length & SCOUTFS_BLOCK_SM_MASK))
 		return -EINVAL;


@@ -318,23 +324,24 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	inode_dio_wait(inode);

 	/* drop all clean and dirty cached blocks in the range */
-	start = args.block << SCOUTFS_BLOCK_SHIFT;
-	end_inc = ((args.block + args.count) << SCOUTFS_BLOCK_SHIFT) - 1;
-	truncate_inode_pages_range(&inode->i_data, start, end_inc);
+	truncate_inode_pages_range(&inode->i_data, args.offset,
+				   args.offset + args.length - 1);

+	sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
+	eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
-					  args.block,
-					  args.block + args.count - 1, true,
+					  sblock,
+					  eblock, true,
 					  lock);
 	if (ret == 0) {
 		scoutfs_inode_get_onoff(inode, &online, &offline);
 		isize = i_size_read(inode);
 		if (online == 0 && isize) {
-			start = (isize + SCOUTFS_BLOCK_SIZE - 1)
-					>> SCOUTFS_BLOCK_SHIFT;
+			sblock = (isize + SCOUTFS_BLOCK_SM_SIZE - 1)
+					>> SCOUTFS_BLOCK_SM_SHIFT;
 			ret = scoutfs_data_truncate_items(sb, inode,
 							  scoutfs_ino(inode),
-							  start, U64_MAX,
+							  sblock, U64_MAX,
 							  false, lock);
 		}
 	}
@@ -371,8 +378,8 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_data_wait_err(sb, &args);

-	sblock = args.offset >> SCOUTFS_BLOCK_SHIFT;
-	eblock = (args.offset + args.count - 1) >> SCOUTFS_BLOCK_SHIFT;
+	sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
+	eblock = (args.offset + args.count - 1) >> SCOUTFS_BLOCK_SM_SHIFT;

 	if (sblock > eblock)
 		return -EINVAL;
@@ -456,23 +463,24 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_stage(sb, scoutfs_ino(inode), &args);

-	end_size = args.offset + args.count;
+	end_size = args.offset + args.length;

 	/* verify arg constraints that aren't dependent on file */
-	if (args.count < 0 || (end_size < args.offset) ||
-	    args.offset & SCOUTFS_BLOCK_MASK)
+	if (args.length < 0 || (end_size < args.offset) ||
+	    args.offset & SCOUTFS_BLOCK_SM_MASK) {
 		return -EINVAL;
+	}

-	if (args.count == 0)
+	if (args.length == 0)
 		return 0;

 	/* the iocb is really only used for the file pointer :P */
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = args.offset;
-	kiocb.ki_left = args.count;
-	kiocb.ki_nbytes = args.count;
+	kiocb.ki_left = args.length;
+	kiocb.ki_nbytes = args.length;
 	iov.iov_base = (void __user *)(unsigned long)args.buf_ptr;
-	iov.iov_len = args.count;
+	iov.iov_len = args.length;

 	ret = mnt_want_write_file(file);
 	if (ret)
@@ -494,7 +502,7 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
 	    (file->f_flags & (O_APPEND | O_DIRECT | O_DSYNC)) ||
 	    IS_SYNC(file->f_mapping->host) ||
 	    (end_size > isize) ||
-	    ((end_size & SCOUTFS_BLOCK_MASK) && (end_size != isize))) {
+	    ((end_size & SCOUTFS_BLOCK_SM_MASK) && (end_size != isize))) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -511,11 +519,11 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
 	written = 0;
 	do {
 		ret = generic_file_buffered_write(&kiocb, &iov, 1, pos, &pos,
-						  args.count, written);
+						  args.length, written);
 		BUG_ON(ret == -EIOCBQUEUED);
 		if (ret > 0)
 			written += ret;
-	} while (ret > 0 && written < args.count);
+	} while (ret > 0 && written < args.length);

 	si->staging = false;
 	current->backing_dev_info = NULL;
@@ -666,8 +674,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)

 	/* setting only so we don't see 0 data seq with nonzero data_version */
 	set_data_seq = sm.data_version != 0 ? true : false;
-	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq,
-					    SIC_SETATTR_MORE());
+	ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
 	if (ret)
 		goto unlock;

@@ -759,18 +766,20 @@ out:
 * but we don't check that the callers xattr name contains the tag and
 * search for it regardless.
 */
-static long scoutfs_ioc_find_xattrs(struct file *file, unsigned long arg)
+static long scoutfs_ioc_search_xattrs(struct file *file, unsigned long arg)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
-	struct scoutfs_ioctl_find_xattrs __user *ufx = (void __user *)arg;
-	struct scoutfs_ioctl_find_xattrs fx;
-	struct scoutfs_lock *lock = NULL;
-	struct scoutfs_key last;
-	struct scoutfs_key key;
+	struct scoutfs_ioctl_search_xattrs __user *usx = (void __user *)arg;
+	struct scoutfs_ioctl_search_xattrs sx;
+	struct scoutfs_xattr_prefix_tags tgs;
+	struct scoutfs_srch_rb_root sroot;
+	struct scoutfs_srch_rb_node *snode;
+	u64 __user *uinos;
+	struct rb_node *node;
 	char *name = NULL;
-	int total = 0;
-	u64 hash;
-	u64 ino;
+	bool done = false;
+	u64 prev_ino = 0;
+	u64 total = 0;
 	int ret;

 	if (!(file->f_mode & FMODE_READ)) {
@@ -783,67 +792,73 @@ static long scoutfs_ioc_find_xattrs(struct file *file, unsigned long arg)
 		goto out;
 	}

-	if (copy_from_user(&fx, ufx, sizeof(fx))) {
+	if (copy_from_user(&sx, usx, sizeof(sx))) {
 		ret = -EFAULT;
 		goto out;
 	}
+	uinos = (u64 __user *)sx.inodes_ptr;

-	if (fx.name_bytes > SCOUTFS_XATTR_MAX_NAME_LEN) {
+	if (sx.name_bytes > SCOUTFS_XATTR_MAX_NAME_LEN) {
 		ret = -EINVAL;
 		goto out;
 	}

-	name = kmalloc(fx.name_bytes, GFP_KERNEL);
+	if (sx.nr_inodes == 0 || sx.last_ino < sx.next_ino) {
+		ret = 0;
+		goto out;
+	}
+
+	name = kmalloc(sx.name_bytes, GFP_KERNEL);
 	if (!name) {
 		ret = -ENOMEM;
 		goto out;
 	}

-	if (copy_from_user(name, (void __user *)fx.name_ptr, fx.name_bytes)) {
+	if (copy_from_user(name, (void __user *)sx.name_ptr, sx.name_bytes)) {
 		ret = -EFAULT;
 		goto out;
 	}

-	hash = scoutfs_hash64(name, fx.name_bytes);
-	scoutfs_xattr_index_key(&key, hash, fx.next_ino, 0);
-	scoutfs_xattr_index_key(&last, hash, U64_MAX, U64_MAX);
-	ino = 0;
+	if (scoutfs_xattr_parse_tags(name, sx.name_bytes, &tgs) < 0 ||
+	    !tgs.srch) {
+		ret = -EINVAL;
+		goto out;
+	}

-	ret = scoutfs_lock_xattr_index(sb, SCOUTFS_LOCK_READ, 0, hash, &lock);
+	ret = scoutfs_srch_search_xattrs(sb, &sroot,
+					 scoutfs_hash64(name, sx.name_bytes),
+					 sx.next_ino, sx.last_ino, &done);
 	if (ret < 0)
 		goto out;

-	while (fx.nr_inodes) {
+	prev_ino = 0;
+	scoutfs_srch_foreach_rb_node(snode, node, &sroot) {
+		if (prev_ino == snode->ino)
+			continue;

-		ret = scoutfs_forest_next(sb, &key, &last, NULL, lock);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
+		if (put_user(snode->ino, uinos + total)) {
+			ret = -EFAULT;
 			break;
 		}
+		prev_ino = snode->ino;

-		/* xattrs hashes can collide and add multiple entries */
-		if (le64_to_cpu(key.skxi_ino) != ino) {
-			ino = le64_to_cpu(key.skxi_ino);
-			if (put_user(ino, (u64 __user *)fx.inodes_ptr)) {
-				ret = -EFAULT;
-				break;
-			}
-
-			fx.inodes_ptr += sizeof(u64);
-			fx.nr_inodes--;
-			total++;
-			ret = 0;
-		}
-
-		scoutfs_key_inc(&key);
+		if (++total == sx.nr_inodes)
+			break;
 	}

-	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
+	sx.output_flags = 0;
+	if (done && total == sroot.nr)
+		sx.output_flags |= SCOUTFS_SEARCH_XATTRS_OFLAG_END;
+
+	if (put_user(sx.output_flags, &usx->output_flags))
+		ret = -EFAULT;
+	else
+		ret = 0;
+
+	scoutfs_srch_destroy_rb_root(&sroot);

 out:
 	kfree(name);
-
 	return ret ?: total;
 }

@@ -853,6 +868,7 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_ioctl_statfs_more sfm;
+	int ret;

 	if (get_user(sfm.valid_bytes, (__u64 __user *)arg))
 		return -EFAULT;
@@ -861,6 +877,12 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 				sizeof(struct scoutfs_ioctl_statfs_more));
 	sfm.fsid = le64_to_cpu(super->hdr.fsid);
 	sfm.rid = sbi->rid;
+	sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks);
+	sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks);
+
+	ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
+	if (ret)
+		return ret;

 	if (copy_to_user((void __user *)arg, &sfm, sfm.valid_bytes))
 		return -EFAULT;
@@ -868,6 +890,101 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 	return 0;
 }

+struct copy_alloc_detail_args {
+	struct scoutfs_ioctl_alloc_detail_entry __user *uade;
+	u64 nr;
+	u64 copied;
+};
+
+static int copy_alloc_detail_to_user(struct super_block *sb, void *arg,
+				     int owner, u64 id, bool meta, bool avail,
+				     u64 blocks)
+{
+	struct copy_alloc_detail_args *args = arg;
+	struct scoutfs_ioctl_alloc_detail_entry ade;
+
+	if (args->copied == args->nr)
+		return -EOVERFLOW;
+
+	ade.blocks = blocks;
+	ade.id = id;
+	ade.meta = !!meta;
+	ade.avail = !!avail;
+
+	if (copy_to_user(&args->uade[args->copied], &ade, sizeof(ade)))
+		return -EFAULT;
+
+	args->copied++;
+	return 0;
+}
+
+static long scoutfs_ioc_alloc_detail(struct file *file, unsigned long arg)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct scoutfs_ioctl_alloc_detail __user *uad = (void __user *)arg;
+	struct scoutfs_ioctl_alloc_detail ad;
+	struct copy_alloc_detail_args args;
+
+	if (copy_from_user(&ad, uad, sizeof(ad)))
+		return -EFAULT;
+
+	args.uade = (struct scoutfs_ioctl_alloc_detail_entry __user *)
+			(uintptr_t)ad.entries_ptr;
+	args.nr = ad.entries_nr;
+	args.copied = 0;
+
+	return scoutfs_alloc_foreach(sb, copy_alloc_detail_to_user, &args) ?:
+	       args.copied;
+}
+
+static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
+{
+	struct inode *to = file_inode(file);
+	struct super_block *sb = to->i_sb;
+	struct scoutfs_ioctl_move_blocks __user *umb = (void __user *)arg;
+	struct scoutfs_ioctl_move_blocks mb;
+	struct file *from_file;
+	struct inode *from;
+	int ret;
+
+	if (copy_from_user(&mb, umb, sizeof(mb)))
+		return -EFAULT;
+
+	if (mb.len == 0)
+		return 0;
+
+	if (mb.from_off + mb.len < mb.from_off ||
+	    mb.to_off + mb.len < mb.to_off)
+		return -EOVERFLOW;
+
+	from_file = fget(mb.from_fd);
+	if (!from_file)
+		return -EBADF;
+	from = file_inode(from_file);
+
+	if (from == to) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (from->i_sb != sb) {
+		ret = -EXDEV;
+		goto out;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret < 0)
+		goto out;
+
+	ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len,
+				       to, mb.to_off);
+	mnt_drop_write_file(file);
+out:
+	fput(from_file);
+
+	return ret;
+}
+
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -887,12 +1004,16 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_setattr_more(file, arg);
 	case SCOUTFS_IOC_LISTXATTR_HIDDEN:
 		return scoutfs_ioc_listxattr_hidden(file, arg);
-	case SCOUTFS_IOC_FIND_XATTRS:
-		return scoutfs_ioc_find_xattrs(file, arg);
+	case SCOUTFS_IOC_SEARCH_XATTRS:
+		return scoutfs_ioc_search_xattrs(file, arg);
 	case SCOUTFS_IOC_STATFS_MORE:
 		return scoutfs_ioc_statfs_more(file, arg);
 	case SCOUTFS_IOC_DATA_WAIT_ERR:
 		return scoutfs_ioc_data_wait_err(file, arg);
+	case SCOUTFS_IOC_ALLOC_DETAIL:
+		return scoutfs_ioc_alloc_detail(file, arg);
+	case SCOUTFS_IOC_MOVE_BLOCKS:
+		return scoutfs_ioc_move_blocks(file, arg);
 	}

 	return -ENOTTY;
--- a/kmod/src/ioctl.h
+++ b/kmod/src/ioctl.h
@@ -78,7 +78,7 @@ struct scoutfs_ioctl_walk_inodes {
 	__u8 _pad[11]; /* padded to align walk_inodes_entry total size */
 };

-enum {
+enum scoutfs_ino_walk_seq_type {
 	SCOUTFS_IOC_WALK_INODES_META_SEQ = 0,
 	SCOUTFS_IOC_WALK_INODES_DATA_SEQ,
 	SCOUTFS_IOC_WALK_INODES_UNKNOWN,
@@ -176,8 +176,8 @@ struct scoutfs_ioctl_ino_path_result {
 * an offline record is left behind to trigger demand staging if the
 * file is read.
 *
- * The starting block offset and number of blocks to release are in
- * units 4KB blocks.
+ * The starting file offset and number of bytes to release must be in
+ * multiples of 4KB.
 *
 * The specified range can extend past i_size and can straddle sparse
 * regions or blocks that are already offline.  The only change it makes
@@ -193,8 +193,8 @@ struct scoutfs_ioctl_ino_path_result {
 * presentation of the data in the file.
 */
 struct scoutfs_ioctl_release {
-	__u64 block;
-	__u64 count;
+	__u64 offset;
+	__u64 length;
 	__u64 data_version;
 };

@@ -205,7 +205,7 @@ struct scoutfs_ioctl_stage {
 	__u64 data_version;
 	__u64 buf_ptr;
 	__u64 offset;
-	__s32 count;
+	__s32 length;
 	__u32 _pad;
 };

@@ -296,34 +296,57 @@ struct scoutfs_ioctl_listxattr_hidden {

 /*
 * Return the inode numbers of inodes which might contain the given
- * named xattr.  The inode may not have a set xattr with that name, the
- * caller must check the returned inodes to see if they match.
+ * xattr.  The inode may not have a set xattr with that name, the caller
+ * must check the returned inodes to see if they match.
 *
 * @next_ino: The next inode number that could be returned.  Initialized
 * to 0 when first searching and set to one past the last inode number
 * returned to continue searching.
- * @name_ptr: The address of the name of the xattr to search for.  It does
- * not need to be null terminated.
- * @inodes_ptr: The address of the array of uint64_t inode numbers in which
- * to store inode numbers that may contain the xattr.  EFAULT may be returned
- * if this address is not naturally aligned.
- * @name_bytes: The number of non-null bytes found in the name at name_ptr.
+ * @last_ino: The last inode number that could be returned.  U64_MAX to
+ * find all inodes.
+ * @name_ptr: The address of the name of the xattr to search for.  It is
+ * not null terminated.
+ * @inodes_ptr: The address of the array of uint64_t inode numbers in
+ * which to store inode numbers that may contain the xattr.  EFAULT may
+ * be returned if this address is not naturally aligned.
+ * @output_flags: Set as success is returned.  If an error is returned
+ * then this field is undefined and should not be read.
 * @nr_inodes: The number of elements in the array found at inodes_ptr.
+ * @name_bytes: The number of non-null bytes found in the name at
+ * name_ptr.
 *
 * This requires the CAP_SYS_ADMIN capability and will return -EPERM if
 * it's not granted.
+ *
+ * The number of inode numbers stored in the inodes_ptr array is
+ * returned.  If nr_inodes is 0 or last_ino is less than next_ino then 0
+ * will be immediately returned.
+ *
+ * Partial progress can be returned if an error is hit or if nr_inodes
+ * was larger than the internal limit on the number of inodes returned
+ * in a search pass.  The _END output flag is set if all the results
+ * including last_ino were searched in this pass.
+ *
+ * It's valuable to provide a large inodes array so that all the results
+ * can be found in one search pass and _END can be set.  There are
+ * significant constant costs for performing each search pass.
 */
-struct scoutfs_ioctl_find_xattrs {
+struct scoutfs_ioctl_search_xattrs {
 	__u64 next_ino;
+	__u64 last_ino;
 	__u64 name_ptr;
 	__u64 inodes_ptr;
+	__u64 output_flags;
+	__u64 nr_inodes;
 	__u16 name_bytes;
-	__u16 nr_inodes;
-	__u8 _pad[4];
+	__u8 _pad[6];
 };

-#define SCOUTFS_IOC_FIND_XATTRS _IOR(SCOUTFS_IOCTL_MAGIC, 9, \
-				     struct scoutfs_ioctl_find_xattrs)
+/* set in output_flags if returned inodes reached last_ino */
+#define SCOUTFS_SEARCH_XATTRS_OFLAG_END (1ULL << 0)
+
+#define SCOUTFS_IOC_SEARCH_XATTRS _IOR(SCOUTFS_IOCTL_MAGIC, 9, \
+				     struct scoutfs_ioctl_search_xattrs)

 /*
 * Give the user information about the filesystem.
@@ -335,13 +358,20 @@ struct scoutfs_ioctl_find_xattrs {
 * field is set if all of its bytes are within the valid_bytes that the
 * kernel set on return.
 *
+ * @committed_seq: All seqs up to and including this seq have been
+ * committed.  Can be compared with meta_seq and data_seq from inodes in
+ * stat_more to discover if changes have been committed to disk.
+ *
 * New fields are only added to the end of the struct.
 */
 struct scoutfs_ioctl_statfs_more {
 	__u64 valid_bytes;
 	__u64 fsid;
 	__u64 rid;
-} __packed;
+	__u64 committed_seq;
+	__u64 total_meta_blocks;
+	__u64 total_data_blocks;
+};

 #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \
 				     struct scoutfs_ioctl_statfs_more)
@@ -364,4 +394,74 @@ struct scoutfs_ioctl_data_wait_err {
 #define SCOUTFS_IOC_DATA_WAIT_ERR _IOR(SCOUTFS_IOCTL_MAGIC, 11, \
 				       struct scoutfs_ioctl_data_wait_err)

+
+struct scoutfs_ioctl_alloc_detail {
+	__u64 entries_ptr;
+	__u64 entries_nr;
+};
+
+struct scoutfs_ioctl_alloc_detail_entry {
+	__u64 id;
+	__u64 blocks;
+	__u8 type;
+	__u8 meta:1,
+	     avail:1;
+	__u8 __bit_pad:6;
+	__u8 __pad[6];
+};
+
+#define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \
+				      struct scoutfs_ioctl_alloc_detail)
+
+/*
+ * Move extents from one regular file to another at a different offset,
+ * on the same file system.
+ *
+ * from_fd specifies the source file and the ioctl is called on the
+ * destination file.  Both files must have write access.  from_off
+ * specifies the byte offset in the source, to_off is the byte offset in
+ * the destination, and len is the number of bytes in the region to
+ * move.   All of the offsets and lengths must be in multiples of 4KB,
+ * except in the case where the from_off + len ends at the i_size of the
+ * source file.
+ *
+ * This interface only moves extents which are block granular, it does
+ * not perform RMW of sub-block byte extents and it does not overwrite
+ * existing extents in the destination.  It will split extents in the
+ * source.
+ *
+ * Only extents within i_size on the source are moved.  The destination
+ * i_size will be updated if extents are moved beyond its current
+ * i_size.  The i_size update will maintain final partial blocks in the
+ * source.
+ *
+ * It will return an error if either of the files have offline extents.
+ * It will return 0 when all of the extents in the source region have
+ * been moved to the destination.  Moving extents updates the ctime,
+ * mtime, meta_seq, data_seq, and data_version fields of both the source
+ * and destination inodes.  If an error is returned then partial
+ * progress may have been made and inode fields may have been updated.
+ *
+ * Errors specific to this interface include:
+ *
+ * EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source
+ *	   and destination files are the same inode; either the source or
+ *	   destination is not a regular file; the destination file has
+ *	   an existing overlapping extent.
+ * EOVERFLOW: either from_off + len or to_off + len exceeded 64bits.
+ * EBADF: from_fd isn't a valid open file descriptor.
+ * EXDEV: the source and destination files are in different filesystems.
+ * EISDIR: either the source or destination is a directory.
+ * ENODATA: either the source or destination file have offline extents.
+ */
+struct scoutfs_ioctl_move_blocks {
+	__u64 from_fd;
+	__u64 from_off;
+	__u64 len;
+	__u64 to_off;
+};
+
+#define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
+				     struct scoutfs_ioctl_move_blocks)
+
 #endif
--- a/kmod/src/item.c
+++ b/kmod/src/item.c
--- a/kmod/src/item.h
+++ b/kmod/src/item.h
@@ -0,0 +1,39 @@
+#ifndef _SCOUTFS_ITEM_H_
+#define _SCOUTFS_ITEM_H_
+
+int scoutfs_item_lookup(struct super_block *sb, struct scoutfs_key *key,
+			void *val, int val_len, struct scoutfs_lock *lock);
+int scoutfs_item_lookup_exact(struct super_block *sb, struct scoutfs_key *key,
+			      void *val, int val_len,
+			      struct scoutfs_lock *lock);
+int scoutfs_item_next(struct super_block *sb, struct scoutfs_key *key,
+		      struct scoutfs_key *last, void *val, int val_len,
+		      struct scoutfs_lock *lock);
+int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
+		       struct scoutfs_lock *lock);
+int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key,
+			void *val, int val_len, struct scoutfs_lock *lock);
+int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key,
+			      void *val, int val_len,
+			      struct scoutfs_lock *lock);
+int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
+			void *val, int val_len, struct scoutfs_lock *lock);
+int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key,
+			  struct scoutfs_lock *lock);
+int scoutfs_item_delete_force(struct super_block *sb,
+				struct scoutfs_key *key,
+				struct scoutfs_lock *lock);
+
+u64 scoutfs_item_dirty_pages(struct super_block *sb);
+int scoutfs_item_write_dirty(struct super_block *sb);
+int scoutfs_item_write_done(struct super_block *sb);
+bool scoutfs_item_range_cached(struct super_block *sb,
+			       struct scoutfs_key *start,
+			       struct scoutfs_key *end, bool *dirty);
+void scoutfs_item_invalidate(struct super_block *sb, struct scoutfs_key *start,
+			     struct scoutfs_key *end);
+
+int scoutfs_item_setup(struct super_block *sb);
+void scoutfs_item_destroy(struct super_block *sb);
+
+#endif
--- a/kmod/src/key.h
+++ b/kmod/src/key.h
@@ -78,6 +78,14 @@ static inline void scoutfs_key_set_zeros(struct scoutfs_key *key)
 	key->_sk_second = 0;
 	key->_sk_third = 0;
 	key->_sk_fourth = 0;
+	memset(key->__pad, 0, sizeof(key->__pad));
+}
+
+static inline bool scoutfs_key_is_zeros(struct scoutfs_key *key)
+{
+	return key->sk_zone == 0 && key->_sk_first == 0 && key->sk_type == 0 &&
+	       key->_sk_second == 0 && key->_sk_third == 0 &&
+	       key->_sk_fourth == 0;
 }

 static inline void scoutfs_key_copy_or_zeros(struct scoutfs_key *dst,
@@ -97,6 +105,7 @@ static inline void scoutfs_key_set_ones(struct scoutfs_key *key)
 	key->_sk_second = cpu_to_le64(U64_MAX);
 	key->_sk_third = cpu_to_le64(U64_MAX);
 	key->_sk_fourth = U8_MAX;
+	memset(key->__pad, 0, sizeof(key->__pad));
 }

 /*
@@ -179,29 +188,19 @@ static inline void scoutfs_key_dec(struct scoutfs_key *key)
 	key->sk_zone--;
 }

-static inline void scoutfs_key_to_be(struct scoutfs_key_be *be,
-				     struct scoutfs_key *key)
-{
-	BUILD_BUG_ON(sizeof(struct scoutfs_key_be) !=
-		     sizeof(struct scoutfs_key));
+/*
+ * Some key types are used by multiple subsystems and shouldn't have
+ * duplicate private key init functions.
+ */

-	be->sk_zone = key->sk_zone;
-	be->_sk_first = le64_to_be64(key->_sk_first);
-	be->sk_type = key->sk_type;
-	be->_sk_second = le64_to_be64(key->_sk_second);
-	be->_sk_third = le64_to_be64(key->_sk_third);
-	be->_sk_fourth = key->_sk_fourth;
-}
-
-static inline void scoutfs_key_from_be(struct scoutfs_key *key,
-				       struct scoutfs_key_be *be)
+static inline void scoutfs_key_init_log_trees(struct scoutfs_key *key,
+					      u64 rid, u64 nr)
 {
-	key->sk_zone = be->sk_zone;
-	key->_sk_first = be64_to_le64(be->_sk_first);
-	key->sk_type = be->sk_type;
-	key->_sk_second = be64_to_le64(be->_sk_second);
-	key->_sk_third = be64_to_le64(be->_sk_third);
-	key->_sk_fourth = be->_sk_fourth;
+	*key = (struct scoutfs_key) {
+		.sk_zone = SCOUTFS_LOG_TREES_ZONE,
+		.sklt_rid = cpu_to_le64(rid),
+		.sklt_nr = cpu_to_le64(nr),
+	};
 }

 #endif
--- a/kmod/src/kvec.h
+++ b/kmod/src/kvec.h
@@ -1,12 +0,0 @@
-#ifndef _SCOUTFS_KVEC_H_
-#define _SCOUTFS_KVEC_H_
-
-#include <linux/uio.h>
-
-static inline void kvec_init(struct kvec *kv, void *base, size_t len)
-{
-	kv->iov_base = base;
-	kv->iov_len = len;
-}
-
-#endif
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -21,7 +21,6 @@

 #include "super.h"
 #include "lock.h"
-#include "forest.h"
 #include "scoutfs_trace.h"
 #include "msg.h"
 #include "cmp.h"
@@ -34,6 +33,7 @@
 #include "client.h"
 #include "data.h"
 #include "xattr.h"
+#include "item.h"

 /*
 * scoutfs uses a lock service to manage item cache consistency between
@@ -65,7 +65,7 @@
 * relative to that lock state we resend.
 */

-#define GRACE_PERIOD_KT	ms_to_ktime(2)
+#define GRACE_PERIOD_KT	ms_to_ktime(10)

 /*
 * allocated per-super, freed on unmount.
@@ -80,6 +80,12 @@ struct lock_info {
 	struct list_head lru_list;
 	unsigned long long lru_nr;
 	struct workqueue_struct *workq;
+	struct work_struct grant_work;
+	struct list_head grant_list;
+	struct delayed_work inv_dwork;
+	struct list_head inv_list;
+	struct work_struct shrink_work;
+	struct list_head shrink_list;
 	atomic64_t next_refresh_gen;
 	struct dentry *tseq_dentry;
 	struct scoutfs_tseq_tree tseq_tree;
@@ -88,19 +94,17 @@ struct lock_info {
 #define DECLARE_LOCK_INFO(sb, name) \
 	struct lock_info *name = SCOUTFS_SB(sb)->lock_info

-static void scoutfs_lock_shrink_worker(struct work_struct *work);
-
-static bool lock_mode_invalid(int mode)
+static bool lock_mode_invalid(enum scoutfs_lock_mode mode)
 {
 	return (unsigned)mode >= SCOUTFS_LOCK_INVALID;
 }

-static bool lock_mode_can_read(int mode)
+static bool lock_mode_can_read(enum scoutfs_lock_mode mode)
 {
 	return mode == SCOUTFS_LOCK_READ || mode == SCOUTFS_LOCK_WRITE;
 }

-static bool lock_mode_can_write(int mode)
+static bool lock_mode_can_write(enum scoutfs_lock_mode mode)
 {
 	return mode == SCOUTFS_LOCK_WRITE || mode == SCOUTFS_LOCK_WRITE_ONLY;
 }
@@ -143,7 +147,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 * leave cached items behind in the case of invalidating to a read lock.
 */
 static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
-			   int prev, int mode)
+			   enum scoutfs_lock_mode prev, enum scoutfs_lock_mode mode)
 {
 	struct scoutfs_lock_coverage *cov;
 	struct scoutfs_lock_coverage *tmp;
@@ -156,15 +160,13 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
 	BUG_ON(!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ) &&
 	         mode != SCOUTFS_LOCK_NULL);

-	/* any transition from a mode allowed to dirty items has to write */
-	if (lock_mode_can_write(prev) && scoutfs_trans_has_dirty(sb)) {
+	/* sync when a write lock could have dirtied the current transaction */
+	if (lock_mode_can_write(prev) &&
+	    (lock->dirty_trans_seq == scoutfs_trans_sample_seq(sb))) {
+		scoutfs_inc_counter(sb, lock_invalidate_sync);
 		ret = scoutfs_trans_sync(sb, 1);
 		if (ret < 0)
 			return ret;
-		if (ret > 0) {
-			scoutfs_add_counter(sb, lock_invalidate_commit, ret);
-			ret = 0;
-		}
 	}

 	/* have to invalidate if we're not in the only usable case */
@@ -193,6 +195,8 @@ retry:
 				ino++;
 			}
 		}
+
+		scoutfs_item_invalidate(sb, &lock->start, &lock->end);
 	}

 	return ret;
@@ -220,9 +224,11 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
 	BUG_ON(!RB_EMPTY_NODE(&lock->node));
 	BUG_ON(!RB_EMPTY_NODE(&lock->range_node));
 	BUG_ON(!list_empty(&lock->lru_head));
+	BUG_ON(!list_empty(&lock->grant_head));
+	BUG_ON(!list_empty(&lock->inv_head));
+	BUG_ON(!list_empty(&lock->shrink_head));
 	BUG_ON(!list_empty(&lock->cov_list));

-	scoutfs_forest_clear_lock(sb, lock);
 	kfree(lock);
 }

@@ -245,7 +251,9 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	RB_CLEAR_NODE(&lock->node);
 	RB_CLEAR_NODE(&lock->range_node);
 	INIT_LIST_HEAD(&lock->lru_head);
-
+	INIT_LIST_HEAD(&lock->grant_head);
+	INIT_LIST_HEAD(&lock->inv_head);
+	INIT_LIST_HEAD(&lock->shrink_head);
 	spin_lock_init(&lock->cov_list_lock);
 	INIT_LIST_HEAD(&lock->cov_list);

@@ -253,21 +261,22 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	lock->end = *end;
 	lock->sb = sb;
 	init_waitqueue_head(&lock->waitq);
-	INIT_WORK(&lock->shrink_work, scoutfs_lock_shrink_worker);
 	lock->mode = SCOUTFS_LOCK_NULL;

+	atomic64_set(&lock->forest_bloom_nr, 0);
+
 	trace_scoutfs_lock_alloc(sb, lock);

 	return lock;
 }

-static void lock_inc_count(unsigned int *counts, int mode)
+static void lock_inc_count(unsigned int *counts, enum scoutfs_lock_mode mode)
 {
 	BUG_ON(mode < 0 || mode >= SCOUTFS_LOCK_NR_MODES);
 	counts[mode]++;
 }

-static void lock_dec_count(unsigned int *counts, int mode)
+static void lock_dec_count(unsigned int *counts, enum scoutfs_lock_mode mode)
 {
 	BUG_ON(mode < 0 || mode >= SCOUTFS_LOCK_NR_MODES);
 	counts[mode]--;
@@ -279,7 +288,7 @@ static void lock_dec_count(unsigned int *counts, int mode)
 */
 static bool lock_counts_match(int granted, unsigned int *counts)
 {
-	int mode;
+	enum scoutfs_lock_mode mode;

 	for (mode = 0; mode < SCOUTFS_LOCK_NR_MODES; mode++) {
 		if (counts[mode] && !lock_modes_match(granted, mode))
@@ -296,7 +305,7 @@ static bool lock_counts_match(int granted, unsigned int *counts)
 */
 static bool lock_count_match_exists(int desired, unsigned int *counts)
 {
-	int mode;
+	enum scoutfs_lock_mode mode;

 	for (mode = 0; mode < SCOUTFS_LOCK_NR_MODES; mode++) {
 		if (counts[mode] && lock_modes_match(desired, mode))
@@ -312,7 +321,7 @@ static bool lock_count_match_exists(int desired, unsigned int *counts)
 */
 static bool lock_idle(struct scoutfs_lock *lock)
 {
-	int mode;
+	enum scoutfs_lock_mode mode;

 	if (lock->request_pending || lock->invalidate_pending)
 		return false;
@@ -540,11 +549,80 @@ static void extend_grace(struct super_block *sb, struct scoutfs_lock *lock)
 	lock->grace_deadline = ktime_add(now, GRACE_PERIOD_KT);
 }

+static void queue_grant_work(struct lock_info *linfo)
+{
+	assert_spin_locked(&linfo->lock);
+
+	if (!list_empty(&linfo->grant_list) && !linfo->shutdown)
+		queue_work(linfo->workq, &linfo->grant_work);
+}
+
 /*
- * The client is receiving a lock response message from the server.
- * This can be reordered with incoming invlidation requests from the
- * server so we have to be careful to only set the new mode once the old
- * mode matches.
+ * We immediately queue work on the assumption that the caller might
+ * have made a change (set a lock mode) which can let one of the
+ * invalidating locks make forward progress, even if other locks are
+ * waiting for their grace period to elapse.  It's a trade-off between
+ * invalidation latency and burning cpu repeatedly finding that locks
+ * are still in their grace period.
+ */
+static void queue_inv_work(struct lock_info *linfo)
+{
+	assert_spin_locked(&linfo->lock);
+
+	if (!list_empty(&linfo->inv_list) && !linfo->shutdown)
+		mod_delayed_work(linfo->workq, &linfo->inv_dwork, 0);
+}
+
+/*
+ * The given lock is processing a received a grant response.  Trigger a
+ * bug if the cache is inconsistent.
+ *
+ * We only have two modes that can create dirty items.  We can't have
+ * dirty items when transitioning from write_only to write because the
+ * writer can't trust the cached items in the cache for reading.  And we
+ * don't currently transition directly from write to write_only, we
+ * first go through null.  So if we have dirty items as we're granted a
+ * mode it's always incorrect.
+ *
+ * And we can't have cached items that we're going to use for reading if
+ * the previous mode didn't allow reading.
+ *
+ * Inconsistencies have come from all sorts of bugs: invalidation missed
+ * items, the cache was populated outside of locking coverage, lock
+ * holders performed the wrong item operations under their lock,
+ * overlapping locks, out of order granting or invalidating, etc.
+ */
+static void bug_on_inconsistent_grant_cache(struct super_block *sb,
+					    struct scoutfs_lock *lock,
+					    int old_mode, int new_mode)
+{
+	bool cached;
+	bool dirty;
+
+	cached = scoutfs_item_range_cached(sb, &lock->start, &lock->end,
+					   &dirty);
+	if (dirty ||
+	    (cached && (!lock_mode_can_read(old_mode) ||
+			!lock_mode_can_read(new_mode)))) {
+		scoutfs_err(sb, "granted lock item cache inconsistency, cached %u dirty %u old_mode %d new_mode %d: start "SK_FMT" end "SK_FMT" refresh_gen %llu mode %u waiters: rd %u wr %u wo %u users: rd %u wr %u wo %u",
+			   cached, dirty, old_mode, new_mode, SK_ARG(&lock->start),
+			   SK_ARG(&lock->end), lock->refresh_gen, lock->mode,
+			   lock->waiters[SCOUTFS_LOCK_READ],
+			   lock->waiters[SCOUTFS_LOCK_WRITE],
+			   lock->waiters[SCOUTFS_LOCK_WRITE_ONLY],
+			   lock->users[SCOUTFS_LOCK_READ],
+			   lock->users[SCOUTFS_LOCK_WRITE],
+			   lock->users[SCOUTFS_LOCK_WRITE_ONLY]);
+		BUG();
+	}
+}
+
+/*
+ * Each lock has received a grant response message from the server.
+ *
+ * Grant responses can be reordered with incoming invalidation requests
+ * from the server so we have to be careful to only set the new mode
+ * once the old mode matches.
 *
 * We extend the grace period as we grant the lock if there is a waiting
 * locker who can use the lock.  This stops invalidation from pulling
@@ -555,9 +633,65 @@ static void extend_grace(struct super_block *sb, struct scoutfs_lock *lock)
 * against the invalidation.  In that case they'd extend the grace
 * period anyway as they unlock.
 */
-int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock *nl)
+static void lock_grant_worker(struct work_struct *work)
 {
+	struct lock_info *linfo = container_of(work, struct lock_info,
+					       grant_work);
+	struct super_block *sb = linfo->sb;
+	struct scoutfs_net_lock_grant_response *gr;
+	struct scoutfs_net_lock *nl;
+	struct scoutfs_lock *lock;
+	struct scoutfs_lock *tmp;
+
+	scoutfs_inc_counter(sb, lock_grant_work);
+
+	spin_lock(&linfo->lock);
+
+	list_for_each_entry_safe(lock, tmp, &linfo->grant_list, grant_head) {
+		gr = &lock->grant_resp;
+		nl = &lock->grant_resp.nl;
+
+		/* wait for reordered invalidation to finish */
+		if (lock->mode != nl->old_mode)
+			continue;
+
+		bug_on_inconsistent_grant_cache(sb, lock, nl->old_mode,
+						nl->new_mode);
+
+		if (!lock_mode_can_read(nl->old_mode) &&
+		    lock_mode_can_read(nl->new_mode)) {
+			lock->refresh_gen =
+				atomic64_inc_return(&linfo->next_refresh_gen);
+		}
+
+		lock->request_pending = 0;
+		lock->mode = nl->new_mode;
+		lock->write_version = le64_to_cpu(nl->write_version);
+		lock->roots = gr->roots;
+
+		if (lock_count_match_exists(nl->new_mode, lock->waiters))
+			extend_grace(sb, lock);
+
+		trace_scoutfs_lock_granted(sb, lock);
+		list_del_init(&lock->grant_head);
+		wake_up(&lock->waitq);
+		put_lock(linfo, lock);
+	}
+
+	/* invalidations might be waiting for our reordered grant */
+	queue_inv_work(linfo);
+	spin_unlock(&linfo->lock);
+}
+
+/*
+ * The client is receiving a grant response message from the server.  We
+ * find the lock, record the response, and add it to the list for grant
+ * work to process.
+ */
+int scoutfs_lock_grant_response(struct super_block *sb,
+				struct scoutfs_net_lock_grant_response *gr)
+{
+	struct scoutfs_net_lock *nl = &gr->nl;
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;

@@ -568,34 +702,12 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 	/* lock must already be busy with request_pending */
 	lock = lock_lookup(sb, &nl->key, NULL);
 	BUG_ON(!lock);
+	trace_scoutfs_lock_grant_response(sb, lock);
 	BUG_ON(!lock->request_pending);

-	trace_scoutfs_lock_grant_response(sb, lock);
-
-	/* resolve unlikely work reordering with invalidation request */
-	while (lock->mode != nl->old_mode) {
-		spin_unlock(&linfo->lock);
-		/* implicit read barrier from waitq locks */
-		wait_event(lock->waitq, lock->mode == nl->old_mode);
-		spin_lock(&linfo->lock);
-	}
-
-	if (!lock_mode_can_read(nl->old_mode) &&
-	    lock_mode_can_read(nl->new_mode)) {
-		lock->refresh_gen =
-			atomic64_inc_return(&linfo->next_refresh_gen);
-	}
-
-	lock->request_pending = 0;
-	lock->mode = nl->new_mode;
-	lock->write_version = le64_to_cpu(nl->write_version);
-
-	if (lock_count_match_exists(nl->new_mode, lock->waiters))
-		extend_grace(sb, lock);
-
-	trace_scoutfs_lock_granted(sb, lock);
-	wake_up(&lock->waitq);
-	put_lock(linfo, lock);
+	lock->grant_resp = *gr;
+	list_add_tail(&lock->grant_head, &linfo->grant_list);
+	queue_grant_work(linfo);

 	spin_unlock(&linfo->lock);

@@ -603,34 +715,9 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 }

 /*
- * Invalidation waits until the old mode indicates that we've resolved
- * unlikely races with reordered grant responses from the server and
- * until the new mode satisfies active users.
- *
- * Once it's safe to proceed we set the lock mode here under the lock to
- * prevent additional users of the old mode while we're invalidating.
- */
-static bool lock_invalidate_safe(struct lock_info *linfo,
-				 struct scoutfs_lock *lock,
-				 int old_mode, int new_mode)
-{
-	bool safe;
-
-	spin_lock(&linfo->lock);
-	safe = (lock->mode == old_mode) &&
-	       lock_counts_match(new_mode, lock->users);
-	if (safe)
-		lock->mode = new_mode;
-	spin_unlock(&linfo->lock);
-
-	return safe;
-}
-
-/*
- * The client is receiving a lock invalidation request from the server
+ * Each lock has received a lock invalidation request from the server
 * which specifies a new mode for the lock.  The server will only send
- * one invalidation request at a time.  This is executing in a blocking
- * net receive work context.
+ * one invalidation request at a time for each lock.
 *
 * This is an unsolicited request from the server so it can arrive at
 * any time after we make the server aware of the lock by initially
@@ -647,70 +734,134 @@ static bool lock_invalidate_safe(struct lock_info *linfo,
 * invalidate once the lock mode matches what the server told us to
 * invalidate.
 *
- * We delay invalidation processing until a grace period has elapsed since
- * the last unlock.  The intent is to let users do a reasonable batch of
- * work before dropping the lock.  Continuous unlocking can continuously
- * extend the deadline.
+ * We delay invalidation processing until a grace period has elapsed
+ * since the last unlock.  The intent is to let users do a reasonable
+ * batch of work before dropping the lock.  Continuous unlocking can
+ * continuously extend the deadline.
+ *
+ * Before we start invalidating the lock we set the lock to the new
+ * mode, preventing further incompatible users of the old mode from
+ * using the lock while we're invalidating.
+ *
+ * This does a lot of serialized inode invalidation in one context and
+ * performs a lot of repeated calls to sync.  It would be nice to get
+ * some concurrent inode invalidation and to more carefully only call
+ * sync when needed.
+ */
+static void lock_invalidate_worker(struct work_struct *work)
+{
+	struct lock_info *linfo = container_of(work, struct lock_info,
+					       inv_dwork.work);
+	struct super_block *sb = linfo->sb;
+	struct scoutfs_net_lock *nl;
+	struct scoutfs_lock *lock;
+	struct scoutfs_lock *tmp;
+	unsigned long delay = MAX_JIFFY_OFFSET;
+	ktime_t now = ktime_get();
+	ktime_t deadline;
+	LIST_HEAD(ready);
+	u64 net_id;
+	int ret;
+
+	scoutfs_inc_counter(sb, lock_invalidate_work);
+
+	spin_lock(&linfo->lock);
+
+	list_for_each_entry_safe(lock, tmp, &linfo->inv_list, inv_head) {
+		nl = &lock->inv_nl;
+
+		/* wait for reordered grant to finish */
+		if (lock->mode != nl->old_mode)
+			continue;
+
+		/* wait until incompatible holders unlock */
+		if (!lock_counts_match(nl->new_mode, lock->users))
+			continue;
+
+		/* skip if grace hasn't elapsed, record earliest */
+		deadline = lock->grace_deadline;
+		if (!linfo->shutdown && ktime_before(now, deadline)) {
+			delay = min(delay,
+				    nsecs_to_jiffies(ktime_to_ns(
+						ktime_sub(deadline, now))));
+			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
+			continue;
+		}
+		/* set the new mode, no incompatible users during inval */
+		lock->mode = nl->new_mode;
+
+		/* move everyone that's ready to our private list */
+		list_move_tail(&lock->inv_head, &ready);
+	}
+
+	spin_unlock(&linfo->lock);
+
+	if (list_empty(&ready))
+		goto out;
+
+	/* invalidate once the lock is read */
+	list_for_each_entry(lock, &ready, inv_head) {
+		nl = &lock->inv_nl;
+		net_id = lock->inv_net_id;
+
+		ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
+		BUG_ON(ret);
+
+		/* respond with the key and modes from the request */
+		ret = scoutfs_client_lock_response(sb, net_id, nl);
+		BUG_ON(ret);
+
+		scoutfs_inc_counter(sb, lock_invalidate_response);
+	}
+
+	/* and finish all the invalidated locks */
+	spin_lock(&linfo->lock);
+
+	list_for_each_entry_safe(lock, tmp, &ready, inv_head) {
+		list_del_init(&lock->inv_head);
+
+		lock->invalidate_pending = 0;
+		trace_scoutfs_lock_invalidated(sb, lock);
+		wake_up(&lock->waitq);
+		put_lock(linfo, lock);
+	}
+
+	/* grant might have been waiting for invalidate request */
+	queue_grant_work(linfo);
+	spin_unlock(&linfo->lock);
+
+out:
+	/* queue delayed work if invalidations waiting on grace deadline */
+	if (delay != MAX_JIFFY_OFFSET)
+		queue_delayed_work(linfo->workq, &linfo->inv_dwork, delay);
+}
+
+/*
+ * Record an incoming invalidate request from the server and add its lock
+ * to the list for processing.
+ *
+ * This is trusting the server and will crash if it's sent bad requests :/
 */
 int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 				    struct scoutfs_net_lock *nl)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;
-	ktime_t deadline;
-	bool grace_waited = false;
-	int ret;

 	scoutfs_inc_counter(sb, lock_invalidate_request);

 	spin_lock(&linfo->lock);
 	lock = get_lock(sb, &nl->key);
-	if (lock) {
-		BUG_ON(lock->invalidate_pending); /* XXX trusting server :/ */
-		lock->invalidate_pending = 1;
-		deadline = lock->grace_deadline;
-		trace_scoutfs_lock_invalidate_request(sb, lock);
-	}
-	spin_unlock(&linfo->lock);
-
 	BUG_ON(!lock);
-
-	/* wait for a grace period after the most recent unlock */
-	while (ktime_before(ktime_get(), deadline)) {
-		grace_waited = true;
-		scoutfs_inc_counter(linfo->sb, lock_grace_wait);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-                schedule_hrtimeout(&deadline, HRTIMER_MODE_ABS);
-
-		spin_lock(&linfo->lock);
-		deadline = lock->grace_deadline;
-		spin_unlock(&linfo->lock);
+	if (lock) {
+		BUG_ON(lock->invalidate_pending);
+		lock->invalidate_pending = 1;
+		lock->inv_nl = *nl;
+		lock->inv_net_id = net_id;
+		list_add_tail(&lock->inv_head, &linfo->inv_list);
+		trace_scoutfs_lock_invalidate_request(sb, lock);
+		queue_inv_work(linfo);
 	}
-
-	if (grace_waited)
-		scoutfs_inc_counter(linfo->sb, lock_grace_elapsed);
-
-	/* sets the lock mode to prevent use of old mode during invalidate */
-	wait_event(lock->waitq, lock_invalidate_safe(linfo, lock, nl->old_mode,
-						     nl->new_mode));
-
-	ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
-	BUG_ON(ret);
-
-	/* respond with the key and modes from the request */
-	ret = scoutfs_client_lock_response(sb, net_id, nl);
-	BUG_ON(ret);
-
-	scoutfs_inc_counter(sb, lock_invalidate_response);
-
-	spin_lock(&linfo->lock);
-
-	lock->invalidate_pending = 0;
-
-	trace_scoutfs_lock_invalidated(sb, lock);
-	wake_up(&lock->waitq);
-	put_lock(linfo, lock);
-
 	spin_unlock(&linfo->lock);

 	return 0;
@@ -749,6 +900,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 	for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {

 		nlr->locks[i].key = lock->start;
+		nlr->locks[i].write_version = cpu_to_le64(lock->write_version);
 		nlr->locks[i].old_mode = lock->mode;
 		nlr->locks[i].new_mode = lock->mode;

@@ -769,7 +921,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 }

 static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock,
-			   int mode)
+			   enum scoutfs_lock_mode mode)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
 	bool wake;
@@ -803,7 +955,7 @@ static bool lock_flags_invalid(int flags)
 * won't process our request until it receives our invalidation
 * response.
 */
-static int lock_key_range(struct super_block *sb, int mode, int flags,
+static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			  struct scoutfs_key *start, struct scoutfs_key *end,
 			  struct scoutfs_lock **ret_lock)
 {
@@ -911,7 +1063,7 @@ out_unlock:
 	return ret;
 }

-int scoutfs_lock_ino(struct super_block *sb, int mode, int flags, u64 ino,
+int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
 		     struct scoutfs_lock **ret_lock)
 {
 	struct scoutfs_key start;
@@ -936,7 +1088,7 @@ int scoutfs_lock_ino(struct super_block *sb, int mode, int flags, u64 ino,
 * is incremented as new locks are acquired and then indicates that an
 * old inode with a smaller refresh_gen needs to be refreshed.
 */
-int scoutfs_lock_inode(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 		       struct inode *inode, struct scoutfs_lock **lock)
 {
 	int ret;
@@ -999,7 +1151,7 @@ static void swap_arg(void *A, void *B, int size)
 *
 * (pretty great collision with d_lock() here)
 */
-int scoutfs_lock_inodes(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct inode *a, struct scoutfs_lock **a_lock,
 			struct inode *b, struct scoutfs_lock **b_lock,
 			struct inode *c, struct scoutfs_lock **c_lock,
@@ -1047,7 +1199,7 @@ int scoutfs_lock_inodes(struct super_block *sb, int mode, int flags,
 /*
 * The rename lock is magical because it's global.
 */
-int scoutfs_lock_rename(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct scoutfs_lock **lock)
 {
 	struct scoutfs_key key = {
@@ -1094,7 +1246,7 @@ void scoutfs_lock_get_index_item_range(u8 type, u64 major, u64 ino,
 * Lock the given index item.  We use the index masks to calculate the
 * start and end key values that are covered by the lock.
 */
-int scoutfs_lock_inode_index(struct super_block *sb, int mode,
+int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode,
 			     u8 type, u64 major, u64 ino,
 			     struct scoutfs_lock **ret_lock)
 {
@@ -1106,24 +1258,6 @@ int scoutfs_lock_inode_index(struct super_block *sb, int mode,
 	return lock_key_range(sb, mode, 0, &start, &end, ret_lock);
 }

-/*
- * Today we lock a hash value entirely. If we went to finer grained ino
- * locking as well we'd need to check the manifest to find the next
- * possible ino to lock so that we didn't try to iterate over all of
- * them.
- */
-int scoutfs_lock_xattr_index(struct super_block *sb, int mode, int flags,
-			     u64 hash, struct scoutfs_lock **ret_lock)
-{
-	struct scoutfs_key start;
-	struct scoutfs_key end;
-
-	scoutfs_xattr_index_key(&start, hash, 0, 0);
-	scoutfs_xattr_index_key(&end, hash, U64_MAX, U64_MAX);
-
-	return lock_key_range(sb, mode, flags, &start, &end, ret_lock);
-}
-
 /*
 * The rid lock protects a mount's private persistent items in the rid
 * zone.  It's held for the duration of the mount.  It lets the mount
@@ -1135,7 +1269,7 @@ int scoutfs_lock_xattr_index(struct super_block *sb, int mode, int flags,
 * able to.  Maybe we have a bunch free and they're trying to allocate
 * and are getting ENOSPC.
 */
-int scoutfs_lock_rid(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 		     u64 rid, struct scoutfs_lock **lock)
 {
 	struct scoutfs_key start;
@@ -1156,7 +1290,7 @@ int scoutfs_lock_rid(struct super_block *sb, int mode, int flags,
 * As we unlock we always extend the grace period to give the caller
 * another pass at the lock before its invalidated.
 */
-void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, int mode)
+void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode)
 {
 	DECLARE_LOCK_INFO(sb, linfo);

@@ -1169,9 +1303,12 @@ void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, int mode)

 	lock_dec_count(lock->users, mode);
 	extend_grace(sb, lock);
+	if (lock_mode_can_write(mode))
+		lock->dirty_trans_seq = scoutfs_trans_sample_seq(sb);

 	trace_scoutfs_lock_unlock(sb, lock);
 	wake_up(&lock->waitq);
+	queue_inv_work(linfo);
 	put_lock(linfo, lock);

 	spin_unlock(&linfo->lock);
@@ -1246,7 +1383,7 @@ void scoutfs_lock_del_coverage(struct super_block *sb,
 * the mode and keys from changing.
 */
 bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
-			    int mode)
+			    enum scoutfs_lock_mode mode)
 {
 	signed char lock_mode = ACCESS_ONCE(lock->mode);

@@ -1256,38 +1393,50 @@ bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
 }

 /*
- * The shrink callback got the lock, marked it request_pending, and
- * handed it off to us.  We kick off a null request and the lock will
- * be freed by the response once all users drain.  If this races with
+ * The shrink callback got the lock, marked it request_pending, and put
+ * it on the shrink list.  We send a null request and the lock will be
+ * freed by the response once all users drain.  If this races with
 * invalidation then the server will only send the grant response once
 * the invalidation is finished.
 */
-static void scoutfs_lock_shrink_worker(struct work_struct *work)
+static void lock_shrink_worker(struct work_struct *work)
 {
-	struct scoutfs_lock *lock = container_of(work, struct scoutfs_lock,
-						 shrink_work);
-	struct super_block *sb = lock->sb;
-	DECLARE_LOCK_INFO(sb, linfo);
+	struct lock_info *linfo = container_of(work, struct lock_info,
+					       shrink_work);
+	struct super_block *sb = linfo->sb;
 	struct scoutfs_net_lock nl;
+	struct scoutfs_lock *lock;
+	struct scoutfs_lock *tmp;
+	LIST_HEAD(list);
 	int ret;

-	/* unlocked lock access, but should be stable since we queued */
-	nl.key = lock->start;
-	nl.old_mode = lock->mode;
-	nl.new_mode = SCOUTFS_LOCK_NULL;
+	scoutfs_inc_counter(sb, lock_shrink_work);

-	ret = scoutfs_client_lock_request(sb, &nl);
-	if (ret) {
-		/* oh well, not freeing */
-		scoutfs_inc_counter(sb, lock_shrink_request_aborted);
+	spin_lock(&linfo->lock);
+	list_splice_init(&linfo->shrink_list, &list);
+	spin_unlock(&linfo->lock);

-		spin_lock(&linfo->lock);
+	list_for_each_entry_safe(lock, tmp, &list, shrink_head) {
+		list_del_init(&lock->shrink_head);

-		lock->request_pending = 0;
-		wake_up(&lock->waitq);
-		put_lock(linfo, lock);
+		/* unlocked lock access, but should be stable since we queued */
+		nl.key = lock->start;
+		nl.old_mode = lock->mode;
+		nl.new_mode = SCOUTFS_LOCK_NULL;

-		spin_unlock(&linfo->lock);
+		ret = scoutfs_client_lock_request(sb, &nl);
+		if (ret) {
+			/* oh well, not freeing */
+			scoutfs_inc_counter(sb, lock_shrink_aborted);
+
+			spin_lock(&linfo->lock);
+
+			lock->request_pending = 0;
+			wake_up(&lock->waitq);
+			put_lock(linfo, lock);
+
+			spin_unlock(&linfo->lock);
+		}
 	}
 }

@@ -1312,6 +1461,7 @@ static int scoutfs_lock_shrink(struct shrinker *shrink,
 	struct scoutfs_lock *lock;
 	struct scoutfs_lock *tmp;
 	unsigned long nr;
+	bool added = false;
 	int ret;

 	nr = sc->nr_to_scan;
@@ -1325,15 +1475,17 @@ restart:

 		BUG_ON(!lock_idle(lock));
 		BUG_ON(lock->mode == SCOUTFS_LOCK_NULL);
+		BUG_ON(!list_empty(&lock->shrink_head));

-		if (nr-- == 0)
+		if (linfo->shutdown || nr-- == 0)
 			break;

 		__lock_del_lru(linfo, lock);
 		lock->request_pending = 1;
-		queue_work(linfo->workq, &lock->shrink_work);
+		list_add_tail(&lock->shrink_head, &linfo->shrink_list);
+		added = true;

-		scoutfs_inc_counter(sb, lock_shrink_queued);
+		scoutfs_inc_counter(sb, lock_shrink_attempted);
 		trace_scoutfs_lock_shrink(sb, lock);

 		/* could have bazillions of idle locks */
@@ -1343,6 +1495,9 @@ restart:

 	spin_unlock(&linfo->lock);

+	if (added)
+		queue_work(linfo->workq, &linfo->shrink_work);
+
 out:
 	ret = min_t(unsigned long, linfo->lru_nr, INT_MAX);
 	trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, ret);
@@ -1377,10 +1532,15 @@ static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
 }

 /*
- * We're going to be destroying the locks soon.  We shouldn't have any
- * normal task holders that would have prevented unmount.  We can have
- * internal threads blocked in locks.  We force all currently blocked
- * and future lock calls to return -ESHUTDOWN.
+ * The caller is going to be calling _destroy soon and, critically, is
+ * about to shutdown networking before calling us so that we don't get
+ * any callbacks while we're destroying.  We have to ensure that we
+ * won't call networking after this returns.
+ *
+ * Internal fs threads can be using locking, and locking can have async
+ * work pending.  We use ->shutdown to force callers to return
+ * -ESHUTDOWN and to prevent the future queueing of work that could call
+ * networking.  Locks whose work is stopped will be torn down by _destroy.
 */
 void scoutfs_lock_shutdown(struct super_block *sb)
 {
@@ -1402,6 +1562,10 @@ void scoutfs_lock_shutdown(struct super_block *sb)
 	}

 	spin_unlock(&linfo->lock);
+
+	flush_work(&linfo->grant_work);
+	flush_delayed_work(&linfo->inv_dwork);
+	flush_work(&linfo->shrink_work);
 }

 /*
@@ -1422,7 +1586,7 @@ void scoutfs_lock_destroy(struct super_block *sb)
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;
 	struct rb_node *node;
-	int mode;
+	enum scoutfs_lock_mode mode;

 	if (!linfo)
 		return;
@@ -1474,6 +1638,12 @@ void scoutfs_lock_destroy(struct super_block *sb)
 		lock->request_pending = 0;
 		if (!list_empty(&lock->lru_head))
 			__lock_del_lru(linfo, lock);
+		if (!list_empty(&lock->grant_head))
+			list_del_init(&lock->grant_head);
+		if (!list_empty(&lock->inv_head))
+			list_del_init(&lock->inv_head);
+		if (!list_empty(&lock->shrink_head))
+			list_del_init(&lock->shrink_head);
 		lock_remove(linfo, lock);
 		lock_free(linfo, lock);
 	}
@@ -1501,6 +1671,12 @@ int scoutfs_lock_setup(struct super_block *sb)
 	linfo->shrinker.seeks = DEFAULT_SEEKS;
 	register_shrinker(&linfo->shrinker);
 	INIT_LIST_HEAD(&linfo->lru_list);
+	INIT_WORK(&linfo->grant_work, lock_grant_worker);
+	INIT_LIST_HEAD(&linfo->grant_list);
+	INIT_DELAYED_WORK(&linfo->inv_dwork, lock_invalidate_worker);
+	INIT_LIST_HEAD(&linfo->inv_list);
+	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
+	INIT_LIST_HEAD(&linfo->shrink_list);
 	atomic64_set(&linfo->next_refresh_gen, 0);
 	scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

--- a/kmod/src/lock.h
+++ b/kmod/src/lock.h
@@ -22,24 +22,32 @@ struct scoutfs_lock {
 	struct rb_node range_node;
 	u64 refresh_gen;
 	u64 write_version;
+	u64 dirty_trans_seq;
+	struct scoutfs_net_roots roots;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
-	struct work_struct shrink_work;
 	ktime_t grace_deadline;
 	unsigned long request_pending:1,
 		      invalidate_pending:1;

+	struct list_head grant_head;
+	struct scoutfs_net_lock_grant_response grant_resp;
+	struct list_head inv_head;
+	struct scoutfs_net_lock inv_nl;
+	u64 inv_net_id;
+	struct list_head shrink_head;
+
 	spinlock_t cov_list_lock;
 	struct list_head cov_list;

-	int mode;
+	enum scoutfs_lock_mode mode;
 	unsigned int waiters[SCOUTFS_LOCK_NR_MODES];
 	unsigned int users[SCOUTFS_LOCK_NR_MODES];

 	struct scoutfs_tseq_entry tseq_entry;

-	/* the forest btree code stores data per lock */
-	struct forest_lock_private *forest_private;
+	/* the forest tracks which log tree last saw bloom bit updates */
+	atomic64_t forest_bloom_nr;
 };

 struct scoutfs_lock_coverage {
@@ -49,35 +57,33 @@ struct scoutfs_lock_coverage {
 };

 int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock *nl);
+				struct scoutfs_net_lock_grant_response *gr);
 int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 				    struct scoutfs_net_lock *nl);
 int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 				 struct scoutfs_key *key);

-int scoutfs_lock_inode(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 		       struct inode *inode, struct scoutfs_lock **ret_lock);
-int scoutfs_lock_ino(struct super_block *sb, int mode, int flags, u64 ino,
+int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
 		     struct scoutfs_lock **ret_lock);
 void scoutfs_lock_get_index_item_range(u8 type, u64 major, u64 ino,
 				       struct scoutfs_key *start,
 				       struct scoutfs_key *end);
-int scoutfs_lock_inode_index(struct super_block *sb, int mode,
+int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode,
 			     u8 type, u64 major, u64 ino,
 			     struct scoutfs_lock **ret_lock);
-int scoutfs_lock_xattr_index(struct super_block *sb, int mode, int flags,
-			     u64 hash, struct scoutfs_lock **ret_lock);
-int scoutfs_lock_inodes(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct inode *a, struct scoutfs_lock **a_lock,
 			struct inode *b, struct scoutfs_lock **b_lock,
 			struct inode *c, struct scoutfs_lock **c_lock,
 			struct inode *d, struct scoutfs_lock **D_lock);
-int scoutfs_lock_rename(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 			struct scoutfs_lock **lock);
-int scoutfs_lock_rid(struct super_block *sb, int mode, int flags,
+int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
 		     u64 rid, struct scoutfs_lock **lock);
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
-		    int level);
+		    enum scoutfs_lock_mode mode);

 void scoutfs_lock_init_coverage(struct scoutfs_lock_coverage *cov);
 void scoutfs_lock_add_coverage(struct super_block *sb,
@@ -88,7 +94,7 @@ bool scoutfs_lock_is_covered(struct super_block *sb,
 void scoutfs_lock_del_coverage(struct super_block *sb,
 			       struct scoutfs_lock_coverage *cov);
 bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
-			    int mode);
+			    enum scoutfs_lock_mode mode);

 void scoutfs_free_unused_locks(struct super_block *sb, unsigned long nr);

--- a/kmod/src/lock_server.c
+++ b/kmod/src/lock_server.c
@@ -20,7 +20,6 @@
 #include "tseq.h"
 #include "spbm.h"
 #include "block.h"
-#include "radix.h"
 #include "btree.h"
 #include "msg.h"
 #include "scoutfs_trace.h"
@@ -87,8 +86,10 @@ struct lock_server_info {
 	struct scoutfs_tseq_tree tseq_tree;
 	struct dentry *tseq_dentry;

-	struct scoutfs_radix_allocator *alloc;
+	struct scoutfs_alloc *alloc;
 	struct scoutfs_block_writer *wri;
+
+	atomic64_t write_version;
 };

 #define DECLARE_LOCK_SERVER_INFO(sb, name) \
@@ -117,12 +118,6 @@ struct server_lock_node {
 	struct list_head invalidated;
 };

-enum {
-	CLE_GRANTED,
-	CLE_REQUESTED,
-	CLE_INVALIDATED,
-};
-
 /*
 * Interactions with the client are tracked with these little mode
 * wrappers.
@@ -489,12 +484,12 @@ static int process_waiting_requests(struct super_block *sb,
 				    struct server_lock_node *snode)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
+	struct scoutfs_net_lock_grant_response gres;
 	struct scoutfs_net_lock nl;
 	struct client_lock_entry *req;
 	struct client_lock_entry *req_tmp;
 	struct client_lock_entry *gr;
 	struct client_lock_entry *gr_tmp;
-	static atomic64_t write_version = ATOMIC64_INIT(0);
 	u64 wv;
 	int ret;

@@ -548,12 +543,15 @@ static int process_waiting_requests(struct super_block *sb,

 		if (nl.new_mode == SCOUTFS_LOCK_WRITE ||
 		    nl.new_mode == SCOUTFS_LOCK_WRITE_ONLY) {
-			wv = atomic64_inc_return(&write_version);
+			wv = atomic64_inc_return(&inf->write_version);
 			nl.write_version = cpu_to_le64(wv);
 		}

+		gres.nl = nl;
+		scoutfs_server_get_roots(sb, &gres.roots);
+
 		ret = scoutfs_server_lock_response(sb, req->rid,
-						   req->net_id, &nl);
+						   req->net_id, &gres);
 		if (ret)
 			goto out;

@@ -575,6 +573,14 @@ out:
 	return ret;
 }

+static void init_lock_clients_key(struct scoutfs_key *key, u64 rid)
+{
+	*key = (struct scoutfs_key) {
+		.sk_zone = SCOUTFS_LOCK_CLIENTS_ZONE,
+		.sklc_rid = cpu_to_le64(rid),
+	};
+}
+
 /*
 * The server received a greeting from a client for the first time.  If
 * the client had already talked to the server then we must find an
@@ -589,23 +595,22 @@ int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid,
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct scoutfs_lock_client_btree_key cbk;
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
 	int ret;

-	cbk.rid = cpu_to_be64(rid);
+	init_lock_clients_key(&key, rid);

 	mutex_lock(&inf->mutex);
 	if (should_exist) {
-		ret = scoutfs_btree_lookup(sb, &super->lock_clients,
-					   &cbk, sizeof(cbk), &iref);
+		ret = scoutfs_btree_lookup(sb, &super->lock_clients, &key,
+					   &iref);
 		if (ret == 0)
 			scoutfs_btree_put_iref(&iref);
 	} else {
 		ret = scoutfs_btree_insert(sb, inf->alloc, inf->wri,
 					   &super->lock_clients,
-					   &cbk, sizeof(cbk), NULL, 0);
+					   &key, NULL, 0);
 	}
 	mutex_unlock(&inf->mutex);

@@ -664,6 +669,14 @@ static int finished_recovery(struct super_block *sb, u64 rid, bool cancel)
 	return ret;
 }

+static void set_max_write_version(struct lock_server_info *inf, u64 new)
+{
+	u64 old;
+
+	while (new > (old = atomic64_read(&inf->write_version)) &&
+	       (atomic64_cmpxchg(&inf->write_version, old, new) != old));
+}
+
 /*
 * We sent a lock recover request to the client when we received its
 * greeting while in recovery.  Here we instantiate all the locks it
@@ -727,6 +740,10 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
 		scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);

 		put_server_lock(inf, snode);
+
+		/* make sure next write lock is greater than all recovered */
+		set_max_write_version(inf,
+				le64_to_cpu(nlr->locks[i].write_version));
 	}

 	/* send request for next batch of keys */
@@ -738,15 +755,12 @@ out:
 	return ret;
 }

-static int get_rid_and_put_ref(struct scoutfs_btree_item_ref *iref,
-			       u64 *rid)
+static int get_rid_and_put_ref(struct scoutfs_btree_item_ref *iref, u64 *rid)
 {
-	struct scoutfs_lock_client_btree_key *cbk;
 	int ret;

-	if (iref->key_len == sizeof(*cbk) && iref->val_len == 0) {
-		cbk = iref->key;
-		*rid = be64_to_cpu(cbk->rid);
+	if (iref->val_len == 0) {
+		*rid = le64_to_cpu(iref->key->sklc_rid);
 		ret = 0;
 	} else {
 		ret = -EIO;
@@ -767,8 +781,8 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)
 						    recovery_dwork.work);
 	struct super_block *sb = inf->sb;
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct scoutfs_lock_client_btree_key cbk;
 	SCOUTFS_BTREE_ITEM_REF(iref);
+	struct scoutfs_key key;
 	bool timed_out;
 	u64 rid;
 	int ret;
@@ -779,9 +793,8 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)

 	/* we enter recovery if there are any client records */
 	for (rid = 0; ; rid++) {
-		cbk.rid = cpu_to_be64(rid);
-		ret = scoutfs_btree_next(sb, &super->lock_clients,
-					 &cbk, sizeof(cbk), &iref);
+		init_lock_clients_key(&key, rid);
+		ret = scoutfs_btree_next(sb, &super->lock_clients, &key, &iref);
 		if (ret == -ENOENT) {
 			ret = 0;
 			break;
@@ -806,10 +819,9 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)
 		scoutfs_err(sb, "client rid %016llx lock recovery timed out",
 			    rid);

-		cbk.rid = cpu_to_be64(rid);
+		init_lock_clients_key(&key, rid);
 		ret = scoutfs_btree_delete(sb, inf->alloc, inf->wri,
-					   &super->lock_clients,
-					   &cbk, sizeof(cbk));
+					   &super->lock_clients, &key);
 		if (ret)
 			break;
 	}
@@ -838,7 +850,6 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct scoutfs_lock_client_btree_key cli;
 	struct client_lock_entry *clent;
 	struct client_lock_entry *tmp;
 	struct server_lock_node *snode;
@@ -847,10 +858,10 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 	bool freed;
 	int ret = 0;

-	cli.rid = cpu_to_be64(rid);
 	mutex_lock(&inf->mutex);
+	init_lock_clients_key(&key, rid);
 	ret = scoutfs_btree_delete(sb, inf->alloc, inf->wri,
-				   &super->lock_clients, &cli, sizeof(cli));
+				   &super->lock_clients, &key);
 	mutex_unlock(&inf->mutex);
 	if (ret == -ENOENT) {
 		ret = 0;
@@ -951,14 +962,14 @@ static void lock_server_tseq_show(struct seq_file *m,
 * we time them out.
 */
 int scoutfs_lock_server_setup(struct super_block *sb,
-			      struct scoutfs_radix_allocator *alloc,
-			      struct scoutfs_block_writer *wri)
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri, u64 max_vers)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct lock_server_info *inf;
 	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_lock_client_btree_key cbk;
+	struct scoutfs_key key;
 	unsigned int nr;
 	u64 rid;
 	int ret;
@@ -977,6 +988,7 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);
 	inf->alloc = alloc;
 	inf->wri = wri;
+	atomic64_set(&inf->write_version, max_vers); /* inc_return gives +1 */

 	inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root,
 					       &inf->tseq_tree);
@@ -990,9 +1002,8 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	/* we enter recovery if there are any client records */
 	nr = 0;
 	for (rid = 0; ; rid++) {
-		cbk.rid = cpu_to_be64(rid);
-		ret = scoutfs_btree_next(sb, &super->lock_clients,
-					 &cbk, sizeof(cbk), &iref);
+		init_lock_clients_key(&key, rid);
+		ret = scoutfs_btree_next(sb, &super->lock_clients, &key, &iref);
 		if (ret == -ENOENT)
 			break;
 		if (ret == 0)
--- a/kmod/src/lock_server.h
+++ b/kmod/src/lock_server.h
@@ -12,8 +12,8 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
 int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid);

 int scoutfs_lock_server_setup(struct super_block *sb,
-			      struct scoutfs_radix_allocator *alloc,
-			      struct scoutfs_block_writer *wri);
+			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_block_writer *wri, u64 max_vers);
 void scoutfs_lock_server_destroy(struct super_block *sb);

 #endif
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -100,7 +100,7 @@ do {								\
 } while (0)

 /* listening and their accepting sockets have a fixed locking order */
-enum {
+enum spin_lock_subtype {
 	CONN_LOCK_LISTENER,
 	CONN_LOCK_ACCEPTED,
 };
@@ -369,6 +369,7 @@ static int submit_send(struct super_block *sb,
 	msend->nh.cmd = cmd;
 	msend->nh.flags = flags;
 	msend->nh.error = net_err;
+	memset(msend->nh.__pad, 0, sizeof(msend->nh.__pad));
 	msend->nh.data_len = cpu_to_le16(data_len);
 	if (data_len)
 		memcpy(msend->nh.data, data, data_len);
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -76,7 +76,7 @@ struct scoutfs_net_connection {
 	void *info;
 };

-enum {
+enum conn_flags {
 	CONN_FL_valid_greeting = (1UL << 0), /* other commands can proceed */
 	CONN_FL_established =	 (1UL << 1), /* added sends queue send work */
 	CONN_FL_shutting_down =	 (1UL << 2), /* shutdown work was queued */
@@ -102,6 +102,7 @@ static inline void scoutfs_addr_from_sin(struct scoutfs_inet_addr *addr,
 {
 	addr->addr = be32_to_le32(sin->sin_addr.s_addr);
 	addr->port = be16_to_le16(sin->sin_port);
+	memset(addr->__pad, 0, sizeof(addr->__pad));
 }

 struct scoutfs_net_connection *
--- a/kmod/src/options.c
+++ b/kmod/src/options.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
+#include <linux/namei.h>

 #include <linux/parser.h>
 #include <linux/inet.h>
@@ -28,24 +29,16 @@

 static const match_table_t tokens = {
 	{Opt_server_addr, "server_addr=%s"},
+	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_err, NULL}
 };

 struct options_sb_info {
 	struct dentry *debugfs_dir;
-	u32 btree_force_tiny_blocks;
 };

 u32 scoutfs_option_u32(struct super_block *sb, int token)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct options_sb_info *osi = sbi->options;
-
-	switch(token) {
-		case Opt_btree_force_tiny_blocks:
-			return osi->btree_force_tiny_blocks;
-	}
-
 	WARN_ON_ONCE(1);
 	return 0;
 }
@@ -90,6 +83,52 @@ static int parse_ipv4(struct super_block *sb, char *str,
 	return 0;
 }

+static int parse_bdev_path(struct super_block *sb, substring_t *substr,
+			      char **bdev_path_ret)
+{
+	char *bdev_path;
+	struct inode *bdev_inode;
+	struct path path;
+	bool got_path = false;
+	int ret;
+
+	bdev_path = match_strdup(substr);
+	if (!bdev_path) {
+		scoutfs_err(sb, "bdev string dup failed");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = kern_path(bdev_path, LOOKUP_FOLLOW, &path);
+	if (ret) {
+		scoutfs_err(sb, "path %s not found for bdev: error %d",
+			    bdev_path, ret);
+		goto out;
+	}
+	got_path = true;
+
+	bdev_inode = d_inode(path.dentry);
+	if (!S_ISBLK(bdev_inode->i_mode)) {
+		scoutfs_err(sb, "path %s for bdev is not a block device",
+			    bdev_path);
+		ret = -ENOTBLK;
+		goto out;
+	}
+
+out:
+	if (got_path) {
+		path_put(&path);
+	}
+
+	if (ret < 0) {
+		kfree(bdev_path);
+	} else {
+		*bdev_path_ret = bdev_path;
+	}
+
+	return ret;
+}
+
 int scoutfs_parse_options(struct super_block *sb, char *options,
 			  struct mount_options *parsed)
 {
@@ -115,6 +154,13 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
 			if (ret < 0)
 				return ret;
 			break;
+		case Opt_metadev_path:
+
+			ret = parse_bdev_path(sb, &args[0],
+						 &parsed->metadev_path);
+			if (ret < 0)
+				return ret;
+			break;
 		default:
 			scoutfs_err(sb, "Unknown or malformed option, \"%s\"",
 				    p);
@@ -122,6 +168,11 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
 		}
 	}

+	if (!parsed->metadev_path) {
+		scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
+		return -EINVAL;
+	}
+
 	return 0;
 }

@@ -143,13 +194,6 @@ int scoutfs_options_setup(struct super_block *sb)
 		goto out;
 	}

-	if (!debugfs_create_bool("btree_force_tiny_blocks", 0644,
-				 osi->debugfs_dir,
-				 &osi->btree_force_tiny_blocks)) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	ret = 0;
 out:
 	if (ret)
--- a/kmod/src/options.h
+++ b/kmod/src/options.h
@@ -5,18 +5,15 @@
 #include <linux/in.h>
 #include "format.h"

-enum {
-	/*
-	 * For debugging we can quickly create huge trees by limiting
-	 * the number of items in each block as though the blocks were tiny.
-	 */
-	Opt_btree_force_tiny_blocks,
+enum scoutfs_mount_options {
 	Opt_server_addr,
+	Opt_metadev_path,
 	Opt_err,
 };

 struct mount_options {
 	struct sockaddr_in server_addr;
+	char *metadev_path;
 };

 int scoutfs_parse_options(struct super_block *sb, char *options,
--- a/kmod/src/quorum.c
+++ b/kmod/src/quorum.c
@@ -112,12 +112,13 @@ static ktime_t random_to(u32 lo, u32 hi)
 /*
 * The caller is about to read all the quorum blocks.  We invalidate any
 * cached blocks and issue one large contiguous read to repopulate the
- * cache.  The caller then uses normal sb_bread to read each block.  I'm
+ * cache.  The caller then uses normal __bread to read each block.  I'm
 * not a huge fan of the plug but I couldn't get the individual
 * readahead requests merged without it.
 */
 static void readahead_quorum_blocks(struct super_block *sb)
 {
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct buffer_head *bh;
 	struct blk_plug plug;
 	int i;
@@ -125,7 +126,8 @@ static void readahead_quorum_blocks(struct super_block *sb)
 	blk_start_plug(&plug);

 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
-		bh = sb_getblk(sb, SCOUTFS_QUORUM_BLKNO + i);
+		bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i,
+			     SCOUTFS_BLOCK_SM_SIZE);
 		if (!bh)
 			continue;

@@ -144,7 +146,7 @@ struct quorum_block_head {
 	struct list_head head;
 	union {
 		struct scoutfs_quorum_block blk;
-		u8 bytes[SCOUTFS_BLOCK_SIZE];
+		u8 bytes[SCOUTFS_BLOCK_SM_SIZE];
 	};
 };

@@ -184,13 +186,13 @@ static size_t quorum_block_bytes(struct scoutfs_quorum_block *blk)
 static bool invalid_quorum_block(struct buffer_head *bh,
 				 struct scoutfs_quorum_block *blk)
 {
-	return bh->b_size != SCOUTFS_BLOCK_SIZE ||
-	       sizeof(struct scoutfs_quorum_block) > SCOUTFS_BLOCK_SIZE ||
+	return bh->b_size != SCOUTFS_BLOCK_SM_SIZE ||
+	       sizeof(struct scoutfs_quorum_block) > SCOUTFS_BLOCK_SM_SIZE ||
 	       quorum_block_crc(blk) != blk->crc ||
 	       le64_to_cpu(blk->blkno) != bh->b_blocknr ||
 	       blk->term == 0 ||
 	       blk->log_nr > SCOUTFS_QUORUM_LOG_MAX ||
-	       quorum_block_bytes(blk) > SCOUTFS_BLOCK_SIZE;
+	       quorum_block_bytes(blk) > SCOUTFS_BLOCK_SM_SIZE;
 }

 /* true if a is stale and should be ignored */
@@ -215,6 +217,7 @@ static bool stale_quorum_block(struct scoutfs_quorum_block *a,
 static int read_quorum_blocks(struct super_block *sb, struct list_head *blocks)
 {
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_quorum_block *blk;
 	struct quorum_block_head *qbh;
 	struct quorum_block_head *tmp;
@@ -227,7 +230,8 @@ static int read_quorum_blocks(struct super_block *sb, struct list_head *blocks)

 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
 		brelse(bh);
-		bh = sb_bread(sb, SCOUTFS_QUORUM_BLKNO + i);
+		bh = __bread(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i,
+			     SCOUTFS_BLOCK_SM_SIZE);
 		if (!bh) {
 			scoutfs_inc_counter(sb, quorum_read_block_error);
 			ret = -EIO;
@@ -291,23 +295,25 @@ static int write_quorum_block(struct super_block *sb,
 			      struct scoutfs_quorum_block *our_blk)
 {
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_quorum_block *blk;
 	struct buffer_head *bh = NULL;
 	size_t size;
 	int ret;

-	BUILD_BUG_ON(sizeof(struct scoutfs_quorum_block) > SCOUTFS_BLOCK_SIZE);
+	BUILD_BUG_ON(sizeof(struct scoutfs_quorum_block) >
+		     SCOUTFS_BLOCK_SM_SIZE);

-	bh = sb_getblk(sb, SCOUTFS_QUORUM_BLKNO +
-			   prandom_u32_max(SCOUTFS_QUORUM_BLOCKS));
+	bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO +
+		      prandom_u32_max(SCOUTFS_QUORUM_BLOCKS),
+		      SCOUTFS_BLOCK_SM_SIZE);
 	if (bh == NULL) {
 		ret = -EIO;
 		goto out;
 	}

 	size = quorum_block_bytes(our_blk);
-	if (WARN_ON_ONCE(size > SCOUTFS_BLOCK_SIZE ||
-			 size > bh->b_size)) {
+	if (WARN_ON_ONCE(size > SCOUTFS_BLOCK_SM_SIZE || size > bh->b_size)) {
 		ret = -EIO;
 		goto out;
 	}
@@ -530,7 +536,7 @@ int scoutfs_quorum_election(struct super_block *sb, ktime_t timeout_abs,
 	trace_scoutfs_quorum_election(sb, prev_term);

 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
-	our_blk = kmalloc(SCOUTFS_BLOCK_SIZE, GFP_NOFS);
+	our_blk = kmalloc(SCOUTFS_BLOCK_SM_SIZE, GFP_NOFS);
 	if (!super || !our_blk) {
 		ret = -ENOMEM;
 		goto out;
@@ -548,7 +554,7 @@ int scoutfs_quorum_election(struct super_block *sb, ktime_t timeout_abs,
 			    SCOUTFS_QUORUM_TERM_HI_MS);

 	for (;;) {
-		memset(our_blk, 0, SCOUTFS_BLOCK_SIZE);
+		memset(our_blk, 0, SCOUTFS_BLOCK_SM_SIZE);

 		scoutfs_inc_counter(sb, quorum_cycle);

--- a/kmod/src/radix.c
+++ b/kmod/src/radix.c
--- a/kmod/src/radix.h
+++ b/kmod/src/radix.h
@@ -1,45 +0,0 @@
-#ifndef _SCOUTFS_RADIX_H_
-#define _SCOUTFS_RADIX_H_
-
-#include "per_task.h"
-
-struct scoutfs_block_writer;
-
-struct scoutfs_radix_allocator {
-	struct mutex mutex;
-	struct scoutfs_radix_root avail;
-	struct scoutfs_radix_root freed;
-};
-
-int scoutfs_radix_alloc(struct super_block *sb,
-			struct scoutfs_radix_allocator *alloc,
-			struct scoutfs_block_writer *wri, u64 *blkno);
-int scoutfs_radix_alloc_data(struct super_block *sb,
-			     struct scoutfs_radix_allocator *alloc,
-			     struct scoutfs_block_writer *wri,
-			     struct scoutfs_radix_root *root,
-			     int count, u64 *blkno_ret, int *count_ret);
-int scoutfs_radix_free(struct super_block *sb,
-		       struct scoutfs_radix_allocator *alloc,
-		       struct scoutfs_block_writer *wri, u64 blkno);
-int scoutfs_radix_free_data(struct super_block *sb,
-			    struct scoutfs_radix_allocator *alloc,
-			    struct scoutfs_block_writer *wri,
-			    struct scoutfs_radix_root *root,
-			    u64 blkno, int count);
-int scoutfs_radix_merge(struct super_block *sb,
-			struct scoutfs_radix_allocator *alloc,
-			struct scoutfs_block_writer *wri,
-			struct scoutfs_radix_root *dst,
-			struct scoutfs_radix_root *src,
-			struct scoutfs_radix_root *inp, bool meta, u64 count);
-void scoutfs_radix_init_alloc(struct scoutfs_radix_allocator *alloc,
-			      struct scoutfs_radix_root *avail,
-			      struct scoutfs_radix_root *freed);
-void scoutfs_radix_root_init(struct super_block *sb,
-			     struct scoutfs_radix_root *root, bool meta);
-u64 scoutfs_radix_root_free_bytes(struct super_block *sb,
-				  struct scoutfs_radix_root *root);
-u64 scoutfs_radix_bit_leaf_nr(u64 bit);
-
-#endif
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -58,10 +58,12 @@ do {								\

 int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 				struct scoutfs_net_lock *nl);
-int scoutfs_server_lock_response(struct super_block *sb, u64 rid,
-				 u64 id, struct scoutfs_net_lock *nl);
+int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
+				 struct scoutfs_net_lock_grant_response *gr);
 int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
+void scoutfs_server_get_roots(struct super_block *sb,
+			      struct scoutfs_net_roots *roots);
 int scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);

--- a/kmod/src/sort_priv.c
+++ b/kmod/src/sort_priv.c
@@ -0,0 +1,71 @@
+/*
+ * A copy of sort() from upstream with a priv argument that's passed
+ * to comparison, like list_sort().
+ */
+
+/* ------------------------ */
+
+/*
+ * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ *
+ * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include "sort_priv.h"
+
+/**
+ * sort - sort an array of elements
+ * @priv: caller's pointer to pass to comparison and swap functions
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap_func function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+void sort_priv(void *priv, void *base, size_t num, size_t size,
+	       int (*cmp_func)(void *priv, const void *, const void *),
+	       void (*swap_func)(void *priv, void *, void *, int size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+			    cmp_func(priv, base + c, base + c + size) < 0)
+				c += size;
+			if (cmp_func(priv, base + r, base + c) >= 0)
+				break;
+			swap_func(priv, base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(priv, base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+			    cmp_func(priv, base + c, base + c + size) < 0)
+				c += size;
+			if (cmp_func(priv, base + r, base + c) >= 0)
+				break;
+			swap_func(priv, base + r, base + c, size);
+		}
+	}
+}
--- a/kmod/src/sort_priv.h
+++ b/kmod/src/sort_priv.h
@@ -0,0 +1,8 @@
+#ifndef _SCOUTFS_SORT_PRIV_H_
+#define _SCOUTFS_SORT_PRIV_H_
+
+void sort_priv(void *priv, void *base, size_t num, size_t size,
+	       int (*cmp_func)(void *priv, const void *, const void *),
+	       void (*swap_func)(void *priv, void *, void *, int size));
+
+#endif
--- a/kmod/src/spbm.c
+++ b/kmod/src/spbm.c
@@ -47,9 +47,9 @@ bool scoutfs_spbm_empty(struct scoutfs_spbm *spbm)
 	return RB_EMPTY_ROOT(&spbm->root);
 }

-enum {
+enum spbm_flags {
 	/* if a node isn't found then return an allocated new node */
-	SPBM_FIND_ALLOC = 0x1,
+	SPBM_FIND_ALLOC = (1 << 0),
 };
 static struct spbm_node *find_node(struct scoutfs_spbm *spbm, u64 index,
 				   int flags)
--- a/kmod/src/srch.c
+++ b/kmod/src/srch.c
--- a/kmod/src/srch.h
+++ b/kmod/src/srch.h
@@ -0,0 +1,68 @@
+#ifndef _SCOUTFS_SRCH_H_
+#define _SCOUTFS_SRCH_H_
+
+struct scoutfs_block;
+
+struct scoutfs_srch_rb_root {
+	struct rb_root root;
+	struct rb_node *last;
+	unsigned long nr;
+};
+
+struct scoutfs_srch_rb_node {
+	struct rb_node node;
+	u64 ino;
+	u64 id;
+};
+
+#define scoutfs_srch_foreach_rb_node(snode, node, sroot)		\
+	for (node = rb_first(&(sroot)->root);				\
+	     node && (snode = container_of(node, struct scoutfs_srch_rb_node, \
+					   node), 1);			\
+	     node = rb_next(node))
+
+int scoutfs_srch_add(struct super_block *sb,
+		     struct scoutfs_alloc *alloc,
+		     struct scoutfs_block_writer *wri,
+		     struct scoutfs_srch_file *sfl,
+		     struct scoutfs_block **bl_ret,
+		     u64 hash, u64 ino, u64 id);
+
+void scoutfs_srch_destroy_rb_root(struct scoutfs_srch_rb_root *sroot);
+int scoutfs_srch_search_xattrs(struct super_block *sb,
+			       struct scoutfs_srch_rb_root *sroot,
+			       u64 hash, u64 ino, u64 last_ino, bool *done);
+
+int scoutfs_srch_rotate_log(struct super_block *sb,
+			    struct scoutfs_alloc *alloc,
+			    struct scoutfs_block_writer *wri,
+			    struct scoutfs_btree_root *root,
+			    struct scoutfs_srch_file *sfl);
+int scoutfs_srch_get_compact(struct super_block *sb,
+			     struct scoutfs_alloc *alloc,
+			     struct scoutfs_block_writer *wri,
+			     struct scoutfs_btree_root *root,
+			     u64 rid, struct scoutfs_srch_compact *sc);
+int scoutfs_srch_update_compact(struct super_block *sb,
+				struct scoutfs_alloc *alloc,
+				struct scoutfs_block_writer *wri,
+				struct scoutfs_btree_root *root, u64 rid,
+				struct scoutfs_srch_compact *sc);
+int scoutfs_srch_commit_compact(struct super_block *sb,
+				struct scoutfs_alloc *alloc,
+				struct scoutfs_block_writer *wri,
+				struct scoutfs_btree_root *root, u64 rid,
+				struct scoutfs_srch_compact *res,
+				struct scoutfs_alloc_list_head *av,
+				struct scoutfs_alloc_list_head *fr);
+int scoutfs_srch_cancel_compact(struct super_block *sb,
+				struct scoutfs_alloc *alloc,
+				struct scoutfs_block_writer *wri,
+				struct scoutfs_btree_root *root, u64 rid,
+				struct scoutfs_alloc_list_head *av,
+				struct scoutfs_alloc_list_head *fr);
+
+void scoutfs_srch_destroy(struct super_block *sb);
+int scoutfs_srch_setup(struct super_block *sb);
+
+#endif
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -41,6 +41,9 @@
 #include "sysfs.h"
 #include "quorum.h"
 #include "forest.h"
+#include "srch.h"
+#include "item.h"
+#include "alloc.h"
 #include "scoutfs_trace.h"

 static struct dentry *scoutfs_debugfs_root;
@@ -76,11 +79,30 @@ retry:
 	return cpu_to_le64(ret);
 }

+struct statfs_free_blocks {
+	u64 meta;
+	u64 data;
+};
+
+static int count_free_blocks(struct super_block *sb, void *arg, int owner,
+			     u64 id, bool meta, bool avail, u64 blocks)
+{
+	struct statfs_free_blocks *sfb = arg;
+
+	if (meta)
+		sfb->meta += blocks;
+	else
+		sfb->data += blocks;
+
+	return 0;
+}
+
 /*
- * Ask the server for the current statfs fields.  The message is very
- * cheap so we're not worrying about spinning in statfs flooding the
- * server with requests.  We can add a cache and stale results if that
- * becomes a problem.
+ * Build the free block counts by having alloc read all the persistent
+ * blocks which contain allocators and calling us for each of them.
+ * Only the super block reads aren't cached so repeatedly calling statfs
+ * is like repeated O_DIRECT IO.  We can add a cache and stale results
+ * if that IO becomes a problem.
 *
 * We fake the number of free inodes value by assuming that we can fill
 * free blocks with a certain number of inodes.  We then the number of
@@ -93,30 +115,50 @@ retry:
 static int scoutfs_statfs(struct dentry *dentry, struct kstatfs *kst)
 {
 	struct super_block *sb = dentry->d_inode->i_sb;
-	struct scoutfs_net_statfs nstatfs;
+	struct scoutfs_super_block *super = NULL;
+	struct statfs_free_blocks sfb = {0,};
 	__le32 uuid[4];
 	int ret;

-	ret = scoutfs_client_statfs(sb, &nstatfs);
-	if (ret)
-		return ret;
+	scoutfs_inc_counter(sb, statfs);

-	kst->f_bfree = le64_to_cpu(nstatfs.bfree);
+	super = kzalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
+	if (!super) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = scoutfs_read_super(sb, super);
+	if (ret)
+		goto out;
+
+	ret = scoutfs_alloc_foreach(sb, count_free_blocks, &sfb);
+	if (ret < 0)
+		goto out;
+
+	kst->f_bfree = (sfb.meta << SCOUTFS_BLOCK_SM_LG_SHIFT) + sfb.data;
 	kst->f_type = SCOUTFS_SUPER_MAGIC;
-	kst->f_bsize = SCOUTFS_BLOCK_SIZE;
-	kst->f_blocks = le64_to_cpu(nstatfs.total_blocks);
+	kst->f_bsize = SCOUTFS_BLOCK_SM_SIZE;
+	kst->f_blocks = (le64_to_cpu(super->total_meta_blocks) <<
+			 SCOUTFS_BLOCK_SM_LG_SHIFT) +
+			le64_to_cpu(super->total_data_blocks);
 	kst->f_bavail = kst->f_bfree;

-	kst->f_ffree = kst->f_bfree * 16;
-	kst->f_files = kst->f_ffree + le64_to_cpu(nstatfs.next_ino);
+	/* arbitrarily assume ~1K / empty file */
+	kst->f_ffree = sfb.meta * (SCOUTFS_BLOCK_LG_SIZE / 1024);
+	kst->f_files = kst->f_ffree + le64_to_cpu(super->next_ino);

-	BUILD_BUG_ON(sizeof(uuid) != sizeof(nstatfs.uuid));
-	memcpy(uuid, &nstatfs, sizeof(uuid));
+	BUILD_BUG_ON(sizeof(uuid) != sizeof(super->uuid));
+	memcpy(uuid, super->uuid, sizeof(uuid));
 	kst->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[1]);
 	kst->f_fsid.val[1] = le32_to_cpu(uuid[2]) ^ le32_to_cpu(uuid[3]);
 	kst->f_namelen = SCOUTFS_NAME_LEN;
-	kst->f_frsize = SCOUTFS_BLOCK_SIZE;
+	kst->f_frsize = SCOUTFS_BLOCK_SM_SIZE;
+
 	/* the vfs fills f_flags */
+	ret = 0;
+out:
+	kfree(super);

 	/*
 	 * We don't take cluster locks in statfs which makes it a very
@@ -126,7 +168,7 @@ static int scoutfs_statfs(struct dentry *dentry, struct kstatfs *kst)
 	if (scoutfs_trigger(sb, STATFS_LOCK_PURGE))
 		scoutfs_free_unused_locks(sb, -1UL);

-	return 0;
+	return ret;
 }

 static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
@@ -135,10 +177,21 @@ static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
 	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;

 	seq_printf(seq, ",server_addr="SIN_FMT, SIN_ARG(&opts->server_addr));
+	seq_printf(seq, ",metadev_path=%s", opts->metadev_path);

 	return 0;
 }

+static ssize_t metadev_path_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
+{
+	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
+	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
+
+	return snprintf(buf, PAGE_SIZE, "%s", opts->metadev_path);
+}
+SCOUTFS_ATTR_RO(metadev_path);
+
 static ssize_t server_addr_show(struct kobject *kobj,
 			      struct kobj_attribute *attr, char *buf)
 {
@@ -151,6 +204,7 @@ static ssize_t server_addr_show(struct kobject *kobj,
 SCOUTFS_ATTR_RO(server_addr);

 static struct attribute *mount_options_attrs[] = {
+	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(server_addr),
 	NULL,
 };
@@ -163,6 +217,20 @@ static int scoutfs_sync_fs(struct super_block *sb, int wait)
 	return scoutfs_trans_sync(sb, wait);
 }

+/*
+ * Data dev is closed by generic code, but we have to explicitly close the meta
+ * dev.
+ */
+static void scoutfs_metadev_close(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	if (sbi->meta_bdev) {
+		blkdev_put(sbi->meta_bdev, SCOUTFS_META_BDEV_MODE);
+		sbi->meta_bdev = NULL;
+	}
+}
+
 /*
 * This destroys all the state that's built up in the sb info during
 * mount.  It's called by us on errors during mount if we haven't set
@@ -178,6 +246,7 @@ static void scoutfs_put_super(struct super_block *sb)
 	sbi->shutdown = true;

 	scoutfs_data_destroy(sb);
+	scoutfs_srch_destroy(sb);

 	scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
 	sbi->rid_lock = NULL;
@@ -185,6 +254,7 @@ static void scoutfs_put_super(struct super_block *sb)
 	scoutfs_shutdown_trans(sb);
 	scoutfs_client_destroy(sb);
 	scoutfs_inode_destroy(sb);
+	scoutfs_item_destroy(sb);
 	scoutfs_forest_destroy(sb);

 	/* the server locks the listen address and compacts */
@@ -203,6 +273,9 @@ static void scoutfs_put_super(struct super_block *sb)
 	debugfs_remove(sbi->debug_root);
 	scoutfs_destroy_counters(sb);
 	scoutfs_destroy_sysfs(sb);
+	scoutfs_metadev_close(sb);
+
+	kfree(sbi->opts.metadev_path);
 	kfree(sbi);

 	sb->s_fs_info = NULL;
@@ -227,30 +300,33 @@ static const struct super_operations scoutfs_super_ops = {
 int scoutfs_write_super(struct super_block *sb,
 			struct scoutfs_super_block *super)
 {
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
 	le64_add_cpu(&super->hdr.seq, 1);

-	return scoutfs_block_write_sm(sb, SCOUTFS_SUPER_BLKNO, &super->hdr,
+	return scoutfs_block_write_sm(sb, sbi->meta_bdev, SCOUTFS_SUPER_BLKNO,
+				      &super->hdr,
 				      sizeof(struct scoutfs_super_block));
 }

 /*
- * Read the super block.  If it's valid store it in the caller's super
- * struct.
+ * Read super, specifying bdev.
 */
-int scoutfs_read_super(struct super_block *sb,
-		       struct scoutfs_super_block *super_res)
+static int scoutfs_read_super_from_bdev(struct super_block *sb,
+					struct block_device *bdev,
+					struct scoutfs_super_block *super_res)
 {
 	struct scoutfs_super_block *super;
 	__le32 calc;
+	u64 blkno;
 	int ret;

 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
 	if (!super)
 		return -ENOMEM;

-	ret = scoutfs_block_read_sm(sb, SCOUTFS_SUPER_BLKNO, &super->hdr,
-				    sizeof(struct scoutfs_super_block),
-				    &calc);
+	ret = scoutfs_block_read_sm(sb, bdev, SCOUTFS_SUPER_BLKNO, &super->hdr,
+				    sizeof(struct scoutfs_super_block), &calc);
 	if (ret < 0)
 		goto out;

@@ -276,10 +352,10 @@ int scoutfs_read_super(struct super_block *sb,
 	}


-	if (super->format_hash != cpu_to_le64(SCOUTFS_FORMAT_HASH)) {
-		scoutfs_err(sb, "super block has invalid format hash 0x%llx, expected 0x%llx",
-			    le64_to_cpu(super->format_hash),
-			    SCOUTFS_FORMAT_HASH);
+	if (super->version != cpu_to_le64(SCOUTFS_INTEROP_VERSION)) {
+		scoutfs_err(sb, "super block has invalid version %llu, expected %llu",
+			    le64_to_cpu(super->version),
+			    SCOUTFS_INTEROP_VERSION);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -294,13 +370,61 @@ int scoutfs_read_super(struct super_block *sb,
 		goto out;
 	}

-	*super_res = *super;
-	ret = 0;
+	blkno = (SCOUTFS_QUORUM_BLKNO + SCOUTFS_QUORUM_BLOCKS) >>
+		SCOUTFS_BLOCK_SM_LG_SHIFT;
+	if (le64_to_cpu(super->first_meta_blkno) < blkno) {
+		scoutfs_err(sb, "super block first meta blkno %llu is within quorum blocks",
+			le64_to_cpu(super->first_meta_blkno));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (le64_to_cpu(super->first_meta_blkno) >
+	    le64_to_cpu(super->last_meta_blkno)) {
+		scoutfs_err(sb, "super block first meta blkno %llu is greater than last meta blkno %llu",
+			le64_to_cpu(super->first_meta_blkno),
+			le64_to_cpu(super->last_meta_blkno));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (le64_to_cpu(super->first_data_blkno) >
+	    le64_to_cpu(super->last_data_blkno)) {
+		scoutfs_err(sb, "super block first data blkno %llu is greater than last data blkno %llu",
+			le64_to_cpu(super->first_data_blkno),
+			le64_to_cpu(super->last_data_blkno));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	blkno = (i_size_read(sb->s_bdev->bd_inode) >>
+		 SCOUTFS_BLOCK_SM_SHIFT) - 1;
+	if (le64_to_cpu(super->last_data_blkno) > blkno) {
+		scoutfs_err(sb, "super block last data blkno %llu is outsite device size last blkno %llu",
+			le64_to_cpu(super->last_data_blkno), blkno);
+		ret = -EINVAL;
+		goto out;
+	}
+
 out:
+	if (ret == 0)
+		*super_res = *super;
 	kfree(super);
+
 	return ret;
 }

+/*
+ * Read the super block from meta dev.
+ */
+int scoutfs_read_super(struct super_block *sb,
+		       struct scoutfs_super_block *super_res)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	return scoutfs_read_super_from_bdev(sb, sbi->meta_bdev, super_res);
+}
+
 /*
 * This needs to be setup after reading the super because it uses the
 * fsid found in the super block.
@@ -337,10 +461,66 @@ static int assign_random_id(struct scoutfs_sb_info *sbi)
 	return 0;
 }

+/*
+ * Ensure superblock copies in metadata and data block devices are valid, and
+ * fill in in-memory superblock if so.
+ */
+static int scoutfs_read_supers(struct super_block *sb)
+{
+	struct scoutfs_super_block *meta_super = NULL;
+	struct scoutfs_super_block *data_super = NULL;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	int ret = 0;
+
+	meta_super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
+	data_super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
+	if (!meta_super || !data_super) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = scoutfs_read_super_from_bdev(sb, sbi->meta_bdev, meta_super);
+	if (ret < 0) {
+		scoutfs_err(sb, "could not get meta_super: error %d", ret);
+		goto out;
+	}
+
+	ret = scoutfs_read_super_from_bdev(sb, sb->s_bdev, data_super);
+	if (ret < 0) {
+		scoutfs_err(sb, "could not get data_super: error %d", ret);
+		goto out;
+	}
+
+	if (!SCOUTFS_IS_META_BDEV(meta_super)) {
+		scoutfs_err(sb, "meta_super META flag not set");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (SCOUTFS_IS_META_BDEV(data_super)) {
+		scoutfs_err(sb, "data_super META flag set");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (memcmp(meta_super->uuid, data_super->uuid, SCOUTFS_UUID_BYTES)) {
+		scoutfs_err(sb, "superblock UUID mismatch");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sbi->super = *meta_super;
+out:
+	kfree(meta_super);
+	kfree(data_super);
+	return ret;
+}
+
 static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct scoutfs_sb_info *sbi;
 	struct mount_options opts;
+	struct block_device *meta_bdev;
 	struct inode *inode;
 	int ret;

@@ -379,14 +559,31 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)

 	sbi->opts = opts;

-	ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SIZE);
-	if (ret != SCOUTFS_BLOCK_SIZE) {
+	ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE);
+	if (ret != SCOUTFS_BLOCK_SM_SIZE) {
 		scoutfs_err(sb, "failed to set blocksize, returned %d", ret);
 		ret = -EIO;
 		goto out;
 	}

-	ret = scoutfs_read_super(sb, &SCOUTFS_SB(sb)->super) ?:
+	meta_bdev =
+		blkdev_get_by_path(sbi->opts.metadev_path,
+				   SCOUTFS_META_BDEV_MODE, sb);
+	if (IS_ERR(meta_bdev)) {
+		scoutfs_err(sb, "could not open metadev: error %ld",
+			    PTR_ERR(meta_bdev));
+		ret = PTR_ERR(meta_bdev);
+		goto out;
+	}
+	sbi->meta_bdev = meta_bdev;
+	ret = set_blocksize(sbi->meta_bdev, SCOUTFS_BLOCK_SM_SIZE);
+	if (ret != 0) {
+		scoutfs_err(sb, "failed to set metadev blocksize, returned %d",
+			    ret);
+		goto out;
+	}
+
+	ret = scoutfs_read_supers(sb) ?:
 	      scoutfs_debugfs_setup(sb) ?:
 	      scoutfs_setup_sysfs(sb) ?:
 	      scoutfs_setup_counters(sb) ?:
@@ -396,6 +593,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_setup_triggers(sb) ?:
 	      scoutfs_block_setup(sb) ?:
 	      scoutfs_forest_setup(sb) ?:
+	      scoutfs_item_setup(sb) ?:
 	      scoutfs_inode_setup(sb) ?:
 	      scoutfs_data_setup(sb) ?:
 	      scoutfs_setup_trans(sb) ?:
@@ -406,7 +604,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_client_setup(sb) ?:
 	      scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
 				   &sbi->rid_lock) ?:
-	      scoutfs_trans_get_log_trees(sb);
+	      scoutfs_trans_get_log_trees(sb) ?:
+	      scoutfs_srch_setup(sb);
 	if (ret)
 		goto out;

@@ -483,6 +682,10 @@ static int __init scoutfs_module_init(void)
 		".section	.note.git_describe,\"a\"\n"
 		".string	\""SCOUTFS_GIT_DESCRIBE"\\n\"\n"
 		".previous\n");
+	__asm__ __volatile__ (
+		".section	.note.scoutfs_interop_version,\"a\"\n"
+		".string	\""SCOUTFS_INTEROP_VERSION_STR"\\n\"\n"
+		".previous\n");

 	scoutfs_init_counters();

@@ -515,3 +718,4 @@ module_exit(scoutfs_module_exit)
 MODULE_AUTHOR("Zach Brown <zab@versity.com>");
 MODULE_LICENSE("GPL");
 MODULE_INFO(git_describe, SCOUTFS_GIT_DESCRIBE);
+MODULE_INFO(scoutfs_interop_version, SCOUTFS_INTEROP_VERSION_STR);
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -25,6 +25,7 @@ struct options_sb_info;
 struct net_info;
 struct block_info;
 struct forest_info;
+struct srch_info;

 struct scoutfs_sb_info {
 	struct super_block *sb;
@@ -35,6 +36,8 @@ struct scoutfs_sb_info {

 	struct scoutfs_super_block super;

+	struct block_device *meta_bdev;
+
 	spinlock_t next_ino_lock;

 	struct data_info *data_info;
@@ -44,6 +47,8 @@ struct scoutfs_sb_info {
 	struct quorum_info *quorum_info;
 	struct block_info *block_info;
 	struct forest_info *forest_info;
+	struct srch_info *srch_info;
+	struct item_cache_info *item_cache_info;

 	wait_queue_head_t trans_hold_wq;
 	struct task_struct *trans_task;
@@ -91,6 +96,13 @@ static inline bool SCOUTFS_HAS_SBI(struct super_block *sb)
 	return (sb != NULL) && (SCOUTFS_SB(sb) != NULL);
 }

+static inline bool SCOUTFS_IS_META_BDEV(struct scoutfs_super_block *super_block)
+{
+	return !!(le64_to_cpu(super_block->flags) & SCOUTFS_FLAG_IS_META_BDEV);
+}
+
+#define SCOUTFS_META_BDEV_MODE (FMODE_READ | FMODE_WRITE | FMODE_EXCL)
+
 /*
 * A small string embedded in messages that's used to identify a
 * specific mount.  It's the three most significant bytes of the fsid
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -25,8 +25,10 @@
 #include "counters.h"
 #include "client.h"
 #include "inode.h"
-#include "radix.h"
+#include "alloc.h"
 #include "block.h"
+#include "msg.h"
+#include "item.h"
 #include "scoutfs_trace.h"

 /*
@@ -58,13 +60,11 @@
 */
 struct trans_info {
 	spinlock_t lock;
-	unsigned reserved_items;
-	unsigned reserved_vals;
 	unsigned holders;
 	bool writing;

 	struct scoutfs_log_trees lt;
-	struct scoutfs_radix_allocator alloc;
+	struct scoutfs_alloc alloc;
 	struct scoutfs_block_writer wri;
 };

@@ -110,8 +110,7 @@ int scoutfs_trans_get_log_trees(struct super_block *sb)
 	ret = scoutfs_client_get_log_trees(sb, &lt);
 	if (ret == 0) {
 		tri->lt = lt;
-		scoutfs_radix_init_alloc(&tri->alloc, &lt.meta_avail,
-					 &lt.meta_freed);
+		scoutfs_alloc_init(&tri->alloc, &lt.meta_avail, &lt.meta_freed);
 		scoutfs_block_writer_init(sb, &tri->wri);

 		scoutfs_forest_init_btrees(sb, &tri->alloc, &tri->wri, &lt);
@@ -126,6 +125,7 @@ bool scoutfs_trans_has_dirty(struct super_block *sb)

 	return scoutfs_block_writer_has_dirty(sb, &tri->wri);
 }
+
 /*
 * This work func is responsible for writing out all the dirty blocks
 * that make up the current dirty transaction.  It prevents writers from
@@ -156,6 +156,8 @@ void scoutfs_trans_write_func(struct work_struct *work)
 						   trans_write_work.work);
 	struct super_block *sb = sbi->sb;
 	DECLARE_TRANS_INFO(sb, tri);
+	u64 trans_seq = sbi->trans_seq;
+	char *s = NULL;
 	int ret = 0;

 	sbi->trans_task = current;
@@ -165,37 +167,49 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	trace_scoutfs_trans_write_func(sb,
 			scoutfs_block_writer_dirty_bytes(sb, &tri->wri));

-	if (scoutfs_block_writer_has_dirty(sb, &tri->wri)) {
-		if (sbi->trans_deadline_expired)
-			scoutfs_inc_counter(sb, trans_commit_timer);
-
-		ret = scoutfs_inode_walk_writeback(sb, true) ?:
-		      scoutfs_block_writer_write(sb, &tri->wri) ?:
-		      scoutfs_inode_walk_writeback(sb, false) ?:
-		      commit_btrees(sb) ?:
-		      scoutfs_client_advance_seq(sb, &sbi->trans_seq) ?:
-		      scoutfs_trans_get_log_trees(sb);
-		if (ret)
-			goto out;
-
-	} else if (sbi->trans_deadline_expired) {
-		/*
-		 * If we're not writing data then we only advance the
-		 * seq at the sync deadline interval.  This keeps idle
-		 * mounts from pinning a seq and stopping readers of the
-		 * seq indices but doesn't send a message for every sync
-		 * syscall.
-		 */
-		ret = scoutfs_client_advance_seq(sb, &sbi->trans_seq);
+	if (!scoutfs_block_writer_has_dirty(sb, &tri->wri) &&
+	    !scoutfs_item_dirty_pages(sb)) {
+		if (sbi->trans_deadline_expired) {
+			/*
+			 * If we're not writing data then we only advance the
+			 * seq at the sync deadline interval.  This keeps idle
+			 * mounts from pinning a seq and stopping readers of the
+			 * seq indices but doesn't send a message for every sync
+			 * syscall.
+			 */
+			ret = scoutfs_client_advance_seq(sb, &trans_seq);
+			if (ret < 0)
+			      s = "clean advance seq";
+		}
+		goto out;
 	}

-out:
+	if (sbi->trans_deadline_expired)
+		scoutfs_inc_counter(sb, trans_commit_timer);
+
+	scoutfs_inc_counter(sb, trans_commit_written);
+
 	/* XXX this all needs serious work for dealing with errors */
-	WARN_ON_ONCE(ret);
+	ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
+	      (s = "item dirty", scoutfs_item_write_dirty(sb))  ?:
+	      (s = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
+	      (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb,
+						&tri->alloc, &tri->wri))  ?:
+	      (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
+	      (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
+	      (s = "commit log trees", commit_btrees(sb)) ?:
+	      scoutfs_item_write_done(sb) ?:
+	      (s = "advance seq", scoutfs_client_advance_seq(sb, &trans_seq)) ?:
+	      (s = "get log trees", scoutfs_trans_get_log_trees(sb));
+out:
+	if (ret < 0)
+		scoutfs_err(sb, "critical transaction commit failure: %s, %d",
+			    s, ret);

 	spin_lock(&sbi->trans_write_lock);
 	sbi->trans_write_count++;
 	sbi->trans_write_ret = ret;
+	sbi->trans_seq = trans_seq;
 	spin_unlock(&sbi->trans_write_lock);
 	wake_up(&sbi->trans_write_wq);

@@ -302,12 +316,11 @@ void scoutfs_trans_restart_sync_deadline(struct super_block *sb)
 * Including nested holds avoids having to deal with writing out partial
 * transactions while a caller still holds the transaction.
 */
+
 #define SCOUTFS_RESERVATION_MAGIC 0xd57cd13b
 struct scoutfs_reservation {
 	unsigned magic;
 	unsigned holders;
-	struct scoutfs_item_count reserved;
-	struct scoutfs_item_count actual;
 };

 /*
@@ -324,22 +337,16 @@ struct scoutfs_reservation {
 * delaying or prematurely forcing commits.
 */
 static bool acquired_hold(struct super_block *sb,
-			  struct scoutfs_reservation *rsv,
-			  const struct scoutfs_item_count *cnt)
+			  struct scoutfs_reservation *rsv)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	DECLARE_TRANS_INFO(sb, tri);
 	bool acquired = false;
-	unsigned items;
-	unsigned vals;

 	spin_lock(&tri->lock);

-	trace_scoutfs_trans_acquired_hold(sb, cnt, rsv, rsv->holders,
-					  &rsv->reserved, &rsv->actual,
-					  tri->holders, tri->writing,
-					  tri->reserved_items,
-					  tri->reserved_vals);
+	trace_scoutfs_trans_acquired_hold(sb, rsv, rsv->holders,
+					  tri->holders, tri->writing);

 	/* use a caller's existing reservation */
 	if (rsv->holders)
@@ -349,14 +356,31 @@ static bool acquired_hold(struct super_block *sb,
 	if (tri->writing)
 		goto out;

-	/* see if we can reserve space for our item count */
-	items = tri->reserved_items + cnt->items;
-	vals = tri->reserved_vals + cnt->vals;
+	/*
+	 * In theory each dirty item page could be straddling two full
+	 * blocks, requiring 4 allocations for each item cache page.
+	 * That's much too conservative, typically many dirty item cache
+	 * pages that are near each other all land in one block.  This
+	 * rough estimate is still so far beyond what typically happens
+	 * that it accounts for having to dirty parent blocks and
+	 * whatever dirtying is done during the transaction hold.
+	 */
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc,
+				   scoutfs_item_dirty_pages(sb) * 2)) {
+		scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
+		queue_trans_work(sbi);
+		goto out;
+	}

-	/* XXX arbitrarily limit to 8 meg transactions */
-	if (scoutfs_block_writer_dirty_bytes(sb, &tri->wri) >=
-			(8 * 1024 * 1024)) {
-		scoutfs_inc_counter(sb, trans_commit_full);
+	/*
+	 * Extent modifications can use meta allocators without creating
+	 * dirty items so we have to check the meta alloc specifically.
+	 * The size of the client's avail and freed roots are bound so
+	 * we're unlikely to need very many block allocations per
+	 * transaction hold.  XXX This should be more precisely tuned.
+	 */
+	if (scoutfs_alloc_meta_low(sb, &tri->alloc, 16)) {
+		scoutfs_inc_counter(sb, trans_commit_meta_alloc_low);
 		queue_trans_work(sbi);
 		goto out;
 	}
@@ -368,12 +392,6 @@ static bool acquired_hold(struct super_block *sb,
 		goto out;
 	}

-	tri->reserved_items = items;
-	tri->reserved_vals = vals;
-
-	rsv->reserved.items = cnt->items;
-	rsv->reserved.vals = cnt->vals;
-
 hold:
 	rsv->holders++;
 	tri->holders++;
@@ -386,20 +404,12 @@ out:
 	return acquired;
 }

-int scoutfs_hold_trans(struct super_block *sb,
-		       const struct scoutfs_item_count cnt)
+int scoutfs_hold_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_reservation *rsv;
 	int ret;

-	/*
-	 * Caller shouldn't provide garbage counts, nor counts that
-	 * can't fit in segments by themselves.
-	 */
-	if (WARN_ON_ONCE(cnt.items <= 0 || cnt.vals < 0))
-		return -EINVAL;
-
 	if (current == sbi->trans_task)
 		return 0;

@@ -416,7 +426,7 @@ int scoutfs_hold_trans(struct super_block *sb,
 	BUG_ON(rsv->magic != SCOUTFS_RESERVATION_MAGIC);

 	ret = wait_event_interruptible(sbi->trans_hold_wq,
-				       acquired_hold(sb, rsv, &cnt));
+				       acquired_hold(sb, rsv));
 	if (ret && rsv->holders == 0) {
 		current->journal_info = NULL;
 		kfree(rsv);
@@ -436,38 +446,6 @@ bool scoutfs_trans_held(void)
 	return rsv && rsv->magic == SCOUTFS_RESERVATION_MAGIC;
 }

-/*
- * Record a transaction holder's individual contribution to the dirty
- * items in the current transaction.  We're making sure that the
- * reservation matches the possible item manipulations while they hold
- * the reservation.
- *
- * It is possible and legitimate for an individual contribution to be
- * negative if they delete dirty items.  The item cache makes sure that
- * the total dirty item count doesn't fall below zero.
- */
-void scoutfs_trans_track_item(struct super_block *sb, signed items,
-			      signed vals)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_reservation *rsv = current->journal_info;
-
-	if (current == sbi->trans_task)
-		return;
-
-	BUG_ON(!rsv || rsv->magic != SCOUTFS_RESERVATION_MAGIC);
-
-	rsv->actual.items += items;
-	rsv->actual.vals += vals;
-
-	trace_scoutfs_trans_track_item(sb, items, vals, rsv->actual.items,
-				       rsv->actual.vals, rsv->reserved.items,
-				       rsv->reserved.vals);
-
-	WARN_ON_ONCE(rsv->actual.items > rsv->reserved.items);
-	WARN_ON_ONCE(rsv->actual.vals > rsv->reserved.vals);
-}
-
 /*
 * As we drop the last hold in the reservation we try and wake other
 * hold attempts that were waiting for space.  As we drop the last trans
@@ -489,16 +467,12 @@ void scoutfs_release_trans(struct super_block *sb)

 	spin_lock(&tri->lock);

-	trace_scoutfs_release_trans(sb, rsv, rsv->holders, &rsv->reserved,
-				    &rsv->actual, tri->holders, tri->writing,
-				    tri->reserved_items, tri->reserved_vals);
+	trace_scoutfs_release_trans(sb, rsv, rsv->holders, tri->holders, tri->writing);

 	BUG_ON(rsv->holders <= 0);
 	BUG_ON(tri->holders <= 0);

 	if (--rsv->holders == 0) {
-		tri->reserved_items -= rsv->reserved.items;
-		tri->reserved_vals -= rsv->reserved.vals;
 		current->journal_info = NULL;
 		kfree(rsv);
 		wake = true;
@@ -513,6 +487,23 @@ void scoutfs_release_trans(struct super_block *sb)
 		wake_up(&sbi->trans_hold_wq);
 }

+/*
+ * Return the current transaction sequence.  Whether this is racing with
+ * the transaction write thread is entirely dependent on the caller's
+ * context.
+ */
+u64 scoutfs_trans_sample_seq(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	u64 ret;
+
+	spin_lock(&sbi->trans_write_lock);
+	ret = sbi->trans_seq;
+	spin_unlock(&sbi->trans_write_lock);
+
+	return ret;
+}
+
 int scoutfs_setup_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
--- a/kmod/src/trans.h
+++ b/kmod/src/trans.h
@@ -6,20 +6,16 @@
 /* the client will force commits if data allocators get too low */
 #define SCOUTFS_TRANS_DATA_ALLOC_LWM	(256ULL * 1024 * 1024)

-#include "count.h"
-
 void scoutfs_trans_write_func(struct work_struct *work);
 int scoutfs_trans_sync(struct super_block *sb, int wait);
 int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 void scoutfs_trans_restart_sync_deadline(struct super_block *sb);

-int scoutfs_hold_trans(struct super_block *sb,
-		       const struct scoutfs_item_count cnt);
+int scoutfs_hold_trans(struct super_block *sb);
 bool scoutfs_trans_held(void);
 void scoutfs_release_trans(struct super_block *sb);
-void scoutfs_trans_track_item(struct super_block *sb, signed items,
-			      signed vals);
+u64 scoutfs_trans_sample_seq(struct super_block *sb);

 int scoutfs_trans_get_log_trees(struct super_block *sb);
 bool scoutfs_trans_has_dirty(struct super_block *sb);
--- a/kmod/src/triggers.h
+++ b/kmod/src/triggers.h
@@ -1,7 +1,7 @@
 #ifndef _SCOUTFS_TRIGGERS_H_
 #define _SCOUTFS_TRIGGERS_H_

-enum {
+enum scoutfs_trigger {
 	SCOUTFS_TRIGGER_BTREE_STALE_READ,
 	SCOUTFS_TRIGGER_BTREE_ADVANCE_RING_HALF,
 	SCOUTFS_TRIGGER_HARD_STALE_ERROR,
--- a/kmod/src/util.h
+++ b/kmod/src/util.h
@@ -0,0 +1,20 @@
+#ifndef _SCOUTFS_UTIL_H_
+#define _SCOUTFS_UTIL_H_
+
+/*
+ * Little utility helpers that probably belong upstream.
+ */
+
+static inline void down_write_two(struct rw_semaphore *a,
+				  struct rw_semaphore *b)
+{
+	BUG_ON(a == b);
+
+	if (a > b)
+		swap(a, b);
+
+	down_write(a);
+	down_write_nested(b, SINGLE_DEPTH_NESTING);
+}
+
+#endif
--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -20,7 +20,7 @@
 #include "inode.h"
 #include "key.h"
 #include "super.h"
-#include "kvec.h"
+#include "item.h"
 #include "forest.h"
 #include "trans.h"
 #include "xattr.h"
@@ -94,21 +94,17 @@ static int unknown_prefix(const char *name)
 	       strncmp(name, SCOUTFS_XATTR_PREFIX, SCOUTFS_XATTR_PREFIX_LEN);
 }

-struct prefix_tags {
-	unsigned long hide:1,
-		      indx:1;
-};

 #define HIDE_TAG	"hide."
-#define INDX_TAG	"indx."
+#define SRCH_TAG	"srch."
 #define TAG_LEN		(sizeof(HIDE_TAG) - 1)

-static int parse_tags(const char *name, unsigned int name_len,
-		      struct prefix_tags *tgs)
+int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
+			     struct scoutfs_xattr_prefix_tags *tgs)
 {
 	bool found;

-	memset(tgs, 0, sizeof(struct prefix_tags));
+	memset(tgs, 0, sizeof(struct scoutfs_xattr_prefix_tags));

 	if ((name_len < (SCOUTFS_XATTR_PREFIX_LEN + TAG_LEN + 1)) ||
 	    strncmp(name, SCOUTFS_XATTR_PREFIX, SCOUTFS_XATTR_PREFIX_LEN))
@@ -120,8 +116,8 @@ static int parse_tags(const char *name, unsigned int name_len,
 		if (!strncmp(name, HIDE_TAG, TAG_LEN)) {
 			if (++tgs->hide == 0)
 				return -EINVAL;
-		} else if (!strncmp(name, INDX_TAG, TAG_LEN)) {
-			if (++tgs->indx == 0)
+		} else if (!strncmp(name, SRCH_TAG, TAG_LEN)) {
+			if (++tgs->srch == 0)
 				return -EINVAL;
 		} else {
 			/* only reason to use scoutfs. is tags */
@@ -136,17 +132,6 @@ static int parse_tags(const char *name, unsigned int name_len,
 	return 0;
 }

-void scoutfs_xattr_index_key(struct scoutfs_key *key,
-			     u64 hash, u64 ino, u64 id)
-{
-	scoutfs_key_set_zeros(key);
-	key->sk_zone = SCOUTFS_XATTR_INDEX_ZONE;
-	key->skxi_hash = cpu_to_le64(hash);
-	key->sk_type = SCOUTFS_XATTR_INDEX_NAME_TYPE;
-	key->skxi_ino = cpu_to_le64(ino);
-	key->skxi_id = cpu_to_le64(id);
-}
-
 /*
 * Find the next xattr and copy the key, xattr header, and as much of
 * the name and value into the callers buffer as we can.  Returns the
@@ -171,7 +156,6 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
 {
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_key last;
-	struct kvec val;
 	u8 last_part;
 	int total;
 	u8 part;
@@ -194,8 +178,9 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,

 	for (;;) {
 		key->skx_part = part;
-		kvec_init(&val, (void *)xat + total, bytes - total);
-		ret = scoutfs_forest_next(sb, key, &last, &val, lock);
+		ret = scoutfs_item_next(sb, key, &last,
+					(void *)xat + total, bytes - total,
+					lock);
 		if (ret < 0) {
 			/* XXX corruption, ran out of parts */
 			if (ret == -ENOENT && part > 0)
@@ -271,7 +256,6 @@ static int create_xattr_items(struct inode *inode, u64 id,
 	struct scoutfs_key key;
 	unsigned int part_bytes;
 	unsigned int total;
-	struct kvec val;
 	int ret;

 	init_xattr_key(&key, scoutfs_ino(inode),
@@ -282,12 +266,13 @@ static int create_xattr_items(struct inode *inode, u64 id,
 	while (total < bytes) {
 		part_bytes = min_t(unsigned int, bytes - total,
 				   SCOUTFS_XATTR_MAX_PART_SIZE);
-		kvec_init(&val, (void *)xat + total, part_bytes);

-		ret = scoutfs_forest_create(sb, &key, &val, lock);
+		ret = scoutfs_item_create(sb, &key,
+					  (void *)xat + total, part_bytes,
+					  lock);
 		if (ret) {
 			while (key.skx_part-- > 0)
-				scoutfs_forest_delete_dirty(sb, &key);
+				scoutfs_item_delete(sb, &key, lock);
 			break;
 		}

@@ -299,24 +284,114 @@ static int create_xattr_items(struct inode *inode, u64 id,
 }

 /*
- * Delete and save the items that make up the given xattr.  If this
- * returns an error then the deleted and saved items are left on the
- * list for the caller to restore.
+ * Delete the items that make up the given xattr.  If this returns an
+ * error then no items have been deleted.
 */
 static int delete_xattr_items(struct inode *inode, u32 name_hash, u64 id,
-			      u8 nr_parts, struct list_head *list,
-			      struct scoutfs_lock *lock)
+			      u8 nr_parts, struct scoutfs_lock *lock)
 {
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_key key;
-	int ret;
+	int ret = 0;
+	int i;

 	init_xattr_key(&key, scoutfs_ino(inode), name_hash, id);

-	do {
-		ret = scoutfs_forest_delete_save(sb, &key, list, lock);
-	} while (ret == 0 && ++key.skx_part < nr_parts);
+	/* dirty additional existing old items */
+	for (i = 1; i < nr_parts; i++) {
+		key.skx_part = i;
+		ret = scoutfs_item_dirty(sb, &key, lock);
+		if (ret)
+			goto out;
+	}

+	for (i = 0; i < nr_parts; i++) {
+		key.skx_part = i;
+		ret = scoutfs_item_delete(sb, &key, lock);
+		if (ret)
+			break;
+	}
+out:
+	return ret;
+}
+
+/*
+ * The caller needs to overwrite existing old xattr items with new
+ * items.  We carefully stage the changes so that we can always unwind
+ * to the original items if we return an error.  Both items have at
+ * least one part.  Either the old or new can have more parts.  We dirty
+ * and create first because we can always unwind those.  We delete last
+ * after dirtying so that it can't fail and we don't have to restore the
+ * deleted items.
+ */
+static int change_xattr_items(struct inode *inode, u64 id,
+			      struct scoutfs_xattr *new_xat,
+			      unsigned int new_bytes, u8 new_parts,
+			      u8 old_parts, struct scoutfs_lock *lock)
+{
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_key key;
+	int last_created = -1;
+	int bytes;
+	int off;
+	int i;
+	int ret;
+
+	init_xattr_key(&key, scoutfs_ino(inode),
+		       xattr_name_hash(new_xat->name, new_xat->name_len), id);
+
+	/* dirty existing old items */
+	for (i = 0; i < old_parts; i++) {
+		key.skx_part = i;
+		ret = scoutfs_item_dirty(sb, &key, lock);
+		if (ret)
+			goto out;
+	}
+
+	/* create any new items past the old */
+	for (i = old_parts; i < new_parts; i++) {
+		off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
+		bytes = min_t(unsigned int, new_bytes - off,
+			      SCOUTFS_XATTR_MAX_PART_SIZE);
+
+		key.skx_part = i;
+		ret = scoutfs_item_create(sb, &key, (void *)new_xat + off,
+					  bytes, lock);
+		if (ret)
+			goto out;
+
+		last_created = i;
+	}
+
+	/* update dirtied overlapping existing items, last partial first */
+	for (i = old_parts - 1; i >= 0; i--) {
+		off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
+		bytes = min_t(unsigned int, new_bytes - off,
+			      SCOUTFS_XATTR_MAX_PART_SIZE);
+
+		key.skx_part = i;
+		ret = scoutfs_item_update(sb, &key, (void *)new_xat + off,
+					  bytes, lock);
+		/* only last partial can fail, then we unwind created */
+		if (ret < 0)
+			goto out;
+	}
+
+	/* delete any dirtied old items past new */
+	for (i = new_parts; i < old_parts; i++) {
+		key.skx_part = i;
+		scoutfs_item_delete(sb, &key, lock);
+	}
+
+	ret = 0;
+out:
+	if (ret < 0) {
+		/* delete any newly created items */
+		for (i = old_parts; i <= last_created; i++) {
+			key.skx_part = i;
+			scoutfs_item_delete(sb, &key, lock);
+		}
+	}
 	return ret;
 }

@@ -346,7 +421,7 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,

 	/* only need enough for caller's name and value sizes */
 	bytes = sizeof(struct scoutfs_xattr) + name_len + size;
-	xat = kmalloc(bytes, GFP_NOFS);
+	xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL);
 	if (!xat)
 		return -ENOMEM;

@@ -389,7 +464,7 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
 	ret = le16_to_cpu(xat->val_len);
 	memcpy(buffer, &xat->name[xat->name_len], ret);
 out:
-	kfree(xat);
+	vfree(xat);
 	return ret;
 }

@@ -411,20 +486,17 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
+	struct scoutfs_xattr_prefix_tags tgs;
 	struct scoutfs_xattr *xat = NULL;
-	struct scoutfs_lock *indx_lock = NULL;
 	struct scoutfs_lock *lck = NULL;
 	size_t name_len = strlen(name);
-	struct scoutfs_key indx_key;
 	struct scoutfs_key key;
-	struct prefix_tags tgs;
-	bool undo_indx = false;
+	bool undo_srch = false;
 	LIST_HEAD(ind_locks);
-	LIST_HEAD(saved);
 	u8 found_parts;
 	unsigned int bytes;
 	u64 ind_seq;
-	u64 hash;
+	u64 hash = 0;
 	u64 id = 0;
 	int ret;
 	int err;
@@ -444,14 +516,14 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 	if (unknown_prefix(name))
 		return -EOPNOTSUPP;

-	if (parse_tags(name, name_len, &tgs) != 0)
+	if (scoutfs_xattr_parse_tags(name, name_len, &tgs) != 0)
 		return -EINVAL;

-	if ((tgs.hide || tgs.indx) && !capable(CAP_SYS_ADMIN))
+	if ((tgs.hide || tgs.srch) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;

 	bytes = sizeof(struct scoutfs_xattr) + name_len + size;
-	xat = kmalloc(bytes, GFP_NOFS);
+	xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL);
 	if (!xat) {
 		ret = -ENOMEM;
 		goto out;
@@ -491,29 +563,21 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,

 	/* prepare our xattr */
 	if (value) {
-		id = si->next_xattr_id++;
+		if (found_parts)
+			id = le64_to_cpu(key.skx_id);
+		else
+			id = si->next_xattr_id++;
 		xat->name_len = name_len;
 		xat->val_len = cpu_to_le16(size);
+		memset(xat->__pad, 0, sizeof(xat->__pad));
 		memcpy(xat->name, name, name_len);
 		memcpy(&xat->name[xat->name_len], value, size);
 	}

-	if (tgs.indx && !(found_parts && value)) {
-		hash = scoutfs_hash64(name, name_len);
-		ret = scoutfs_lock_xattr_index(sb, SCOUTFS_LOCK_WRITE_ONLY, 0,
-					       hash, &indx_lock);
-		if (ret < 0)
-			goto unlock;
-	}
-
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
-	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
-						SIC_XATTR_SET(found_parts,
-							      value != NULL,
-							      name_len, size,
-							      tgs.indx));
+	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -523,34 +587,27 @@ retry:
 	if (ret < 0)
 		goto release;

-	if (tgs.indx && !(found_parts && value)) {
+	if (tgs.srch && !(found_parts && value)) {
 		if (found_parts)
 			id = le64_to_cpu(key.skx_id);
 		hash = scoutfs_hash64(name, name_len);
-		scoutfs_xattr_index_key(&indx_key, hash, ino, id);
-		if (value)
-			ret = scoutfs_forest_create_force(sb, &indx_key, NULL,
-							  indx_lock);
-		else
-			ret = scoutfs_forest_delete_force(sb, &indx_key,
-							  indx_lock);
+		ret = scoutfs_forest_srch_add(sb, hash, ino, id);
 		if (ret < 0)
 			goto release;
-		undo_indx = true;
+		undo_srch = true;
 	}

-	ret = 0;
-	if (found_parts)
+	if (found_parts && value)
+		ret = change_xattr_items(inode, id, xat, bytes,
+					 xattr_nr_parts(xat), found_parts, lck);
+	else if (found_parts)
 		ret = delete_xattr_items(inode, le64_to_cpu(key.skx_name_hash),
 					 le64_to_cpu(key.skx_id), found_parts,
-					 &saved, lck);
-	if (value && ret == 0)
+					 lck);
+	else
 		ret = create_xattr_items(inode, id, xat, bytes, lck);
-	if (ret < 0) {
-		scoutfs_forest_restore(sb, &saved, lck);
+	if (ret < 0)
 		goto release;
-	}
-	scoutfs_forest_free_batch(sb, &saved);

 	/* XXX do these want i_mutex or anything? */
 	inode_inc_iversion(inode);
@@ -559,13 +616,8 @@ retry:
 	ret = 0;

 release:
-	if (ret < 0 && undo_indx) {
-		if (value)
-			err = scoutfs_forest_delete_force(sb, &indx_key,
-							  indx_lock);
-		else
-			err = scoutfs_forest_create_force(sb, &indx_key, NULL,
-							  indx_lock);
+	if (ret < 0 && undo_srch) {
+		err = scoutfs_forest_srch_add(sb, hash, ino, id);
 		BUG_ON(err);
 	}

@@ -573,10 +625,9 @@ release:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 unlock:
 	up_write(&si->xattr_rwsem);
-	scoutfs_unlock(sb, indx_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	scoutfs_unlock(sb, lck, SCOUTFS_LOCK_WRITE);
 out:
-	kfree(xat);
+	vfree(xat);

 	return ret;
 }
@@ -601,10 +652,10 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
+	struct scoutfs_xattr_prefix_tags tgs;
 	struct scoutfs_xattr *xat = NULL;
 	struct scoutfs_lock *lck = NULL;
 	struct scoutfs_key key;
-	struct prefix_tags tgs;
 	unsigned int bytes;
 	ssize_t total = 0;
 	u32 name_hash = 0;
@@ -640,8 +691,8 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
 			break;
 		}

-		is_hidden = parse_tags(xat->name, xat->name_len, &tgs) == 0 &&
-			    tgs.hide;
+		is_hidden = scoutfs_xattr_parse_tags(xat->name, xat->name_len,
+						     &tgs) == 0 && tgs.hide;

 		if (show_hidden == is_hidden) {
 			if (size) {
@@ -693,15 +744,12 @@ ssize_t scoutfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		       struct scoutfs_lock *lock)
 {
-	struct scoutfs_lock *indx_lock = NULL;
+	struct scoutfs_xattr_prefix_tags tgs;
 	struct scoutfs_xattr *xat = NULL;
-	struct scoutfs_key indx_key;
 	struct scoutfs_key last;
 	struct scoutfs_key key;
-	struct prefix_tags tgs;
 	bool release = false;
 	unsigned int bytes;
-	struct kvec val;
 	u64 hash;
 	int ret;

@@ -717,8 +765,8 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 	init_xattr_key(&last, ino, U32_MAX, U64_MAX);

 	for (;;) {
-		kvec_init(&val, (void *)xat, bytes);
-		ret = scoutfs_forest_next(sb, &key, &last, &val, lock);
+		ret = scoutfs_item_next(sb, &key, &last, (void *)xat, bytes,
+					lock);
 		if (ret < 0) {
 			if (ret == -ENOENT)
 				ret = 0;
@@ -726,32 +774,23 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		}

 		if (key.skx_part != 0 ||
-		    parse_tags(xat->name, xat->name_len, &tgs) != 0)
+		    scoutfs_xattr_parse_tags(xat->name, xat->name_len,
+					     &tgs) != 0)
 			memset(&tgs, 0, sizeof(tgs));

-		if (tgs.indx) {
-			hash = scoutfs_hash64(xat->name, xat->name_len);
-			scoutfs_xattr_index_key(&indx_key, hash, ino,
-						le64_to_cpu(key.skx_id));
-			ret = scoutfs_lock_xattr_index(sb,
-						      SCOUTFS_LOCK_WRITE_ONLY,
-						      0, hash, &indx_lock);
-			if (ret < 0)
-				break;
-		}
-
-		ret = scoutfs_hold_trans(sb, SIC_EXACT(2, 0));
+		ret = scoutfs_hold_trans(sb);
 		if (ret < 0)
 			break;
 		release = true;

-		ret = scoutfs_forest_delete(sb, &key, lock);
+		ret = scoutfs_item_delete(sb, &key, lock);
 		if (ret < 0)
 			break;

-		if (tgs.indx) {
-		       ret = scoutfs_forest_delete_force(sb, &indx_key,
-							 indx_lock);
+		if (tgs.srch) {
+			hash = scoutfs_hash64(xat->name, xat->name_len);
+			ret = scoutfs_forest_srch_add(sb, hash, ino,
+						      le64_to_cpu(key.skx_id));
 		       if (ret < 0)
 			       break;
 		}
@@ -759,15 +798,11 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		scoutfs_release_trans(sb);
 		release = false;

-		scoutfs_unlock(sb, indx_lock, SCOUTFS_LOCK_WRITE_ONLY);
-		indx_lock = NULL;
-
 		/* don't need to inc, next won't see deleted item */
 	}

 	if (release)
 		scoutfs_release_trans(sb);
-	scoutfs_unlock(sb, indx_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	kfree(xat);
 out:
 	return ret;
--- a/kmod/src/xattr.h
+++ b/kmod/src/xattr.h
@@ -14,7 +14,12 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
 int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		       struct scoutfs_lock *lock);

-void scoutfs_xattr_index_key(struct scoutfs_key *key,
-			     u64 hash, u64 ino, u64 id);
+struct scoutfs_xattr_prefix_tags {
+	unsigned long hide:1,
+		      srch:1;
+};
+
+int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
+			     struct scoutfs_xattr_prefix_tags *tgs);

 #endif
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -0,0 +1,6 @@
+src/*.d
+src/createmany
+src/dumb_setxattr
+src/handle_cat
+src/bulk_create_paths
+src/find_xattrs
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -0,0 +1,49 @@
+CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -fno-strict-aliasing 
+SHELL := /usr/bin/bash
+
+# each binary command is built from a single .c file
+BIN := src/createmany			\
+	src/dumb_setxattr		\
+	src/handle_cat			\
+	src/bulk_create_paths		\
+	src/find_xattrs
+
+DEPS := $(wildcard src/*.d)
+
+all: $(BIN)
+
+ifneq ($(DEPS),)
+-include $(DEPS)
+endif
+
+$(BIN): %: %.c Makefile
+	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@
+
+.PHONY: clean
+clean:
+	@rm -f $(BIN) $(DEPS)
+
+#
+# Make sure we only have all three items needed for each test: entry in
+# sequence, test script in tests/, and output in golden/.
+#
+.PHONY: check-test-files
+check-test-files:
+	@for t in $$(grep -v "^#" sequence); do			\
+		test -e "tests/$$t" ||				\
+			echo "no test for list entry: $$t";	\
+		t=$${t%%.sh};					\
+		test -e "golden/$$t" ||				\
+			echo "no output for list entry: $$t";	\
+	done;							\
+	for t in golden/*; do					\
+		t=$$(basename "$$t");				\
+		grep -q "^$$t.sh$$" sequence ||			\
+			echo "output not in list: $$t";		\
+	done;							\
+	for t in tests/*; do					\
+		t=$$(basename "$$t");				\
+		test "$$t" == "list" && continue;		\
+		grep -q "^$$t$$" sequence ||			\
+			echo "test not in list: $$t";		\
+	done
--- a/tests/README.md
+++ b/tests/README.md
@@ -0,0 +1,123 @@
+
+This test suite exercises multi-node scoutfs by using multiple mounts on
+one host to simulate multiple nodes across a network.
+
+It also contains a light test wrapper that executes xfstests on one of
+the test mounts.
+
+## Invoking Tests
+
+The basic test invocation has to specify the devices for the fs the
+number of mounts to test, whether to create a new fs and insert the
+built module, and where to put the results.
+
+    # bash ./run-tests.sh                       \
+        -M /dev/vda                             \
+        -D /dev/vdb                             \
+        -i                                      \
+        -m                                      \
+        -n 3                                    \
+        -q 2                                    \
+        -r ./results
+
+All options can be seen by running with -h.
+
+This script is built to test multi-node systems on one host by using
+different mounts of the same devices.  The script creates a fake block
+device in front of each fs block device for each mount that will be
+tested.  Currently it will create free loop devices and will mount on
+/mnt/test.[0-9].
+
+All tests will be run by default.  Particular tests can be included or
+excluded by providing test name regular expressions with the -I and -E
+options.  The definitive list of tests and the order in which they'll be
+run is found in the sequence file.
+
+## xfstests
+
+The last test that is run checks out, builds, and runs xfstests.  It
+needs -X and -x options for the xfstests git repo and branch.  It also
+needs spare devices on which to make scratch scoutfs volumes.  The test
+verifies that the expected set of xfstests tests ran and passed.
+
+        -f /dev/vdc                             \
+        -e /dev/vdd                             \
+        -X $HOME/git/scoutfs-xfstests           \
+        -x scoutfs                              \
+
+An xfstests repo that knows about scoutfs is only required to sprinkle
+the scoutfs cases throughout the xfstests harness.
+
+## Individual Test Invocation
+
+Each test is run in a new bash invocation.  A set of directories in the
+test volume and in the results path are created for the test.  Each
+test's working directory isn't managed.
+
+Test output, temp files, and dmesg snapshots are all put in a tmp/ dir
+in the results/ dir.  Per-test dirs are only destroyed before each test
+invocation.
+
+The harness will check for unexpected output in dmesg after each
+individual test.
+
+Each test that fails will have its results appened to the fail.log file
+in the results/ directory.  The details of the failure can be examined
+in the directories for each test in results/output/ and results/tmp/. 
+
+## Writing tests
+
+Tests have access to a set of t\_ prefixed bash functions that are found
+in files in funcs/.
+
+Tests complete by calling t\_ functions which indicate the result of the
+test and can return a message.  If the tests passes then its output is
+compared with known good output.  If the output doesn't match then the
+test fails.  The t\_ completion functions return specific status codes so
+that returning without calling one can be detected.
+
+The golden output has to be consistent across test platforms so there
+are a number of filter functions which strip out local details from
+command output.  t\_filter\_fs is by far the most used which canonicalizes
+fs mount paths and block device details.
+
+Tests can be relatively loose about checking errors.  If commands
+produce output in failure cases then the test will fail without having
+to specifically test for errors on every command execution.  Care should
+be taken to make sure that blowing through a bunch of commands with no
+error checking doesn't produce catastrophic results.  Usually tests are
+simple and it's fine.
+
+A bare sync will sync all the mounted filesystems and ensure that
+no mounts have dirty data.  sync -f can be used to sync just a specific
+filesystem, though it doesn't exist on all platforms.
+
+The harness doesn't currently ensure that all mounts are restored after
+each test invocation.  It probably should.  Currently it's the
+responsibility of the test to restore any mounts it alters and there are
+t\_ functions to mount all configured mount points.
+
+## Environment Variables
+
+Tests have a number of exported environment variables that are commonly
+used during the test.
+
+| Variable         | Description          | Origin          | Example           |
+| ---------------- | -------------------  | --------------- | ----------------- |
+| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
+| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
+| T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
+| T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
+| T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
+| T\_EX\_META\_DEV | scratch meta bdev    | -f              | /dev/vdd          |
+| T\_EX\_DATA\_DEV | scratch meta bdev    | -e              | /dev/vdc          |
+| T\_M[0-9]        | mount paths          | mounted per run | /mnt/test.[0-9]/  |
+| T\_NR\_MOUNTS    | number of mounts     | -n              | 3                 |
+| T\_O[0-9]        | mount options        | created per run | -o server\_addr=  |
+| T\_QUORUM        | quorum count         | -q              | 2                 |
+| T\_TMP           | per-test tmp prefix  | made for test   | results/tmp/t/tmp |
+| T\_TMPDIR        | per-test tmp dir dir | made for test   | results/tmp/t     |
+
+There are also a number of variables that are set in response to options
+and are exported but their use is rare so they aren't included here.
+
--- a/tests/funcs/exec.sh
+++ b/tests/funcs/exec.sh
@@ -0,0 +1,58 @@
+
+t_status_msg()
+{
+	echo "$*" > "$T_TMPDIR/status.msg"
+}
+
+export T_PASS_STATUS=100
+export T_SKIP_STATUS=101
+export T_FAIL_STATUS=102
+export T_FIRST_STATUS="$T_PASS_STATUS"
+export T_LAST_STATUS="$T_FAIL_STATUS"
+
+t_pass()
+{
+	exit $T_PASS_STATUS
+}
+
+t_skip()
+{
+	t_status_msg "$@"
+	exit $T_SKIP_STATUS
+}
+
+t_fail()
+{
+	t_status_msg "$@"
+	exit $T_FAIL_STATUS
+}
+
+#
+# Quietly run a command during a test.  If it succeeds then we have a
+# log of its execution but its output isn't included in the test's
+# compared output.  If it fails then the test fails.
+#
+t_quiet()
+{
+	echo "# $*" >> "$T_TMPDIR/quiet.log"
+	"$@" > "$T_TMPDIR/quiet.log" 2>&1 || \
+		t_fail "quiet command failed"
+}
+
+#
+# redirect test output back to the output of the invoking script intead
+# of the compared output.
+#
+t_restore_output()
+{
+	exec >&6 2>&1
+}
+
+#
+# redirect a command's output back to the compared output after the
+# test has restored its output
+#
+t_compare_output()
+{
+	"$@" >&7 2>&1
+}
--- a/tests/funcs/filter.sh
+++ b/tests/funcs/filter.sh
@@ -0,0 +1,66 @@
+
+# filter out device ids and mount paths
+t_filter_fs()
+{
+	sed -e 's@mnt/test\.[0-9]*@mnt/test@g' \
+	    -e 's@Device: [a-fA-F0-7]*h/[0-9]*d@Device: 0h/0d@g'
+}
+
+#
+# Filter out expected messages.  Putting messages here implies that
+# tests aren't relying on messages to discover failures.. they're
+# directly testing the result of whatever it is that's generating the
+# message.
+#
+t_filter_dmesg()
+{
+	local re
+
+	# the kernel can just be noisy
+	re=" used greatest stack depth: "
+
+	# mkfs/mount checks partition tables
+	re="$re|unknown partition table"
+
+	# dm swizzling
+	re="$re|device doesn't appear to be in the dev hash table"
+	re="$re|device-mapper:.*uevent:.*version"
+	re="$re|device-mapper:.*ioctl:.*initialised"
+
+	# some tests try invalid devices
+	re="$re|scoutfs .* error reading super block"
+	re="$re| EXT4-fs (.*): get root inode failed"
+	re="$re| EXT4-fs (.*): mount failed"
+	re="$re| EXT4-fs (.*): no journal found"
+	re="$re| EXT4-fs (.*): VFS: Can't find ext4 filesystem"
+
+	# dropping caches is fine
+	re="$re| drop_caches: "
+
+	# mount and unmount spew a bunch
+	re="$re|scoutfs.*client connected"
+	re="$re|scoutfs.*client disconnected"
+	re="$re|scoutfs.*server setting up"
+	re="$re|scoutfs.*server ready"
+	re="$re|scoutfs.*server accepted"
+	re="$re|scoutfs.*server closing"
+	re="$re|scoutfs.*server shutting down"
+	re="$re|scoutfs.*server stopped"
+
+	# xfstests records test execution in desg
+	re="$re| run fstests "
+
+	# tests that drop unmount io triggers fencing
+	re="$re|scoutfs .* error: fencing "
+	re="$re|scoutfs .*: waiting for .* lock clients"
+	re="$re|scoutfs .*: all lock clients recovered"
+	re="$re|scoutfs .* error: client rid.*lock recovery timed out"
+
+	# some tests mount w/o options
+	re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"
+
+	# in debugging kernels we can slow things down a bit
+	re="$re|hrtimer: interrupt took .*"
+
+	egrep -v "($re)" 
+}
--- a/tests/funcs/fs.sh
+++ b/tests/funcs/fs.sh
@@ -0,0 +1,231 @@
+
+#
+# Make all previously dirty items in memory in all mounts synced and
+# visible in the inode seq indexes.  We have to force a sync on every
+# node by dirtying data as that's the only way to guarantee advancing
+# the sequence number on each node which limits index visibility.  Some
+# distros don't have sync -f so we dirty our mounts then sync
+# everything.
+#
+t_sync_seq_index()
+{
+	local m
+	
+	for m in $T_MS; do
+		t_quiet touch $m
+	done
+	t_quiet sync
+}
+
+#
+# Output the "f.$fsid.r.$rid" identifier string for the given mount
+# number, 0 is used by default if none is specified. 
+#
+t_ident()
+{
+	local nr="${1:-0}"
+	local mnt="$(eval echo \$T_M$nr)"
+	local fsid
+	local rid
+
+	fsid=$(scoutfs statfs -s fsid -p "$mnt")
+	rid=$(scoutfs statfs -s rid -p "$mnt")
+
+	echo "f.${fsid:0:6}.r.${rid:0:6}"
+}
+
+#
+# Output the mount's sysfs path, defaulting to mount 0 if none is
+# specified.
+#
+t_sysfs_path()
+{
+	local nr="$1"
+
+	echo "/sys/fs/scoutfs/$(t_ident $nr)"
+}
+
+#
+# Output the mount's debugfs path, defaulting to mount 0 if none is
+# specified.
+#
+t_debugfs_path()
+{
+	local nr="$1"
+
+	echo "/sys/kernel/debug/scoutfs/$(t_ident $nr)"
+}
+
+#
+# output all the configured test nrs for iteration
+#
+t_fs_nrs()
+{
+	seq 0 $((T_NR_MOUNTS - 1))
+}
+
+#
+# Output the mount nr of the current server.  This takes no steps to
+# ensure that the server doesn't shut down and have some other mount
+# take over.  
+#
+t_server_nr()
+{
+	for i in $(t_fs_nrs); do
+		if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "1" ]; then
+			echo $i
+			return
+		fi
+	done
+
+	t_fail "t_server_nr didn't find a server"
+}
+
+#
+# Output the mount nr of the first client that we find.  There can be
+# no clients if there's only one mount who has to be the server.  This
+# takes no steps to ensure that the client doesn't become a server at
+# any point.
+#
+t_first_client_nr()
+{
+	for i in $(t_fs_nrs); do
+		if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "0" ]; then
+			echo $i
+			return
+		fi
+	done
+
+	t_fail "t_first_client_nr didn't find any clients"
+}
+
+t_mount()
+{
+	local nr="$1"
+
+	test "$nr" -lt "$T_NR_MOUNTS" || \
+		t_fail "fs nr $nr invalid"
+
+	eval t_quiet mount -t scoutfs \$T_O$nr \$T_DB$nr \$T_M$nr
+}
+
+t_umount()
+{
+	local nr="$1"
+
+	test "$nr" -lt "$T_NR_MOUNTS" || \
+		t_fail "fs nr $nr invalid"
+
+	eval t_quiet umount \$T_DB$i
+}
+
+#
+# Attempt to mount all the configured mounts, assuming that they're
+# not already mounted.
+#
+t_mount_all()
+{
+	local pids=""
+	local p
+
+	for i in $(t_fs_nrs); do
+		t_mount $i &
+		p="$!"
+		pids="$pids $!"
+	done
+	for p in $pids; do
+		t_quiet wait $p
+	done
+}
+
+#
+# Attempt to unmount all the configured mounts, assuming that they're
+# all mounted.
+#
+t_umount_all()
+{
+	local pids=""
+	local p
+
+	for i in $(t_fs_nrs); do
+		t_umount $i &
+		p="$!"
+		pids="$pids $!"
+	done
+	for p in $pids; do
+		t_quiet wait $p
+	done
+}
+
+t_remount_all()
+{
+	t_quiet t_umount_all || t_fail "umounting all failed"
+	t_quiet t_mount_all || t_fail "mounting all failed"
+}
+
+t_reinsert_remount_all()
+{
+	t_quiet t_umount_all || t_fail "umounting all failed"
+
+	t_quiet rmmod scoutfs || \
+		t_fail "rmmod scoutfs failed"
+	t_quiet insmod "$T_KMOD/src/scoutfs.ko" ||
+		t_fail "insmod scoutfs failed"
+
+	t_quiet t_mount_all || t_fail "mounting all failed"
+}
+
+t_trigger_path() {
+	local nr="$1"
+
+	echo "/sys/kernel/debug/scoutfs/$(t_ident $nr)/trigger"
+}
+
+t_trigger_get() {
+	local which="$1"
+	local nr="$2"
+
+	cat "$(t_trigger_path "$nr")/$which"
+}
+
+t_trigger_show() {
+	local which="$1"
+	local string="$2"
+	local nr="$3"
+
+	echo "trigger $which $string: $(t_trigger_get $which $nr)"
+}
+
+t_trigger_arm() {
+	local which="$1"
+	local nr="$2"
+	local path=$(t_trigger_path "$nr")
+
+	echo 1 > "$path/$which"
+	t_trigger_show $which armed $nr
+}
+
+#
+# output the value of the given counter for the given mount, defaulting
+# to mount 0 if a mount isn't specified.
+#
+t_counter() {
+	local which="$1"
+	local nr="$2"
+
+	cat "$(t_sysfs_path $nr)/counters/$which"
+}
+
+#
+# output the value of the given counter for the given mount, defaulting
+# to mount 0 if a mount isn't specified.
+#
+t_counter_diff() {
+	local which="$1"
+	local old="$2"
+	local nr="$3"
+	local new
+
+	new="$(t_counter $which $nr)"
+	echo "counter $which diff $((new - old))"
+}
--- a/tests/funcs/require.sh
+++ b/tests/funcs/require.sh
@@ -0,0 +1,40 @@
+
+#
+# Make sure that all the base command arguments are found in the path.
+# This isn't strictly necessary as the test will naturally fail if the
+# command isn't found, but it's nice to fail fast and clearly
+# communicate why.
+#
+t_require_commands() {
+	local c
+
+	for c in "$@"; do
+		which "$c" >/dev/null 2>&1 || \
+			t_fail "command $c not found in path"
+	done
+}
+
+#
+# make sure that we have at least this many mounts
+#
+t_require_mounts() {
+	local req="$1"
+
+	test "$T_NR_MOUNTS" -ge "$req" || \
+		t_skip "$req mounts required, only have $T_NR_MOUNTS"
+}
+
+#
+# Require that the meta device be at least the size string argument, as
+# parsed by numfmt using single char base 2 suffixes (iec).. 64G, etc.
+#
+t_require_meta_size() {
+	local dev="$T_META_DEVICE"
+	local req_iec="$1"
+	local req_bytes=$(numfmt --from=iec --to=none $req_iec)
+	local dev_bytes=$(blockdev --getsize64 $dev)
+	local dev_iec=$(numfmt --from=auto --to=iec $dev_bytes)
+
+	test "$dev_bytes" -ge "$req_bytes" || \
+		t_skip "$dev must be at least $req_iec, is $dev_iec"
+}
--- a/tests/golden/archive-light-cycle
+++ b/tests/golden/archive-light-cycle
@@ -0,0 +1,36 @@
+== calculate number of files
+== create per mount dirs
+== generate phase scripts
+== round 1: create
+== round 1: online
+== round 1: verify
+== round 1: release
+== round 1: offline
+== round 1: stage
+== round 1: online
+== round 1: verify
+== round 1: release
+== round 1: offline
+== round 1: unlink
+== round 2: create
+== round 2: online
+== round 2: verify
+== round 2: release
+== round 2: offline
+== round 2: stage
+== round 2: online
+== round 2: verify
+== round 2: release
+== round 2: offline
+== round 2: unlink
+== round 3: create
+== round 3: online
+== round 3: verify
+== round 3: release
+== round 3: offline
+== round 3: stage
+== round 3: online
+== round 3: verify
+== round 3: release
+== round 3: offline
+== round 3: unlink
--- a/tests/golden/basic-block-counts
+++ b/tests/golden/basic-block-counts
@@ -0,0 +1,53 @@
+== single block write
+online: 1
+offline: 0
+st_blocks: 8
+== single block overwrite
+online: 1
+offline: 0
+st_blocks: 8
+== append
+online: 2
+offline: 0
+st_blocks: 16
+== release
+online: 0
+offline: 2
+st_blocks: 16
+== duplicate release
+online: 0
+offline: 2
+st_blocks: 16
+== duplicate release past i_size
+online: 0
+offline: 2
+st_blocks: 16
+== stage
+online: 2
+offline: 0
+st_blocks: 16
+== duplicate stage
+online: 2
+offline: 0
+st_blocks: 16
+== larger file
+online: 256
+offline: 0
+st_blocks: 2048
+== partial truncate
+online: 128
+offline: 0
+st_blocks: 1024
+== single sparse block
+online: 1
+offline: 0
+st_blocks: 8
+== empty file
+online: 0
+offline: 0
+st_blocks: 0
+== non-regular file
+online: 0
+offline: 0
+st_blocks: 0
+== cleanup
--- a/tests/golden/basic-posix-consistency
+++ b/tests/golden/basic-posix-consistency
@@ -0,0 +1,55 @@
+== root inode updates flow back and forth
+== stat of created file matches
+== written file contents match
+== overwritten file contents match
+== appended file contents match
+== fiemap matches after racey appends
+== unlinked file isn't found
+== symlink targets match
+/mnt/test/test/basic-posix-consistency/file.targ
+/mnt/test/test/basic-posix-consistency/file.targ
+/mnt/test/test/basic-posix-consistency/file.targ2
+/mnt/test/test/basic-posix-consistency/file.targ2
+== new xattrs are visible
+# file: /mnt/test/test/basic-posix-consistency/file
+user.xat="1"
+
+# file: /mnt/test/test/basic-posix-consistency/file
+user.xat="1"
+
+== modified xattrs are updated
+# file: /mnt/test/test/basic-posix-consistency/file
+user.xat="2"
+
+# file: /mnt/test/test/basic-posix-consistency/file
+user.xat="2"
+
+== deleted xattrs
+/mnt/test/test/basic-posix-consistency/file: user.xat: No such attribute
+/mnt/test/test/basic-posix-consistency/file: user.xat: No such attribute
+== readdir after modification
+one
+two
+three
+four
+one
+two
+three
+four
+two
+four
+two
+four
+== can delete empty dir
+== some easy rename cases
+--- file between dirs
+--- file within dir
+--- dir within dir
+--- overwrite file
+--- can't overwrite non-empty dir
+mv: cannot move ‘/mnt/test/test/basic-posix-consistency/dir/c/clobber’ to ‘/mnt/test/test/basic-posix-consistency/dir/a/dir’: Directory not empty
+--- can overwrite empty dir
+== path resoluion
+== inode indexes match after syncing existing
+== inode indexes match after copying and syncing
+== inode indexes match after removing and syncing
--- a/tests/golden/createmany-large-names
+++ b/tests/golden/createmany-large-names
--- a/tests/golden/createmany-parallel
+++ b/tests/golden/createmany-parallel
@@ -0,0 +1,4 @@
+Run createmany in /mnt/test/test/createmany-parallel/0
+Run createmany in /mnt/test/test/createmany-parallel/1
+Run createmany in /mnt/test/test/createmany-parallel/2
+Run createmany in /mnt/test/test/createmany-parallel/3
--- a/tests/golden/createmany-parallel-mounts
+++ b/tests/golden/createmany-parallel-mounts
@@ -0,0 +1,3 @@
+== measure initial createmany
+== measure initial createmany
+== measure two concurrent createmany runs
--- a/tests/golden/createmany-rename-large-dir
+++ b/tests/golden/createmany-rename-large-dir
@@ -0,0 +1,2 @@
+== create large directory with 1220608 files
+== randomly renaming 5000 files
--- a/tests/golden/cross-mount-data-free
+++ b/tests/golden/cross-mount-data-free
@@ -0,0 +1,2 @@
+== repeated cross-mount alloc+free, totalling 2x free
+== remove empty test file
--- a/tests/golden/dirent-consistency
+++ b/tests/golden/dirent-consistency
@@ -0,0 +1,10 @@
+== create per node dirs
+== touch files on each node
+== recreate the files
+== turn the files into directories
+== rename parent dirs
+== rename parent dirs back
+== create some hard links
+== recreate one of the hard links
+== delete the remaining hard link
+== race to blow everything away
--- a/tests/golden/export-get-name-parent
+++ b/tests/golden/export-get-name-parent
--- a/tests/golden/inode-items-updated
+++ b/tests/golden/inode-items-updated
@@ -0,0 +1,4 @@
+== create files and sync
+== modify files
+== mount and unmount
+== verify files
--- a/tests/golden/lock-conflicting-batch-commit
+++ b/tests/golden/lock-conflicting-batch-commit
@@ -0,0 +1,4 @@
+== create per mount files
+== time independent modification
+== time concurrent independent modification
+== time concurrent conflicting modification
--- a/tests/golden/lock-ex-race-processes
+++ b/tests/golden/lock-ex-race-processes
@@ -0,0 +1,2 @@
+=== setup files ===
+=== ping-pong xattr ops ===
--- a/tests/golden/lock-pr-cw-conflict
+++ b/tests/golden/lock-pr-cw-conflict
@@ -0,0 +1 @@
+== race writing and index walking
--- a/tests/golden/lock-refleak
+++ b/tests/golden/lock-refleak
@@ -0,0 +1,3 @@
+== make test dir
+== do enough stuff to make lock leaks visible
+== make sure nothing has leaked
--- a/tests/golden/lock-revoke-getcwd
+++ b/tests/golden/lock-revoke-getcwd
@@ -0,0 +1,2 @@
+=== getcwd after lock revocation
+trigger statfs_lock_purge armed: 1
--- a/tests/golden/lock-shrink-consistency
+++ b/tests/golden/lock-shrink-consistency
@@ -0,0 +1,15 @@
+=== setup test file ===
+# file: /mnt/test/test/lock-shrink-consistency/dir/file
+user.test="aaa"
+
+=== commit dirty trans and revoke lock ===
+trigger statfs_lock_purge armed: 1
+trigger statfs_lock_purge after it fired: 0
+=== change xattr on other mount ===
+# file: /mnt/test/test/lock-shrink-consistency/dir/file
+user.test="bbb"
+
+=== verify new xattr under new lock on first mount ===
+# file: /mnt/test/test/lock-shrink-consistency/dir/file
+user.test="bbb"
+
--- a/tests/golden/mount-unmount-race
+++ b/tests/golden/mount-unmount-race
@@ -0,0 +1,3 @@
+== create per mount files
+== 30s of racing random mount/umount
+== mounting any unmounted
--- a/tests/golden/move-blocks
+++ b/tests/golden/move-blocks
@@ -0,0 +1,33 @@
+== build test files
+== wrapped offsets should fail
+ioctl failed on '/mnt/test/test/move-blocks/to': Value too large for defined data type (75)
+scoutfs: move-blocks failed: Value too large for defined data type (75)
+ioctl failed on '/mnt/test/test/move-blocks/to': Value too large for defined data type (75)
+scoutfs: move-blocks failed: Value too large for defined data type (75)
+== specifying same file fails
+ioctl failed on '/mnt/test/test/move-blocks/hardlink': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+== specifying files in other file systems fails
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid cross-device link (18)
+scoutfs: move-blocks failed: Invalid cross-device link (18)
+== offsets must be multiples of 4KB
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+== can't move onto existing extent
+ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
+scoutfs: move-blocks failed: Invalid argument (22)
+== can't move between files with offline extents
+ioctl failed on '/mnt/test/test/move-blocks/to': No data available (61)
+scoutfs: move-blocks failed: No data available (61)
+ioctl failed on '/mnt/test/test/move-blocks/to': No data available (61)
+scoutfs: move-blocks failed: No data available (61)
+== basic moves work
+== moving final partial block sets partial i_size
+123
+== moving updates inode fields
+== moving blocks backwards works
+== combine many files into one
--- a/tests/golden/offline-extent-waiting
+++ b/tests/golden/offline-extent-waiting
@@ -0,0 +1,56 @@
+== create files
+== waiter shows up in ioctl
+offline waiting should be empty:
+0
+offline waiting should now have one known entry:
+== multiple waiters on same block listed once
+offline waiting still has one known entry:
+== different blocks show up
+offline waiting now has two known entries:
+== staging wakes everyone
+offline waiting should be empty again:
+0
+== interruption does no harm
+offline waiting should now have one known entry:
+offline waiting should be empty again:
+0
+== EIO injection for waiting readers works
+offline waiting should now have two known entries:
+2
+data_wait_err found 2 waiters.
+offline waiting should now have 0 known entries:
+0
+dd: error reading ‘/mnt/test/test/offline-extent-waiting/dir/file’: Input/output error
+0+0 records in
+0+0 records out
+dd: error reading ‘/mnt/test/test/offline-extent-waiting/dir/file’: Input/output error
+0+0 records in
+0+0 records out
+offline waiting should be empty again:
+0
+== readahead while offline does no harm
+== waiting on interesting blocks works
+offline waiting is empty at block 0
+0
+offline waiting is empty at block 1
+0
+offline waiting is empty at block 128
+0
+offline waiting is empty at block 129
+0
+offline waiting is empty at block 254
+0
+offline waiting is empty at block 255
+0
+== contents match when staging blocks forward
+== contents match when staging blocks backwards
+== truncate to same size doesn't wait
+offline wating should be empty:
+0
+== truncating does wait
+truncate should be waiting for first block:
+trunate should no longer be waiting:
+0
+== writing waits
+should be waiting for write
+== cleanup
--- a/tests/golden/persistent-item-vers
+++ b/tests/golden/persistent-item-vers
@@ -0,0 +1,4 @@
+== advance lock version by creating unrelated files
+== create before file version
+== verify before version, touch after version
+== verify after version
--- a/tests/golden/setattr_more
+++ b/tests/golden/setattr_more
@@ -0,0 +1,31 @@
+== 0 data_version arg fails
+setattr: data version must not be 0
+Try `setattr --help' or `setattr --usage' for more information.
+== args must specify size and offline
+setattr: must provide size if using --offline option
+Try `setattr --help' or `setattr --usage' for more information.
+== only works on regular files
+failed to open '/mnt/test/test/setattr_more/dir': Is a directory (21)
+scoutfs: setattr failed: Is a directory (21)
+setattr_more ioctl failed on '/mnt/test/test/setattr_more/char': Inappropriate ioctl for device (25)
+scoutfs: setattr failed: Inappropriate ioctl for device (25)
+== non-zero file size fails
+setattr_more ioctl failed on '/mnt/test/test/setattr_more/file': Invalid argument (22)
+scoutfs: setattr failed: Invalid argument (22)
+== non-zero file data_version fails
+setattr_more ioctl failed on '/mnt/test/test/setattr_more/file': Invalid argument (22)
+scoutfs: setattr failed: Invalid argument (22)
+== large size is set
+578437695752307201
+== large data_version is set
+578437695752307201
+== large ctime is set
+1972-02-19 00:06:25.999999999 +0000
+== large offline extents are created
+Filesystem type is: 554f4353
+File size of /mnt/test/test/setattr_more/file is 40988672 (10007 blocks of 4096 bytes)
+ ext:     logical_offset:        physical_offset: length:   expected: flags:
+   0:        0..   10006:          0..     10006:  10007:             unknown,eof
+/mnt/test/test/setattr_more/file: 1 extent found
+== correct offline extent length
+976563
--- a/tests/golden/setup-error-teardown
+++ b/tests/golden/setup-error-teardown
@@ -0,0 +1 @@
+== interrupt waiting mount
--- a/tests/golden/simple-inode-index
+++ b/tests/golden/simple-inode-index
@@ -0,0 +1,9 @@
+== dirs shouldn't appear in data_seq queries
+== two created files are present and come after each other
+found first
+found second
+== unlinked entries must not be present
+== dirty inodes can not be present
+== changing metadata must increase meta seq
+== changing contents must increase data seq
+== make sure dirtying doesn't livelock walk
--- a/tests/golden/simple-release-extents
+++ b/tests/golden/simple-release-extents
@@ -0,0 +1,146 @@
+== simple whole file multi-block releasing
+== release last block that straddles i_size
+== release entire file past i_size
+== releasing offline extents is fine
+== 0 count is fine
+== release past i_size is fine
+== wrapped blocks fails
+release ioctl failed: Invalid argument (22)
+scoutfs: release failed: Invalid argument (22)
+== releasing non-file fails
+ioctl failed: Inappropriate ioctl for device (25)
+release: must provide file version --data-version
+Try `release --help' or `release --usage' for more information.
+== releasing a non-scoutfs file fails
+ioctl failed: Inappropriate ioctl for device (25)
+release: must provide file version --data-version
+Try `release --help' or `release --usage' for more information.
+== releasing bad version fails
+release: must provide file version --data-version
+Try `release --help' or `release --usage' for more information.
+== verify small release merging
+0 0 0:  (0 0 1)  (1 101 4)
+0 0 1:  (0 0 2)  (2 102 3)
+0 0 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
+0 0 3:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
+0 0 4:  (0 0 1)  (1 101 3)  (4 0 1)
+0 1 0:  (0 0 2)  (2 102 3)
+0 1 1:  (0 0 2)  (2 102 3)
+0 1 2:  (0 0 3)  (3 103 2)
+0 1 3:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
+0 1 4:  (0 0 2)  (2 102 2)  (4 0 1)
+0 2 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
+0 2 1:  (0 0 3)  (3 103 2)
+0 2 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
+0 2 3:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
+0 2 4:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
+0 3 0:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
+0 3 1:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
+0 3 2:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
+0 3 3:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
+0 3 4:  (0 0 1)  (1 101 2)  (3 0 2)
+0 4 0:  (0 0 1)  (1 101 3)  (4 0 1)
+0 4 1:  (0 0 2)  (2 102 2)  (4 0 1)
+0 4 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
+0 4 3:  (0 0 1)  (1 101 2)  (3 0 2)
+0 4 4:  (0 0 1)  (1 101 3)  (4 0 1)
+1 0 0:  (0 0 2)  (2 102 3)
+1 0 1:  (0 0 2)  (2 102 3)
+1 0 2:  (0 0 3)  (3 103 2)
+1 0 3:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
+1 0 4:  (0 0 2)  (2 102 2)  (4 0 1)
+1 1 0:  (0 0 2)  (2 102 3)
+1 1 1:  (0 100 1)  (1 0 1)  (2 102 3)
+1 1 2:  (0 100 1)  (1 0 2)  (3 103 2)
+1 1 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
+1 1 4:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
+1 2 0:  (0 0 3)  (3 103 2)
+1 2 1:  (0 100 1)  (1 0 2)  (3 103 2)
+1 2 2:  (0 100 1)  (1 0 2)  (3 103 2)
+1 2 3:  (0 100 1)  (1 0 3)  (4 104 1)
+1 2 4:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
+1 3 0:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
+1 3 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
+1 3 2:  (0 100 1)  (1 0 3)  (4 104 1)
+1 3 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
+1 3 4:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
+1 4 0:  (0 0 2)  (2 102 2)  (4 0 1)
+1 4 1:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
+1 4 2:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
+1 4 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
+1 4 4:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
+2 0 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
+2 0 1:  (0 0 3)  (3 103 2)
+2 0 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
+2 0 3:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
+2 0 4:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
+2 1 0:  (0 0 3)  (3 103 2)
+2 1 1:  (0 100 1)  (1 0 2)  (3 103 2)
+2 1 2:  (0 100 1)  (1 0 2)  (3 103 2)
+2 1 3:  (0 100 1)  (1 0 3)  (4 104 1)
+2 1 4:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
+2 2 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
+2 2 1:  (0 100 1)  (1 0 2)  (3 103 2)
+2 2 2:  (0 100 2)  (2 0 1)  (3 103 2)
+2 2 3:  (0 100 2)  (2 0 2)  (4 104 1)
+2 2 4:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
+2 3 0:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
+2 3 1:  (0 100 1)  (1 0 3)  (4 104 1)
+2 3 2:  (0 100 2)  (2 0 2)  (4 104 1)
+2 3 3:  (0 100 2)  (2 0 2)  (4 104 1)
+2 3 4:  (0 100 2)  (2 0 3)
+2 4 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
+2 4 1:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
+2 4 2:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
+2 4 3:  (0 100 2)  (2 0 3)
+2 4 4:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
+3 0 0:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
+3 0 1:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
+3 0 2:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
+3 0 3:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
+3 0 4:  (0 0 1)  (1 101 2)  (3 0 2)
+3 1 0:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
+3 1 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
+3 1 2:  (0 100 1)  (1 0 3)  (4 104 1)
+3 1 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
+3 1 4:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
+3 2 0:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
+3 2 1:  (0 100 1)  (1 0 3)  (4 104 1)
+3 2 2:  (0 100 2)  (2 0 2)  (4 104 1)
+3 2 3:  (0 100 2)  (2 0 2)  (4 104 1)
+3 2 4:  (0 100 2)  (2 0 3)
+3 3 0:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
+3 3 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
+3 3 2:  (0 100 2)  (2 0 2)  (4 104 1)
+3 3 3:  (0 100 3)  (3 0 1)  (4 104 1)
+3 3 4:  (0 100 3)  (3 0 2)
+3 4 0:  (0 0 1)  (1 101 2)  (3 0 2)
+3 4 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
+3 4 2:  (0 100 2)  (2 0 3)
+3 4 3:  (0 100 3)  (3 0 2)
+3 4 4:  (0 100 3)  (3 0 2)
+4 0 0:  (0 0 1)  (1 101 3)  (4 0 1)
+4 0 1:  (0 0 2)  (2 102 2)  (4 0 1)
+4 0 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
+4 0 3:  (0 0 1)  (1 101 2)  (3 0 2)
+4 0 4:  (0 0 1)  (1 101 3)  (4 0 1)
+4 1 0:  (0 0 2)  (2 102 2)  (4 0 1)
+4 1 1:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
+4 1 2:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
+4 1 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
+4 1 4:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
+4 2 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
+4 2 1:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
+4 2 2:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
+4 2 3:  (0 100 2)  (2 0 3)
+4 2 4:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
+4 3 0:  (0 0 1)  (1 101 2)  (3 0 2)
+4 3 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
+4 3 2:  (0 100 2)  (2 0 3)
+4 3 3:  (0 100 3)  (3 0 2)
+4 3 4:  (0 100 3)  (3 0 2)
+4 4 0:  (0 0 1)  (1 101 3)  (4 0 1)
+4 4 1:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
+4 4 2:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
+4 4 3:  (0 100 3)  (3 0 2)
+4 4 4:  (0 100 4)  (4 0 1)
--- a/tests/golden/simple-staging
+++ b/tests/golden/simple-staging
@@ -0,0 +1,23 @@
+== create/release/stage single block file
+== create/release/stage larger file
+== multiple release,drop_cache,stage cycles
+== release+stage shouldn't change stat, data seq or vers
+== stage does change meta_seq
+== can't use stage to extend online file
+stage: must provide file version with --data-version
+Try `stage --help' or `stage --usage' for more information.
+== wrapped region fails
+stage returned -1, not 4096: error Invalid argument (22)
+scoutfs: stage failed: Input/output error (5)
+== non-block aligned offset fails
+stage returned -1, not 4095: error Invalid argument (22)
+scoutfs: stage failed: Input/output error (5)
+== non-block aligned len within block fails
+stage returned -1, not 1024: error Invalid argument (22)
+scoutfs: stage failed: Input/output error (5)
+== partial final block that writes to i_size does work
+== zero length stage doesn't bring blocks online
+== stage of non-regular file fails
+ioctl failed: Inappropriate ioctl for device (25)
+stage: must provide file version with --data-version
+Try `stage --help' or `stage --usage' for more information.
--- a/tests/golden/simple-xattr-unit
+++ b/tests/golden/simple-xattr-unit
@@ -0,0 +1,18 @@
+=== XATTR_ flag combinations
+dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -c -r
+returned -1 errno 22 (Invalid argument)
+dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -r
+returned -1 errno 61 (No data available)
+dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -c
+returned 0
+dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -c
+returned -1 errno 17 (File exists)
+dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -r
+returned 0
+=== bad lengths
+setfattr: /mnt/test/test/simple-xattr-unit/file: Operation not supported
+setfattr: /mnt/test/test/simple-xattr-unit/file: Numerical result out of range
+setfattr: /mnt/test/test/simple-xattr-unit/file: Numerical result out of range
+setfattr: /mnt/test/test/simple-xattr-unit/file: Argument list too long
+=== good length boundaries
+=== 500 random lengths
--- a/tests/golden/srch-basic-functionality
+++ b/tests/golden/srch-basic-functionality
@@ -0,0 +1,13 @@
+== create new xattrs
+== update existing xattr
+== remove an xattr
+== remove xattr with files
+== create entries in current log
+== delete small fraction
+== remove files
+== create entries that exceed one log
+== delete fractions in phases
+== remove files
+== create entries for exceed search entry limit
+== delete half
+== entirely remove third batch
--- a/tests/golden/stage-multi-part
+++ b/tests/golden/stage-multi-part
--- a/tests/golden/stage-release-race-alloc
+++ b/tests/golden/stage-release-race-alloc
@@ -0,0 +1,2 @@
+== create initial files
+== race stage and release
--- a/tests/golden/stale-btree-read
+++ b/tests/golden/stale-btree-read
@@ -0,0 +1,11 @@
+== create file for xattr ping pong
+# file: /mnt/test/test/stale-btree-read/file
+user.xat="initial"
+
+== retry btree block read
+trigger btree_stale_read armed: 1
+# file: /mnt/test/test/stale-btree-read/file
+user.xat="btree"
+
+trigger btree_stale_read after: 0
+counter btree_stale_read diff 1
--- a/tests/golden/xfstests
+++ b/tests/golden/xfstests
@@ -0,0 +1,281 @@
+Ran:
+generic/001
+generic/002
+generic/005
+generic/006
+generic/007
+generic/011
+generic/013
+generic/014
+generic/020
+generic/028
+generic/032
+generic/034
+generic/035
+generic/037
+generic/039
+generic/040
+generic/041
+generic/053
+generic/056
+generic/057
+generic/062
+generic/065
+generic/066
+generic/067
+generic/069
+generic/070
+generic/071
+generic/073
+generic/076
+generic/084
+generic/086
+generic/087
+generic/088
+generic/090
+generic/092
+generic/098
+generic/101
+generic/104
+generic/106
+generic/107
+generic/117
+generic/124
+generic/129
+generic/131
+generic/169
+generic/184
+generic/221
+generic/228
+generic/236
+generic/245
+generic/249
+generic/257
+generic/258
+generic/286
+generic/294
+generic/306
+generic/307
+generic/308
+generic/309
+generic/313
+generic/315
+generic/322
+generic/335
+generic/336
+generic/337
+generic/341
+generic/342
+generic/343
+generic/348
+generic/360
+generic/376
+generic/377
+Not
+run:
+generic/004
+generic/008
+generic/009
+generic/012
+generic/015
+generic/016
+generic/018
+generic/021
+generic/022
+generic/026
+generic/031
+generic/033
+generic/050
+generic/052
+generic/058
+generic/059
+generic/060
+generic/061
+generic/063
+generic/064
+generic/079
+generic/081
+generic/082
+generic/091
+generic/094
+generic/096
+generic/110
+generic/111
+generic/113
+generic/114
+generic/115
+generic/116
+generic/118
+generic/119
+generic/121
+generic/122
+generic/123
+generic/128
+generic/130
+generic/134
+generic/135
+generic/136
+generic/138
+generic/139
+generic/140
+generic/142
+generic/143
+generic/144
+generic/145
+generic/146
+generic/147
+generic/148
+generic/149
+generic/150
+generic/151
+generic/152
+generic/153
+generic/154
+generic/155
+generic/156
+generic/157
+generic/158
+generic/159
+generic/160
+generic/161
+generic/162
+generic/163
+generic/171
+generic/172
+generic/173
+generic/174
+generic/177
+generic/178
+generic/179
+generic/180
+generic/181
+generic/182
+generic/183
+generic/185
+generic/188
+generic/189
+generic/190
+generic/191
+generic/193
+generic/194
+generic/195
+generic/196
+generic/197
+generic/198
+generic/199
+generic/200
+generic/201
+generic/202
+generic/203
+generic/205
+generic/206
+generic/207
+generic/210
+generic/211
+generic/212
+generic/214
+generic/216
+generic/217
+generic/218
+generic/219
+generic/220
+generic/222
+generic/223
+generic/225
+generic/227
+generic/229
+generic/230
+generic/235
+generic/238
+generic/240
+generic/244
+generic/250
+generic/252
+generic/253
+generic/254
+generic/255
+generic/256
+generic/259
+generic/260
+generic/261
+generic/262
+generic/263
+generic/264
+generic/265
+generic/266
+generic/267
+generic/268
+generic/271
+generic/272
+generic/276
+generic/277
+generic/278
+generic/279
+generic/281
+generic/282
+generic/283
+generic/284
+generic/287
+generic/288
+generic/289
+generic/290
+generic/291
+generic/292
+generic/293
+generic/295
+generic/296
+generic/301
+generic/302
+generic/303
+generic/304
+generic/305
+generic/312
+generic/314
+generic/316
+generic/317
+generic/318
+generic/324
+generic/326
+generic/327
+generic/328
+generic/329
+generic/330
+generic/331
+generic/332
+generic/353
+generic/355
+generic/356
+generic/357
+generic/358
+generic/359
+generic/361
+generic/362
+generic/363
+generic/364
+generic/365
+generic/366
+generic/367
+generic/368
+generic/369
+generic/370
+generic/371
+generic/372
+generic/373
+generic/374
+generic/378
+generic/379
+generic/380
+generic/381
+generic/382
+generic/383
+generic/384
+generic/385
+generic/386
+shared/001
+shared/002
+shared/003
+shared/004
+shared/032
+shared/051
+shared/289
+Passed all 72 tests
--- a/Show More
+++ b/Show More