scoutfs: mmap: add support for writable shared mmap()ings

Add support for writable MAP_SHARED mmap()ings. Avoid issues with late writepage()s building transactions by doing the block_write_begin() work in scoutfs_data_page_mkwrite(). Ensure the page is marked dirty and prepared for write, then let the VM complete the write when the page is flushed or invalidated. Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
mmap: add support for read only mmap()
2026-06-09 21:22:36 +00:00 · 2021-03-31 10:43:47 -07:00 · 2021-03-31 10:42:37 -07:00
196 changed files with 6371 additions and 25525 deletions
@@ -1,17 +0,0 @@
-#
-# Typically development is done in each subdir, but we have a tiny
-# makefile here to make it easy to run simple targets across all the
-# subdirs.
-#
-
-SUBDIRS := kmod utils tests
-NOTTESTS := kmod utils
-
-all clean: $(SUBDIRS) FORCE
-dist: $(NOTTESTS) FORCE
-
-$(SUBDIRS): FORCE
-	$(MAKE) -C $@ $(MAKECMDGOALS)
-
-all:
-FORCE:
@@ -51,7 +51,7 @@ modules_install:

 dist: scoutfs-kmod.spec
 	git archive --format=tar --prefix scoutfs-kmod-$(RPM_VERSION)/ HEAD^{tree} > $(TARFILE)
-	@ tar rf $(TARFILE) --transform="s@\(.*\)@scoutfs-kmod-$(RPM_VERSION)/\1@" scoutfs-kmod.spec
+	@ tar rf $(TARFILE) --transform="s@\(.*\)@scoutfs-$(RPM_VERSION)/\1@" scoutfs-kmod.spec

 clean:
 	make $(SCOUTFS_ARGS) clean
@@ -6,7 +6,7 @@ from the ground up to support large archival systems.
 Its key differentiating features are:

 - Integrated consistent indexing accelerates archival maintenance operations
- - Commit logs allow nodes to write concurrently without contention
+ - Log-structured commits allow nodes to write concurrently without contention

 It meets best of breed expectations:

@@ -62,17 +62,17 @@ help on the mailing list.**
 The requirements for running scoutfs on a small cluster are:

 1. One or more nodes running x86-64 CentOS/RHEL 7.4 (or 7.3)
- 2. Access to two shared block devices
+ 2. Access to a single shared block device
 3. IPv4 connectivity between the nodes

 The steps for getting scoutfs mounted and operational are:

 1. Get the kernel module running on the nodes
- 2. Make a new filesystem on the devices with the userspace utilities
- 3. Mount the devices on all the nodes
+ 2. Make a new filesystem on the device with the userspace utilities
+ 3. Mount the device on all the nodes

-In this example we run all of these commands on three nodes.  The names
-of the block devices are the same on all the nodes.
+In this example we run all of these commands on three nodes.  The block
+device name is the same on all the nodes.

 1. Get the Kernel Module and Userspace Binaries

@@ -87,11 +87,14 @@ of the block devices are the same on all the nodes.

   ```shell
   yum install kernel-devel
-   git clone git@github.com:versity/scoutfs.git
-   make -C scoutfs
+   git clone git@github.com:versity/scoutfs-kmod-dev.git
+   make -C scoutfs-kmod-dev module 
   modprobe libcrc32c
-   insmod scoutfs/kmod/src/scoutfs.ko
-   alias scoutfs=$PWD/scoutfs/utils/src/scoutfs
+   insmod scoutfs-kmod-dev/src/scoutfs.ko
+
+   git clone git@github.com:versity/scoutfs-utils-dev.git
+   make -C scoutfs-utils-dev
+   alias scoutfs=$PWD/scoutfs-utils-dev/src/scoutfs
   ```

 2. Make a New Filesystem (**destroys contents, no questions asked**)
@@ -100,7 +103,7 @@ of the block devices are the same on all the nodes.
   quorum for the system to function.

   ```shell
-   scoutfs mkfs -Q 2 /dev/meta_dev /dev/data_dev
+   scoutfs mkfs -Q 2 /dev/shared_block_device
   ```

 3. Mount the Filesystem
@@ -111,7 +114,7 @@ of the block devices are the same on all the nodes.

   ```shell
   mkdir /mnt/scoutfs
-   mount -t scoutfs -o server_addr=$NODE_ADDR,metadev_path=/dev/meta_dev /dev/data_dev /mnt/scoutfs
+   mount -t scoutfs -o server_addr=$NODE_ADDR /dev/shared_block_device /mnt/scoutfs
   ```

 4. For Kicks, Observe the Metadata Change Index
@@ -9,8 +9,6 @@ CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
 -include $(src)/Makefile.kernelcompat
 
 scoutfs-y +=			\
-	avl.o			\
-	alloc.o			\
 	block.o			\
 	btree.o			\
 	client.o		\
@@ -18,12 +16,10 @@ scoutfs-y +=			\
 	data.o			\
 	dir.o			\
 	export.o		\
-	ext.o			\
 	file.o			\
 	forest.o		\
 	inode.o			\
 	ioctl.o			\
-	item.o			\
 	lock.o			\
 	lock_server.o		\
 	msg.o			\
@@ -31,11 +27,10 @@ scoutfs-y +=			\
 	options.o		\
 	per_task.o		\
 	quorum.o		\
+	radix.o			\
 	scoutfs_trace.o		\
 	server.o		\
-	sort_priv.o		\
 	spbm.o			\
-	srch.o			\
 	super.o			\
 	sysfs.o			\
 	trans.o			\
@@ -55,9 +50,5 @@ $(src)/check_exported_types:
 		echo "no raw types in exported headers, preface with __";     \
 		exit 1;							      \
 	fi
-	@if egrep '\<__packed\>' $(src)/format.h $(src)/ioctl.h; then \
-		echo "no __packed allowed in exported headers";     \
-		exit 1;							      \
-	fi

 extra-y += check_exported_types
@@ -1,155 +0,0 @@
-#ifndef _SCOUTFS_ALLOC_H_
-#define _SCOUTFS_ALLOC_H_
-
-#include "ext.h"
-
-/*
- * These are implementation-specific metrics, they don't need to be
- * consistent across implementations.  They should probably be run-time
- * knobs.
- */
-
-/*
- * The largest extent that we'll try to allocate with fallocate.  We're
- * trying not to completely consume a transactions data allocation all
- * at once.  This is only allocation granularity, repeated allocations
- * can produce large contiguous extents.
- */
-#define SCOUTFS_FALLOCATE_ALLOC_LIMIT \
-	(128ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
-
-/*
- * The largest aligned region that we'll try to allocate at the end of
- * the file as it's extended.  This is also limited to the current file
- * size so we can only waste at most twice the total file size when
- * files are less than this.  We try to keep this around the point of
- * diminishing returns in streaming performance of common data devices
- * to limit waste.
- */
-#define SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT \
-	(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
-
-/*
- * Small data allocations are satisfied by cached extents stored in
- * the run-time alloc struct to minimize item operations for small
- * block allocations.  Large allocations come directly from btree
- * extent items, and this defines the threshold beetwen them.
- */
-#define SCOUTFS_ALLOC_DATA_LG_THRESH \
-	(8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
-
-/*
- * Fill client alloc roots to the target when they fall below the lo
- * threshold.
- *
- * We're giving the client the most available meta blocks we can so that
- * it has the freedom to build large transactions before worrying that
- * it might run out of meta allocs during commits.
- */
-#define SCOUTFS_SERVER_META_FILL_TARGET \
-	SCOUTFS_ALLOC_LIST_MAX_BLOCKS
-#define SCOUTFS_SERVER_META_FILL_LO \
-	(SCOUTFS_ALLOC_LIST_MAX_BLOCKS / 2)
-#define SCOUTFS_SERVER_DATA_FILL_TARGET \
-	(4ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
-#define SCOUTFS_SERVER_DATA_FILL_LO \
-	(1ULL * 1024 * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT)
-
-/*
- * Each of the server meta_alloc roots will try to keep a minimum amount
- * of free blocks.  The server will swap roots when its current avail
- * falls below the threshold while the freed root is still above it.  It
- * must have room for all the largest allocation attempted in a
- * transaction on the server.
- */
-#define SCOUTFS_SERVER_META_ALLOC_MIN \
-	(SCOUTFS_SERVER_META_FILL_TARGET * 2)
-
-/*
- * A run-time use of a pair of persistent avail/freed roots as a
- * metadata allocator.  It has the machinery needed to lock and avoid
- * recursion when dirtying the list blocks that are used during the
- * transaction.
- */
-struct scoutfs_alloc {
-	spinlock_t lock;
-	struct mutex mutex;
-	struct scoutfs_block *dirty_avail_bl;
-	struct scoutfs_block *dirty_freed_bl;
-	struct scoutfs_alloc_list_head avail;
-	struct scoutfs_alloc_list_head freed;
-};
-
-/*
- * A run-time data allocator.  We have a cached extent in memory that is
- * a lot cheaper to work with than the extent items, and we have a
- * consistent record of the total_len that can be sampled outside of the
- * usual heavy serialization of the extent modifications.
- */
-struct scoutfs_data_alloc {
-	struct scoutfs_alloc_root root;
-	struct scoutfs_extent cached;
-	atomic64_t total_len;
-};
-
-void scoutfs_alloc_init(struct scoutfs_alloc *alloc,
-			struct scoutfs_alloc_list_head *avail,
-			struct scoutfs_alloc_list_head *freed);
-int scoutfs_alloc_prepare_commit(struct super_block *sb,
-				 struct scoutfs_alloc *alloc,
-				 struct scoutfs_block_writer *wri);
-
-int scoutfs_alloc_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
-		       struct scoutfs_block_writer *wri, u64 *blkno);
-int scoutfs_free_meta(struct super_block *sb, struct scoutfs_alloc *alloc,
-		      struct scoutfs_block_writer *wri, u64 blkno);
-
-void scoutfs_dalloc_init(struct scoutfs_data_alloc *dalloc,
-			 struct scoutfs_alloc_root *data_avail);
-void scoutfs_dalloc_get_root(struct scoutfs_data_alloc *dalloc,
-			     struct scoutfs_alloc_root *data_avail);
-u64 scoutfs_dalloc_total_len(struct scoutfs_data_alloc *dalloc);
-int scoutfs_dalloc_return_cached(struct super_block *sb,
-				 struct scoutfs_alloc *alloc,
-				 struct scoutfs_block_writer *wri,
-				 struct scoutfs_data_alloc *dalloc);
-int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
-		       struct scoutfs_block_writer *wri,
-		       struct scoutfs_data_alloc *dalloc, u64 count,
-		       u64 *blkno_ret, u64 *count_ret);
-int scoutfs_free_data(struct super_block *sb, struct scoutfs_alloc *alloc,
-		      struct scoutfs_block_writer *wri,
-		      struct scoutfs_alloc_root *root, u64 blkno, u64 count);
-
-int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
-		       struct scoutfs_block_writer *wri,
-		       struct scoutfs_alloc_root *dst,
-		       struct scoutfs_alloc_root *src, u64 total);
-
-int scoutfs_alloc_fill_list(struct super_block *sb,
-			    struct scoutfs_alloc *alloc,
-			    struct scoutfs_block_writer *wri,
-			    struct scoutfs_alloc_list_head *lhead,
-			    struct scoutfs_alloc_root *root,
-			    u64 lo, u64 target);
-int scoutfs_alloc_empty_list(struct super_block *sb,
-			     struct scoutfs_alloc *alloc,
-			     struct scoutfs_block_writer *wri,
-			     struct scoutfs_alloc_root *root,
-			     struct scoutfs_alloc_list_head *lhead);
-int scoutfs_alloc_splice_list(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_alloc_list_head *dst,
-			      struct scoutfs_alloc_list_head *src);
-
-bool scoutfs_alloc_meta_low(struct super_block *sb,
-			    struct scoutfs_alloc *alloc, u32 nr);
-
-typedef int (*scoutfs_alloc_foreach_cb_t)(struct super_block *sb, void *arg,
-					  int owner, u64 id,
-					  bool meta, bool avail, u64 blocks);
-int scoutfs_alloc_foreach(struct super_block *sb,
-			  scoutfs_alloc_foreach_cb_t cb, void *arg);
-
-#endif
@@ -1,405 +0,0 @@
-/*
- * Copyright (C) 2020 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-#include "format.h"
-#include "avl.h"
-
-/*
- * We use a simple avl to index items in btree blocks.  The interface
- * looks a bit like the kernel rbtree interface in that the caller
- * manages locking and storage for the nodes.  Node references are
- * stored as byte offsets from the root so that the implementation
- * doesn't have to know anything about the caller's container. 
- *
- * We store the full height in each node, rather than just 2 bits for
- * the balance, so that we can use the extra redundancy to verify the
- * integrity of the tree.
- */
-
-static struct scoutfs_avl_node *node_ptr(struct scoutfs_avl_root *root,
-					 __le16 off)
-{
-	return off ? (void *)root + le16_to_cpu(off) : NULL;
-}
-
-static __le16 node_off(struct scoutfs_avl_root *root,
-		       struct scoutfs_avl_node *node)
-{
-	return node ? cpu_to_le16((void *)node - (void *)root) : 0;
-}
-
-static __u8 node_height(struct scoutfs_avl_node *node)
-{
-	return node ? node->height : 0;
-}
-
-struct scoutfs_avl_node *
-scoutfs_avl_search(struct scoutfs_avl_root *root,
-		   scoutfs_avl_compare_t compare, void *arg, int *cmp_ret,
-		   struct scoutfs_avl_node **par,
-		   struct scoutfs_avl_node **next,
-		   struct scoutfs_avl_node **prev)
-{
-	struct scoutfs_avl_node *node = node_ptr(root, root->node);
-	int cmp;
-
-	if (cmp_ret)
-		*cmp_ret = -1;
-	if (par)
-		*par = NULL;
-	if (next)
-		*next = NULL;
-	if (prev)
-		*prev = NULL;
-
-	while (node) {
-		cmp = compare(arg, node);
-		if (par)
-			*par = node;
-		if (cmp_ret)
-			*cmp_ret = cmp;
-		if (cmp < 0) {
-			if (next)
-				*next = node;
-			node = node_ptr(root, node->left);
-		} else if (cmp > 0) {
-			if (prev)
-				*prev = node;
-			node = node_ptr(root, node->right);
-		} else {
-			return node;
-		}
-	}
-
-	return NULL;
-}
-
-struct scoutfs_avl_node *scoutfs_avl_first(struct scoutfs_avl_root *root)
-{
-	struct scoutfs_avl_node *node = node_ptr(root, root->node);
-
-	while (node && node->left)
-		node = node_ptr(root, node->left);
-
-	return node;
-}
-
-struct scoutfs_avl_node *scoutfs_avl_last(struct scoutfs_avl_root *root)
-{
-	struct scoutfs_avl_node *node = node_ptr(root, root->node);
-
-	while (node && node->right)
-		node = node_ptr(root, node->right);
-
-	return node;
-}
-
-struct scoutfs_avl_node *scoutfs_avl_next(struct scoutfs_avl_root *root,
-					  struct scoutfs_avl_node *node)
-{
-	struct scoutfs_avl_node *parent;
-
-	if (node->right) {
-		node = node_ptr(root, node->right);
-		while (node->left)
-			node = node_ptr(root, node->left);
-		return node;
-	}
-
-	while ((parent = node_ptr(root, node->parent)) &&
-	       node == node_ptr(root, parent->right))
-		node = parent;
-
-	return parent;
-}
-
-struct scoutfs_avl_node *scoutfs_avl_prev(struct scoutfs_avl_root *root,
-					  struct scoutfs_avl_node *node)
-{
-	struct scoutfs_avl_node *parent;
-
-	if (node->left) {
-		node = node_ptr(root, node->left);
-		while (node->right)
-			node = node_ptr(root, node->right);
-		return node;
-	}
-
-	while ((parent = node_ptr(root, node->parent)) &&
-	       node == node_ptr(root, parent->left))
-		node = parent;
-
-	return parent;
-}
-
-static void set_parent_left_right(struct scoutfs_avl_root *root,
-				  struct scoutfs_avl_node *parent,
-				  struct scoutfs_avl_node *old,
-				  struct scoutfs_avl_node *new)
-{
-	__le16 *off;
-
-	if (parent == NULL)
-		off = &root->node;
-	else if (parent->left == node_off(root, old))
-		off = &parent->left;
-	else
-		off = &parent->right;
-
-	*off = node_off(root, new);
-}
-
-static void set_height(struct scoutfs_avl_root *root,
-		       struct scoutfs_avl_node *node)
-{
-	struct scoutfs_avl_node *left = node_ptr(root, node->left);
-	struct scoutfs_avl_node *right = node_ptr(root, node->right);
-
-	node->height = 1 + max(node_height(left), node_height(right));
-}
-
-static int node_balance(struct scoutfs_avl_root *root,
-		        struct scoutfs_avl_node *node)
-{
-	if (node == NULL)
-		return 0;
-
-	return (int)node_height(node_ptr(root, node->right)) -
-	       (int)node_height(node_ptr(root, node->left));
-}
-
-/*
- *     d                         b
- *    / \    rotate right ->    / \
- *   b   e                     a   d
- *  / \      <- rotate left       / \
- * a   c                         c   e
- *
- * The rotate functions are always called with the higher node as the
- * earlier argument.  Links to a and e are constant.  We have to update
- * the forward and back refs between parents and nodes for the three links
- * along root->[db]->[bd]->c.
- */
-static void rotate_right(struct scoutfs_avl_root *root,
-			 struct scoutfs_avl_node *d)
-{
-	struct scoutfs_avl_node *gpa = node_ptr(root, d->parent);
-	struct scoutfs_avl_node *b = node_ptr(root, d->left);
-	struct scoutfs_avl_node *c = node_ptr(root, b->right);
-
-	set_parent_left_right(root, gpa, d, b);
-	b->parent = node_off(root, gpa);
-
-	b->right = node_off(root, d);
-	d->parent = node_off(root, b);
-
-	d->left = node_off(root, c);
-	if (c)
-		c->parent = node_off(root, d);
-
-	set_height(root, d);
-	set_height(root, b);
-}
-
-static void rotate_left(struct scoutfs_avl_root *root,
-			struct scoutfs_avl_node *b)
-{
-	struct scoutfs_avl_node *gpa = node_ptr(root, b->parent);
-	struct scoutfs_avl_node *d = node_ptr(root, b->right);
-	struct scoutfs_avl_node *c = node_ptr(root, d->left);
-
-	set_parent_left_right(root, gpa, b, d);
-	d->parent = node_off(root, gpa);
-
-	d->left = node_off(root, b);
-	b->parent = node_off(root, d);
-
-	b->right = node_off(root, c);
-	if (c)
-		c->parent = node_off(root, b);
-
-	set_height(root, b);
-	set_height(root, d);
-}
-
-/*
- * Check the balance factor for the given node and perform rotations if
- * its two child subtrees are too far out of balance.  Return either the
- * node again or the root of the newly balanced subtree.
- */
-static struct scoutfs_avl_node *
-rotate_imbalance(struct scoutfs_avl_root *root, struct scoutfs_avl_node *node)
-{
-	int bal = node_balance(root, node);
-	struct scoutfs_avl_node *child;
-
-	if (bal >= -1 && bal <= 1)
-		return node;
-
-	if (bal > 0) {
-		/* turn right-left case into right-right */
-		child = node_ptr(root, node->right);
-		if (node_balance(root, child) < 0)
-			rotate_right(root, child);
-		/* rotate left to address right-right */
-		rotate_left(root, node);
-
-	} else {
-		/* or do the mirror for the left- cases */
-		child = node_ptr(root, node->left);
-		if (node_balance(root, child) > 0)
-			rotate_left(root, child);
-		rotate_right(root, node);
-	}
-
-	return node_ptr(root, node->parent);
-}
-
-void scoutfs_avl_insert(struct scoutfs_avl_root *root,
-			struct scoutfs_avl_node *parent,
-			struct scoutfs_avl_node *node, int cmp)
-{
-	node->parent = 0;
-	node->left = 0;
-	node->right = 0;
-	set_height(root, node);
-	memset(node->__pad, 0, sizeof(node->__pad));
-
-	if (parent == NULL) {
-		root->node = node_off(root, node);
-		node->parent = 0;
-		return;
-	}
-
-	if (cmp < 0)
-		parent->left = node_off(root, node);
-	else
-		parent->right = node_off(root, node);
-	node->parent = node_off(root, parent);
-
-	while (parent) {
-		set_height(root, parent);
-		parent = rotate_imbalance(root, parent);
-		parent = node_ptr(root, parent->parent);
-	}
-}
-
-static struct scoutfs_avl_node *avl_successor(struct scoutfs_avl_root *root,
-					      struct scoutfs_avl_node *node)
-{
-	node = node_ptr(root, node->right);
-	while (node->left)
-		node = node_ptr(root, node->left);
-
-	return node;
-}
-
-/*
- * Find a node next successor and then swap the positions of the two
- * nodes with each other in the tree.  This is only tricky because the
- * successor can be a direct child of the node and if we weren't careful
- * we'd be modifying each of the nodes through the pointers between
- * them.
- */
-static void swap_with_successor(struct scoutfs_avl_root *root,
-				struct scoutfs_avl_node *node)
-{
-	struct scoutfs_avl_node *succ = avl_successor(root, node);
-	struct scoutfs_avl_node *succ_par = node_ptr(root, succ->parent);
-	struct scoutfs_avl_node *succ_right = node_ptr(root, succ->right);
-	struct scoutfs_avl_node *parent;
-	struct scoutfs_avl_node *left;
-	struct scoutfs_avl_node *right;
-
-	/* Link old node's parent and left child with the successor */
-	succ->parent = node->parent;
-	parent = node_ptr(root, succ->parent);
-	set_parent_left_right(root, parent, node, succ);
-	succ->left = node->left;
-	left = node_ptr(root, succ->left);
-	if (left)
-		left->parent = node_off(root, succ);
-
-	/*
-	 * Link the old node's right with successor and the old
-	 * successor's parent with the node, they could have pointed to
-	 * each other.
-	 */
-	if (succ_par == node) {
-		succ->right = node_off(root, node);
-		node->parent = node_off(root, succ);
-	} else {
-		succ->right = node->right;
-		right = node_ptr(root, succ->right);
-		if (right)
-			right->parent = node_off(root, succ);
-		set_parent_left_right(root, succ_par, succ, node);
-		node->parent = node_off(root, succ_par);
-	}
-
-	/* Link the old successor's right with the node, it can't have left */
-	node->right = node_off(root, succ_right);
-	if (succ_right)
-		succ_right->parent = node_off(root, node);
-	node->left = 0;
-
-	swap(node->height, succ->height);
-}
-
-void scoutfs_avl_delete(struct scoutfs_avl_root *root,
-			struct scoutfs_avl_node *node)
-{
-	struct scoutfs_avl_node *parent;
-	struct scoutfs_avl_node *child;
-
-	if (node->left && node->right)
-		swap_with_successor(root, node);
-
-	parent = node_ptr(root, node->parent);
-	child = node_ptr(root, node->left ?: node->right);
-
-	set_parent_left_right(root, parent, node, child);
-	if (child)
-		child->parent = node->parent;
-
-	while (parent) {
-		set_height(root, parent);
-		parent = rotate_imbalance(root, parent);
-		parent = node_ptr(root, parent->parent);
-	}
-}
-
-/*
- * Move the contents of a node to a new node location in memory.  The
- * logical position of the node in the tree does not change.
- */
-void scoutfs_avl_relocate(struct scoutfs_avl_root *root,
-			  struct scoutfs_avl_node *to,
-			  struct scoutfs_avl_node *from)
-{
-	struct scoutfs_avl_node *parent = node_ptr(root, from->parent);
-	struct scoutfs_avl_node *left = node_ptr(root, from->left);
-	struct scoutfs_avl_node *right = node_ptr(root, from->right);
-
-	set_parent_left_right(root, parent, from, to);
-	to->parent = from->parent;
-	to->left = from->left;
-	if (left)
-		left->parent = node_off(root, to);
-	to->right = from->right;
-	if (right)
-		right->parent = node_off(root, to);
-	to->height = from->height;
-}
@@ -1,30 +0,0 @@
-#ifndef _SCOUTFS_AVL_H_
-#define _SCOUTFS_AVL_H_
-
-#include "format.h"
-
-typedef int (*scoutfs_avl_compare_t)(void *arg,
-				       struct scoutfs_avl_node *node);
-
-struct scoutfs_avl_node *
-scoutfs_avl_search(struct scoutfs_avl_root *root,
-		   scoutfs_avl_compare_t compare, void *arg, int *cmp_ret,
-		   struct scoutfs_avl_node **par,
-		   struct scoutfs_avl_node **next,
-		   struct scoutfs_avl_node **prev);
-struct scoutfs_avl_node *scoutfs_avl_first(struct scoutfs_avl_root *root);
-struct scoutfs_avl_node *scoutfs_avl_last(struct scoutfs_avl_root *root);
-struct scoutfs_avl_node *scoutfs_avl_next(struct scoutfs_avl_root *root,
-					  struct scoutfs_avl_node *node);
-struct scoutfs_avl_node *scoutfs_avl_prev(struct scoutfs_avl_root *root,
-					  struct scoutfs_avl_node *node);
-void scoutfs_avl_insert(struct scoutfs_avl_root *root,
-			  struct scoutfs_avl_node *parent,
-			  struct scoutfs_avl_node *node, int cmp);
-void scoutfs_avl_delete(struct scoutfs_avl_root *root,
-			  struct scoutfs_avl_node *node);
-void scoutfs_avl_relocate(struct scoutfs_avl_root *root,
-			    struct scoutfs_avl_node *to,
-			    struct scoutfs_avl_node *from);
-
-#endif
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/rbtree.h>

 #include "format.h"
 #include "super.h"
@@ -45,7 +46,7 @@
 struct block_info {
 	struct super_block *sb;
 	spinlock_t lock;
-	struct radix_tree_root radix;
+	struct rb_root root;
 	struct list_head lru_list;
 	u64 lru_nr;
 	u64 lru_move_counter;
@@ -58,20 +59,22 @@ struct block_info {
 #define DECLARE_BLOCK_INFO(sb, name) \
 	struct block_info *name = SCOUTFS_SB(sb)->block_info

-enum block_status_bits {
+enum {
 	BLOCK_BIT_UPTODATE = 0,	/* contents consistent with media */
 	BLOCK_BIT_NEW,		/* newly allocated, contents undefined */
 	BLOCK_BIT_DIRTY,	/* dirty, writer will write */
 	BLOCK_BIT_IO_BUSY,	/* bios are in flight */
 	BLOCK_BIT_ERROR,	/* saw IO error */
-	BLOCK_BIT_DELETED,	/* has been deleted from radix tree */
+	BLOCK_BIT_DELETED,	/* has been deleted from rbtree */
 	BLOCK_BIT_PAGE_ALLOC,	/* page (possibly high order) allocation */
 	BLOCK_BIT_VIRT,		/* mapped virt allocation */
 	BLOCK_BIT_CRC_VALID,	/* crc has been verified */
+	BLOCK_BIT_VISITED,	/* used by callers to track blocks */
 };

 struct block_private {
 	struct scoutfs_block bl;
+	struct rb_node node;
 	struct super_block *sb;
 	atomic_t refcount;
 	union {
@@ -105,18 +108,18 @@ do {									\
 * be refactored away.
 */

-__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
+__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr)
 {
 	int off = offsetof(struct scoutfs_block_header, crc) +
 		  FIELD_SIZEOF(struct scoutfs_block_header, crc);
-	u32 calc = crc32c(~0, (char *)hdr + off, size - off);
+	u32 calc = crc32c(~0, (char *)hdr + off, SCOUTFS_BLOCK_SIZE - off);

 	return cpu_to_le32(calc);
 }

-bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr, u32 size)
+bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr)
 {
-	return hdr->crc == scoutfs_block_calc_crc(hdr, size);
+	return hdr->crc == scoutfs_block_calc_crc(hdr);
 }

 bool scoutfs_block_valid_ref(struct super_block *sb,
@@ -129,6 +132,22 @@ bool scoutfs_block_valid_ref(struct super_block *sb,
 	       hdr->blkno == blkno;
 }

+bool scoutfs_block_tas_visited(struct super_block *sb,
+			       struct scoutfs_block *bl)
+{
+	struct block_private *bp = BLOCK_PRIVATE(bl);
+
+	return test_bit(BLOCK_BIT_VISITED, &bp->bits) != 0;
+}
+
+void scoutfs_block_clear_visited(struct super_block *sb,
+				 struct scoutfs_block *bl)
+{
+	struct block_private *bp = BLOCK_PRIVATE(bl);
+
+	clear_bit(BLOCK_BIT_VISITED, &bp->bits);
+}
+
 static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 {
 	struct block_private *bp;
@@ -138,20 +157,19 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	 * more careful with a partial page allocator when allocating
 	 * blocks and would make the lru per-page instead of per-block.
 	 */
-	BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_LG_SIZE);
+	BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_SIZE);

 	bp = kzalloc(sizeof(struct block_private), GFP_NOFS);
 	if (!bp)
 		goto out;

-	bp->page = alloc_pages(GFP_NOFS | __GFP_NOWARN,
-			       SCOUTFS_BLOCK_LG_PAGE_ORDER);
+	bp->page = alloc_pages(GFP_NOFS, SCOUTFS_BLOCK_PAGE_ORDER);
 	if (bp->page) {
 		scoutfs_inc_counter(sb, block_cache_alloc_page_order);
 		set_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits);
 		bp->bl.data = page_address(bp->page);
 	} else {
-		bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE,
+		bp->virt = __vmalloc(SCOUTFS_BLOCK_SIZE,
 				     GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
 		if (!bp->virt) {
 			kfree(bp);
@@ -165,6 +183,7 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
 	}

 	bp->bl.blkno = blkno;
+	RB_CLEAR_NODE(&bp->node);
 	bp->sb = sb;
 	atomic_set(&bp->refcount, 1);
 	INIT_LIST_HEAD(&bp->lru_entry);
@@ -187,7 +206,7 @@ static void block_free(struct super_block *sb, struct block_private *bp)
 	TRACE_BLOCK(free, bp);

 	if (test_bit(BLOCK_BIT_PAGE_ALLOC, &bp->bits))
-		__free_pages(bp->page, SCOUTFS_BLOCK_LG_PAGE_ORDER);
+		__free_pages(bp->page, SCOUTFS_BLOCK_PAGE_ORDER);
 	else if (test_bit(BLOCK_BIT_VIRT, &bp->bits))
 		vfree(bp->virt);
 	else
@@ -234,9 +253,39 @@ static void block_put(struct super_block *sb, struct block_private *bp)
 	}
 }

+static struct block_private *walk_block_rbtree(struct rb_root *root,
+					       u64 blkno,
+					       struct block_private *ins)
+{
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct block_private *bp;
+	int cmp;
+
+	while (*node) {
+		parent = *node;
+		bp = container_of(*node, struct block_private, node);
+
+		cmp = scoutfs_cmp_u64s(bp->bl.blkno, blkno);
+		if (cmp == 0)
+			return bp;
+		else if (cmp < 0)
+			node = &(*node)->rb_left;
+		else
+			node = &(*node)->rb_right;
+	}
+
+	if (ins) {
+		rb_link_node(&ins->node, parent, node);
+		rb_insert_color(&ins->node, root);
+		return ins;
+	}
+
+	return NULL;
+}
+
 /*
- * Add a new block into the cache.  The caller holds the lock and has
- * preloaded the radix.
+ * Add a new block into the cache.  The caller holds the lock.
 */
 static void block_insert(struct super_block *sb, struct block_private *bp,
 			 u64 blkno)
@@ -245,9 +294,10 @@ static void block_insert(struct super_block *sb, struct block_private *bp,

 	assert_spin_locked(&binf->lock);
 	BUG_ON(!list_empty(&bp->lru_entry));
+	BUG_ON(!RB_EMPTY_NODE(&bp->node));

 	atomic_inc(&bp->refcount);
-	radix_tree_insert(&binf->radix, blkno, bp);
+	walk_block_rbtree(&binf->root, blkno, bp);
 	list_add_tail(&bp->lru_entry, &binf->lru_list);
 	bp->lru_moved = ++binf->lru_move_counter;
 	binf->lru_nr++;
@@ -295,11 +345,10 @@ static void block_remove(struct super_block *sb, struct block_private *bp)
 {
 	DECLARE_BLOCK_INFO(sb, binf);

-	assert_spin_locked(&binf->lock);
-
 	if (!test_and_set_bit(BLOCK_BIT_DELETED, &bp->bits)) {
 		BUG_ON(list_empty(&bp->lru_entry));
-		radix_tree_delete(&binf->radix, bp->bl.blkno);
+		rb_erase(&bp->node, &binf->root);
+		RB_CLEAR_NODE(&bp->node);
 		list_del_init(&bp->lru_entry);
 		binf->lru_nr--;
 		block_put(sb, bp);
@@ -319,19 +368,18 @@ static void block_remove_all(struct super_block *sb)
 {
 	DECLARE_BLOCK_INFO(sb, binf);
 	struct block_private *bp;
+	struct rb_node *node;

-	spin_lock(&binf->lock);
-
-	while (radix_tree_gang_lookup(&binf->radix, (void **)&bp, 0, 1) == 1) {
+	for (node = rb_first(&binf->root); node; ) {
+		bp = container_of(node, struct block_private, node);
+		node = rb_next(node);
 		wait_event(binf->waitq, !io_busy(bp));
 		block_remove(sb, bp);
 	}

-	spin_unlock(&binf->lock);
-
 	WARN_ON_ONCE(!list_empty(&binf->lru_list));
 	WARN_ON_ONCE(binf->lru_nr != 0);
-	WARN_ON_ONCE(binf->radix.rnode != NULL);
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&binf->root));
 }

 /*
@@ -386,7 +434,6 @@ static void block_bio_end_io(struct bio *bio, int err)
 static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 			    int rw)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 	struct page *page;
@@ -394,7 +441,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
 	sector_t sector;
 	int ret = 0;

-	sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);
+	sector = bp->bl.blkno << (SCOUTFS_BLOCK_SHIFT - 9);

 	WARN_ON_ONCE(bp->bl.blkno == U64_MAX);
 	WARN_ON_ONCE(sector == U64_MAX || sector == 0);
@@ -406,16 +453,16 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,

 	blk_start_plug(&plug);

-	for (off = 0; off < SCOUTFS_BLOCK_LG_SIZE; off += PAGE_SIZE) {
+	for (off = 0; off < SCOUTFS_BLOCK_SIZE; off += PAGE_SIZE) {
 		if (!bio) {
-			bio = bio_alloc(GFP_NOFS, SCOUTFS_BLOCK_LG_PAGES_PER);
+			bio = bio_alloc(GFP_NOFS, SCOUTFS_PAGES_PER_BLOCK);
 			if (!bio) {
 				ret = -ENOMEM;
 				break;
 			}

 			bio->bi_sector = sector + (off >> 9);
-			bio->bi_bdev = sbi->meta_bdev;
+			bio->bi_bdev = sb->s_bdev;
 			bio->bi_end_io = block_bio_end_io;
 			bio->bi_private = bp;

@@ -450,8 +497,8 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,

 /*
 * Return a reference to a cached block in the system, allocating a new
- * block if one isn't found in the radix.  Its contents are undefined if
- * it's newly allocated.
+ * block if one isn't found in the rbtree.  Its contents are undefined
+ * if it's newly allocated.
 */
 static struct block_private *block_get(struct super_block *sb, u64 blkno)
 {
@@ -460,11 +507,11 @@ static struct block_private *block_get(struct super_block *sb, u64 blkno)
 	struct block_private *bp;
 	int ret;

-	rcu_read_lock();
-	bp = radix_tree_lookup(&binf->radix, blkno);
+	spin_lock(&binf->lock);
+	bp = walk_block_rbtree(&binf->root, blkno, NULL);
 	if (bp)
 		atomic_inc(&bp->refcount);
-	rcu_read_unlock();
+	spin_unlock(&binf->lock);

 	/* drop failed reads that interrupted waiters abandoned */
 	if (bp && (test_bit(BLOCK_BIT_ERROR, &bp->bits) &&
@@ -483,20 +530,15 @@ static struct block_private *block_get(struct super_block *sb, u64 blkno)
 			goto out;
 		}

-		ret = radix_tree_preload(GFP_NOFS);
-		if (ret)
-			goto out;
-
-		/* could use slot instead of lookup/insert */
+		/* could refactor to insert in one walk */
 		spin_lock(&binf->lock);
-		found = radix_tree_lookup(&binf->radix, blkno);
+		found = walk_block_rbtree(&binf->root, blkno, NULL);
 		if (found) {
 			atomic_inc(&found->refcount);
 		} else {
 			block_insert(sb, bp, blkno);
 		}
 		spin_unlock(&binf->lock);
-		radix_tree_preload_end();

 		if (found) {
 			block_put(sb, bp);
@@ -592,7 +634,6 @@ void scoutfs_block_invalidate(struct super_block *sb, struct scoutfs_block *bl)
 	}
 }

-/* This is only used for large metadata blocks */
 bool scoutfs_block_consistent_ref(struct super_block *sb,
 				  struct scoutfs_block *bl,
 				  __le64 seq, __le64 blkno, u32 magic)
@@ -602,8 +643,7 @@ bool scoutfs_block_consistent_ref(struct super_block *sb,
 	struct scoutfs_block_header *hdr = bl->data;

 	if (!test_bit(BLOCK_BIT_CRC_VALID, &bp->bits)) {
-		if (hdr->crc !=
-		    scoutfs_block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE))
+		if (hdr->crc != scoutfs_block_calc_crc(hdr))
 			return false;
 		set_bit(BLOCK_BIT_CRC_VALID, &bp->bits);
 	}
@@ -682,7 +722,7 @@ int scoutfs_block_writer_write(struct super_block *sb,
 	/* checksum everything to reduce time between io submission merging */
 	list_for_each_entry(bp, &wri->dirty_list, dirty_entry) {
 		hdr = bp->bl.data;
-		hdr->crc = scoutfs_block_calc_crc(hdr, SCOUTFS_BLOCK_LG_SIZE);
+		hdr->crc = scoutfs_block_calc_crc(hdr);
 	}

        blk_start_plug(&plug);
@@ -770,6 +810,44 @@ void scoutfs_block_writer_forget(struct super_block *sb,
 	}
 }

+/*
+ * Change a cached block's location.  We're careful to only change its
+ * position in the rbtree.  If we find another block existing at the new
+ * location then we remove it from the cache and forget it if it was
+ * dirty.
+ */
+void scoutfs_block_move(struct super_block *sb,
+			struct scoutfs_block_writer *wri,
+			struct scoutfs_block *bl, u64 blkno)
+{
+	DECLARE_BLOCK_INFO(sb, binf);
+	struct block_private *bp = BLOCK_PRIVATE(bl);
+	struct block_private *existing = NULL;
+
+	spin_lock(&binf->lock);
+
+	existing = walk_block_rbtree(&binf->root, blkno, NULL);
+	if (existing) {
+		/* only nesting of binf and wri locks */
+		if (test_bit(BLOCK_BIT_DIRTY, &bp->bits)) {
+			spin_lock(&wri->lock);
+			if (test_bit(BLOCK_BIT_DIRTY, &bp->bits))
+				block_forget(sb, wri, bp);
+			spin_unlock(&wri->lock);
+		}
+		block_remove(sb, existing);
+	}
+
+	rb_erase(&bp->node, &binf->root);
+	RB_CLEAR_NODE(&bp->node);
+	bp->bl.blkno = blkno;
+	walk_block_rbtree(&binf->root, blkno, bp);
+
+	TRACE_BLOCK(move, bp);
+
+	spin_unlock(&binf->lock);
+}
+
 /*
 * The caller has ensured that no more dirtying will take place.  This
 * helps the caller avoid doing a bunch of work before calling into the
@@ -788,7 +866,7 @@ bool scoutfs_block_writer_has_dirty(struct super_block *sb,
 u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
 				     struct scoutfs_block_writer *wri)
 {
-	return wri->nr_dirty_blocks * SCOUTFS_BLOCK_LG_SIZE;
+	return wri->nr_dirty_blocks * SCOUTFS_BLOCK_SIZE;
 }

 /*
@@ -838,9 +916,12 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	spin_unlock(&binf->lock);

 out:
-	return min_t(u64, binf->lru_nr * SCOUTFS_BLOCK_LG_PAGES_PER, INT_MAX);
+	return min_t(u64, binf->lru_nr * SCOUTFS_PAGES_PER_BLOCK, INT_MAX);
 }

+#define SCOUTFS_SM_BLOCK_SHIFT	12
+#define SCOUTFS_SM_BLOCK_SIZE	(1 << SCOUTFS_SM_BLOCK_SHIFT)
+
 struct sm_block_completion {
 	struct completion comp;
 	int err;
@@ -865,7 +946,7 @@ static void sm_block_bio_end_io(struct bio *bio, int err)
 * only layer that sees the full block buffer so we pass the calculated
 * crc to the caller for them to check in their context.
 */
-static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,
+static int sm_block_io(struct super_block *sb, int rw, u64 blkno,
 		       struct scoutfs_block_header *hdr, size_t len,
 		       __le32 *blk_crc)
 {
@@ -875,9 +956,11 @@ static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,
 	struct bio *bio;
 	int ret;

-	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SM_SIZE);
+	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_SM_BLOCK_SIZE);
+	/* block calc crc is assuming block size, they'll be different later */
+	BUILD_BUG_ON(SCOUTFS_SM_BLOCK_SIZE != SCOUTFS_BLOCK_SIZE);

-	if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
+	if (WARN_ON_ONCE(len > SCOUTFS_SM_BLOCK_SIZE) ||
 	    WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
 		return -EINVAL;

@@ -889,11 +972,10 @@ static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,

 	if (rw & WRITE) {
 		memcpy(pg_hdr, hdr, len);
-		if (len < SCOUTFS_BLOCK_SM_SIZE)
+		if (len < SCOUTFS_SM_BLOCK_SIZE)
 			memset((char *)pg_hdr + len, 0,
-			       SCOUTFS_BLOCK_SM_SIZE - len);
-		pg_hdr->crc = scoutfs_block_calc_crc(pg_hdr,
-						     SCOUTFS_BLOCK_SM_SIZE);
+			       SCOUTFS_SM_BLOCK_SIZE - len);
+		pg_hdr->crc = scoutfs_block_calc_crc(pg_hdr);
 	}

 	bio = bio_alloc(GFP_NOFS, 1);
@@ -902,11 +984,11 @@ static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,
 		goto out;
 	}

-	bio->bi_sector = blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9);
-	bio->bi_bdev = bdev;
+	bio->bi_sector = blkno << (SCOUTFS_SM_BLOCK_SHIFT - 9);
+	bio->bi_bdev = sb->s_bdev;
 	bio->bi_end_io = sm_block_bio_end_io;
 	bio->bi_private = &sbc;
-	bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);
+	bio_add_page(bio, page, SCOUTFS_SM_BLOCK_SIZE, 0);

 	init_completion(&sbc.comp);
 	sbc.err = 0;
@@ -918,44 +1000,32 @@ static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,

 	if (ret == 0 && !(rw & WRITE)) {
 		memcpy(hdr, pg_hdr, len);
-		*blk_crc = scoutfs_block_calc_crc(pg_hdr,
-						  SCOUTFS_BLOCK_SM_SIZE);
+		*blk_crc = scoutfs_block_calc_crc(pg_hdr);
 	}
 out:
 	__free_page(page);
 	return ret;
 }

-int scoutfs_block_read_sm(struct super_block *sb,
-			  struct block_device *bdev, u64 blkno,
+int scoutfs_block_read_sm(struct super_block *sb, u64 blkno,
 			  struct scoutfs_block_header *hdr, size_t len,
 			  __le32 *blk_crc)
 {
-	return sm_block_io(bdev, READ, blkno, hdr, len, blk_crc);
+	return sm_block_io(sb, READ, blkno, hdr, len, blk_crc);
 }

-int scoutfs_block_write_sm(struct super_block *sb,
-			   struct block_device *bdev, u64 blkno,
+int scoutfs_block_write_sm(struct super_block *sb, u64 blkno,
 			   struct scoutfs_block_header *hdr, size_t len)
 {
-	return sm_block_io(bdev, WRITE, blkno, hdr, len, NULL);
+	return sm_block_io(sb, WRITE, blkno, hdr, len, NULL);
 }

 int scoutfs_block_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct block_info *binf;
-	loff_t size;
 	int ret;

-	/* we store blknos in longs in the radix */
-	size = i_size_read(sb->s_bdev->bd_inode);
-	if ((size >> SCOUTFS_BLOCK_LG_SHIFT) >= LONG_MAX) {
-		scoutfs_err(sb, "Cant reference all blocks in %llu byte device with %u bit long radix tree indexes",
-			size, BITS_PER_LONG);
-		return -EINVAL;
-	}
-
 	binf = kzalloc(sizeof(struct block_info), GFP_KERNEL);
 	if (!binf) {
 		ret = -ENOMEM;
@@ -964,7 +1034,7 @@ int scoutfs_block_setup(struct super_block *sb)

 	binf->sb = sb;
 	spin_lock_init(&binf->lock);
-	INIT_RADIX_TREE(&binf->radix, GFP_ATOMIC); /* insertion preloads */
+	binf->root = RB_ROOT;
 	INIT_LIST_HEAD(&binf->lru_list);
 	init_waitqueue_head(&binf->waitq);
 	binf->shrinker.shrink = block_shrink;
@@ -10,14 +10,17 @@ struct scoutfs_block_writer {
 struct scoutfs_block {
 	u64 blkno;
 	void *data;
-	void *priv;
 };

-__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr, u32 size);
-bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr, u32 size);
+__le32 scoutfs_block_calc_crc(struct scoutfs_block_header *hdr);
+bool scoutfs_block_valid_crc(struct scoutfs_block_header *hdr);
 bool scoutfs_block_valid_ref(struct super_block *sb,
 			     struct scoutfs_block_header *hdr,
 			     __le64 seq, __le64 blkno);
+bool scoutfs_block_tas_visited(struct super_block *sb,
+			       struct scoutfs_block *bl);
+void scoutfs_block_clear_visited(struct super_block *sb,
+				 struct scoutfs_block *bl);

 struct scoutfs_block *scoutfs_block_create(struct super_block *sb, u64 blkno);
 struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno);
@@ -41,17 +44,18 @@ void scoutfs_block_writer_forget_all(struct super_block *sb,
 void scoutfs_block_writer_forget(struct super_block *sb,
 			         struct scoutfs_block_writer *wri,
 				 struct scoutfs_block *bl);
+void scoutfs_block_move(struct super_block *sb,
+			struct scoutfs_block_writer *wri,
+			struct scoutfs_block *bl, u64 blkno);
 bool scoutfs_block_writer_has_dirty(struct super_block *sb,
 				    struct scoutfs_block_writer *wri);
 u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
 				     struct scoutfs_block_writer *wri);

-int scoutfs_block_read_sm(struct super_block *sb,
-			  struct block_device *bdev, u64 blkno,
+int scoutfs_block_read_sm(struct super_block *sb, u64 blkno,
 			  struct scoutfs_block_header *hdr, size_t len,
 			  __le32 *blk_crc);
-int scoutfs_block_write_sm(struct super_block *sb,
-			   struct block_device *bdev, u64 blkno,
+int scoutfs_block_write_sm(struct super_block *sb, u64 blkno,
 			   struct scoutfs_block_header *hdr, size_t len);

 int scoutfs_block_setup(struct super_block *sb);
@@ -3,14 +3,15 @@

 #include <linux/uio.h>

-struct scoutfs_alloc;
+struct scoutfs_radix_allocator;
 struct scoutfs_block_writer;
 struct scoutfs_block;

 struct scoutfs_btree_item_ref {
 	struct super_block *sb;
 	struct scoutfs_block *bl;
-	struct scoutfs_key *key;
+	void *key;
+	unsigned key_len;
 	void *val;
 	unsigned val_len;
 };
@@ -18,69 +19,50 @@ struct scoutfs_btree_item_ref {
 #define SCOUTFS_BTREE_ITEM_REF(name) \
 	struct scoutfs_btree_item_ref name = {NULL,}

-/* caller gives an item to the callback */
-typedef int (*scoutfs_btree_item_cb)(struct super_block *sb,
-				     struct scoutfs_key *key,
-				     void *val, int val_len, void *arg);

-/* simple singly-linked list of items */
-struct scoutfs_btree_item_list {
-	struct scoutfs_btree_item_list *next;
-	struct scoutfs_key key;
-	int val_len;
-	u8 val[0];
-};
-
-int scoutfs_btree_lookup(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
+int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len,
 			 struct scoutfs_btree_item_ref *iref);
 int scoutfs_btree_insert(struct super_block *sb,
-			 struct scoutfs_alloc *alloc,
+			 struct scoutfs_radix_allocator *alloc,
 			 struct scoutfs_block_writer *wri,
 			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
+			 void *key, unsigned key_len,
 			 void *val, unsigned val_len);
 int scoutfs_btree_update(struct super_block *sb,
-			 struct scoutfs_alloc *alloc,
+			 struct scoutfs_radix_allocator *alloc,
 			 struct scoutfs_block_writer *wri,
 			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
+			 void *key, unsigned key_len,
 			 void *val, unsigned val_len);
 int scoutfs_btree_force(struct super_block *sb,
-			struct scoutfs_alloc *alloc,
+			struct scoutfs_radix_allocator *alloc,
 			struct scoutfs_block_writer *wri,
 			struct scoutfs_btree_root *root,
-			struct scoutfs_key *key,
+			void *key, unsigned key_len,
 			void *val, unsigned val_len);
 int scoutfs_btree_delete(struct super_block *sb,
-			 struct scoutfs_alloc *alloc,
+			 struct scoutfs_radix_allocator *alloc,
 			 struct scoutfs_block_writer *wri,
 			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key);
+			 void *key, unsigned key_len);
 int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *key,
+		       void *key, unsigned key_len,
 		       struct scoutfs_btree_item_ref *iref);
+int scoutfs_btree_after(struct super_block *sb, struct scoutfs_btree_root *root,
+		        void *key, unsigned key_len,
+		        struct scoutfs_btree_item_ref *iref);
 int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *key,
+		       void *key, unsigned key_len,
 		       struct scoutfs_btree_item_ref *iref);
+int scoutfs_btree_before(struct super_block *sb, struct scoutfs_btree_root *root,
+		         void *key, unsigned key_len,
+		         struct scoutfs_btree_item_ref *iref);
 int scoutfs_btree_dirty(struct super_block *sb,
-			struct scoutfs_alloc *alloc,
+			struct scoutfs_radix_allocator *alloc,
 			struct scoutfs_block_writer *wri,
 			struct scoutfs_btree_root *root,
-			struct scoutfs_key *key);
-
-int scoutfs_btree_read_items(struct super_block *sb,
-			     struct scoutfs_btree_root *root,
-			     struct scoutfs_key *key,
-			     struct scoutfs_key *start,
-			     struct scoutfs_key *end,
-			     scoutfs_btree_item_cb cb, void *arg);
-int scoutfs_btree_insert_list(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri,
-			      struct scoutfs_btree_root *root,
-			      struct scoutfs_btree_item_list *lst);
+			void *key, unsigned key_len);

 void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);

@@ -108,16 +108,6 @@ int scoutfs_client_commit_log_trees(struct super_block *sb,
 					lt, sizeof(*lt), NULL, 0);
 }

-int scoutfs_client_get_roots(struct super_block *sb,
-			     struct scoutfs_net_roots *roots)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn,
-					SCOUTFS_NET_CMD_GET_ROOTS,
-					NULL, 0, roots, sizeof(*roots));
-}
-
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
@@ -150,13 +140,24 @@ int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq)
 	return ret;
 }

+int scoutfs_client_statfs(struct super_block *sb,
+			  struct scoutfs_net_statfs *nstatfs)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return scoutfs_net_sync_request(sb, client->conn,
+					SCOUTFS_NET_CMD_STATFS, NULL, 0,
+					nstatfs,
+					sizeof(struct scoutfs_net_statfs));
+}
+
 /* process an incoming grant response from the server */
 static int client_lock_response(struct super_block *sb,
 				struct scoutfs_net_connection *conn,
 				void *resp, unsigned int resp_len,
 				int error, void *data)
 {
-	if (resp_len != sizeof(struct scoutfs_net_lock_grant_response))
+	if (resp_len != sizeof(struct scoutfs_net_lock))
 		return -EINVAL;

 	/* XXX error? */
@@ -199,28 +200,6 @@ int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
 				    net_id, 0, nlr, bytes);
 }

-/* Find srch files that need to be compacted. */
-int scoutfs_client_srch_get_compact(struct super_block *sb,
-				    struct scoutfs_srch_compact *sc)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn,
-					SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
-					NULL, 0, sc, sizeof(*sc));
-}
-
-/* Commit the result of a srch file compaction. */
-int scoutfs_client_srch_commit_compact(struct super_block *sb,
-				       struct scoutfs_srch_compact *res)
-{
-	struct client_info *client = SCOUTFS_SB(sb)->client_info;
-
-	return scoutfs_net_sync_request(sb, client->conn,
-					SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
-					res, sizeof(*res), NULL, 0);
-}
-
 /* The client is receiving a invalidation request from the server */
 static int client_lock(struct super_block *sb,
 		       struct scoutfs_net_connection *conn, u8 cmd, u64 id,
@@ -7,21 +7,17 @@ int scoutfs_client_get_log_trees(struct super_block *sb,
 				 struct scoutfs_log_trees *lt);
 int scoutfs_client_commit_log_trees(struct super_block *sb,
 				    struct scoutfs_log_trees *lt);
-int scoutfs_client_get_roots(struct super_block *sb,
-			     struct scoutfs_net_roots *roots);
 u64 *scoutfs_client_bulk_alloc(struct super_block *sb);
 int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq);
 int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq);
+int scoutfs_client_statfs(struct super_block *sb,
+			  struct scoutfs_net_statfs *nstatfs);
 int scoutfs_client_lock_request(struct super_block *sb,
 				struct scoutfs_net_lock *nl);
 int scoutfs_client_lock_response(struct super_block *sb, u64 net_id,
 				struct scoutfs_net_lock *nl);
 int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
 					 struct scoutfs_net_lock_recover *nlr);
-int scoutfs_client_srch_get_compact(struct super_block *sb,
-				    struct scoutfs_srch_compact *sc);
-int scoutfs_client_srch_commit_compact(struct super_block *sb,
-				       struct scoutfs_srch_compact *res);

 int scoutfs_client_setup(struct super_block *sb);
 void scoutfs_client_destroy(struct super_block *sb);
@@ -205,12 +205,14 @@ static inline const struct scoutfs_item_count SIC_RENAME(unsigned old_len,
 * item with the header and name.  Any previously existing items are
 * deleted which dirties their key but removes their value.  The two
 * sets of items are indexed by different ids so their items don't
- * overlap.
+ * overlap.  If the xattr name is indexed then we modify one xattr index
+ * item.
 */
 static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,
 							    bool creating,
 							    unsigned name_len,
-							    unsigned size)
+							    unsigned size,
+							    bool indexed)
 {
 	struct scoutfs_item_count cnt = {0,};
 	unsigned int new_parts;
@@ -219,6 +221,8 @@ static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,

 	if (old_parts)
 		cnt.items += old_parts;
+	if (indexed)
+		cnt.items++;

 	if (creating) {
 		new_parts = SCOUTFS_XATTR_NR_PARTS(name_len, size);
@@ -241,9 +245,9 @@ static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,
 static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void)
 {
 	struct scoutfs_item_count cnt = {0,};
-	unsigned nr_free = (1 + SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
-	unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCK_SM_PER_PAGE, 2) +
-			    SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
+	unsigned nr_free = (1 + SCOUTFS_BLOCKS_PER_PAGE) * 3;
+	unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCKS_PER_PAGE, 2) +
+			    SCOUTFS_BLOCKS_PER_PAGE) * 3;

 	__count_dirty_inode(&cnt);

@@ -12,15 +12,6 @@
 * other places by this macro.  Don't forget to update LAST_COUNTER.
 */
 #define EXPAND_EACH_COUNTER					\
-	EXPAND_COUNTER(alloc_alloc_data)			\
-	EXPAND_COUNTER(alloc_alloc_meta)			\
-	EXPAND_COUNTER(alloc_free_data)				\
-	EXPAND_COUNTER(alloc_free_meta)				\
-	EXPAND_COUNTER(alloc_list_avail_lo)			\
-	EXPAND_COUNTER(alloc_list_freed_hi)			\
-	EXPAND_COUNTER(alloc_move)				\
-	EXPAND_COUNTER(alloc_moved_extent)			\
-	EXPAND_COUNTER(alloc_stale_cached_list_block)		\
 	EXPAND_COUNTER(block_cache_access)			\
 	EXPAND_COUNTER(block_cache_alloc_failure)		\
 	EXPAND_COUNTER(block_cache_alloc_page_order)		\
@@ -31,23 +22,8 @@
 	EXPAND_COUNTER(block_cache_invalidate)			\
 	EXPAND_COUNTER(block_cache_lru_move)			\
 	EXPAND_COUNTER(block_cache_shrink)			\
-	EXPAND_COUNTER(btree_compact_values)			\
-	EXPAND_COUNTER(btree_compact_values_enomem)		\
-	EXPAND_COUNTER(btree_delete)				\
-	EXPAND_COUNTER(btree_dirty)				\
-	EXPAND_COUNTER(btree_force)				\
-	EXPAND_COUNTER(btree_join)				\
-	EXPAND_COUNTER(btree_insert)				\
-	EXPAND_COUNTER(btree_leaf_item_hash_search)		\
-	EXPAND_COUNTER(btree_lookup)				\
-	EXPAND_COUNTER(btree_next)				\
-	EXPAND_COUNTER(btree_prev)				\
 	EXPAND_COUNTER(btree_read_error)			\
-	EXPAND_COUNTER(btree_split)				\
 	EXPAND_COUNTER(btree_stale_read)			\
-	EXPAND_COUNTER(btree_update)				\
-	EXPAND_COUNTER(btree_walk)				\
-	EXPAND_COUNTER(btree_walk_restart)			\
 	EXPAND_COUNTER(client_farewell_error)			\
 	EXPAND_COUNTER(corrupt_btree_block_level)		\
 	EXPAND_COUNTER(corrupt_btree_no_child_ref)		\
@@ -66,65 +42,25 @@
 	EXPAND_COUNTER(dentry_revalidate_root)			\
 	EXPAND_COUNTER(dentry_revalidate_valid)			\
 	EXPAND_COUNTER(dir_backref_excessive_retries)		\
-	EXPAND_COUNTER(ext_op_insert)				\
-	EXPAND_COUNTER(ext_op_next)				\
-	EXPAND_COUNTER(ext_op_remove)				\
-	EXPAND_COUNTER(forest_bloom_fail)			\
-	EXPAND_COUNTER(forest_bloom_pass)			\
-	EXPAND_COUNTER(forest_read_items)			\
-	EXPAND_COUNTER(forest_roots_next_hint)			\
-	EXPAND_COUNTER(forest_set_bloom_bits)			\
-	EXPAND_COUNTER(item_clear_dirty)			\
-	EXPAND_COUNTER(item_create)				\
-	EXPAND_COUNTER(item_delete)				\
-	EXPAND_COUNTER(item_dirty)				\
-	EXPAND_COUNTER(item_invalidate)				\
-	EXPAND_COUNTER(item_invalidate_page)			\
-	EXPAND_COUNTER(item_lookup)				\
-	EXPAND_COUNTER(item_mark_dirty)				\
-	EXPAND_COUNTER(item_next)				\
-	EXPAND_COUNTER(item_page_accessed)			\
-	EXPAND_COUNTER(item_page_alloc)				\
-	EXPAND_COUNTER(item_page_clear_dirty)			\
-	EXPAND_COUNTER(item_page_compact)			\
-	EXPAND_COUNTER(item_page_free)				\
-	EXPAND_COUNTER(item_page_lru_add)			\
-	EXPAND_COUNTER(item_page_lru_remove)			\
-	EXPAND_COUNTER(item_page_mark_dirty)			\
-	EXPAND_COUNTER(item_page_rbtree_walk)			\
-	EXPAND_COUNTER(item_page_split)				\
-	EXPAND_COUNTER(item_pcpu_add_replaced)			\
-	EXPAND_COUNTER(item_pcpu_page_hit)			\
-	EXPAND_COUNTER(item_pcpu_page_miss)			\
-	EXPAND_COUNTER(item_pcpu_page_miss_keys)		\
-	EXPAND_COUNTER(item_read_pages_split)			\
-	EXPAND_COUNTER(item_shrink_page)			\
-	EXPAND_COUNTER(item_shrink_page_dirty)			\
-	EXPAND_COUNTER(item_shrink_page_reader)			\
-	EXPAND_COUNTER(item_shrink_page_trylock)		\
-	EXPAND_COUNTER(item_update)				\
-	EXPAND_COUNTER(item_write_dirty)			\
 	EXPAND_COUNTER(lock_alloc)				\
 	EXPAND_COUNTER(lock_free)				\
+	EXPAND_COUNTER(lock_grace_elapsed)			\
 	EXPAND_COUNTER(lock_grace_extended)			\
 	EXPAND_COUNTER(lock_grace_set)				\
 	EXPAND_COUNTER(lock_grace_wait)				\
 	EXPAND_COUNTER(lock_grant_request)			\
 	EXPAND_COUNTER(lock_grant_response)			\
-	EXPAND_COUNTER(lock_grant_work)				\
+	EXPAND_COUNTER(lock_invalidate_commit)			\
 	EXPAND_COUNTER(lock_invalidate_coverage)		\
 	EXPAND_COUNTER(lock_invalidate_inode)			\
 	EXPAND_COUNTER(lock_invalidate_request)			\
 	EXPAND_COUNTER(lock_invalidate_response)		\
-	EXPAND_COUNTER(lock_invalidate_sync)			\
-	EXPAND_COUNTER(lock_invalidate_work)			\
 	EXPAND_COUNTER(lock_lock)				\
 	EXPAND_COUNTER(lock_lock_error)				\
 	EXPAND_COUNTER(lock_nonblock_eagain)			\
 	EXPAND_COUNTER(lock_recover_request)			\
-	EXPAND_COUNTER(lock_shrink_attempted)			\
-	EXPAND_COUNTER(lock_shrink_aborted)			\
-	EXPAND_COUNTER(lock_shrink_work)			\
+	EXPAND_COUNTER(lock_shrink_queued)			\
+	EXPAND_COUNTER(lock_shrink_request_aborted)		\
 	EXPAND_COUNTER(lock_unlock)				\
 	EXPAND_COUNTER(lock_wait)				\
 	EXPAND_COUNTER(net_dropped_response)			\
@@ -149,37 +85,17 @@
 	EXPAND_COUNTER(quorum_write_block)			\
 	EXPAND_COUNTER(quorum_write_block_error)		\
 	EXPAND_COUNTER(quorum_fenced)				\
-	EXPAND_COUNTER(server_commit_hold)			\
-	EXPAND_COUNTER(server_commit_queue)			\
-	EXPAND_COUNTER(server_commit_worker)			\
-	EXPAND_COUNTER(srch_add_entry)				\
-	EXPAND_COUNTER(srch_compact_dirty_block)		\
-	EXPAND_COUNTER(srch_compact_entry)			\
-	EXPAND_COUNTER(srch_compact_flush)			\
-	EXPAND_COUNTER(srch_compact_log_page)			\
-	EXPAND_COUNTER(srch_compact_removed_entry)		\
-	EXPAND_COUNTER(srch_inconsistent_ref)			\
-	EXPAND_COUNTER(srch_rotate_log)				\
-	EXPAND_COUNTER(srch_search_log)				\
-	EXPAND_COUNTER(srch_search_log_block)			\
-	EXPAND_COUNTER(srch_search_retry_empty)			\
-	EXPAND_COUNTER(srch_search_sorted)			\
-	EXPAND_COUNTER(srch_search_sorted_block)		\
-	EXPAND_COUNTER(srch_search_stale_eio)			\
-	EXPAND_COUNTER(srch_search_stale_retry)			\
-	EXPAND_COUNTER(srch_search_xattrs)			\
-	EXPAND_COUNTER(srch_read_stale)				\
-	EXPAND_COUNTER(statfs)					\
+	EXPAND_COUNTER(radix_enospc_data)			\
+	EXPAND_COUNTER(radix_enospc_paths)			\
+	EXPAND_COUNTER(radix_enospc_synth)			\
 	EXPAND_COUNTER(trans_commit_data_alloc_low)		\
-	EXPAND_COUNTER(trans_commit_dirty_meta_full)		\
 	EXPAND_COUNTER(trans_commit_fsync)			\
-	EXPAND_COUNTER(trans_commit_meta_alloc_low)		\
+	EXPAND_COUNTER(trans_commit_full)			\
 	EXPAND_COUNTER(trans_commit_sync_fs)			\
-	EXPAND_COUNTER(trans_commit_timer)			\
-	EXPAND_COUNTER(trans_commit_written)
+	EXPAND_COUNTER(trans_commit_timer)

-#define FIRST_COUNTER	alloc_alloc_data
-#define LAST_COUNTER	trans_commit_written
+#define FIRST_COUNTER	block_cache_access
+#define LAST_COUNTER	trans_commit_timer

 #undef EXPAND_COUNTER
 #define EXPAND_COUNTER(which) struct percpu_counter which;
@@ -197,21 +113,11 @@ struct scoutfs_counters {
 	     pcpu <= &SCOUTFS_SB(sb)->counters->LAST_COUNTER;	\
 	     pcpu++)

-/*
- * We always read with _sum, we have no use for the shared count and
- * certainly don't want to pay the cost of a shared lock to update it.
- * The default batch of 32 make counter increments show up significantly
- * in profiles.
- */
-#define SCOUTFS_PCPU_COUNTER_BATCH (1 << 30)
+#define scoutfs_inc_counter(sb, which) \
+	percpu_counter_inc(&SCOUTFS_SB(sb)->counters->which)

-#define scoutfs_inc_counter(sb, which)					\
-	__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, 1,	\
-			     SCOUTFS_PCPU_COUNTER_BATCH)
-
-#define scoutfs_add_counter(sb, which, cnt)				\
-	__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, cnt,	\
-			     SCOUTFS_PCPU_COUNTER_BATCH)
+#define scoutfs_add_counter(sb, which, cnt) \
+	percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, cnt)

 void __init scoutfs_init_counters(void);
 int scoutfs_setup_counters(struct super_block *sb);
@@ -47,7 +47,7 @@ struct scoutfs_traced_extent {

 extern const struct address_space_operations scoutfs_file_aops;
 extern const struct file_operations scoutfs_file_fops;
-struct scoutfs_alloc;
+struct scoutfs_radix_allocator;
 struct scoutfs_block_writer;

 int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
@@ -58,8 +58,6 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
 int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
 				     struct scoutfs_lock *lock);
-int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
-			     u64 byte_len, struct inode *to, u64 to_off);

 int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
 			    u8 sef, u8 op, struct scoutfs_data_wait *ow,
@@ -79,12 +77,11 @@ int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock,
 			 unsigned int nr);

 void scoutfs_data_init_btrees(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
+			      struct scoutfs_radix_allocator *alloc,
 			      struct scoutfs_block_writer *wri,
 			      struct scoutfs_log_trees *lt);
 void scoutfs_data_get_btrees(struct super_block *sb,
 			     struct scoutfs_log_trees *lt);
-int scoutfs_data_prepare_commit(struct super_block *sb);
 u64 scoutfs_data_alloc_free_bytes(struct super_block *sb);

 int scoutfs_data_setup(struct super_block *sb);
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/crc32c.h>
 #include <linux/uio.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
@@ -27,9 +28,9 @@
 #include "super.h"
 #include "trans.h"
 #include "xattr.h"
-#include "item.h"
+#include "kvec.h"
+#include "forest.h"
 #include "lock.h"
-#include "hash.h"
 #include "counters.h"
 #include "scoutfs_trace.h"

@@ -78,7 +79,7 @@ static unsigned int mode_to_type(umode_t mode)
 #undef S_SHIFT
 }

-static unsigned int dentry_type(enum scoutfs_dentry_type type)
+static unsigned int dentry_type(unsigned int type)
 {
 	static unsigned char types[] = {
 		[SCOUTFS_DT_FIFO]	= DT_FIFO,
@@ -212,44 +213,12 @@ static struct scoutfs_dirent *alloc_dirent(unsigned int name_len)
 	return kmalloc(dirent_bytes(name_len), GFP_NOFS);
 }

-/*
- * Test a bit number as though an array of bytes is a large len-bit
- * big-endian value.  nr 0 is the LSB of the final byte, nr (len - 1) is
- * the MSB of the first byte.
- */
-static int test_be_bytes_bit(int nr, const char *bytes, int len)
-{
-	return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
-}
-
-/*
- * Generate a 32bit "fingerprint" of the name by extracting 32 evenly
- * distributed bits from the name.  The intent is to have the sort order
- * of the fingerprints reflect the memcmp() sort order of the names
- * while mapping large names down to small fs keys.
- *
- * Names that are smaller than 32bits are biased towards the high bits
- * of the fingerprint so that most significant bits of the fingerprints
- * consistently reflect the initial characters of the names.
- */
-static u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
-{
-	int name_bits = name_len * 8;
-	int skip = max(name_bits / 32, 1);
-	u32 fp = 0;
-	int f;
-	int n;
-
-	for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
-		fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
-
-	return fp;
-}
-
 static u64 dirent_name_hash(const char *name, unsigned int name_len)
 {
-       return scoutfs_hash32(name, name_len) |
-              ((u64)dirent_name_fingerprint(name, name_len) << 32);
+       unsigned int half = (name_len + 1) / 2;
+
+       return crc32c(~0, name, half) |
+              ((u64)crc32c(~0, name + name_len - half, half) << 32);
 }

 static u64 dirent_names_equal(const char *a_name, unsigned int a_len,
@@ -270,6 +239,7 @@ static int lookup_dirent(struct super_block *sb, u64 dir_ino, const char *name,
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
 	struct scoutfs_dirent *dent = NULL;
+	struct kvec val;
 	int ret;

 	dent = alloc_dirent(SCOUTFS_NAME_LEN);
@@ -280,10 +250,10 @@ static int lookup_dirent(struct super_block *sb, u64 dir_ino, const char *name,

 	init_dirent_key(&key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, 0);
 	init_dirent_key(&last_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, U64_MAX);
+	kvec_init(&val, dent, dirent_bytes(SCOUTFS_NAME_LEN));

 	for (;;) {
-		ret = scoutfs_item_next(sb, &key, &last_key, dent,
-					dirent_bytes(SCOUTFS_NAME_LEN), lock);
+		ret = scoutfs_forest_next(sb, &key, &last_key, &val, lock);
 		if (ret < 0)
 			break;

@@ -482,6 +452,7 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,
 	struct scoutfs_key key;
 	struct scoutfs_key last_key;
 	struct scoutfs_lock *dir_lock;
+	struct kvec val;
 	int name_len;
 	u64 pos;
 	int ret;
@@ -497,6 +468,7 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,

 	init_dirent_key(&last_key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
 			SCOUTFS_DIRENT_LAST_POS, 0);
+	kvec_init(&val, dent, dirent_bytes(SCOUTFS_NAME_LEN));

 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &dir_lock);
 	if (ret)
@@ -506,9 +478,7 @@ static int KC_DECLARE_READDIR(scoutfs_readdir, struct file *file,
 		init_dirent_key(&key, SCOUTFS_READDIR_TYPE, scoutfs_ino(inode),
 				kc_readdir_pos(file, ctx), 0);

-		ret = scoutfs_item_next(sb, &key, &last_key, dent,
-					dirent_bytes(SCOUTFS_NAME_LEN),
-					dir_lock);
+		ret = scoutfs_forest_next(sb, &key, &last_key, &val, dir_lock);
 		if (ret < 0) {
 			if (ret == -ENOENT)
 				ret = 0;
@@ -565,6 +535,7 @@ static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 	struct scoutfs_dirent *dent;
 	bool del_ent = false;
 	bool del_rdir = false;
+	struct kvec val;
 	int ret;

 	dent = alloc_dirent(name_len);
@@ -583,27 +554,25 @@ static int add_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 	init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos);
 	init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0);
 	init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos);
+	kvec_init(&val, dent, dirent_bytes(name_len));

-	ret = scoutfs_item_create(sb, &ent_key, dent, dirent_bytes(name_len),
-				  dir_lock);
+	ret = scoutfs_forest_create(sb, &ent_key, &val, dir_lock);
 	if (ret)
 		goto out;
 	del_ent = true;

-	ret = scoutfs_item_create(sb, &rdir_key, dent, dirent_bytes(name_len),
-				  dir_lock);
+	ret = scoutfs_forest_create(sb, &rdir_key, &val, dir_lock);
 	if (ret)
 		goto out;
 	del_rdir = true;

-	ret = scoutfs_item_create(sb, &lb_key, dent, dirent_bytes(name_len),
-				  inode_lock);
+	ret = scoutfs_forest_create(sb, &lb_key, &val, inode_lock);
 out:
 	if (ret < 0) {
 		if (del_ent)
-			scoutfs_item_delete(sb, &ent_key, dir_lock);
+			scoutfs_forest_delete_dirty(sb, &ent_key);
 		if (del_rdir)
-			scoutfs_item_delete(sb, &rdir_key, dir_lock);
+			scoutfs_forest_delete_dirty(sb, &rdir_key);
 	}

 	kfree(dent);
@@ -625,20 +594,23 @@ static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
 	struct scoutfs_key rdir_key;
 	struct scoutfs_key ent_key;
 	struct scoutfs_key lb_key;
+	LIST_HEAD(dir_saved);
+	LIST_HEAD(inode_saved);
 	int ret;

 	init_dirent_key(&ent_key, SCOUTFS_DIRENT_TYPE, dir_ino, hash, pos);
 	init_dirent_key(&rdir_key, SCOUTFS_READDIR_TYPE, dir_ino, pos, 0);
 	init_dirent_key(&lb_key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, pos);

-	ret = scoutfs_item_dirty(sb, &ent_key, dir_lock) ?:
-	      scoutfs_item_dirty(sb, &rdir_key, dir_lock) ?:
-	      scoutfs_item_dirty(sb, &lb_key, inode_lock);
-	if (ret == 0) {
-		ret = scoutfs_item_delete(sb, &ent_key, dir_lock) ?:
-		      scoutfs_item_delete(sb, &rdir_key, dir_lock) ?:
-		      scoutfs_item_delete(sb, &lb_key, inode_lock);
-		BUG_ON(ret); /* _dirty should have guaranteed success */
+	ret = scoutfs_forest_delete_save(sb, &ent_key, &dir_saved, dir_lock) ?:
+	      scoutfs_forest_delete_save(sb, &rdir_key, &dir_saved, dir_lock) ?:
+	      scoutfs_forest_delete_save(sb, &lb_key, &inode_saved, inode_lock);
+	if (ret < 0) {
+		scoutfs_forest_restore(sb, &dir_saved, dir_lock);
+		scoutfs_forest_restore(sb, &inode_saved, inode_lock);
+	} else {
+		scoutfs_forest_free_batch(sb, &dir_saved);
+		scoutfs_forest_free_batch(sb, &inode_saved);
 	}

 	return ret;
@@ -670,7 +642,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		return ERR_PTR(ret);

-	ret = scoutfs_alloc_ino(sb, S_ISDIR(mode), &ino);
+	ret = scoutfs_alloc_ino(dir, &ino);
 	if (ret)
 		return ERR_PTR(ret);

@@ -988,16 +960,17 @@ static void init_symlink_key(struct scoutfs_key *key, u64 ino, u8 nr)
 * The target name can be null for deletion when val isn't used.  Size
 * still has to be provided to determine the number of items.
 */
-enum symlink_ops {
+enum {
 	SYM_CREATE = 0,
 	SYM_LOOKUP,
 	SYM_DELETE,
 };
-static int symlink_item_ops(struct super_block *sb, enum symlink_ops op, u64 ino,
+static int symlink_item_ops(struct super_block *sb, int op, u64 ino,
 			    struct scoutfs_lock *lock, const char *target,
 			    size_t size)
 {
 	struct scoutfs_key key;
+	struct kvec val;
 	unsigned bytes;
 	unsigned nr;
 	int ret;
@@ -1012,16 +985,14 @@ static int symlink_item_ops(struct super_block *sb, enum symlink_ops op, u64 ino

 		init_symlink_key(&key, ino, i);
 		bytes = min_t(u64, size, SCOUTFS_MAX_VAL_SIZE);
+		kvec_init(&val, (void *)target, bytes);

 		if (op == SYM_CREATE)
-			ret = scoutfs_item_create(sb, &key, (void *)target,
-						  bytes, lock);
+			ret = scoutfs_forest_create(sb, &key, &val, lock);
 		else if (op == SYM_LOOKUP)
-			ret = scoutfs_item_lookup_exact(sb, &key,
-						        (void *)target, bytes,
-							lock);
+			ret = scoutfs_forest_lookup_exact(sb, &key, &val, lock);
 		else if (op == SYM_DELETE)
-			ret = scoutfs_item_delete(sb, &key, lock);
+			ret = scoutfs_forest_delete(sb, &key, lock);
 		if (ret)
 			break;

@@ -1236,6 +1207,7 @@ int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino,
 	struct scoutfs_key last_key;
 	struct scoutfs_key key;
 	struct scoutfs_lock *lock = NULL;
+	struct kvec val;
 	int len;
 	int ret;

@@ -1251,13 +1223,13 @@ int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino,
 	init_dirent_key(&key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, dir_pos);
 	init_dirent_key(&last_key, SCOUTFS_LINK_BACKREF_TYPE, ino, U64_MAX,
 			U64_MAX);
+	kvec_init(&val, &ent->dent, dirent_bytes(SCOUTFS_NAME_LEN));

 	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, ino, &lock);
 	if (ret)
 		goto out;

-	ret = scoutfs_item_next(sb, &key, &last_key, &ent->dent,
-				dirent_bytes(SCOUTFS_NAME_LEN), lock);
+	ret = scoutfs_forest_next(sb, &key, &last_key, &val, lock);
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
 	lock = NULL;
 	if (ret < 0)
@@ -1,394 +0,0 @@
-/*
- * Copyright (C) 2020 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-
-#include "ext.h"
-#include "counters.h"
-#include "scoutfs_trace.h"
-
-/*
- * Extents are used to track free block regions and to map logical file
- * regions to device blocks.   Extents can be split and merged as
- * they're modified.  These helpers implement all the fiddly extent
- * manipulations.  Callers provide callbacks which implement the actual
- * storage of extents in either the item cache or btree items.
- */
-
-static void ext_zero(struct scoutfs_extent *ext)
-{
-	memset(ext, 0, sizeof(struct scoutfs_extent));
-}
-
-static bool ext_overlap(struct scoutfs_extent *ext, u64 start, u64 len)
-{
-	u64 e_end = ext->start + ext->len - 1;
-	u64 end = start + len - 1;
-
-	return !(e_end < start || ext->start > end);
-}
-
-static bool ext_inside(u64 start, u64 len, struct scoutfs_extent *out)
-{
-	u64 in_end = start + len - 1;
-	u64 out_end = out->start + out->len - 1;
-
-	return out->start <= start && out_end >= in_end;
-}
-
-/* we only translate mappings when they exist */
-static inline u64 ext_map_add(u64 map, u64 diff)
-{
-	return map ? map + diff : 0;
-}
-
-/*
- * Extents can merge if they're logically contiguous, both don't have
- * mappings or have mappings which are also contiguous, and have
- * matching flags.
- */
-bool scoutfs_ext_can_merge(struct scoutfs_extent *left,
-			   struct scoutfs_extent *right)
-{
-	return (left->start + left->len == right->start) &&
-	       ((!left->map && !right->map) ||
-		(left->map + left->len == right->map)) &&
-	       (left->flags == right->flags);
-}
-
-/*
- * Split an existing extent in to left and right extents by removing
- * an interior range.  The split extents are all zeros if the range
- * extends to their end of the extent.
- */
-static void ext_split(struct scoutfs_extent *ext, u64 start, u64 len,
-		      struct scoutfs_extent *left,
-		      struct scoutfs_extent *right)
-{
-	if (ext->start < start) {
-		left->start = ext->start;
-		left->len = start - ext->start;
-		left->map = ext->map;
-		left->flags = ext->flags;
-	} else {
-		ext_zero(left);
-	}
-
-	if (ext->start + ext->len > start + len) {
-		right->start = start + len;
-		right->len = ext->start + ext->len - right->start;
-		right->map = ext_map_add(ext->map, right->start - ext->start);
-		right->flags = ext->flags;
-	} else {
-		ext_zero(right);
-	}
-}
-
-#define op_call(sb, ops, arg, which, args...)			\
-({								\
-	int _ret;						\
-	_ret = ops->which(sb, arg, ##args);			\
-	scoutfs_inc_counter(sb, ext_op_##which);		\
-	trace_scoutfs_ext_op_##which(sb, ##args, _ret);		\
-	_ret;							\
-})
-
-struct extent_changes {
-	struct scoutfs_extent exts[4];
-	bool ins[4];
-	u8 nr;
-};
-
-static void add_change(struct extent_changes *chg,
-		       struct scoutfs_extent *ext, bool ins)
-{
-	BUILD_BUG_ON(ARRAY_SIZE(chg->ins) != ARRAY_SIZE(chg->exts));
-
-	if (ext->len) {
-		BUG_ON(chg->nr == ARRAY_SIZE(chg->exts));
-		chg->exts[chg->nr] = *ext;
-		chg->ins[chg->nr] = !!ins;
-		chg->nr++;
-	}
-}
-
-static int apply_changes(struct super_block *sb, struct scoutfs_ext_ops *ops,
-			 void *arg, struct extent_changes *chg)
-{
-	int ret = 0;
-	int err;
-	int i;
-
-	for (i = 0; i < chg->nr; i++) {
-		if (chg->ins[i])
-			ret = op_call(sb, ops, arg, insert, chg->exts[i].start,
-				      chg->exts[i].len, chg->exts[i].map,
-				      chg->exts[i].flags);
-		else
-			ret = op_call(sb, ops, arg, remove, chg->exts[i].start,
-				      chg->exts[i].len, chg->exts[i].map,
-				      chg->exts[i].flags);
-		if (ret < 0)
-			break;
-	}
-
-	while (ret < 0 && --i >= 0) {
-		if (chg->ins[i])
-			err = op_call(sb, ops, arg, remove, chg->exts[i].start,
-				      chg->exts[i].len, chg->exts[i].map,
-				      chg->exts[i].flags);
-		else
-			err = op_call(sb, ops, arg, insert, chg->exts[i].start,
-				      chg->exts[i].len, chg->exts[i].map,
-				      chg->exts[i].flags);
-		BUG_ON(err); /* inconsistent */
-	}
-
-	return ret;
-}
-
-int scoutfs_ext_next(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		     void *arg, u64 start, u64 len, struct scoutfs_extent *ext)
-{
-	int ret;
-
-	ret = op_call(sb, ops, arg, next, start, len, ext);
-	trace_scoutfs_ext_next(sb, start, len, ext, ret);
-	return ret;
-}
-
-/*
- * Insert the given extent.  EINVAL is returned if there's already an existing
- * overlapping extent.  This can merge with its neighbours.
- */
-int scoutfs_ext_insert(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		       void *arg, u64 start, u64 len, u64 map, u8 flags)
-{
-	struct extent_changes chg = { .nr = 0 };
-	struct scoutfs_extent found;
-	struct scoutfs_extent ins;
-	int ret;
-
-	ins.start = start;
-	ins.len = len;
-	ins.map = map;
-	ins.flags = flags;
-
-	/* find right neighbour and check for overlap */
-	ret = op_call(sb, ops, arg, next, start, 1, &found);
-	if (ret < 0 && ret != -ENOENT)
-		goto out;
-
-	/* inserting extent must not overlap */
-	if (found.len && ext_overlap(&ins, found.start, found.len)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* merge with right if we can */
-	if (found.len && scoutfs_ext_can_merge(&ins, &found)) {
-		ins.len += found.len;
-		add_change(&chg, &found, false);
-	}
-
-	/* see if we can merge with a left neighbour */
-	if (start > 0) {
-		ret = op_call(sb, ops, arg, next, start - 1,  1, &found);
-		if (ret < 0 && ret != -ENOENT)
-			goto out;
-
-		if (ret == 0 && scoutfs_ext_can_merge(&found, &ins)) {
-			ins.start = found.start;
-			ins.map = found.map;
-			ins.len += found.len;
-			add_change(&chg, &found, false);
-		}
-	}
-
-	add_change(&chg, &ins, true);
-	ret = apply_changes(sb, ops, arg, &chg);
-out:
-	trace_scoutfs_ext_insert(sb, start, len, map, flags, ret);
-	return ret;
-}
-
-/*
- * Remove the given extent.  The extent to remove must be found entirely
- * in an existing extent.  If the existing extent is larger then we leave
- * behind the remaining extent.  The existing extent can be split.
- */
-int scoutfs_ext_remove(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		       void *arg, u64 start, u64 len)
-{
-	struct extent_changes chg = { .nr = 0 };
-	struct scoutfs_extent found;
-	struct scoutfs_extent left;
-	struct scoutfs_extent right;
-	int ret;
-
-	ret = op_call(sb, ops, arg, next, start, 1, &found);
-	if (ret < 0)
-		goto out;
-
-	/* removed extent must be entirely within found */
-	if (!ext_inside(start, len, &found)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ext_split(&found, start, len, &left, &right);
-
-	add_change(&chg, &found, false);
-	add_change(&chg, &left, true);
-	add_change(&chg, &right, true);
-
-	ret = apply_changes(sb, ops, arg, &chg);
-out:
-	trace_scoutfs_ext_remove(sb, start, len, 0, 0, ret);
-	return ret;
-}
-
-/*
- * Find and remove the next extent, removing only a portion if the
- * extent is larger than the count.  Returns ENOENT if it didn't
- * find any extents.
- *
- * This does not search for merge candidates so it's safe to call with
- * extents indexed by length.
- */
-int scoutfs_ext_alloc(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		      void *arg, u64 start, u64 len, u64 count,
-		      struct scoutfs_extent *ext)
-{
-	struct extent_changes chg = { .nr = 0 };
-	struct scoutfs_extent found;
-	struct scoutfs_extent ins;
-	int ret;
-
-	ret = op_call(sb, ops, arg, next, start, len, &found);
-	if (ret < 0)
-		goto out;
-
-	add_change(&chg, &found, false);
-
-	if (found.len > count) {
-		ins.start = found.start + count;
-		ins.len = found.len - count;
-		ins.map = ext_map_add(found.map, count);
-		ins.flags = found.flags;
-
-		add_change(&chg, &ins, true);
-	}
-
-	ret = apply_changes(sb, ops, arg, &chg);
-out:
-	if (ret == 0) {
-		ext->start = found.start;
-		ext->len = min(found.len, count);
-		ext->map = found.map;
-		ext->flags = found.flags;
-	} else {
-		ext_zero(ext);
-	}
-
-	trace_scoutfs_ext_alloc(sb, start, len, count, ext, ret);
-	return ret;
-}
-
-/*
- * Set the map and flags for an extent region, with the magical property
- * that extents with map and flags set to 0 are removed.
- *
- * If we're modifying an existing extent then the modification must be
- * fully inside the existing extent.  The modification can leave edges
- * of the extent which need to be inserted.  If the modification extends
- * to the end of the existing extent then we need to check for adjacent
- * neighbouring extents which might now be able to be merged.
- *
- * Inserting a new extent is like the case of modifying the entire
- * existing extent.  We need to check neighbours of the inserted extent
- * to see if they can be merged.
- */
-int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		    void *arg, u64 start, u64 len, u64 map, u8 flags)
-{
-	struct extent_changes chg = { .nr = 0 };
-	struct scoutfs_extent found;
-	struct scoutfs_extent left;
-	struct scoutfs_extent right;
-	struct scoutfs_extent set;
-	int ret;
-
-	set.start = start;
-	set.len = len;
-	set.map = map;
-	set.flags = flags;
-
-	/* find extent to remove */
-	ret = op_call(sb, ops, arg, next, start, 1, &found);
-	if (ret < 0 && ret != -ENOENT)
-		goto out;
-
-	if (ret == 0 && ext_overlap(&found, start, len)) {
-		/* set extent must be entirely within found */
-		if (!ext_inside(start, len, &found)) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		add_change(&chg, &found, false);
-		ext_split(&found, start, len, &left, &right);
-	} else {
-		ext_zero(&found);
-		ext_zero(&left);
-		ext_zero(&right);
-	}
-
-	if (left.len) {
-		/* inserting split left, won't merge */
-		add_change(&chg, &left, true);
-	} else if (start > 0) {
-		ret = op_call(sb, ops, arg, next, start - 1, 1, &left);
-		if (ret < 0 && ret != -ENOENT)
-			goto out;
-		else if (ret == 0 && scoutfs_ext_can_merge(&left, &set)) {
-			/* remove found left, merging */
-			set.start = left.start;
-			set.map = left.map;
-			set.len += left.len;
-			add_change(&chg, &left, false);
-		}
-	}
-
-	if (right.len) {
-		/* inserting split right, won't merge */
-		add_change(&chg, &right, true);
-	} else {
-		ret = op_call(sb, ops, arg, next, start + len, 1, &right);
-		if (ret < 0 && ret != -ENOENT)
-			goto out;
-		else if (ret == 0 && scoutfs_ext_can_merge(&set, &right)) {
-			/* remove found right, merging */
-			set.len += right.len;
-			add_change(&chg, &right, false);
-		}
-	}
-
-	if (set.flags || set.map)
-		add_change(&chg, &set, true);
-
-	ret = apply_changes(sb, ops, arg, &chg);
-out:
-	trace_scoutfs_ext_set(sb, start, len, map, flags, ret);
-	return ret;
-}
@@ -1,35 +0,0 @@
-#ifndef _SCOUTFS_EXT_H_
-#define _SCOUTFS_EXT_H_
-
-struct scoutfs_extent {
-	u64 start;
-	u64 len;
-	u64 map;
-	u8 flags;
-};
-
-struct scoutfs_ext_ops {
-	int (*next)(struct super_block *sb, void *arg,
-		    u64 start, u64 len, struct scoutfs_extent *ext);
-	int (*insert)(struct super_block *sb, void *arg,
-		      u64 start, u64 len, u64 map, u8 flags);
-	int (*remove)(struct super_block *sb, void *arg, u64 start, u64 len,
-		      u64 map, u8 flags);
-};
-
-bool scoutfs_ext_can_merge(struct scoutfs_extent *left,
-			   struct scoutfs_extent *right);
-
-int scoutfs_ext_next(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		     void *arg, u64 start, u64 len, struct scoutfs_extent *ext);
-int scoutfs_ext_insert(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		       void *arg, u64 start, u64 len, u64 map, u8 flags);
-int scoutfs_ext_remove(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		       void *arg, u64 start, u64 len);
-int scoutfs_ext_alloc(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		      void *arg, u64 start, u64 len, u64 limit,
-		      struct scoutfs_extent *ext);
-int scoutfs_ext_set(struct super_block *sb, struct scoutfs_ext_ops *ops,
-		    void *arg, u64 start, u64 len, u64 map, u8 flags);
-
-#endif
@@ -50,7 +50,9 @@ retry:
 	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
 		/* protect checked extents from stage/release */
 		mutex_lock(&inode->i_mutex);
+		mutex_lock(&si->s_i_mutex);
 		atomic_inc(&inode->i_dio_count);
+		mutex_unlock(&si->s_i_mutex);
 		mutex_unlock(&inode->i_mutex);

 		ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos,
@@ -1,43 +1,54 @@
 #ifndef _SCOUTFS_FOREST_H_
 #define _SCOUTFS_FOREST_H_

-struct scoutfs_alloc;
+struct scoutfs_radix_allocator;
 struct scoutfs_block_writer;
-struct scoutfs_block;
-
-#include "btree.h"
-
-/* caller gives an item to the callback */
-typedef int (*scoutfs_forest_item_cb)(struct super_block *sb,
-				      struct scoutfs_key *key,
-				      struct scoutfs_log_item_value *liv,
-				      void *val, int val_len, void *arg);

+int scoutfs_forest_lookup(struct super_block *sb, struct scoutfs_key *key,
+			  struct kvec *val, struct scoutfs_lock *lock);
+int scoutfs_forest_lookup_exact(struct super_block *sb,
+				struct scoutfs_key *key, struct kvec *val,
+				struct scoutfs_lock *lock);
+int scoutfs_forest_next(struct super_block *sb, struct scoutfs_key *key,
+			struct scoutfs_key *last, struct kvec *val,
+			struct scoutfs_lock *lock);
 int scoutfs_forest_next_hint(struct super_block *sb, struct scoutfs_key *key,
 			     struct scoutfs_key *next);
-int scoutfs_forest_read_items(struct super_block *sb,
-			      struct scoutfs_lock *lock,
-			      struct scoutfs_key *key,
-			      struct scoutfs_key *start,
-			      struct scoutfs_key *end,
-			      scoutfs_forest_item_cb cb, void *arg);
-int scoutfs_forest_set_bloom_bits(struct super_block *sb,
-				  struct scoutfs_lock *lock);
-void scoutfs_forest_set_max_vers(struct super_block *sb, u64 max_vers);
-int scoutfs_forest_get_max_vers(struct super_block *sb,
-				struct scoutfs_super_block *super,
-				u64 *vers);
-int scoutfs_forest_insert_list(struct super_block *sb,
-			       struct scoutfs_btree_item_list *lst);
-int scoutfs_forest_srch_add(struct super_block *sb, u64 hash, u64 ino, u64 id);
+int scoutfs_forest_prev(struct super_block *sb, struct scoutfs_key *key,
+			struct scoutfs_key *first, struct kvec *val,
+			struct scoutfs_lock *lock);
+int scoutfs_forest_create(struct super_block *sb, struct scoutfs_key *key,
+			  struct kvec *val, struct scoutfs_lock *lock);
+int scoutfs_forest_create_force(struct super_block *sb,
+				struct scoutfs_key *key, struct kvec *val,
+				struct scoutfs_lock *lock);
+int scoutfs_forest_update(struct super_block *sb, struct scoutfs_key *key,
+			  struct kvec *val, struct scoutfs_lock *lock);
+int scoutfs_forest_delete_dirty(struct super_block *sb,
+			        struct scoutfs_key *key);
+int scoutfs_forest_delete(struct super_block *sb, struct scoutfs_key *key,
+			  struct scoutfs_lock *lock);
+int scoutfs_forest_delete_force(struct super_block *sb,
+				struct scoutfs_key *key,
+				struct scoutfs_lock *lock);
+int scoutfs_forest_delete_save(struct super_block *sb,
+			       struct scoutfs_key *key,
+			       struct list_head *list,
+			       struct scoutfs_lock *lock);
+int scoutfs_forest_restore(struct super_block *sb, struct list_head *list,
+			   struct scoutfs_lock *lock);
+void scoutfs_forest_free_batch(struct super_block *sb, struct list_head *list);

 void scoutfs_forest_init_btrees(struct super_block *sb,
-				struct scoutfs_alloc *alloc,
+				struct scoutfs_radix_allocator *alloc,
 				struct scoutfs_block_writer *wri,
 				struct scoutfs_log_trees *lt);
 void scoutfs_forest_get_btrees(struct super_block *sb,
 			       struct scoutfs_log_trees *lt);

+void scoutfs_forest_clear_lock(struct super_block *sb,
+			       struct scoutfs_lock *lock);
+
 int scoutfs_forest_setup(struct super_block *sb);
 void scoutfs_forest_destroy(struct super_block *sb);

@@ -8,47 +8,27 @@
 #define SCOUTFS_BLOCK_MAGIC_SUPER	0x103c428b
 #define SCOUTFS_BLOCK_MAGIC_BTREE	0xe597f96d
 #define SCOUTFS_BLOCK_MAGIC_BLOOM	0x31995604
-#define SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK	0x897e4a7d
-#define SCOUTFS_BLOCK_MAGIC_SRCH_PARENT	0xb23a2a05
-#define SCOUTFS_BLOCK_MAGIC_ALLOC_LIST	0x8a93ac83
+#define SCOUTFS_BLOCK_MAGIC_RADIX	0xebeb5e65

 /*
- * The super block, quorum block, and file data allocation granularity
- * use the smaller 4KB block.
+ * The super block and btree blocks are fixed 4k.
 */
-#define SCOUTFS_BLOCK_SM_SHIFT		12
-#define SCOUTFS_BLOCK_SM_SIZE		(1 << SCOUTFS_BLOCK_SM_SHIFT)
-#define SCOUTFS_BLOCK_SM_MASK		(SCOUTFS_BLOCK_SM_SIZE - 1)
-#define SCOUTFS_BLOCK_SM_PER_PAGE	(PAGE_SIZE / SCOUTFS_BLOCK_SM_SIZE)
-#define SCOUTFS_BLOCK_SM_SECTOR_SHIFT	(SCOUTFS_BLOCK_SM_SHIFT - 9)
-#define SCOUTFS_BLOCK_SM_SECTORS	(1 << SCOUTFS_BLOCK_SM_SECTOR_SHIFT)
-#define SCOUTFS_BLOCK_SM_MAX		(U64_MAX >> SCOUTFS_BLOCK_SM_SHIFT)
-#define SCOUTFS_BLOCK_SM_PAGES_PER	(SCOUTFS_BLOCK_SM_SIZE / PAGE_SIZE)
-#define SCOUTFS_BLOCK_SM_PAGE_ORDER	(SCOUTFS_BLOCK_SM_SHIFT - PAGE_SHIFT)
-
-/*
- * The radix and btree structures, and the forest bloom block, use the
- * larger 64KB metadata block size.
- */
-#define SCOUTFS_BLOCK_LG_SHIFT		16
-#define SCOUTFS_BLOCK_LG_SIZE		(1 << SCOUTFS_BLOCK_LG_SHIFT)
-#define SCOUTFS_BLOCK_LG_MASK		(SCOUTFS_BLOCK_LG_SIZE - 1)
-#define SCOUTFS_BLOCK_LG_PER_PAGE	(PAGE_SIZE / SCOUTFS_BLOCK_LG_SIZE)
-#define SCOUTFS_BLOCK_LG_SECTOR_SHIFT	(SCOUTFS_BLOCK_LG_SHIFT - 9)
-#define SCOUTFS_BLOCK_LG_SECTORS	(1 << SCOUTFS_BLOCK_LG_SECTOR_SHIFT)
-#define SCOUTFS_BLOCK_LG_MAX		(U64_MAX >> SCOUTFS_BLOCK_LG_SHIFT)
-#define SCOUTFS_BLOCK_LG_PAGES_PER	(SCOUTFS_BLOCK_LG_SIZE / PAGE_SIZE)
-#define SCOUTFS_BLOCK_LG_PAGE_ORDER	(SCOUTFS_BLOCK_LG_SHIFT - PAGE_SHIFT)
-
-#define SCOUTFS_BLOCK_SM_LG_SHIFT	(SCOUTFS_BLOCK_LG_SHIFT - \
-					 SCOUTFS_BLOCK_SM_SHIFT)
+#define SCOUTFS_BLOCK_SHIFT 12
+#define SCOUTFS_BLOCK_SIZE (1 << SCOUTFS_BLOCK_SHIFT)
+#define SCOUTFS_BLOCK_MASK (SCOUTFS_BLOCK_SIZE - 1)
+#define SCOUTFS_BLOCKS_PER_PAGE (PAGE_SIZE / SCOUTFS_BLOCK_SIZE)
+#define SCOUTFS_BLOCK_SECTOR_SHIFT (SCOUTFS_BLOCK_SHIFT - 9)
+#define SCOUTFS_BLOCK_SECTORS (1 << SCOUTFS_BLOCK_SECTOR_SHIFT)
+#define SCOUTFS_BLOCK_MAX (U64_MAX >> SCOUTFS_BLOCK_SHIFT)

+#define SCOUTFS_PAGES_PER_BLOCK (SCOUTFS_BLOCK_SIZE / PAGE_SIZE)
+#define SCOUTFS_BLOCK_PAGE_ORDER (SCOUTFS_BLOCK_SHIFT - PAGE_SHIFT)

 /*
 * The super block leaves some room before the first block for platform
 * structures like boot loaders.
 */
-#define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
+#define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)

 /*
 * A reasonably large region of aligned quorum blocks follow the super
@@ -58,14 +38,8 @@
 * mounts that have a reasonable probability of not overwriting each
 * other's random block locations.
 */
-#define SCOUTFS_QUORUM_BLKNO	((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
-#define SCOUTFS_QUORUM_BLOCKS	((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
-
-/*
- * Start data on the data device aligned as well.
- */
-#define SCOUTFS_DATA_DEV_START_BLKNO ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
-
+#define SCOUTFS_QUORUM_BLKNO		((256ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)
+#define SCOUTFS_QUORUM_BLOCKS		((256ULL * 1024) >> SCOUTFS_BLOCK_SHIFT)

 #define SCOUTFS_UNIQUE_NAME_MAX_BYTES	64 /* includes null */

@@ -75,15 +49,18 @@
 struct scoutfs_timespec {
 	__le64 sec;
 	__le32 nsec;
-	__u8 __pad[4];
-};
+} __packed;
+
+struct scoutfs_betimespec {
+	__be64 sec;
+	__be32 nsec;
+} __packed;

 /* XXX ipv6 */
 struct scoutfs_inet_addr {
 	__le32 addr;
 	__le16 port;
-	__u8 __pad[2];
-};
+} __packed;

 /*
 * This header is stored at the start of btree blocks and the super
@@ -96,7 +73,7 @@ struct scoutfs_block_header {
 	__le64 fsid;
 	__le64 seq;
 	__le64 blkno;
-};
+} __packed;

 /*
 * scoutfs identifies all file system metadata items by a small key
@@ -112,19 +89,23 @@ struct scoutfs_block_header {
 * increment them, subtract them from each other, etc.
 */
 struct scoutfs_key {
+	__u8	sk_zone;
 	__le64	_sk_first;
+	__u8	sk_type;
 	__le64	_sk_second;
 	__le64	_sk_third;
 	__u8	_sk_fourth;
-	__u8	sk_zone;
-	__u8	sk_type;
-	__u8	__pad[5];
-};
+}__packed;

 /* inode index */
 #define skii_major	_sk_second
 #define skii_ino	_sk_third

+/* xattr index */
+#define skxi_hash	_sk_first
+#define skxi_ino	_sk_second
+#define skxi_id		_sk_third
+
 /* node orphan inode */
 #define sko_rid		_sk_first
 #define sko_ino		_sk_second
@@ -147,59 +128,85 @@ struct scoutfs_key {
 #define sks_ino		_sk_first
 #define sks_nr		_sk_second

-/* data extents */
-#define skdx_ino	_sk_first
-#define skdx_end	_sk_second
-#define skdx_len	_sk_third
-
-/* log trees */
-#define sklt_rid	_sk_first
-#define sklt_nr		_sk_second
-
-/* lock clients */
-#define sklc_rid	_sk_first
-
-/* seqs */
-#define skts_trans_seq	_sk_first
-#define skts_rid	_sk_second
-
-/* mounted clients */
-#define skmc_rid	_sk_first
-
-/* free extents by blkno */
-#define skfb_end	_sk_second
-#define skfb_len	_sk_third
-/* free extents by len */
-#define skfl_neglen	_sk_second
-#define skfl_blkno	_sk_third
+/* packed extents */
+#define skpe_ino	_sk_first
+#define skpe_base	_sk_second
+#define skpe_part	_sk_fourth

 struct scoutfs_radix_block {
 	struct scoutfs_block_header hdr;
+	__le32 sm_first;
+	__le32 lg_first;
 	union {
 		struct scoutfs_radix_ref {
 			__le64 blkno;
 			__le64 seq;
 			__le64 sm_total;
 			__le64 lg_total;
-		} refs[0];
+		} __packed refs[0];
 		__le64 bits[0];
-	};
-};
+	} __packed;
+} __packed;

-struct scoutfs_avl_root {
-	__le16 node;
-};
-
-struct scoutfs_avl_node {
-	__le16 parent;
-	__le16 left;
-	__le16 right;
+struct scoutfs_radix_root {
 	__u8 height;
-	__u8 __pad[1];
-};
+	__le64 next_find_bit;
+	struct scoutfs_radix_ref ref;
+} __packed;

+#define SCOUTFS_RADIX_REFS \
+	((SCOUTFS_BLOCK_SIZE - offsetof(struct scoutfs_radix_block, refs[0])) /\
+		sizeof(struct scoutfs_radix_ref))
+
+/* 8 meg regions with 4k data blocks */
+#define SCOUTFS_RADIX_LG_SHIFT	11
+#define SCOUTFS_RADIX_LG_BITS	(1 << SCOUTFS_RADIX_LG_SHIFT)
+#define SCOUTFS_RADIX_LG_MASK	(SCOUTFS_RADIX_LG_BITS - 1)
+
+/* round block bits down to a multiple of large ranges */
+#define SCOUTFS_RADIX_BITS					\
+	(((SCOUTFS_BLOCK_SIZE -					\
+	   offsetof(struct scoutfs_radix_block, bits[0])) * 8) &	\
+	 ~(__u64)SCOUTFS_RADIX_LG_MASK)
+#define SCOUTFS_RADIX_BITS_BYTES (SCOUTFS_RADIX_BITS / 8)
+
+/*
+ * The btree still uses memcmp() to compare keys.  We should fix that
+ * before too long.
+ */
+struct scoutfs_key_be {
+	__u8	sk_zone;
+	__be64	_sk_first;
+	__u8	sk_type;
+	__be64	_sk_second;
+	__be64	_sk_third;
+	__u8	_sk_fourth;
+}__packed;
+
+/* chose reasonable max key lens that have room for some u64s */
+#define SCOUTFS_BTREE_MAX_KEY_LEN 40
 /* when we split we want to have multiple items on each side */
-#define SCOUTFS_BTREE_MAX_VAL_LEN 896
+#define SCOUTFS_BTREE_MAX_VAL_LEN (SCOUTFS_BLOCK_SIZE / 8)
+
+/*
+ * The min number of free bytes we must leave in a parent as we descend
+ * to modify.  This leaves enough free bytes to insert a possibly maximal
+ * sized key as a seperator for a child block.  Fewer bytes then this
+ * and split/merge might try to insert a max child item in the parent
+ * that wouldn't fit.
+ */
+#define SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES				\
+	(sizeof(struct scoutfs_btree_item_header) +			\
+	 sizeof(struct scoutfs_btree_item) + SCOUTFS_BTREE_MAX_KEY_LEN +\
+	 sizeof(struct scoutfs_btree_ref))
+
+/*
+ * When debugging we can tune the splitting and merging thresholds to
+ * create much larger trees by having blocks with many fewer items.  We
+ * implement this by pretending the blocks are tiny.  They're still
+ * large enough for a handful of items.
+ */
+#define SCOUTFS_BTREE_TINY_BLOCK_SIZE	512

 /*
 * A 4EB test image measured a worst case height of 17.  This is plenty
@@ -210,7 +217,7 @@ struct scoutfs_avl_node {
 struct scoutfs_btree_ref {
 	__le64 blkno;
 	__le64 seq;
-};
+} __packed;

 /*
 * A height of X means that the first block read will have level X-1 and
@@ -219,230 +226,91 @@ struct scoutfs_btree_ref {
 struct scoutfs_btree_root {
 	struct scoutfs_btree_ref ref;
 	__u8 height;
-	__u8 __pad[7];
-};
+} __packed;
+
+struct scoutfs_btree_item_header {
+	__le32 off;
+} __packed;

 struct scoutfs_btree_item {
-	struct scoutfs_avl_node node;
-	struct scoutfs_key key;
-	__le16 val_off;
+	__le16 key_len;
 	__le16 val_len;
-	__u8 __pad[4];
-};
+	__u8 data[0];
+} __packed;

 struct scoutfs_btree_block {
 	struct scoutfs_block_header hdr;
-	struct scoutfs_avl_root item_root;
-	__le16 nr_items;
-	__le16 total_item_bytes;
-	__le16 mid_free_len;
+	__le32 free_end;
+	__le32 nr_items;
 	__u8 level;
-	__u8 __pad[7];
-	struct scoutfs_btree_item items[0];
-	/* leaf blocks have a fixed size item offset hash table at the end */
-};
-
-#define SCOUTFS_BTREE_VALUE_ALIGN 8
+	struct scoutfs_btree_item_header item_hdrs[0];
+} __packed;

 /*
- * Try to aim for a 75% load in a leaf full of items with no value.
- * We'll almost never see this because most items have values and most
- * blocks aren't full.
+ * The lock server keeps a persistent record of connected clients so that
+ * server failover knows who to wait for before resuming operations.
 */
-#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED			  \
-	((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) /	  \
-	 (sizeof(struct scoutfs_btree_item) + (sizeof(__le16))) * 100 / 75)
-#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR					  \
-	(round_up(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED,		  \
-		  SCOUTFS_BTREE_VALUE_ALIGN))
-#define SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES \
-	(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR * sizeof(__le16))
-
-struct scoutfs_alloc_list_ref {
-	__le64 blkno;
-	__le64 seq;
-};
+struct scoutfs_lock_client_btree_key {
+	__be64 rid;
+} __packed;

 /*
- * first_nr tracks the nr of the first block in the list and is used for
- * allocation sizing. total_nr is the sum of the nr of all the blocks in
- * the list and is used for calculating total free block counts.
+ * The server tracks transaction sequence numbers that clients have
+ * open.  This limits results that can be returned from the seq indices.
 */
-struct scoutfs_alloc_list_head {
-	struct scoutfs_alloc_list_ref ref;
-	__le64 total_nr;
-	__le32 first_nr;
-	__u8 __pad[4];
-};
+struct scoutfs_trans_seq_btree_key {
+	__be64 trans_seq;
+	__be64 rid;
+} __packed;

 /*
- * While the main allocator uses extent items in btree blocks, metadata
- * allocations for a single transaction are recorded in arrays in
- * blocks.  This limits the number of allocations and frees needed to
- * cow and modify the structure.  The blocks can be stored in a list
- * which lets us create a persistent log of pending frees that are
- * generated as we cow btree blocks to insert freed extents.
- *
- * The array floats in the block so that both adding and removing blknos
- * only modifies an index.
+ * The server keeps a persistent record of mounted clients.
 */
-struct scoutfs_alloc_list_block {
-	struct scoutfs_block_header hdr;
-	struct scoutfs_alloc_list_ref next;
-	__le32 start;
-	__le32 nr;
-	__le64 blknos[0]; /* naturally aligned for sorting */
-};
-
-#define SCOUTFS_ALLOC_LIST_MAX_BLOCKS					      \
-	((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_alloc_list_block)) /  \
-	 (member_sizeof(struct scoutfs_alloc_list_block, blknos[0])))
-
-/*
- * These can safely be initialized to all-zeros.
- */
-struct scoutfs_alloc_root {
-	__le64 total_len;
-	struct scoutfs_btree_root root;
-};
-
-/* types of allocators, exposed to alloc_detail ioctl */
-#define SCOUTFS_ALLOC_OWNER_NONE	0
-#define SCOUTFS_ALLOC_OWNER_SERVER	1
-#define SCOUTFS_ALLOC_OWNER_MOUNT	2
-#define SCOUTFS_ALLOC_OWNER_SRCH	3
+struct scoutfs_mounted_client_btree_key {
+	__be64 rid;
+} __packed;

 struct scoutfs_mounted_client_btree_val {
 	__u8 flags;
-};
+} __packed;

 #define SCOUTFS_MOUNTED_CLIENT_VOTER	(1 << 0)

-/*
- * srch files are a contiguous run of blocks with compressed entries
- * described by a dense parent radix.  The files can be stored in
- * log_tree items when the files contain unsorted entries written by
- * mounts during their transactions.  Sorted files of increasing size
- * are kept in a btree off the super for searching and further
- * compacting.
- */
-struct scoutfs_srch_entry {
-	__le64 hash;
-	__le64 ino;
-	__le64 id;
-};
-
-#define SCOUTFS_SRCH_ENTRY_MAX_BYTES	(2 + (sizeof(__u64) * 3))
-
-struct scoutfs_srch_ref {
-	__le64 blkno;
-	__le64 seq;
-};
-
-struct scoutfs_srch_file {
-	struct scoutfs_srch_entry first;
-	struct scoutfs_srch_entry last;
-	struct scoutfs_srch_ref ref;
-	__le64 blocks;
-	__le64 entries;
-	__u8 height;
-	__u8 __pad[7];
-};
-
-struct scoutfs_srch_parent {
-	struct scoutfs_block_header hdr;
-	struct scoutfs_srch_ref refs[0];
-};
-
-#define SCOUTFS_SRCH_PARENT_REFS				\
-	((SCOUTFS_BLOCK_LG_SIZE -				\
-	  offsetof(struct scoutfs_srch_parent, refs)) /		\
-	 sizeof(struct scoutfs_srch_ref))
-
-struct scoutfs_srch_block {
-	struct scoutfs_block_header hdr;
-	struct scoutfs_srch_entry first;
-	struct scoutfs_srch_entry last;
-	struct scoutfs_srch_entry tail;
-	__le32 entry_nr;
-	__le32 entry_bytes;
-	__u8 entries[0];
-};
-
-/*
- * Decoding loads final small deltas with full __u64 loads.  Rather than
- * check the size before each load we stop coding entries past the point
- * where a full size entry could overflow the block.  A final entry can
- * start at this byte count and consume the rest of the block, though
- * its unlikely.
- */
-#define SCOUTFS_SRCH_BLOCK_SAFE_BYTES					\
-	(SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_srch_block) -	\
-	 SCOUTFS_SRCH_ENTRY_MAX_BYTES)
-
-#define SCOUTFS_SRCH_LOG_BLOCK_LIMIT	(1024 * 1024 / SCOUTFS_BLOCK_LG_SIZE)
-#define SCOUTFS_SRCH_COMPACT_ORDER	2
-#define SCOUTFS_SRCH_COMPACT_NR		(1 << SCOUTFS_SRCH_COMPACT_ORDER)
-
-/*
- * A persistent record of a srch file compaction operation in progress.
- *
- * When compacting log files blk and pos aren't used.  When compacting
- * sorted files blk is the logical block number and pos is the byte
- * offset of the next entry.  When deleting files pos is the height of
- * the level that we're deleting, and blk is the logical block offset of
- * the next parent ref array index to descend through.
- */
-struct scoutfs_srch_compact {
-	struct scoutfs_alloc_list_head meta_avail;
-	struct scoutfs_alloc_list_head meta_freed;
-	__le64 id;
-	__u8 nr;
-	__u8 flags;
-	__u8 __pad[6];
-	struct scoutfs_srch_file out;
-	struct scoutfs_srch_compact_input {
-		struct scoutfs_srch_file sfl;
-		__le64 blk;
-		__le64 pos;
-	} in[SCOUTFS_SRCH_COMPACT_NR];
-};
-
-/* server -> client: combine input log file entries into output file */
-#define SCOUTFS_SRCH_COMPACT_FLAG_LOG		(1 << 0)
-/* server -> client: combine input sorted file entries into output file */
-#define SCOUTFS_SRCH_COMPACT_FLAG_SORTED	(1 << 1)
-/* server -> client: delete input files */
-#define SCOUTFS_SRCH_COMPACT_FLAG_DELETE	(1 << 2)
-/* client -> server: compaction phase (LOG,SORTED,DELETE) done */
-#define SCOUTFS_SRCH_COMPACT_FLAG_DONE		(1 << 4)
-/* client -> server: compaction failed */
-#define SCOUTFS_SRCH_COMPACT_FLAG_ERROR		(1 << 5)
-
 /*
 * XXX I imagine we should rename these now that they've evolved to track
 * all the btrees that clients use during a transaction.  It's not just
 * about item logs, it's about clients making changes to trees.
 */
 struct scoutfs_log_trees {
-	struct scoutfs_alloc_list_head meta_avail;
-	struct scoutfs_alloc_list_head meta_freed;
+	struct scoutfs_radix_root meta_avail;
+	struct scoutfs_radix_root meta_freed;
 	struct scoutfs_btree_root item_root;
 	struct scoutfs_btree_ref bloom_ref;
-	struct scoutfs_alloc_root data_avail;
-	struct scoutfs_alloc_root data_freed;
-	struct scoutfs_srch_file srch_file;
-	__le64 max_item_vers;
+	struct scoutfs_radix_root data_avail;
+	struct scoutfs_radix_root data_freed;
 	__le64 rid;
 	__le64 nr;
-};
+} __packed;
+
+struct scoutfs_log_trees_key {
+	__be64 rid;
+	__be64 nr;
+} __packed;
+
+struct scoutfs_log_trees_val {
+	struct scoutfs_radix_root meta_avail;
+	struct scoutfs_radix_root meta_freed;
+	struct scoutfs_btree_root item_root;
+	struct scoutfs_btree_ref bloom_ref;
+	struct scoutfs_radix_root data_avail;
+	struct scoutfs_radix_root data_freed;
+} __packed;

 struct scoutfs_log_item_value {
 	__le64 vers;
 	__u8 flags;
-	__u8 __pad[7];
 	__u8 data[0];
-};
+} __packed;

 /*
 * FS items are limited by the max btree value length with the log item
@@ -457,7 +325,7 @@ struct scoutfs_bloom_block {
 	struct scoutfs_block_header hdr;
 	__le64 total_set;
 	__le64 bits[0];
-};
+} __packed;

 /*
 * Item log trees are accompanied by a block of bits that make up a
@@ -466,33 +334,30 @@ struct scoutfs_bloom_block {
 * before the bloom filters fill up and start returning excessive false
 * positives.
 */
-#define SCOUTFS_FOREST_BLOOM_NRS		3
+#define SCOUTFS_FOREST_BLOOM_NRS		7
 #define SCOUTFS_FOREST_BLOOM_BITS \
-	(((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_bloom_block)) /  \
-	 member_sizeof(struct scoutfs_bloom_block, bits[0])) *		  \
-	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)
-#define SCOUTFS_FOREST_BLOOM_FUNC_BITS		(SCOUTFS_BLOCK_LG_SHIFT + 3)
+	(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_bloom_block)) /	\
+	 member_sizeof(struct scoutfs_bloom_block, bits[0])) *		\
+	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)	\

 /*
 * Keys are first sorted by major key zones.
 */
 #define SCOUTFS_INODE_INDEX_ZONE		1
-#define SCOUTFS_RID_ZONE			2
-#define SCOUTFS_FS_ZONE				3
-#define SCOUTFS_LOCK_ZONE			4
-/* Items only stored in server btrees */
-#define SCOUTFS_LOG_TREES_ZONE			6
-#define SCOUTFS_LOCK_CLIENTS_ZONE		7
-#define SCOUTFS_TRANS_SEQ_ZONE			8
-#define SCOUTFS_MOUNTED_CLIENT_ZONE		9
-#define SCOUTFS_SRCH_ZONE			10
-#define SCOUTFS_FREE_EXTENT_ZONE		11
+#define SCOUTFS_XATTR_INDEX_ZONE		2
+#define SCOUTFS_RID_ZONE			3
+#define SCOUTFS_FS_ZONE				4
+#define SCOUTFS_LOCK_ZONE			5
+#define SCOUTFS_MAX_ZONE			8 /* power of 2 is efficient */

 /* inode index zone */
 #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE	1
 #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
 #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */

+/* xattr index zone */
+#define SCOUTFS_XATTR_INDEX_NAME_TYPE		1
+
 /* rid zone (also used in server alloc btree) */
 #define SCOUTFS_ORPHAN_TYPE			1

@@ -503,27 +368,44 @@ struct scoutfs_bloom_block {
 #define SCOUTFS_READDIR_TYPE			4
 #define SCOUTFS_LINK_BACKREF_TYPE		5
 #define SCOUTFS_SYMLINK_TYPE			6
-#define SCOUTFS_DATA_EXTENT_TYPE		7
+#define SCOUTFS_PACKED_EXTENT_TYPE		7

 /* lock zone, only ever found in lock ranges, never in persistent items */
 #define SCOUTFS_RENAME_TYPE			1

-/* srch zone, only in server btrees */
-#define SCOUTFS_SRCH_LOG_TYPE		1
-#define SCOUTFS_SRCH_BLOCKS_TYPE	2
-#define SCOUTFS_SRCH_PENDING_TYPE	3
-#define SCOUTFS_SRCH_BUSY_TYPE		4
+#define SCOUTFS_MAX_TYPE			8 /* power of 2 is efficient */

-/* free extents in allocator btrees in client and server, by blkno or len */
-#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE	1
-#define SCOUTFS_FREE_EXTENT_LEN_TYPE	2

-/* file data extents have start and len in key */
-struct scoutfs_data_extent_val {
-	__le64 blkno;
-	__u8 flags;
-	__u8 __pad[7];
-};
+/*
+ * The extents that map blocks in a fixed-size logical region of a file
+ * are packed and stored in item values.  The packed extents are
+ * contiguous so the starting logical block is implicit from the length
+ * of previous extents.  Sparse regions are represented by 0 flags and
+ * blkno.  The blkno of a packed extent is encoded as the zigzag (lsb is
+ * sign bit) difference from the last blkno of the previous extent.
+ * This guarantees that non-sparse extents must have a blkno delta of at
+ * least -1/1.  High zero byte aren't stored.
+ */
+struct scoutfs_packed_extent {
+	__le16 count;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 diff_bytes:4,
+	     flags:3,
+	     final:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u8 final:1,
+	     flags:3,
+	     diff_bytes:4;
+#else
+#error "no {BIG,LITTLE}_ENDIAN_BITFIELD defined?"
+#endif
+	__u8 le_blkno_diff[0];
+} __packed;
+
+#define SCOUTFS_PACKEXT_BLOCKS		(8 * 1024 * 1024 / SCOUTFS_BLOCK_SIZE)
+#define SCOUTFS_PACKEXT_BASE_SHIFT	(ilog2(SCOUTFS_PACKEXT_BLOCKS))
+#define SCOUTFS_PACKEXT_BASE_MASK	(~((__u64)SCOUTFS_PACKEXT_BLOCKS - 1))
+#define SCOUTFS_PACKEXT_MAX_BYTES	SCOUTFS_MAX_VAL_SIZE

 #define SEF_OFFLINE	(1 << 0)
 #define SEF_UNWRITTEN	(1 << 1)
@@ -535,11 +417,10 @@ struct scoutfs_data_extent_val {
 * part item and overflow into the values of the rest of the part items.
 */
 struct scoutfs_xattr {
-	__le16 val_len;
 	__u8 name_len;
-	__u8 __pad[5];
+	__le16 val_len;
 	__u8 name[0];
-};
+} __packed;


 /* XXX does this exist upstream somewhere? */
@@ -579,51 +460,47 @@ struct scoutfs_quorum_block {
 	__le64 vote_for_rid;
 	__le32 crc;
 	__u8 log_nr;
-	__u8 __pad[3];
 	struct scoutfs_quorum_log {
 		__le64 term;
 		__le64 rid;
 		struct scoutfs_inet_addr addr;
-	} log[0];
-};
+	} __packed log[0];
+} __packed;

-#define SCOUTFS_QUORUM_LOG_MAX						  \
-	((SCOUTFS_BLOCK_SM_SIZE - sizeof(struct scoutfs_quorum_block)) /  \
+#define SCOUTFS_QUORUM_LOG_MAX						\
+	((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_quorum_block)) /	\
 		sizeof(struct scoutfs_quorum_log))

-#define SCOUTFS_FLAG_IS_META_BDEV 0x01
-
 struct scoutfs_super_block {
 	struct scoutfs_block_header hdr;
 	__le64 id;
 	__le64 format_hash;
-	__le64 flags;
 	__u8 uuid[SCOUTFS_UUID_BYTES];
 	__le64 next_ino;
 	__le64 next_trans_seq;
 	__le64 total_meta_blocks;	/* both static and dynamic */
 	__le64 first_meta_blkno;	/* first dynamically allocated */
 	__le64 last_meta_blkno;
+	__le64 free_meta_blocks;
 	__le64 total_data_blocks;
 	__le64 first_data_blkno;
 	__le64 last_data_blkno;
+	__le64 free_data_blocks;
 	__le64 quorum_fenced_term;
 	__le64 quorum_server_term;
 	__le64 unmount_barrier;
 	__u8 quorum_count;
-	__u8 __pad[7];
 	struct scoutfs_inet_addr server_addr;
-	struct scoutfs_alloc_root meta_alloc[2];
-	struct scoutfs_alloc_root data_alloc;
-	struct scoutfs_alloc_list_head server_meta_avail[2];
-	struct scoutfs_alloc_list_head server_meta_freed[2];
+	struct scoutfs_radix_root core_meta_avail;
+	struct scoutfs_radix_root core_meta_freed;
+	struct scoutfs_radix_root core_data_avail;
+	struct scoutfs_radix_root core_data_freed;
 	struct scoutfs_btree_root fs_root;
 	struct scoutfs_btree_root logs_root;
 	struct scoutfs_btree_root lock_clients;
 	struct scoutfs_btree_root trans_seqs;
 	struct scoutfs_btree_root mounted_clients;
-	struct scoutfs_btree_root srch_root;
-};
+} __packed;

 #define SCOUTFS_ROOT_INO 1

@@ -672,7 +549,7 @@ struct scoutfs_inode {
 	struct scoutfs_timespec atime;
 	struct scoutfs_timespec ctime;
 	struct scoutfs_timespec mtime;
-};
+} __packed;

 #define SCOUTFS_INO_FLAG_TRUNCATE 0x1

@@ -694,9 +571,8 @@ struct scoutfs_dirent {
 	__le64 hash;
 	__le64 pos;
 	__u8 type;
-	__u8 __pad[7];
 	__u8 name[0];
-};
+} __packed;

 #define SCOUTFS_NAME_LEN 255

@@ -708,7 +584,7 @@ struct scoutfs_dirent {
 /* getdents returns next pos with an entry, no entry at (f_pos)~0 */
 #define SCOUTFS_DIRENT_LAST_POS (U64_MAX - 1)

-enum scoutfs_dentry_type {
+enum {
 	SCOUTFS_DT_FIFO = 0,
 	SCOUTFS_DT_CHR,
 	SCOUTFS_DT_DIR,
@@ -764,7 +640,7 @@ struct scoutfs_net_greeting {
 	__le64 unmount_barrier;
 	__le64 rid;
 	__le64 flags;
-};
+} __packed;

 #define SCOUTFS_NET_GREETING_FLAG_FAREWELL	(1 << 0)
 #define SCOUTFS_NET_GREETING_FLAG_VOTER		(1 << 1)
@@ -799,25 +675,22 @@ struct scoutfs_net_header {
 	__u8 cmd;
 	__u8 flags;
 	__u8 error;
-	__u8 __pad[3];
 	__u8 data[0];
-};
+} __packed;

 #define SCOUTFS_NET_FLAG_RESPONSE	(1 << 0)
 #define SCOUTFS_NET_FLAGS_UNKNOWN	(U8_MAX << 1)

-enum scoutfs_net_cmd {
+enum {
 	SCOUTFS_NET_CMD_GREETING = 0,
 	SCOUTFS_NET_CMD_ALLOC_INODES,
 	SCOUTFS_NET_CMD_GET_LOG_TREES,
 	SCOUTFS_NET_CMD_COMMIT_LOG_TREES,
-	SCOUTFS_NET_CMD_GET_ROOTS,
 	SCOUTFS_NET_CMD_ADVANCE_SEQ,
 	SCOUTFS_NET_CMD_GET_LAST_SEQ,
+	SCOUTFS_NET_CMD_STATFS,
 	SCOUTFS_NET_CMD_LOCK,
 	SCOUTFS_NET_CMD_LOCK_RECOVER,
-	SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
-	SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
 	SCOUTFS_NET_CMD_FAREWELL,
 	SCOUTFS_NET_CMD_UNKNOWN,
 };
@@ -836,7 +709,7 @@ enum scoutfs_net_cmd {

 #undef EXPAND_NET_ERRNO
 #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
-enum scoutfs_net_errors {
+enum {
 	SCOUTFS_NET_ERR_NONE = 0,
 	EXPAND_EACH_NET_ERRNO
 	SCOUTFS_NET_ERR_UNKNOWN,
@@ -852,39 +725,33 @@ enum scoutfs_net_errors {
 struct scoutfs_net_inode_alloc {
 	__le64 ino;
 	__le64 nr;
-};
+} __packed;

-struct scoutfs_net_roots {
-	struct scoutfs_btree_root fs_root;
-	struct scoutfs_btree_root logs_root;
-	struct scoutfs_btree_root srch_root;
-};
+struct scoutfs_net_statfs {
+	__le64 total_blocks;		/* total blocks in device */
+	__le64 next_ino;		/* next unused inode number */
+	__le64 bfree;			/* free blocks */
+	__u8 uuid[SCOUTFS_UUID_BYTES];	/* logical volume uuid */
+} __packed;

 struct scoutfs_net_lock {
 	struct scoutfs_key key;
 	__le64 write_version;
 	__u8 old_mode;
 	__u8 new_mode;
-	__u8 __pad[6];
-};
-
-struct scoutfs_net_lock_grant_response {
-	struct scoutfs_net_lock nl;
-	struct scoutfs_net_roots roots;
-};
+} __packed;

 struct scoutfs_net_lock_recover {
 	__le16 nr;
-	__u8 __pad[6];
 	struct scoutfs_net_lock locks[0];
-};
+} __packed;

 #define SCOUTFS_NET_LOCK_MAX_RECOVER_NR					       \
 	((SCOUTFS_NET_MAX_DATA_LEN - sizeof(struct scoutfs_net_lock_recover)) /\
 	 sizeof(struct scoutfs_net_lock))

 /* some enums for tracing */
-enum scoutfs_lock_trace {
+enum {
 	SLT_CLIENT,
 	SLT_SERVER,
 	SLT_GRANT,
@@ -905,7 +772,7 @@ enum scoutfs_lock_trace {
 *
 * The null mode provides no access and is used to destroy locks.
 */
-enum scoutfs_lock_mode {
+enum {
 	SCOUTFS_LOCK_NULL = 0,
 	SCOUTFS_LOCK_READ,
 	SCOUTFS_LOCK_WRITE,
@@ -920,7 +787,7 @@ enum scoutfs_lock_mode {
 struct scoutfs_fid {
 	__le64 ino;
 	__le64 parent_ino;
-};
+} __packed;

 #define FILEID_SCOUTFS			0x81
 #define FILEID_SCOUTFS_WITH_PARENT	0x82
@@ -928,7 +795,7 @@ struct scoutfs_fid {
 /*
 * Identifiers for sources of corruption that can generate messages.
 */
-enum scoutfs_corruption_sources {
+enum {
 	SC_DIRENT_NAME_LEN = 0,
 	SC_DIRENT_BACKREF_NAME_LEN,
 	SC_DIRENT_READDIR_NAME_LEN,
@@ -1,49 +1,15 @@
 #ifndef _SCOUTFS_HASH_H_
 #define _SCOUTFS_HASH_H_

-/*
- * We're using FNV1a for now.  It's fine.  Ish.
- *
- * The longer term plan is xxh3 but it looks like it'll take just a bit
- * more time to be declared stable and then it needs to be ported to the
- * kernel.
- *
- *  - https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
- *  - https://github.com/Cyan4973/xxHash/releases/tag/v0.7.4
- */
-
-static inline u32 fnv1a32(const void *data, unsigned int len)
-{
-	u32 hash = 0x811c9dc5;
-
-	while (len--) {
-		hash ^= *(u8 *)(data++);
-		hash *= 0x01000193;
-	}
-
-	return hash;
-}
-
-static inline u64 fnv1a64(const void *data, unsigned int len)
-{
-	u64 hash = 0xcbf29ce484222325ULL;
-
-	while (len--) {
-		hash ^= *(u8 *)(data++);
-		hash *= 0x100000001b3ULL;
-	}
-
-	return hash;
-}
-
-static inline u32 scoutfs_hash32(const void *data, unsigned int len)
-{
-	return fnv1a32(data, len);
-}
+#include <linux/crc32c.h>

+/* XXX replace with xxhash */
 static inline u64 scoutfs_hash64(const void *data, unsigned int len)
 {
-	return fnv1a64(data, len);
+       unsigned int half = (len + 1) / 2;
+
+       return crc32c(~0, data, half) |
+              ((u64)crc32c(~0, data + len - half, half) << 32);
 }

 #endif
@@ -30,7 +30,8 @@
 #include "xattr.h"
 #include "trans.h"
 #include "msg.h"
-#include "item.h"
+#include "kvec.h"
+#include "forest.h"
 #include "client.h"
 #include "cmp.h"

@@ -46,17 +47,9 @@
 *  - describe data locking size problems
 */

-struct inode_allocator {
-	spinlock_t lock;
-	u64 ino;
-	u64 nr;
-};
-
 struct inode_sb_info {
 	spinlock_t writeback_lock;
 	struct rb_root writeback_inodes;
-	struct inode_allocator dir_ino_alloc;
-	struct inode_allocator ino_alloc;
 };

 #define DECLARE_INODE_SB_INFO(sb, name) \
@@ -71,30 +64,31 @@ static struct kmem_cache *scoutfs_inode_cachep;
 */
 static void scoutfs_inode_ctor(void *obj)
 {
-	struct scoutfs_inode_info *si = obj;
+	struct scoutfs_inode_info *ci = obj;

-	init_rwsem(&si->extent_sem);
-	mutex_init(&si->item_mutex);
-	seqcount_init(&si->seqcount);
-	si->staging = false;
-	scoutfs_per_task_init(&si->pt_data_lock);
-	atomic64_set(&si->data_waitq.changed, 0);
-	init_waitqueue_head(&si->data_waitq.waitq);
-	init_rwsem(&si->xattr_rwsem);
-	RB_CLEAR_NODE(&si->writeback_node);
+	mutex_init(&ci->s_i_mutex);
+	mutex_init(&ci->item_mutex);
+	seqcount_init(&ci->seqcount);
+	ci->staging = false;
+	scoutfs_per_task_init(&ci->pt_data_lock);
+	atomic64_set(&ci->data_waitq.changed, 0);
+	init_waitqueue_head(&ci->data_waitq.waitq);
+	init_rwsem(&ci->xattr_rwsem);
+	RB_CLEAR_NODE(&ci->writeback_node);
+	spin_lock_init(&ci->ino_alloc.lock);

-	inode_init_once(&si->inode);
+	inode_init_once(&ci->inode);
 }

 struct inode *scoutfs_alloc_inode(struct super_block *sb)
 {
-	struct scoutfs_inode_info *si;
+	struct scoutfs_inode_info *ci;

-	si = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
-	if (!si)
+	ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
+	if (!ci)
 		return NULL;

-	return &si->inode;
+	return &ci->inode;
 }

 static void scoutfs_i_callback(struct rcu_head *head)
@@ -222,7 +216,7 @@ static void set_item_info(struct scoutfs_inode_info *si,

 static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 {
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);

 	i_size_write(inode, le64_to_cpu(cinode->size));
 	set_nlink(inode, le32_to_cpu(cinode->nlink));
@@ -237,23 +231,23 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 	inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);

-	si->meta_seq = le64_to_cpu(cinode->meta_seq);
-	si->data_seq = le64_to_cpu(cinode->data_seq);
-	si->data_version = le64_to_cpu(cinode->data_version);
-	si->online_blocks = le64_to_cpu(cinode->online_blocks);
-	si->offline_blocks = le64_to_cpu(cinode->offline_blocks);
-	si->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
-	si->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
-	si->flags = le32_to_cpu(cinode->flags);
+	ci->meta_seq = le64_to_cpu(cinode->meta_seq);
+	ci->data_seq = le64_to_cpu(cinode->data_seq);
+	ci->data_version = le64_to_cpu(cinode->data_version);
+	ci->online_blocks = le64_to_cpu(cinode->online_blocks);
+	ci->offline_blocks = le64_to_cpu(cinode->offline_blocks);
+	ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
+	ci->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
+	ci->flags = le32_to_cpu(cinode->flags);

 	/*
 	 * i_blocks is initialized from online and offline and is then
 	 * maintained as blocks come and go.
 	 */
-	inode->i_blocks = (si->online_blocks + si->offline_blocks)
-				<< SCOUTFS_BLOCK_SM_SECTOR_SHIFT;
+	inode->i_blocks = (ci->online_blocks + ci->offline_blocks)
+				<< SCOUTFS_BLOCK_SECTOR_SHIFT;

-	set_item_info(si, cinode);
+	set_item_info(ci, cinode);
 }

 static void init_inode_key(struct scoutfs_key *key, u64 ino)
@@ -283,6 +277,7 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_key key;
 	struct scoutfs_inode sinode;
+	struct kvec val;
 	const u64 refresh_gen = lock->refresh_gen;
 	int ret;

@@ -298,11 +293,11 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock,
 		return 0;

 	init_inode_key(&key, scoutfs_ino(inode));
+	kvec_init(&val, &sinode, sizeof(sinode));

 	mutex_lock(&si->item_mutex);
 	if (atomic64_read(&si->last_refreshed) < refresh_gen) {
-		ret = scoutfs_item_lookup_exact(sb, &key, &sinode,
-						sizeof(sinode), lock);
+		ret = scoutfs_forest_lookup_exact(sb, &key, &val, lock);
 		if (ret == 0) {
 			load_inode(inode, &sinode);
 			atomic64_set(&si->last_refreshed, refresh_gen);
@@ -335,7 +330,7 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 			  u64 new_size, bool truncate)
 {
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	LIST_HEAD(ind_locks);
 	int ret;
@@ -354,7 +349,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
 	truncate_setsize(inode, new_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	if (truncate)
-		si->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
+		ci->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
 	scoutfs_inode_set_data_seq(inode);
 	scoutfs_update_inode_item(inode, lock, &ind_locks);

@@ -366,7 +361,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,

 static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	LIST_HEAD(ind_locks);
 	int ret;
@@ -376,7 +371,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
 	if (ret)
 		return ret;

-	si->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
+	ci->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
 	scoutfs_update_inode_item(inode, lock, &ind_locks);

 	scoutfs_release_trans(sb);
@@ -387,17 +382,16 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)

 int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
 	u64 start;
 	int ret, err;

-	trace_scoutfs_complete_truncate(inode, si->flags);
+	trace_scoutfs_complete_truncate(inode, ci->flags);

-	if (!(si->flags & SCOUTFS_INO_FLAG_TRUNCATE))
+	if (!(ci->flags & SCOUTFS_INO_FLAG_TRUNCATE))
 		return 0;

-	start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
-		SCOUTFS_BLOCK_SM_SHIFT;
+	start = (i_size_read(inode) + SCOUTFS_BLOCK_SIZE - 1) >> SCOUTFS_BLOCK_SHIFT;
 	ret = scoutfs_data_truncate_items(inode->i_sb, inode,
 					  scoutfs_ino(inode), start, ~0ULL,
 					  false, lock);
@@ -419,6 +413,7 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 int scoutfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *lock = NULL;
 	DECLARE_DATA_WAIT(dw);
@@ -429,6 +424,7 @@ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr)

 	trace_scoutfs_setattr(dentry, attr);

+	mutex_lock(&si->s_i_mutex);
 retry:
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
@@ -463,9 +459,11 @@ retry:
 				scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);

 				/* XXX callee locks instead? */
+				mutex_unlock(&si->s_i_mutex);
 				mutex_unlock(&inode->i_mutex);
 				ret = scoutfs_data_wait(inode, &dw);
 				mutex_lock(&inode->i_mutex);
+				mutex_lock(&si->s_i_mutex);

 				if (ret == 0)
 					goto retry;
@@ -499,6 +497,7 @@ retry:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 out:
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	mutex_unlock(&si->s_i_mutex);
 	return ret;
 }

@@ -580,7 +579,7 @@ void scoutfs_inode_add_onoff(struct inode *inode, s64 on, s64 off)
 		si->online_blocks += on;
 		si->offline_blocks += off;
 		/* XXX not sure if this is right */
-		inode->i_blocks += (on + off) * SCOUTFS_BLOCK_SM_SECTORS;
+		inode->i_blocks += (on + off) * SCOUTFS_BLOCK_SECTORS;

 		trace_scoutfs_online_offline_blocks(inode, on, off,
 						    si->online_blocks,
@@ -644,19 +643,19 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)

 static int scoutfs_iget_test(struct inode *inode, void *arg)
 {
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
 	u64 *ino = arg;

-	return si->ino == *ino;
+	return ci->ino == *ino;
 }

 static int scoutfs_iget_set(struct inode *inode, void *arg)
 {
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
 	u64 *ino = arg;

 	inode->i_ino = *ino;
-	si->ino = *ino;
+	ci->ino = *ino;

 	return 0;
 }
@@ -688,6 +687,8 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 		/* XXX ensure refresh, instead clear in drop_inode? */
 		si = SCOUTFS_I(inode);
 		atomic64_set(&si->last_refreshed, 0);
+		si->ino_alloc.ino = 0;
+		si->ino_alloc.nr = 0;

 		ret = scoutfs_inode_refresh(inode, lock, 0);
 		if (ret) {
@@ -706,7 +707,7 @@ out:

 static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 {
-	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+	struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
 	u64 online_blocks;
 	u64 offline_blocks;

@@ -720,22 +721,19 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 	cinode->rdev = cpu_to_le32(inode->i_rdev);
 	cinode->atime.sec = cpu_to_le64(inode->i_atime.tv_sec);
 	cinode->atime.nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-	memset(cinode->atime.__pad, 0, sizeof(cinode->atime.__pad));
 	cinode->ctime.sec = cpu_to_le64(inode->i_ctime.tv_sec);
 	cinode->ctime.nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-	memset(cinode->ctime.__pad, 0, sizeof(cinode->ctime.__pad));
 	cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec);
 	cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-	memset(cinode->mtime.__pad, 0, sizeof(cinode->mtime.__pad));

 	cinode->meta_seq = cpu_to_le64(scoutfs_inode_meta_seq(inode));
 	cinode->data_seq = cpu_to_le64(scoutfs_inode_data_seq(inode));
 	cinode->data_version = cpu_to_le64(scoutfs_inode_data_version(inode));
 	cinode->online_blocks = cpu_to_le64(online_blocks);
 	cinode->offline_blocks = cpu_to_le64(offline_blocks);
-	cinode->next_readdir_pos = cpu_to_le64(si->next_readdir_pos);
-	cinode->next_xattr_id = cpu_to_le64(si->next_xattr_id);
-	cinode->flags = cpu_to_le32(si->flags);
+	cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos);
+	cinode->next_xattr_id = cpu_to_le64(ci->next_xattr_id);
+	cinode->flags = cpu_to_le32(ci->flags);
 }

 /*
@@ -761,13 +759,15 @@ int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock)
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
+	struct kvec val;
 	int ret;

 	store_inode(&sinode, inode);
+	kvec_init(&val, &sinode, sizeof(sinode));

 	init_inode_key(&key, scoutfs_ino(inode));

-	ret = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
+	ret = scoutfs_forest_update(sb, &key, &val, lock);
 	if (!ret)
 		trace_scoutfs_dirty_inode(inode);
 	return ret;
@@ -899,7 +899,7 @@ static int update_index_items(struct super_block *sb,
 	scoutfs_inode_init_index_key(&ins, type, major, minor, ino);

 	ins_lock = find_index_lock(lock_list, type, major, minor, ino);
-	ret = scoutfs_item_create_force(sb, &ins, NULL, 0, ins_lock);
+	ret = scoutfs_forest_create_force(sb, &ins, NULL, ins_lock);
 	if (ret || !will_del_index(si, type, major, minor))
 		return ret;

@@ -911,9 +911,9 @@ static int update_index_items(struct super_block *sb,

 	del_lock = find_index_lock(lock_list, type, si->item_majors[type],
 				   si->item_minors[type], ino);
-	ret = scoutfs_item_delete_force(sb, &del, del_lock);
+	ret = scoutfs_forest_delete_force(sb, &del, del_lock);
 	if (ret) {
-		err = scoutfs_item_delete(sb, &ins, ins_lock);
+		err = scoutfs_forest_delete(sb, &ins, ins_lock);
 		BUG_ON(err);
 	}

@@ -972,6 +972,7 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
 	const u64 ino = scoutfs_ino(inode);
 	struct scoutfs_key key;
 	struct scoutfs_inode sinode;
+	struct kvec val;
 	int ret;
 	int err;

@@ -987,8 +988,9 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
 	BUG_ON(ret);

 	init_inode_key(&key, ino);
+	kvec_init(&val, &sinode, sizeof(sinode));

-	err = scoutfs_item_update(sb, &key, &sinode, sizeof(sinode), lock);
+	err = scoutfs_forest_update(sb, &key, &val, lock);
 	if (err) {
 		scoutfs_err(sb, "inode %llu update err %d", ino, err);
 		BUG_ON(err);
@@ -1263,7 +1265,7 @@ static int remove_index(struct super_block *sb, u64 ino, u8 type, u64 major,
 	scoutfs_inode_init_index_key(&key, type, major, minor, ino);

 	lock = find_index_lock(ind_locks, type, major, minor, ino);
-	ret = scoutfs_item_delete_force(sb, &key, lock);
+	ret = scoutfs_forest_delete_force(sb, &key, lock);
 	if (ret == -ENOENT)
 		ret = 0;
 	return ret;
@@ -1325,16 +1327,14 @@ u64 scoutfs_last_ino(struct super_block *sb)
 * minimize that loss while still being large enough for typical
 * directory file counts.
 */
-int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret)
+int scoutfs_alloc_ino(struct inode *parent, u64 *ino_ret)
 {
-	DECLARE_INODE_SB_INFO(sb, inf);
-	struct inode_allocator *ia;
+	struct scoutfs_inode_allocator *ia = &SCOUTFS_I(parent)->ino_alloc;
+	struct super_block *sb = parent->i_sb;
 	u64 ino;
 	u64 nr;
 	int ret;

-	ia = is_dir ? &inf->dir_ino_alloc : &inf->ino_alloc;
-
 	spin_lock(&ia->lock);

 	if (ia->nr == 0) {
@@ -1369,26 +1369,29 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 				umode_t mode, dev_t rdev, u64 ino,
 				struct scoutfs_lock *lock)
 {
-	struct scoutfs_inode_info *si;
+	struct scoutfs_inode_info *ci;
 	struct scoutfs_key key;
 	struct scoutfs_inode sinode;
 	struct inode *inode;
+	struct kvec val;
 	int ret;

 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);

-	si = SCOUTFS_I(inode);
-	si->ino = ino;
-	si->data_version = 0;
-	si->online_blocks = 0;
-	si->offline_blocks = 0;
-	si->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
-	si->next_xattr_id = 0;
-	si->have_item = false;
-	atomic64_set(&si->last_refreshed, lock->refresh_gen);
-	si->flags = 0;
+	ci = SCOUTFS_I(inode);
+	ci->ino = ino;
+	ci->data_version = 0;
+	ci->online_blocks = 0;
+	ci->offline_blocks = 0;
+	ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
+	ci->next_xattr_id = 0;
+	ci->have_item = false;
+	atomic64_set(&ci->last_refreshed, lock->refresh_gen);
+	ci->flags = 0;
+	ci->ino_alloc.ino = 0;
+	ci->ino_alloc.nr = 0;

 	scoutfs_inode_set_meta_seq(inode);
 	scoutfs_inode_set_data_seq(inode);
@@ -1402,8 +1405,9 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,

 	store_inode(&sinode, inode);
 	init_inode_key(&key, scoutfs_ino(inode));
+	kvec_init(&val, &sinode, sizeof(sinode));

-	ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock);
+	ret = scoutfs_forest_create(sb, &key, &val, lock);
 	if (ret) {
 		iput(inode);
 		return ERR_PTR(ret);
@@ -1431,7 +1435,7 @@ static int remove_orphan_item(struct super_block *sb, u64 ino)

 	init_orphan_key(&key, sbi->rid, ino);

-	ret = scoutfs_item_delete(sb, &key, lock);
+	ret = scoutfs_forest_delete(sb, &key, lock);
 	if (ret == -ENOENT)
 		ret = 0;

@@ -1453,6 +1457,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 	struct scoutfs_key key;
 	LIST_HEAD(ind_locks);
 	bool release = false;
+	struct kvec val;
 	umode_t mode;
 	u64 ind_seq;
 	u64 size;
@@ -1463,9 +1468,9 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
 		return ret;

 	init_inode_key(&key, ino);
+	kvec_init(&val, &sinode, sizeof(sinode));

-	ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode),
-					lock);
+	ret = scoutfs_forest_lookup_exact(sb, &key, &val, lock);
 	if (ret < 0) {
 		if (ret == -ENOENT)
 			ret = 0;
@@ -1518,7 +1523,7 @@ retry:
 			goto out;
 	}

-	ret = scoutfs_item_delete(sb, &key, lock);
+	ret = scoutfs_forest_delete(sb, &key, lock);
 	if (ret)
 		goto out;

@@ -1587,7 +1592,7 @@ int scoutfs_scan_orphans(struct super_block *sb)
 	init_orphan_key(&last, sbi->rid, ~0ULL);

 	while (1) {
-		ret = scoutfs_item_next(sb, &key, &last, NULL, 0, lock);
+		ret = scoutfs_forest_next(sb, &key, &last, NULL, lock);
 		if (ret == -ENOENT) /* No more orphan items */
 			break;
 		if (ret < 0)
@@ -1621,7 +1626,7 @@ int scoutfs_orphan_inode(struct inode *inode)

 	init_orphan_key(&key, sbi->rid, scoutfs_ino(inode));

-	ret = scoutfs_item_create(sb, &key, NULL, 0, lock);
+	ret = scoutfs_forest_create(sb, &key, NULL, lock);

 	return ret;
 }
@@ -1725,8 +1730,6 @@ int scoutfs_inode_setup(struct super_block *sb)

 	spin_lock_init(&inf->writeback_lock);
 	inf->writeback_inodes = RB_ROOT;
-	spin_lock_init(&inf->dir_ino_alloc.lock);
-	spin_lock_init(&inf->ino_alloc.lock);

 	sbi->inode_sb_info = inf;

@@ -10,6 +10,12 @@

 struct scoutfs_lock;

+struct scoutfs_inode_allocator {
+	spinlock_t lock;
+	u64 ino;
+	u64 nr;
+};
+
 struct scoutfs_inode_info {
 	/* read or initialized for each inode instance */
 	u64 ino;
@@ -22,13 +28,11 @@ struct scoutfs_inode_info {
 	u64 offline_blocks;
 	u32 flags;

-	/*
-	 * Protects per-inode extent items, most particularly readers
-	 * who want to serialize writers without holding i_mutex. (only
-	 * used in data.c, it's the only place that understands file
-	 * extent items)
+	/* We can't use inode->i_mutex to protect i_dio_count due to lock
+	 * ordering in the kernel between i_mutex and mmap_sem.  Use this
+	 * as an inner lock.
 	 */
-	struct rw_semaphore extent_sem;
+	struct mutex s_i_mutex;

 	/*
 	 * The in-memory item info caches the current index item values
@@ -44,6 +48,9 @@ struct scoutfs_inode_info {
 	/* updated at on each new lock acquisition */
 	atomic64_t last_refreshed;

+	/* reset for every new inode instance */
+	struct scoutfs_inode_allocator ino_alloc;
+
 	/* initialized once for slab object */
 	seqcount_t seqcount;
 	bool staging;			/* holder of i_mutex is staging */
@@ -94,7 +101,7 @@ int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
 void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock,
 			       struct list_head *ind_locks);

-int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret);
+int scoutfs_alloc_ino(struct inode *parent, u64 *ino);
 struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 				umode_t mode, dev_t rdev, u64 ino,
 				struct scoutfs_lock *lock);
@@ -12,7 +12,6 @@
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
-#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/compiler.h>
 #include <linux/uio.h>
@@ -28,7 +27,6 @@
 #include "ioctl.h"
 #include "super.h"
 #include "inode.h"
-#include "item.h"
 #include "forest.h"
 #include "data.h"
 #include "client.h"
@@ -36,8 +34,6 @@
 #include "trans.h"
 #include "xattr.h"
 #include "hash.h"
-#include "srch.h"
-#include "alloc.h"
 #include "scoutfs_trace.h"

 /*
@@ -113,7 +109,7 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)

 	for (nr = 0; nr < walk.nr_entries; ) {

-		ret = scoutfs_item_next(sb, &key, &last_key, NULL, 0, lock);
+		ret = scoutfs_forest_next(sb, &key, &last_key, NULL, lock);
 		if (ret < 0 && ret != -ENOENT)
 			break;

@@ -272,11 +268,12 @@ out:
 static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_ioctl_release args;
 	struct scoutfs_lock *lock = NULL;
-	u64 sblock;
-	u64 eblock;
+	loff_t start;
+	loff_t end_inc;
 	u64 online;
 	u64 offline;
 	u64 isize;
@@ -287,11 +284,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_release(sb, scoutfs_ino(inode), &args);

-	if (args.length == 0)
+	if (args.count == 0)
 		return 0;
-	if (((args.offset + args.length) < args.offset) ||
-	    (args.offset & SCOUTFS_BLOCK_SM_MASK) ||
-	    (args.length & SCOUTFS_BLOCK_SM_MASK))
+	if ((args.block + args.count) < args.block)
 		return -EINVAL;


@@ -300,6 +295,7 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 		return ret;

 	mutex_lock(&inode->i_mutex);
+	mutex_lock(&si->s_i_mutex);

 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
@@ -324,30 +320,30 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	inode_dio_wait(inode);

 	/* drop all clean and dirty cached blocks in the range */
-	truncate_inode_pages_range(&inode->i_data, args.offset,
-				   args.offset + args.length - 1);
+	start = args.block << SCOUTFS_BLOCK_SHIFT;
+	end_inc = ((args.block + args.count) << SCOUTFS_BLOCK_SHIFT) - 1;
+	truncate_inode_pages_range(&inode->i_data, start, end_inc);

-	sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
-	eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
-					  sblock,
-					  eblock, true,
+					  args.block,
+					  args.block + args.count - 1, true,
 					  lock);
 	if (ret == 0) {
 		scoutfs_inode_get_onoff(inode, &online, &offline);
 		isize = i_size_read(inode);
 		if (online == 0 && isize) {
-			sblock = (isize + SCOUTFS_BLOCK_SM_SIZE - 1)
-					>> SCOUTFS_BLOCK_SM_SHIFT;
+			start = (isize + SCOUTFS_BLOCK_SIZE - 1)
+					>> SCOUTFS_BLOCK_SHIFT;
 			ret = scoutfs_data_truncate_items(sb, inode,
 							  scoutfs_ino(inode),
-							  sblock, U64_MAX,
+							  start, U64_MAX,
 							  false, lock);
 		}
 	}

 out:
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	mutex_unlock(&si->s_i_mutex);
 	mutex_unlock(&inode->i_mutex);
 	mnt_drop_write_file(file);

@@ -360,6 +356,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct scoutfs_ioctl_data_wait_err args;
 	struct scoutfs_lock *lock = NULL;
+	struct scoutfs_inode_info *si;
 	struct inode *inode = NULL;
 	u64 sblock;
 	u64 eblock;
@@ -378,8 +375,8 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_data_wait_err(sb, &args);

-	sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
-	eblock = (args.offset + args.count - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
+	sblock = args.offset >> SCOUTFS_BLOCK_SHIFT;
+	eblock = (args.offset + args.count - 1) >> SCOUTFS_BLOCK_SHIFT;

 	if (sblock > eblock)
 		return -EINVAL;
@@ -390,7 +387,9 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)
 		goto out;
 	}

+	si = SCOUTFS_I(inode);
 	mutex_lock(&inode->i_mutex);
+	mutex_lock(&si->s_i_mutex);

 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
@@ -408,6 +407,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg)

 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
 unlock:
+	mutex_unlock(&si->s_i_mutex);
 	mutex_unlock(&inode->i_mutex);
 	iput(inode);
 out:
@@ -463,24 +463,23 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)

 	trace_scoutfs_ioc_stage(sb, scoutfs_ino(inode), &args);

-	end_size = args.offset + args.length;
+	end_size = args.offset + args.count;

 	/* verify arg constraints that aren't dependent on file */
-	if (args.length < 0 || (end_size < args.offset) ||
-	    args.offset & SCOUTFS_BLOCK_SM_MASK) {
+	if (args.count < 0 || (end_size < args.offset) ||
+	    args.offset & SCOUTFS_BLOCK_MASK)
 		return -EINVAL;
-	}

-	if (args.length == 0)
+	if (args.count == 0)
 		return 0;

 	/* the iocb is really only used for the file pointer :P */
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = args.offset;
-	kiocb.ki_left = args.length;
-	kiocb.ki_nbytes = args.length;
+	kiocb.ki_left = args.count;
+	kiocb.ki_nbytes = args.count;
 	iov.iov_base = (void __user *)(unsigned long)args.buf_ptr;
-	iov.iov_len = args.length;
+	iov.iov_len = args.count;

 	ret = mnt_want_write_file(file);
 	if (ret)
@@ -502,7 +501,7 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
 	    (file->f_flags & (O_APPEND | O_DIRECT | O_DSYNC)) ||
 	    IS_SYNC(file->f_mapping->host) ||
 	    (end_size > isize) ||
-	    ((end_size & SCOUTFS_BLOCK_SM_MASK) && (end_size != isize))) {
+	    ((end_size & SCOUTFS_BLOCK_MASK) && (end_size != isize))) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -519,11 +518,11 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg)
 	written = 0;
 	do {
 		ret = generic_file_buffered_write(&kiocb, &iov, 1, pos, &pos,
-						  args.length, written);
+						  args.count, written);
 		BUG_ON(ret == -EIOCBQUEUED);
 		if (ret > 0)
 			written += ret;
-	} while (ret > 0 && written < args.length);
+	} while (ret > 0 && written < args.count);

 	si->staging = false;
 	current->backing_dev_info = NULL;
@@ -616,6 +615,7 @@ static long scoutfs_ioc_data_waiting(struct file *file, unsigned long arg)
 static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file->f_inode;
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_ioctl_setattr_more __user *usm = (void __user *)arg;
 	struct scoutfs_ioctl_setattr_more sm;
@@ -651,6 +651,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 		goto out;

 	mutex_lock(&inode->i_mutex);
+	mutex_lock(&si->s_i_mutex);

 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &lock);
@@ -693,6 +694,7 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
 unlock:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
+	mutex_unlock(&si->s_i_mutex);
 	mutex_unlock(&inode->i_mutex);
 	mnt_drop_write_file(file);
 out:
@@ -767,20 +769,18 @@ out:
 * but we don't check that the callers xattr name contains the tag and
 * search for it regardless.
 */
-static long scoutfs_ioc_search_xattrs(struct file *file, unsigned long arg)
+static long scoutfs_ioc_find_xattrs(struct file *file, unsigned long arg)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
-	struct scoutfs_ioctl_search_xattrs __user *usx = (void __user *)arg;
-	struct scoutfs_ioctl_search_xattrs sx;
-	struct scoutfs_xattr_prefix_tags tgs;
-	struct scoutfs_srch_rb_root sroot;
-	struct scoutfs_srch_rb_node *snode;
-	u64 __user *uinos;
-	struct rb_node *node;
+	struct scoutfs_ioctl_find_xattrs __user *ufx = (void __user *)arg;
+	struct scoutfs_ioctl_find_xattrs fx;
+	struct scoutfs_lock *lock = NULL;
+	struct scoutfs_key last;
+	struct scoutfs_key key;
 	char *name = NULL;
-	bool done = false;
-	u64 prev_ino = 0;
-	u64 total = 0;
+	int total = 0;
+	u64 hash;
+	u64 ino;
 	int ret;

 	if (!(file->f_mode & FMODE_READ)) {
@@ -793,73 +793,67 @@ static long scoutfs_ioc_search_xattrs(struct file *file, unsigned long arg)
 		goto out;
 	}

-	if (copy_from_user(&sx, usx, sizeof(sx))) {
+	if (copy_from_user(&fx, ufx, sizeof(fx))) {
 		ret = -EFAULT;
 		goto out;
 	}
-	uinos = (u64 __user *)sx.inodes_ptr;

-	if (sx.name_bytes > SCOUTFS_XATTR_MAX_NAME_LEN) {
+	if (fx.name_bytes > SCOUTFS_XATTR_MAX_NAME_LEN) {
 		ret = -EINVAL;
 		goto out;
 	}

-	if (sx.nr_inodes == 0 || sx.last_ino < sx.next_ino) {
-		ret = 0;
-		goto out;
-	}
-
-	name = kmalloc(sx.name_bytes, GFP_KERNEL);
+	name = kmalloc(fx.name_bytes, GFP_KERNEL);
 	if (!name) {
 		ret = -ENOMEM;
 		goto out;
 	}

-	if (copy_from_user(name, (void __user *)sx.name_ptr, sx.name_bytes)) {
+	if (copy_from_user(name, (void __user *)fx.name_ptr, fx.name_bytes)) {
 		ret = -EFAULT;
 		goto out;
 	}

-	if (scoutfs_xattr_parse_tags(name, sx.name_bytes, &tgs) < 0 ||
-	    !tgs.srch) {
-		ret = -EINVAL;
-		goto out;
-	}
+	hash = scoutfs_hash64(name, fx.name_bytes);
+	scoutfs_xattr_index_key(&key, hash, fx.next_ino, 0);
+	scoutfs_xattr_index_key(&last, hash, U64_MAX, U64_MAX);
+	ino = 0;

-	ret = scoutfs_srch_search_xattrs(sb, &sroot,
-					 scoutfs_hash64(name, sx.name_bytes),
-					 sx.next_ino, sx.last_ino, &done);
+	ret = scoutfs_lock_xattr_index(sb, SCOUTFS_LOCK_READ, 0, hash, &lock);
 	if (ret < 0)
 		goto out;

-	prev_ino = 0;
-	scoutfs_srch_foreach_rb_node(snode, node, &sroot) {
-		if (prev_ino == snode->ino)
-			continue;
+	while (fx.nr_inodes) {

-		if (put_user(snode->ino, uinos + total)) {
-			ret = -EFAULT;
+		ret = scoutfs_forest_next(sb, &key, &last, NULL, lock);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
 			break;
 		}
-		prev_ino = snode->ino;

-		if (++total == sx.nr_inodes)
-			break;
+		/* xattrs hashes can collide and add multiple entries */
+		if (le64_to_cpu(key.skxi_ino) != ino) {
+			ino = le64_to_cpu(key.skxi_ino);
+			if (put_user(ino, (u64 __user *)fx.inodes_ptr)) {
+				ret = -EFAULT;
+				break;
+			}
+
+			fx.inodes_ptr += sizeof(u64);
+			fx.nr_inodes--;
+			total++;
+			ret = 0;
+		}
+
+		scoutfs_key_inc(&key);
 	}

-	sx.output_flags = 0;
-	if (done && total == sroot.nr)
-		sx.output_flags |= SCOUTFS_SEARCH_XATTRS_OFLAG_END;
-
-	if (put_user(sx.output_flags, &usx->output_flags))
-		ret = -EFAULT;
-	else
-		ret = 0;
-
-	scoutfs_srch_destroy_rb_root(&sroot);
+	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);

 out:
 	kfree(name);
+
 	return ret ?: total;
 }

@@ -869,7 +863,6 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
 	struct scoutfs_ioctl_statfs_more sfm;
-	int ret;

 	if (get_user(sfm.valid_bytes, (__u64 __user *)arg))
 		return -EFAULT;
@@ -878,12 +871,6 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 				sizeof(struct scoutfs_ioctl_statfs_more));
 	sfm.fsid = le64_to_cpu(super->hdr.fsid);
 	sfm.rid = sbi->rid;
-	sfm.total_meta_blocks = le64_to_cpu(super->total_meta_blocks);
-	sfm.total_data_blocks = le64_to_cpu(super->total_data_blocks);
-
-	ret = scoutfs_client_get_last_seq(sb, &sfm.committed_seq);
-	if (ret)
-		return ret;

 	if (copy_to_user((void __user *)arg, &sfm, sfm.valid_bytes))
 		return -EFAULT;
@@ -891,101 +878,6 @@ static long scoutfs_ioc_statfs_more(struct file *file, unsigned long arg)
 	return 0;
 }

-struct copy_alloc_detail_args {
-	struct scoutfs_ioctl_alloc_detail_entry __user *uade;
-	u64 nr;
-	u64 copied;
-};
-
-static int copy_alloc_detail_to_user(struct super_block *sb, void *arg,
-				     int owner, u64 id, bool meta, bool avail,
-				     u64 blocks)
-{
-	struct copy_alloc_detail_args *args = arg;
-	struct scoutfs_ioctl_alloc_detail_entry ade;
-
-	if (args->copied == args->nr)
-		return -EOVERFLOW;
-
-	ade.blocks = blocks;
-	ade.id = id;
-	ade.meta = !!meta;
-	ade.avail = !!avail;
-
-	if (copy_to_user(&args->uade[args->copied], &ade, sizeof(ade)))
-		return -EFAULT;
-
-	args->copied++;
-	return 0;
-}
-
-static long scoutfs_ioc_alloc_detail(struct file *file, unsigned long arg)
-{
-	struct super_block *sb = file_inode(file)->i_sb;
-	struct scoutfs_ioctl_alloc_detail __user *uad = (void __user *)arg;
-	struct scoutfs_ioctl_alloc_detail ad;
-	struct copy_alloc_detail_args args;
-
-	if (copy_from_user(&ad, uad, sizeof(ad)))
-		return -EFAULT;
-
-	args.uade = (struct scoutfs_ioctl_alloc_detail_entry __user *)
-			(uintptr_t)ad.entries_ptr;
-	args.nr = ad.entries_nr;
-	args.copied = 0;
-
-	return scoutfs_alloc_foreach(sb, copy_alloc_detail_to_user, &args) ?:
-	       args.copied;
-}
-
-static long scoutfs_ioc_move_blocks(struct file *file, unsigned long arg)
-{
-	struct inode *to = file_inode(file);
-	struct super_block *sb = to->i_sb;
-	struct scoutfs_ioctl_move_blocks __user *umb = (void __user *)arg;
-	struct scoutfs_ioctl_move_blocks mb;
-	struct file *from_file;
-	struct inode *from;
-	int ret;
-
-	if (copy_from_user(&mb, umb, sizeof(mb)))
-		return -EFAULT;
-
-	if (mb.len == 0)
-		return 0;
-
-	if (mb.from_off + mb.len < mb.from_off ||
-	    mb.to_off + mb.len < mb.to_off)
-		return -EOVERFLOW;
-
-	from_file = fget(mb.from_fd);
-	if (!from_file)
-		return -EBADF;
-	from = file_inode(from_file);
-
-	if (from == to) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (from->i_sb != sb) {
-		ret = -EXDEV;
-		goto out;
-	}
-
-	ret = mnt_want_write_file(file);
-	if (ret < 0)
-		goto out;
-
-	ret = scoutfs_data_move_blocks(from, mb.from_off, mb.len,
-				       to, mb.to_off);
-	mnt_drop_write_file(file);
-out:
-	fput(from_file);
-
-	return ret;
-}
-
 long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1005,16 +897,12 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return scoutfs_ioc_setattr_more(file, arg);
 	case SCOUTFS_IOC_LISTXATTR_HIDDEN:
 		return scoutfs_ioc_listxattr_hidden(file, arg);
-	case SCOUTFS_IOC_SEARCH_XATTRS:
-		return scoutfs_ioc_search_xattrs(file, arg);
+	case SCOUTFS_IOC_FIND_XATTRS:
+		return scoutfs_ioc_find_xattrs(file, arg);
 	case SCOUTFS_IOC_STATFS_MORE:
 		return scoutfs_ioc_statfs_more(file, arg);
 	case SCOUTFS_IOC_DATA_WAIT_ERR:
 		return scoutfs_ioc_data_wait_err(file, arg);
-	case SCOUTFS_IOC_ALLOC_DETAIL:
-		return scoutfs_ioc_alloc_detail(file, arg);
-	case SCOUTFS_IOC_MOVE_BLOCKS:
-		return scoutfs_ioc_move_blocks(file, arg);
 	}

 	return -ENOTTY;
@@ -78,7 +78,7 @@ struct scoutfs_ioctl_walk_inodes {
 	__u8 _pad[11]; /* padded to align walk_inodes_entry total size */
 };

-enum scoutfs_ino_walk_seq_type {
+enum {
 	SCOUTFS_IOC_WALK_INODES_META_SEQ = 0,
 	SCOUTFS_IOC_WALK_INODES_DATA_SEQ,
 	SCOUTFS_IOC_WALK_INODES_UNKNOWN,
@@ -176,8 +176,8 @@ struct scoutfs_ioctl_ino_path_result {
 * an offline record is left behind to trigger demand staging if the
 * file is read.
 *
- * The starting file offset and number of bytes to release must be in
- * multiples of 4KB.
+ * The starting block offset and number of blocks to release are in
+ * units 4KB blocks.
 *
 * The specified range can extend past i_size and can straddle sparse
 * regions or blocks that are already offline.  The only change it makes
@@ -193,8 +193,8 @@ struct scoutfs_ioctl_ino_path_result {
 * presentation of the data in the file.
 */
 struct scoutfs_ioctl_release {
-	__u64 offset;
-	__u64 length;
+	__u64 block;
+	__u64 count;
 	__u64 data_version;
 };

@@ -205,7 +205,7 @@ struct scoutfs_ioctl_stage {
 	__u64 data_version;
 	__u64 buf_ptr;
 	__u64 offset;
-	__s32 length;
+	__s32 count;
 	__u32 _pad;
 };

@@ -296,57 +296,34 @@ struct scoutfs_ioctl_listxattr_hidden {

 /*
 * Return the inode numbers of inodes which might contain the given
- * xattr.  The inode may not have a set xattr with that name, the caller
- * must check the returned inodes to see if they match.
+ * named xattr.  The inode may not have a set xattr with that name, the
+ * caller must check the returned inodes to see if they match.
 *
 * @next_ino: The next inode number that could be returned.  Initialized
 * to 0 when first searching and set to one past the last inode number
 * returned to continue searching.
- * @last_ino: The last inode number that could be returned.  U64_MAX to
- * find all inodes.
- * @name_ptr: The address of the name of the xattr to search for.  It is
- * not null terminated.
- * @inodes_ptr: The address of the array of uint64_t inode numbers in
- * which to store inode numbers that may contain the xattr.  EFAULT may
- * be returned if this address is not naturally aligned.
- * @output_flags: Set as success is returned.  If an error is returned
- * then this field is undefined and should not be read.
+ * @name_ptr: The address of the name of the xattr to search for.  It does
+ * not need to be null terminated.
+ * @inodes_ptr: The address of the array of uint64_t inode numbers in which
+ * to store inode numbers that may contain the xattr.  EFAULT may be returned
+ * if this address is not naturally aligned.
+ * @name_bytes: The number of non-null bytes found in the name at name_ptr.
 * @nr_inodes: The number of elements in the array found at inodes_ptr.
- * @name_bytes: The number of non-null bytes found in the name at
- * name_ptr.
 *
 * This requires the CAP_SYS_ADMIN capability and will return -EPERM if
 * it's not granted.
- *
- * The number of inode numbers stored in the inodes_ptr array is
- * returned.  If nr_inodes is 0 or last_ino is less than next_ino then 0
- * will be immediately returned.
- *
- * Partial progress can be returned if an error is hit or if nr_inodes
- * was larger than the internal limit on the number of inodes returned
- * in a search pass.  The _END output flag is set if all the results
- * including last_ino were searched in this pass.
- *
- * It's valuable to provide a large inodes array so that all the results
- * can be found in one search pass and _END can be set.  There are
- * significant constant costs for performing each search pass.
 */
-struct scoutfs_ioctl_search_xattrs {
+struct scoutfs_ioctl_find_xattrs {
 	__u64 next_ino;
-	__u64 last_ino;
 	__u64 name_ptr;
 	__u64 inodes_ptr;
-	__u64 output_flags;
-	__u64 nr_inodes;
 	__u16 name_bytes;
-	__u8 _pad[6];
+	__u16 nr_inodes;
+	__u8 _pad[4];
 };

-/* set in output_flags if returned inodes reached last_ino */
-#define SCOUTFS_SEARCH_XATTRS_OFLAG_END (1ULL << 0)
-
-#define SCOUTFS_IOC_SEARCH_XATTRS _IOR(SCOUTFS_IOCTL_MAGIC, 9, \
-				     struct scoutfs_ioctl_search_xattrs)
+#define SCOUTFS_IOC_FIND_XATTRS _IOR(SCOUTFS_IOCTL_MAGIC, 9, \
+				     struct scoutfs_ioctl_find_xattrs)

 /*
 * Give the user information about the filesystem.
@@ -358,20 +335,13 @@ struct scoutfs_ioctl_search_xattrs {
 * field is set if all of its bytes are within the valid_bytes that the
 * kernel set on return.
 *
- * @committed_seq: All seqs up to and including this seq have been
- * committed.  Can be compared with meta_seq and data_seq from inodes in
- * stat_more to discover if changes have been committed to disk.
- *
 * New fields are only added to the end of the struct.
 */
 struct scoutfs_ioctl_statfs_more {
 	__u64 valid_bytes;
 	__u64 fsid;
 	__u64 rid;
-	__u64 committed_seq;
-	__u64 total_meta_blocks;
-	__u64 total_data_blocks;
-};
+} __packed;

 #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \
 				     struct scoutfs_ioctl_statfs_more)
@@ -394,74 +364,4 @@ struct scoutfs_ioctl_data_wait_err {
 #define SCOUTFS_IOC_DATA_WAIT_ERR _IOR(SCOUTFS_IOCTL_MAGIC, 11, \
 				       struct scoutfs_ioctl_data_wait_err)

-
-struct scoutfs_ioctl_alloc_detail {
-	__u64 entries_ptr;
-	__u64 entries_nr;
-};
-
-struct scoutfs_ioctl_alloc_detail_entry {
-	__u64 id;
-	__u64 blocks;
-	__u8 type;
-	__u8 meta:1,
-	     avail:1;
-	__u8 __bit_pad:6;
-	__u8 __pad[6];
-};
-
-#define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \
-				      struct scoutfs_ioctl_alloc_detail)
-
-/*
- * Move extents from one regular file to another at a different offset,
- * on the same file system.
- *
- * from_fd specifies the source file and the ioctl is called on the
- * destination file.  Both files must have write access.  from_off
- * specifies the byte offset in the source, to_off is the byte offset in
- * the destination, and len is the number of bytes in the region to
- * move.   All of the offsets and lengths must be in multiples of 4KB,
- * except in the case where the from_off + len ends at the i_size of the
- * source file.
- *
- * This interface only moves extents which are block granular, it does
- * not perform RMW of sub-block byte extents and it does not overwrite
- * existing extents in the destination.  It will split extents in the
- * source.
- *
- * Only extents within i_size on the source are moved.  The destination
- * i_size will be updated if extents are moved beyond its current
- * i_size.  The i_size update will maintain final partial blocks in the
- * source.
- *
- * It will return an error if either of the files have offline extents.
- * It will return 0 when all of the extents in the source region have
- * been moved to the destination.  Moving extents updates the ctime,
- * mtime, meta_seq, data_seq, and data_version fields of both the source
- * and destination inodes.  If an error is returned then partial
- * progress may have been made and inode fields may have been updated.
- *
- * Errors specific to this interface include:
- *
- * EINVAL: from_off, len, or to_off aren't a multiple of 4KB; the source
- *	   and destination files are the same inode; either the source or
- *	   destination is not a regular file; the destination file has
- *	   an existing overlapping extent.
- * EOVERFLOW: either from_off + len or to_off + len exceeded 64bits.
- * EBADF: from_fd isn't a valid open file descriptor.
- * EXDEV: the source and destination files are in different filesystems.
- * EISDIR: either the source or destination is a directory.
- * ENODATA: either the source or destination file have offline extents.
- */
-struct scoutfs_ioctl_move_blocks {
-	__u64 from_fd;
-	__u64 from_off;
-	__u64 len;
-	__u64 to_off;
-};
-
-#define SCOUTFS_IOC_MOVE_BLOCKS _IOR(SCOUTFS_IOCTL_MAGIC, 13, \
-				     struct scoutfs_ioctl_move_blocks)
-
 #endif
@@ -1,39 +0,0 @@
-#ifndef _SCOUTFS_ITEM_H_
-#define _SCOUTFS_ITEM_H_
-
-int scoutfs_item_lookup(struct super_block *sb, struct scoutfs_key *key,
-			void *val, int val_len, struct scoutfs_lock *lock);
-int scoutfs_item_lookup_exact(struct super_block *sb, struct scoutfs_key *key,
-			      void *val, int val_len,
-			      struct scoutfs_lock *lock);
-int scoutfs_item_next(struct super_block *sb, struct scoutfs_key *key,
-		      struct scoutfs_key *last, void *val, int val_len,
-		      struct scoutfs_lock *lock);
-int scoutfs_item_dirty(struct super_block *sb, struct scoutfs_key *key,
-		       struct scoutfs_lock *lock);
-int scoutfs_item_create(struct super_block *sb, struct scoutfs_key *key,
-			void *val, int val_len, struct scoutfs_lock *lock);
-int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key,
-			      void *val, int val_len,
-			      struct scoutfs_lock *lock);
-int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key,
-			void *val, int val_len, struct scoutfs_lock *lock);
-int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key,
-			  struct scoutfs_lock *lock);
-int scoutfs_item_delete_force(struct super_block *sb,
-				struct scoutfs_key *key,
-				struct scoutfs_lock *lock);
-
-u64 scoutfs_item_dirty_pages(struct super_block *sb);
-int scoutfs_item_write_dirty(struct super_block *sb);
-int scoutfs_item_write_done(struct super_block *sb);
-bool scoutfs_item_range_cached(struct super_block *sb,
-			       struct scoutfs_key *start,
-			       struct scoutfs_key *end, bool *dirty);
-void scoutfs_item_invalidate(struct super_block *sb, struct scoutfs_key *start,
-			     struct scoutfs_key *end);
-
-int scoutfs_item_setup(struct super_block *sb);
-void scoutfs_item_destroy(struct super_block *sb);
-
-#endif
@@ -78,14 +78,6 @@ static inline void scoutfs_key_set_zeros(struct scoutfs_key *key)
 	key->_sk_second = 0;
 	key->_sk_third = 0;
 	key->_sk_fourth = 0;
-	memset(key->__pad, 0, sizeof(key->__pad));
-}
-
-static inline bool scoutfs_key_is_zeros(struct scoutfs_key *key)
-{
-	return key->sk_zone == 0 && key->_sk_first == 0 && key->sk_type == 0 &&
-	       key->_sk_second == 0 && key->_sk_third == 0 &&
-	       key->_sk_fourth == 0;
 }

 static inline void scoutfs_key_copy_or_zeros(struct scoutfs_key *dst,
@@ -105,7 +97,6 @@ static inline void scoutfs_key_set_ones(struct scoutfs_key *key)
 	key->_sk_second = cpu_to_le64(U64_MAX);
 	key->_sk_third = cpu_to_le64(U64_MAX);
 	key->_sk_fourth = U8_MAX;
-	memset(key->__pad, 0, sizeof(key->__pad));
 }

 /*
@@ -188,19 +179,29 @@ static inline void scoutfs_key_dec(struct scoutfs_key *key)
 	key->sk_zone--;
 }

-/*
- * Some key types are used by multiple subsystems and shouldn't have
- * duplicate private key init functions.
- */
-
-static inline void scoutfs_key_init_log_trees(struct scoutfs_key *key,
-					      u64 rid, u64 nr)
+static inline void scoutfs_key_to_be(struct scoutfs_key_be *be,
+				     struct scoutfs_key *key)
 {
-	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_LOG_TREES_ZONE,
-		.sklt_rid = cpu_to_le64(rid),
-		.sklt_nr = cpu_to_le64(nr),
-	};
+	BUILD_BUG_ON(sizeof(struct scoutfs_key_be) !=
+		     sizeof(struct scoutfs_key));
+
+	be->sk_zone = key->sk_zone;
+	be->_sk_first = le64_to_be64(key->_sk_first);
+	be->sk_type = key->sk_type;
+	be->_sk_second = le64_to_be64(key->_sk_second);
+	be->_sk_third = le64_to_be64(key->_sk_third);
+	be->_sk_fourth = key->_sk_fourth;
+}
+
+static inline void scoutfs_key_from_be(struct scoutfs_key *key,
+				       struct scoutfs_key_be *be)
+{
+	key->sk_zone = be->sk_zone;
+	key->_sk_first = be64_to_le64(be->_sk_first);
+	key->sk_type = be->sk_type;
+	key->_sk_second = be64_to_le64(be->_sk_second);
+	key->_sk_third = be64_to_le64(be->_sk_third);
+	key->_sk_fourth = be->_sk_fourth;
 }

 #endif
@@ -0,0 +1,12 @@
+#ifndef _SCOUTFS_KVEC_H_
+#define _SCOUTFS_KVEC_H_
+
+#include <linux/uio.h>
+
+static inline void kvec_init(struct kvec *kv, void *base, size_t len)
+{
+	kv->iov_base = base;
+	kv->iov_len = len;
+}
+
+#endif
@@ -21,6 +21,7 @@

 #include "super.h"
 #include "lock.h"
+#include "forest.h"
 #include "scoutfs_trace.h"
 #include "msg.h"
 #include "cmp.h"
@@ -33,7 +34,6 @@
 #include "client.h"
 #include "data.h"
 #include "xattr.h"
-#include "item.h"

 /*
 * scoutfs uses a lock service to manage item cache consistency between
@@ -80,12 +80,6 @@ struct lock_info {
 	struct list_head lru_list;
 	unsigned long long lru_nr;
 	struct workqueue_struct *workq;
-	struct work_struct grant_work;
-	struct list_head grant_list;
-	struct delayed_work inv_dwork;
-	struct list_head inv_list;
-	struct work_struct shrink_work;
-	struct list_head shrink_list;
 	atomic64_t next_refresh_gen;
 	struct dentry *tseq_dentry;
 	struct scoutfs_tseq_tree tseq_tree;
@@ -94,17 +88,19 @@ struct lock_info {
 #define DECLARE_LOCK_INFO(sb, name) \
 	struct lock_info *name = SCOUTFS_SB(sb)->lock_info

-static bool lock_mode_invalid(enum scoutfs_lock_mode mode)
+static void scoutfs_lock_shrink_worker(struct work_struct *work);
+
+static bool lock_mode_invalid(int mode)
 {
 	return (unsigned)mode >= SCOUTFS_LOCK_INVALID;
 }

-static bool lock_mode_can_read(enum scoutfs_lock_mode mode)
+static bool lock_mode_can_read(int mode)
 {
 	return mode == SCOUTFS_LOCK_READ || mode == SCOUTFS_LOCK_WRITE;
 }

-static bool lock_mode_can_write(enum scoutfs_lock_mode mode)
+static bool lock_mode_can_write(int mode)
 {
 	return mode == SCOUTFS_LOCK_WRITE || mode == SCOUTFS_LOCK_WRITE_ONLY;
 }
@@ -147,7 +143,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 * leave cached items behind in the case of invalidating to a read lock.
 */
 static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
-			   enum scoutfs_lock_mode prev, enum scoutfs_lock_mode mode)
+			   int prev, int mode)
 {
 	struct scoutfs_lock_coverage *cov;
 	struct scoutfs_lock_coverage *tmp;
@@ -160,13 +156,15 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
 	BUG_ON(!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ) &&
 	         mode != SCOUTFS_LOCK_NULL);

-	/* sync when a write lock could have dirtied the current transaction */
-	if (lock_mode_can_write(prev) &&
-	    (lock->dirty_trans_seq == scoutfs_trans_sample_seq(sb))) {
-		scoutfs_inc_counter(sb, lock_invalidate_sync);
+	/* any transition from a mode allowed to dirty items has to write */
+	if (lock_mode_can_write(prev) && scoutfs_trans_has_dirty(sb)) {
 		ret = scoutfs_trans_sync(sb, 1);
 		if (ret < 0)
 			return ret;
+		if (ret > 0) {
+			scoutfs_add_counter(sb, lock_invalidate_commit, ret);
+			ret = 0;
+		}
 	}

 	/* have to invalidate if we're not in the only usable case */
@@ -195,8 +193,6 @@ retry:
 				ino++;
 			}
 		}
-
-		scoutfs_item_invalidate(sb, &lock->start, &lock->end);
 	}

 	return ret;
@@ -224,11 +220,9 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
 	BUG_ON(!RB_EMPTY_NODE(&lock->node));
 	BUG_ON(!RB_EMPTY_NODE(&lock->range_node));
 	BUG_ON(!list_empty(&lock->lru_head));
-	BUG_ON(!list_empty(&lock->grant_head));
-	BUG_ON(!list_empty(&lock->inv_head));
-	BUG_ON(!list_empty(&lock->shrink_head));
 	BUG_ON(!list_empty(&lock->cov_list));

+	scoutfs_forest_clear_lock(sb, lock);
 	kfree(lock);
 }

@@ -251,9 +245,7 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	RB_CLEAR_NODE(&lock->node);
 	RB_CLEAR_NODE(&lock->range_node);
 	INIT_LIST_HEAD(&lock->lru_head);
-	INIT_LIST_HEAD(&lock->grant_head);
-	INIT_LIST_HEAD(&lock->inv_head);
-	INIT_LIST_HEAD(&lock->shrink_head);
+
 	spin_lock_init(&lock->cov_list_lock);
 	INIT_LIST_HEAD(&lock->cov_list);

@@ -261,22 +253,21 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
 	lock->end = *end;
 	lock->sb = sb;
 	init_waitqueue_head(&lock->waitq);
+	INIT_WORK(&lock->shrink_work, scoutfs_lock_shrink_worker);
 	lock->mode = SCOUTFS_LOCK_NULL;

-	atomic64_set(&lock->forest_bloom_nr, 0);
-
 	trace_scoutfs_lock_alloc(sb, lock);

 	return lock;
 }

-static void lock_inc_count(unsigned int *counts, enum scoutfs_lock_mode mode)
+static void lock_inc_count(unsigned int *counts, int mode)
 {
 	BUG_ON(mode < 0 || mode >= SCOUTFS_LOCK_NR_MODES);
 	counts[mode]++;
 }

-static void lock_dec_count(unsigned int *counts, enum scoutfs_lock_mode mode)
+static void lock_dec_count(unsigned int *counts, int mode)
 {
 	BUG_ON(mode < 0 || mode >= SCOUTFS_LOCK_NR_MODES);
 	counts[mode]--;
@@ -288,7 +279,7 @@ static void lock_dec_count(unsigned int *counts, enum scoutfs_lock_mode mode)
 */
 static bool lock_counts_match(int granted, unsigned int *counts)
 {
-	enum scoutfs_lock_mode mode;
+	int mode;

 	for (mode = 0; mode < SCOUTFS_LOCK_NR_MODES; mode++) {
 		if (counts[mode] && !lock_modes_match(granted, mode))
@@ -305,7 +296,7 @@ static bool lock_counts_match(int granted, unsigned int *counts)
 */
 static bool lock_count_match_exists(int desired, unsigned int *counts)
 {
-	enum scoutfs_lock_mode mode;
+	int mode;

 	for (mode = 0; mode < SCOUTFS_LOCK_NR_MODES; mode++) {
 		if (counts[mode] && lock_modes_match(desired, mode))
@@ -321,7 +312,7 @@ static bool lock_count_match_exists(int desired, unsigned int *counts)
 */
 static bool lock_idle(struct scoutfs_lock *lock)
 {
-	enum scoutfs_lock_mode mode;
+	int mode;

 	if (lock->request_pending || lock->invalidate_pending)
 		return false;
@@ -549,80 +540,11 @@ static void extend_grace(struct super_block *sb, struct scoutfs_lock *lock)
 	lock->grace_deadline = ktime_add(now, GRACE_PERIOD_KT);
 }

-static void queue_grant_work(struct lock_info *linfo)
-{
-	assert_spin_locked(&linfo->lock);
-
-	if (!list_empty(&linfo->grant_list) && !linfo->shutdown)
-		queue_work(linfo->workq, &linfo->grant_work);
-}
-
 /*
- * We immediately queue work on the assumption that the caller might
- * have made a change (set a lock mode) which can let one of the
- * invalidating locks make forward progress, even if other locks are
- * waiting for their grace period to elapse.  It's a trade-off between
- * invalidation latency and burning cpu repeatedly finding that locks
- * are still in their grace period.
- */
-static void queue_inv_work(struct lock_info *linfo)
-{
-	assert_spin_locked(&linfo->lock);
-
-	if (!list_empty(&linfo->inv_list) && !linfo->shutdown)
-		mod_delayed_work(linfo->workq, &linfo->inv_dwork, 0);
-}
-
-/*
- * The given lock is processing a received a grant response.  Trigger a
- * bug if the cache is inconsistent.
- *
- * We only have two modes that can create dirty items.  We can't have
- * dirty items when transitioning from write_only to write because the
- * writer can't trust the cached items in the cache for reading.  And we
- * don't currently transition directly from write to write_only, we
- * first go through null.  So if we have dirty items as we're granted a
- * mode it's always incorrect.
- *
- * And we can't have cached items that we're going to use for reading if
- * the previous mode didn't allow reading.
- *
- * Inconsistencies have come from all sorts of bugs: invalidation missed
- * items, the cache was populated outside of locking coverage, lock
- * holders performed the wrong item operations under their lock,
- * overlapping locks, out of order granting or invalidating, etc.
- */
-static void bug_on_inconsistent_grant_cache(struct super_block *sb,
-					    struct scoutfs_lock *lock,
-					    int old_mode, int new_mode)
-{
-	bool cached;
-	bool dirty;
-
-	cached = scoutfs_item_range_cached(sb, &lock->start, &lock->end,
-					   &dirty);
-	if (dirty ||
-	    (cached && (!lock_mode_can_read(old_mode) ||
-			!lock_mode_can_read(new_mode)))) {
-		scoutfs_err(sb, "granted lock item cache inconsistency, cached %u dirty %u old_mode %d new_mode %d: start "SK_FMT" end "SK_FMT" refresh_gen %llu mode %u waiters: rd %u wr %u wo %u users: rd %u wr %u wo %u",
-			   cached, dirty, old_mode, new_mode, SK_ARG(&lock->start),
-			   SK_ARG(&lock->end), lock->refresh_gen, lock->mode,
-			   lock->waiters[SCOUTFS_LOCK_READ],
-			   lock->waiters[SCOUTFS_LOCK_WRITE],
-			   lock->waiters[SCOUTFS_LOCK_WRITE_ONLY],
-			   lock->users[SCOUTFS_LOCK_READ],
-			   lock->users[SCOUTFS_LOCK_WRITE],
-			   lock->users[SCOUTFS_LOCK_WRITE_ONLY]);
-		BUG();
-	}
-}
-
-/*
- * Each lock has received a grant response message from the server.
- *
- * Grant responses can be reordered with incoming invalidation requests
- * from the server so we have to be careful to only set the new mode
- * once the old mode matches.
+ * The client is receiving a lock response message from the server.
+ * This can be reordered with incoming invlidation requests from the
+ * server so we have to be careful to only set the new mode once the old
+ * mode matches.
 *
 * We extend the grace period as we grant the lock if there is a waiting
 * locker who can use the lock.  This stops invalidation from pulling
@@ -633,65 +555,9 @@ static void bug_on_inconsistent_grant_cache(struct super_block *sb,
 * against the invalidation.  In that case they'd extend the grace
 * period anyway as they unlock.
 */
-static void lock_grant_worker(struct work_struct *work)
-{
-	struct lock_info *linfo = container_of(work, struct lock_info,
-					       grant_work);
-	struct super_block *sb = linfo->sb;
-	struct scoutfs_net_lock_grant_response *gr;
-	struct scoutfs_net_lock *nl;
-	struct scoutfs_lock *lock;
-	struct scoutfs_lock *tmp;
-
-	scoutfs_inc_counter(sb, lock_grant_work);
-
-	spin_lock(&linfo->lock);
-
-	list_for_each_entry_safe(lock, tmp, &linfo->grant_list, grant_head) {
-		gr = &lock->grant_resp;
-		nl = &lock->grant_resp.nl;
-
-		/* wait for reordered invalidation to finish */
-		if (lock->mode != nl->old_mode)
-			continue;
-
-		bug_on_inconsistent_grant_cache(sb, lock, nl->old_mode,
-						nl->new_mode);
-
-		if (!lock_mode_can_read(nl->old_mode) &&
-		    lock_mode_can_read(nl->new_mode)) {
-			lock->refresh_gen =
-				atomic64_inc_return(&linfo->next_refresh_gen);
-		}
-
-		lock->request_pending = 0;
-		lock->mode = nl->new_mode;
-		lock->write_version = le64_to_cpu(nl->write_version);
-		lock->roots = gr->roots;
-
-		if (lock_count_match_exists(nl->new_mode, lock->waiters))
-			extend_grace(sb, lock);
-
-		trace_scoutfs_lock_granted(sb, lock);
-		list_del_init(&lock->grant_head);
-		wake_up(&lock->waitq);
-		put_lock(linfo, lock);
-	}
-
-	/* invalidations might be waiting for our reordered grant */
-	queue_inv_work(linfo);
-	spin_unlock(&linfo->lock);
-}
-
-/*
- * The client is receiving a grant response message from the server.  We
- * find the lock, record the response, and add it to the list for grant
- * work to process.
- */
 int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock_grant_response *gr)
+				struct scoutfs_net_lock *nl)
 {
-	struct scoutfs_net_lock *nl = &gr->nl;
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;

@@ -702,12 +568,34 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 	/* lock must already be busy with request_pending */
 	lock = lock_lookup(sb, &nl->key, NULL);
 	BUG_ON(!lock);
-	trace_scoutfs_lock_grant_response(sb, lock);
 	BUG_ON(!lock->request_pending);

-	lock->grant_resp = *gr;
-	list_add_tail(&lock->grant_head, &linfo->grant_list);
-	queue_grant_work(linfo);
+	trace_scoutfs_lock_grant_response(sb, lock);
+
+	/* resolve unlikely work reordering with invalidation request */
+	while (lock->mode != nl->old_mode) {
+		spin_unlock(&linfo->lock);
+		/* implicit read barrier from waitq locks */
+		wait_event(lock->waitq, lock->mode == nl->old_mode);
+		spin_lock(&linfo->lock);
+	}
+
+	if (!lock_mode_can_read(nl->old_mode) &&
+	    lock_mode_can_read(nl->new_mode)) {
+		lock->refresh_gen =
+			atomic64_inc_return(&linfo->next_refresh_gen);
+	}
+
+	lock->request_pending = 0;
+	lock->mode = nl->new_mode;
+	lock->write_version = le64_to_cpu(nl->write_version);
+
+	if (lock_count_match_exists(nl->new_mode, lock->waiters))
+		extend_grace(sb, lock);
+
+	trace_scoutfs_lock_granted(sb, lock);
+	wake_up(&lock->waitq);
+	put_lock(linfo, lock);

 	spin_unlock(&linfo->lock);

@@ -715,9 +603,34 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 }

 /*
- * Each lock has received a lock invalidation request from the server
+ * Invalidation waits until the old mode indicates that we've resolved
+ * unlikely races with reordered grant responses from the server and
+ * until the new mode satisfies active users.
+ *
+ * Once it's safe to proceed we set the lock mode here under the lock to
+ * prevent additional users of the old mode while we're invalidating.
+ */
+static bool lock_invalidate_safe(struct lock_info *linfo,
+				 struct scoutfs_lock *lock,
+				 int old_mode, int new_mode)
+{
+	bool safe;
+
+	spin_lock(&linfo->lock);
+	safe = (lock->mode == old_mode) &&
+	       lock_counts_match(new_mode, lock->users);
+	if (safe)
+		lock->mode = new_mode;
+	spin_unlock(&linfo->lock);
+
+	return safe;
+}
+
+/*
+ * The client is receiving a lock invalidation request from the server
 * which specifies a new mode for the lock.  The server will only send
- * one invalidation request at a time for each lock.
+ * one invalidation request at a time.  This is executing in a blocking
+ * net receive work context.
 *
 * This is an unsolicited request from the server so it can arrive at
 * any time after we make the server aware of the lock by initially
@@ -734,137 +647,72 @@ int scoutfs_lock_grant_response(struct super_block *sb,
 * invalidate once the lock mode matches what the server told us to
 * invalidate.
 *
- * We delay invalidation processing until a grace period has elapsed
- * since the last unlock.  The intent is to let users do a reasonable
- * batch of work before dropping the lock.  Continuous unlocking can
- * continuously extend the deadline.
- *
- * Before we start invalidating the lock we set the lock to the new
- * mode, preventing further incompatible users of the old mode from
- * using the lock while we're invalidating.
- *
- * This does a lot of serialized inode invalidation in one context and
- * performs a lot of repeated calls to sync.  It would be nice to get
- * some concurrent inode invalidation and to more carefully only call
- * sync when needed.
- */
-static void lock_invalidate_worker(struct work_struct *work)
-{
-	struct lock_info *linfo = container_of(work, struct lock_info,
-					       inv_dwork.work);
-	struct super_block *sb = linfo->sb;
-	struct scoutfs_net_lock *nl;
-	struct scoutfs_lock *lock;
-	struct scoutfs_lock *tmp;
-	unsigned long delay = MAX_JIFFY_OFFSET;
-	ktime_t now = ktime_get();
-	ktime_t deadline;
-	LIST_HEAD(ready);
-	u64 net_id;
-	int ret;
-
-	scoutfs_inc_counter(sb, lock_invalidate_work);
-
-	spin_lock(&linfo->lock);
-
-	list_for_each_entry_safe(lock, tmp, &linfo->inv_list, inv_head) {
-		nl = &lock->inv_nl;
-
-		/* skip if grace hasn't elapsed, record earliest */
-		deadline = lock->grace_deadline;
-		if (ktime_before(now, deadline)) {
-			delay = min(delay,
-				    nsecs_to_jiffies(ktime_to_ns(
-						ktime_sub(deadline, now))));
-			scoutfs_inc_counter(linfo->sb, lock_grace_wait);
-			continue;
-		}
-
-		/* wait for reordered grant to finish */
-		if (lock->mode != nl->old_mode)
-			continue;
-
-		/* wait until incompatible holders unlock */
-		if (!lock_counts_match(nl->new_mode, lock->users))
-			continue;
-
-		/* set the new mode, no incompatible users during inval */
-		lock->mode = nl->new_mode;
-
-		/* move everyone that's ready to our private list */
-		list_move_tail(&lock->inv_head, &ready);
-	}
-
-	spin_unlock(&linfo->lock);
-
-	if (list_empty(&ready))
-		goto out;
-
-	/* invalidate once the lock is read */
-	list_for_each_entry(lock, &ready, inv_head) {
-		nl = &lock->inv_nl;
-		net_id = lock->inv_net_id;
-
-		ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
-		BUG_ON(ret);
-
-		/* respond with the key and modes from the request */
-		ret = scoutfs_client_lock_response(sb, net_id, nl);
-		BUG_ON(ret);
-
-		scoutfs_inc_counter(sb, lock_invalidate_response);
-	}
-
-	/* and finish all the invalidated locks */
-	spin_lock(&linfo->lock);
-
-	list_for_each_entry_safe(lock, tmp, &ready, inv_head) {
-		list_del_init(&lock->inv_head);
-
-		lock->invalidate_pending = 0;
-		trace_scoutfs_lock_invalidated(sb, lock);
-		wake_up(&lock->waitq);
-		put_lock(linfo, lock);
-	}
-
-	/* grant might have been waiting for invalidate request */
-	queue_grant_work(linfo);
-	spin_unlock(&linfo->lock);
-
-out:
-	/* queue delayed work if invalidations waiting on grace deadline */
-	if (delay != MAX_JIFFY_OFFSET)
-		queue_delayed_work(linfo->workq, &linfo->inv_dwork, delay);
-}
-
-/*
- * Record an incoming invalidate request from the server and add its lock
- * to the list for processing.
- *
- * This is trusting the server and will crash if it's sent bad requests :/
+ * We delay invalidation processing until a grace period has elapsed since
+ * the last unlock.  The intent is to let users do a reasonable batch of
+ * work before dropping the lock.  Continuous unlocking can continuously
+ * extend the deadline.
 */
 int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 				    struct scoutfs_net_lock *nl)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;
+	ktime_t deadline;
+	bool grace_waited = false;
+	int ret;

 	scoutfs_inc_counter(sb, lock_invalidate_request);

 	spin_lock(&linfo->lock);
 	lock = get_lock(sb, &nl->key);
-	BUG_ON(!lock);
 	if (lock) {
-		BUG_ON(lock->invalidate_pending);
+		BUG_ON(lock->invalidate_pending); /* XXX trusting server :/ */
 		lock->invalidate_pending = 1;
-		lock->inv_nl = *nl;
-		lock->inv_net_id = net_id;
-		list_add_tail(&lock->inv_head, &linfo->inv_list);
+		deadline = lock->grace_deadline;
 		trace_scoutfs_lock_invalidate_request(sb, lock);
-		queue_inv_work(linfo);
 	}
 	spin_unlock(&linfo->lock);

+	BUG_ON(!lock);
+
+	/* wait for a grace period after the most recent unlock */
+	while (ktime_before(ktime_get(), deadline)) {
+		grace_waited = true;
+		scoutfs_inc_counter(linfo->sb, lock_grace_wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+                schedule_hrtimeout(&deadline, HRTIMER_MODE_ABS);
+
+		spin_lock(&linfo->lock);
+		deadline = lock->grace_deadline;
+		spin_unlock(&linfo->lock);
+	}
+
+	if (grace_waited)
+		scoutfs_inc_counter(linfo->sb, lock_grace_elapsed);
+
+	/* sets the lock mode to prevent use of old mode during invalidate */
+	wait_event(lock->waitq, lock_invalidate_safe(linfo, lock, nl->old_mode,
+						     nl->new_mode));
+
+	ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
+	BUG_ON(ret);
+
+	/* respond with the key and modes from the request */
+	ret = scoutfs_client_lock_response(sb, net_id, nl);
+	BUG_ON(ret);
+
+	scoutfs_inc_counter(sb, lock_invalidate_response);
+
+	spin_lock(&linfo->lock);
+
+	lock->invalidate_pending = 0;
+
+	trace_scoutfs_lock_invalidated(sb, lock);
+	wake_up(&lock->waitq);
+	put_lock(linfo, lock);
+
+	spin_unlock(&linfo->lock);
+
 	return 0;
 }

@@ -901,7 +749,6 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 	for (i = 0; lock && i < SCOUTFS_NET_LOCK_MAX_RECOVER_NR; i++) {

 		nlr->locks[i].key = lock->start;
-		nlr->locks[i].write_version = cpu_to_le64(lock->write_version);
 		nlr->locks[i].old_mode = lock->mode;
 		nlr->locks[i].new_mode = lock->mode;

@@ -922,7 +769,7 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 }

 static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock,
-			   enum scoutfs_lock_mode mode)
+			   int mode)
 {
 	DECLARE_LOCK_INFO(sb, linfo);
 	bool wake;
@@ -956,7 +803,7 @@ static bool lock_flags_invalid(int flags)
 * won't process our request until it receives our invalidation
 * response.
 */
-static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+static int lock_key_range(struct super_block *sb, int mode, int flags,
 			  struct scoutfs_key *start, struct scoutfs_key *end,
 			  struct scoutfs_lock **ret_lock)
 {
@@ -1064,7 +911,7 @@ out_unlock:
 	return ret;
 }

-int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
+int scoutfs_lock_ino(struct super_block *sb, int mode, int flags, u64 ino,
 		     struct scoutfs_lock **ret_lock)
 {
 	struct scoutfs_key start;
@@ -1089,7 +936,7 @@ int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int fl
 * is incremented as new locks are acquired and then indicates that an
 * old inode with a smaller refresh_gen needs to be refreshed.
 */
-int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_inode(struct super_block *sb, int mode, int flags,
 		       struct inode *inode, struct scoutfs_lock **lock)
 {
 	int ret;
@@ -1152,7 +999,7 @@ static void swap_arg(void *A, void *B, int size)
 *
 * (pretty great collision with d_lock() here)
 */
-int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_inodes(struct super_block *sb, int mode, int flags,
 			struct inode *a, struct scoutfs_lock **a_lock,
 			struct inode *b, struct scoutfs_lock **b_lock,
 			struct inode *c, struct scoutfs_lock **c_lock,
@@ -1200,7 +1047,7 @@ int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int
 /*
 * The rename lock is magical because it's global.
 */
-int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_rename(struct super_block *sb, int mode, int flags,
 			struct scoutfs_lock **lock)
 {
 	struct scoutfs_key key = {
@@ -1247,7 +1094,7 @@ void scoutfs_lock_get_index_item_range(u8 type, u64 major, u64 ino,
 * Lock the given index item.  We use the index masks to calculate the
 * start and end key values that are covered by the lock.
 */
-int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode,
+int scoutfs_lock_inode_index(struct super_block *sb, int mode,
 			     u8 type, u64 major, u64 ino,
 			     struct scoutfs_lock **ret_lock)
 {
@@ -1259,6 +1106,24 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
 	return lock_key_range(sb, mode, 0, &start, &end, ret_lock);
 }

+/*
+ * Today we lock a hash value entirely. If we went to finer grained ino
+ * locking as well we'd need to check the manifest to find the next
+ * possible ino to lock so that we didn't try to iterate over all of
+ * them.
+ */
+int scoutfs_lock_xattr_index(struct super_block *sb, int mode, int flags,
+			     u64 hash, struct scoutfs_lock **ret_lock)
+{
+	struct scoutfs_key start;
+	struct scoutfs_key end;
+
+	scoutfs_xattr_index_key(&start, hash, 0, 0);
+	scoutfs_xattr_index_key(&end, hash, U64_MAX, U64_MAX);
+
+	return lock_key_range(sb, mode, flags, &start, &end, ret_lock);
+}
+
 /*
 * The rid lock protects a mount's private persistent items in the rid
 * zone.  It's held for the duration of the mount.  It lets the mount
@@ -1270,7 +1135,7 @@ int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode
 * able to.  Maybe we have a bunch free and they're trying to allocate
 * and are getting ENOSPC.
 */
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_rid(struct super_block *sb, int mode, int flags,
 		     u64 rid, struct scoutfs_lock **lock)
 {
 	struct scoutfs_key start;
@@ -1291,7 +1156,7 @@ int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int fl
 * As we unlock we always extend the grace period to give the caller
 * another pass at the lock before its invalidated.
 */
-void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode)
+void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, int mode)
 {
 	DECLARE_LOCK_INFO(sb, linfo);

@@ -1304,12 +1169,9 @@ void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scou

 	lock_dec_count(lock->users, mode);
 	extend_grace(sb, lock);
-	if (lock_mode_can_write(mode))
-		lock->dirty_trans_seq = scoutfs_trans_sample_seq(sb);

 	trace_scoutfs_lock_unlock(sb, lock);
 	wake_up(&lock->waitq);
-	queue_inv_work(linfo);
 	put_lock(linfo, lock);

 	spin_unlock(&linfo->lock);
@@ -1384,7 +1246,7 @@ void scoutfs_lock_del_coverage(struct super_block *sb,
 * the mode and keys from changing.
 */
 bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
-			    enum scoutfs_lock_mode mode)
+			    int mode)
 {
 	signed char lock_mode = ACCESS_ONCE(lock->mode);

@@ -1394,50 +1256,38 @@ bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
 }

 /*
- * The shrink callback got the lock, marked it request_pending, and put
- * it on the shrink list.  We send a null request and the lock will be
- * freed by the response once all users drain.  If this races with
+ * The shrink callback got the lock, marked it request_pending, and
+ * handed it off to us.  We kick off a null request and the lock will
+ * be freed by the response once all users drain.  If this races with
 * invalidation then the server will only send the grant response once
 * the invalidation is finished.
 */
-static void lock_shrink_worker(struct work_struct *work)
+static void scoutfs_lock_shrink_worker(struct work_struct *work)
 {
-	struct lock_info *linfo = container_of(work, struct lock_info,
-					       shrink_work);
-	struct super_block *sb = linfo->sb;
+	struct scoutfs_lock *lock = container_of(work, struct scoutfs_lock,
+						 shrink_work);
+	struct super_block *sb = lock->sb;
+	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_net_lock nl;
-	struct scoutfs_lock *lock;
-	struct scoutfs_lock *tmp;
-	LIST_HEAD(list);
 	int ret;

-	scoutfs_inc_counter(sb, lock_shrink_work);
+	/* unlocked lock access, but should be stable since we queued */
+	nl.key = lock->start;
+	nl.old_mode = lock->mode;
+	nl.new_mode = SCOUTFS_LOCK_NULL;

-	spin_lock(&linfo->lock);
-	list_splice_init(&linfo->shrink_list, &list);
-	spin_unlock(&linfo->lock);
+	ret = scoutfs_client_lock_request(sb, &nl);
+	if (ret) {
+		/* oh well, not freeing */
+		scoutfs_inc_counter(sb, lock_shrink_request_aborted);

-	list_for_each_entry_safe(lock, tmp, &list, shrink_head) {
-		list_del_init(&lock->shrink_head);
+		spin_lock(&linfo->lock);

-		/* unlocked lock access, but should be stable since we queued */
-		nl.key = lock->start;
-		nl.old_mode = lock->mode;
-		nl.new_mode = SCOUTFS_LOCK_NULL;
+		lock->request_pending = 0;
+		wake_up(&lock->waitq);
+		put_lock(linfo, lock);

-		ret = scoutfs_client_lock_request(sb, &nl);
-		if (ret) {
-			/* oh well, not freeing */
-			scoutfs_inc_counter(sb, lock_shrink_aborted);
-
-			spin_lock(&linfo->lock);
-
-			lock->request_pending = 0;
-			wake_up(&lock->waitq);
-			put_lock(linfo, lock);
-
-			spin_unlock(&linfo->lock);
-		}
+		spin_unlock(&linfo->lock);
 	}
 }

@@ -1462,7 +1312,6 @@ static int scoutfs_lock_shrink(struct shrinker *shrink,
 	struct scoutfs_lock *lock;
 	struct scoutfs_lock *tmp;
 	unsigned long nr;
-	bool added = false;
 	int ret;

 	nr = sc->nr_to_scan;
@@ -1476,17 +1325,15 @@ restart:

 		BUG_ON(!lock_idle(lock));
 		BUG_ON(lock->mode == SCOUTFS_LOCK_NULL);
-		BUG_ON(!list_empty(&lock->shrink_head));

-		if (linfo->shutdown || nr-- == 0)
+		if (nr-- == 0)
 			break;

 		__lock_del_lru(linfo, lock);
 		lock->request_pending = 1;
-		list_add_tail(&lock->shrink_head, &linfo->shrink_list);
-		added = true;
+		queue_work(linfo->workq, &lock->shrink_work);

-		scoutfs_inc_counter(sb, lock_shrink_attempted);
+		scoutfs_inc_counter(sb, lock_shrink_queued);
 		trace_scoutfs_lock_shrink(sb, lock);

 		/* could have bazillions of idle locks */
@@ -1496,9 +1343,6 @@ restart:

 	spin_unlock(&linfo->lock);

-	if (added)
-		queue_work(linfo->workq, &linfo->shrink_work);
-
 out:
 	ret = min_t(unsigned long, linfo->lru_nr, INT_MAX);
 	trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, ret);
@@ -1533,15 +1377,10 @@ static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
 }

 /*
- * The caller is going to be calling _destroy soon and, critically, is
- * about to shutdown networking before calling us so that we don't get
- * any callbacks while we're destroying.  We have to ensure that we
- * won't call networking after this returns.
- *
- * Internal fs threads can be using locking, and locking can have async
- * work pending.  We use ->shutdown to force callers to return
- * -ESHUTDOWN and to prevent the future queueing of work that could call
- * networking.  Locks whose work is stopped will be torn down by _destroy.
+ * We're going to be destroying the locks soon.  We shouldn't have any
+ * normal task holders that would have prevented unmount.  We can have
+ * internal threads blocked in locks.  We force all currently blocked
+ * and future lock calls to return -ESHUTDOWN.
 */
 void scoutfs_lock_shutdown(struct super_block *sb)
 {
@@ -1563,10 +1402,6 @@ void scoutfs_lock_shutdown(struct super_block *sb)
 	}

 	spin_unlock(&linfo->lock);
-
-	flush_work(&linfo->grant_work);
-	flush_delayed_work(&linfo->inv_dwork);
-	flush_work(&linfo->shrink_work);
 }

 /*
@@ -1587,7 +1422,7 @@ void scoutfs_lock_destroy(struct super_block *sb)
 	DECLARE_LOCK_INFO(sb, linfo);
 	struct scoutfs_lock *lock;
 	struct rb_node *node;
-	enum scoutfs_lock_mode mode;
+	int mode;

 	if (!linfo)
 		return;
@@ -1639,12 +1474,6 @@ void scoutfs_lock_destroy(struct super_block *sb)
 		lock->request_pending = 0;
 		if (!list_empty(&lock->lru_head))
 			__lock_del_lru(linfo, lock);
-		if (!list_empty(&lock->grant_head))
-			list_del_init(&lock->grant_head);
-		if (!list_empty(&lock->inv_head))
-			list_del_init(&lock->inv_head);
-		if (!list_empty(&lock->shrink_head))
-			list_del_init(&lock->shrink_head);
 		lock_remove(linfo, lock);
 		lock_free(linfo, lock);
 	}
@@ -1672,12 +1501,6 @@ int scoutfs_lock_setup(struct super_block *sb)
 	linfo->shrinker.seeks = DEFAULT_SEEKS;
 	register_shrinker(&linfo->shrinker);
 	INIT_LIST_HEAD(&linfo->lru_list);
-	INIT_WORK(&linfo->grant_work, lock_grant_worker);
-	INIT_LIST_HEAD(&linfo->grant_list);
-	INIT_DELAYED_WORK(&linfo->inv_dwork, lock_invalidate_worker);
-	INIT_LIST_HEAD(&linfo->inv_list);
-	INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
-	INIT_LIST_HEAD(&linfo->shrink_list);
 	atomic64_set(&linfo->next_refresh_gen, 0);
 	scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

@@ -22,32 +22,24 @@ struct scoutfs_lock {
 	struct rb_node range_node;
 	u64 refresh_gen;
 	u64 write_version;
-	u64 dirty_trans_seq;
-	struct scoutfs_net_roots roots;
 	struct list_head lru_head;
 	wait_queue_head_t waitq;
+	struct work_struct shrink_work;
 	ktime_t grace_deadline;
 	unsigned long request_pending:1,
 		      invalidate_pending:1;

-	struct list_head grant_head;
-	struct scoutfs_net_lock_grant_response grant_resp;
-	struct list_head inv_head;
-	struct scoutfs_net_lock inv_nl;
-	u64 inv_net_id;
-	struct list_head shrink_head;
-
 	spinlock_t cov_list_lock;
 	struct list_head cov_list;

-	enum scoutfs_lock_mode mode;
+	int mode;
 	unsigned int waiters[SCOUTFS_LOCK_NR_MODES];
 	unsigned int users[SCOUTFS_LOCK_NR_MODES];

 	struct scoutfs_tseq_entry tseq_entry;

-	/* the forest tracks which log tree last saw bloom bit updates */
-	atomic64_t forest_bloom_nr;
+	/* the forest btree code stores data per lock */
+	struct forest_lock_private *forest_private;
 };

 struct scoutfs_lock_coverage {
@@ -57,33 +49,35 @@ struct scoutfs_lock_coverage {
 };

 int scoutfs_lock_grant_response(struct super_block *sb,
-				struct scoutfs_net_lock_grant_response *gr);
+				struct scoutfs_net_lock *nl);
 int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
 				    struct scoutfs_net_lock *nl);
 int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
 				 struct scoutfs_key *key);

-int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_inode(struct super_block *sb, int mode, int flags,
 		       struct inode *inode, struct scoutfs_lock **ret_lock);
-int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
+int scoutfs_lock_ino(struct super_block *sb, int mode, int flags, u64 ino,
 		     struct scoutfs_lock **ret_lock);
 void scoutfs_lock_get_index_item_range(u8 type, u64 major, u64 ino,
 				       struct scoutfs_key *start,
 				       struct scoutfs_key *end);
-int scoutfs_lock_inode_index(struct super_block *sb, enum scoutfs_lock_mode mode,
+int scoutfs_lock_inode_index(struct super_block *sb, int mode,
 			     u8 type, u64 major, u64 ino,
 			     struct scoutfs_lock **ret_lock);
-int scoutfs_lock_inodes(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_xattr_index(struct super_block *sb, int mode, int flags,
+			     u64 hash, struct scoutfs_lock **ret_lock);
+int scoutfs_lock_inodes(struct super_block *sb, int mode, int flags,
 			struct inode *a, struct scoutfs_lock **a_lock,
 			struct inode *b, struct scoutfs_lock **b_lock,
 			struct inode *c, struct scoutfs_lock **c_lock,
 			struct inode *d, struct scoutfs_lock **D_lock);
-int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_rename(struct super_block *sb, int mode, int flags,
 			struct scoutfs_lock **lock);
-int scoutfs_lock_rid(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
+int scoutfs_lock_rid(struct super_block *sb, int mode, int flags,
 		     u64 rid, struct scoutfs_lock **lock);
 void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock,
-		    enum scoutfs_lock_mode mode);
+		    int level);

 void scoutfs_lock_init_coverage(struct scoutfs_lock_coverage *cov);
 void scoutfs_lock_add_coverage(struct super_block *sb,
@@ -94,7 +88,7 @@ bool scoutfs_lock_is_covered(struct super_block *sb,
 void scoutfs_lock_del_coverage(struct super_block *sb,
 			       struct scoutfs_lock_coverage *cov);
 bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
-			    enum scoutfs_lock_mode mode);
+			    int mode);

 void scoutfs_free_unused_locks(struct super_block *sb, unsigned long nr);

@@ -20,6 +20,7 @@
 #include "tseq.h"
 #include "spbm.h"
 #include "block.h"
+#include "radix.h"
 #include "btree.h"
 #include "msg.h"
 #include "scoutfs_trace.h"
@@ -86,10 +87,8 @@ struct lock_server_info {
 	struct scoutfs_tseq_tree tseq_tree;
 	struct dentry *tseq_dentry;

-	struct scoutfs_alloc *alloc;
+	struct scoutfs_radix_allocator *alloc;
 	struct scoutfs_block_writer *wri;
-
-	atomic64_t write_version;
 };

 #define DECLARE_LOCK_SERVER_INFO(sb, name) \
@@ -118,6 +117,12 @@ struct server_lock_node {
 	struct list_head invalidated;
 };

+enum {
+	CLE_GRANTED,
+	CLE_REQUESTED,
+	CLE_INVALIDATED,
+};
+
 /*
 * Interactions with the client are tracked with these little mode
 * wrappers.
@@ -484,12 +489,12 @@ static int process_waiting_requests(struct super_block *sb,
 				    struct server_lock_node *snode)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
-	struct scoutfs_net_lock_grant_response gres;
 	struct scoutfs_net_lock nl;
 	struct client_lock_entry *req;
 	struct client_lock_entry *req_tmp;
 	struct client_lock_entry *gr;
 	struct client_lock_entry *gr_tmp;
+	static atomic64_t write_version = ATOMIC64_INIT(0);
 	u64 wv;
 	int ret;

@@ -543,15 +548,12 @@ static int process_waiting_requests(struct super_block *sb,

 		if (nl.new_mode == SCOUTFS_LOCK_WRITE ||
 		    nl.new_mode == SCOUTFS_LOCK_WRITE_ONLY) {
-			wv = atomic64_inc_return(&inf->write_version);
+			wv = atomic64_inc_return(&write_version);
 			nl.write_version = cpu_to_le64(wv);
 		}

-		gres.nl = nl;
-		scoutfs_server_get_roots(sb, &gres.roots);
-
 		ret = scoutfs_server_lock_response(sb, req->rid,
-						   req->net_id, &gres);
+						   req->net_id, &nl);
 		if (ret)
 			goto out;

@@ -573,14 +575,6 @@ out:
 	return ret;
 }

-static void init_lock_clients_key(struct scoutfs_key *key, u64 rid)
-{
-	*key = (struct scoutfs_key) {
-		.sk_zone = SCOUTFS_LOCK_CLIENTS_ZONE,
-		.sklc_rid = cpu_to_le64(rid),
-	};
-}
-
 /*
 * The server received a greeting from a client for the first time.  If
 * the client had already talked to the server then we must find an
@@ -595,22 +589,23 @@ int scoutfs_lock_server_greeting(struct super_block *sb, u64 rid,
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_lock_client_btree_key cbk;
 	SCOUTFS_BTREE_ITEM_REF(iref);
 	struct scoutfs_key key;
 	int ret;

-	init_lock_clients_key(&key, rid);
+	cbk.rid = cpu_to_be64(rid);

 	mutex_lock(&inf->mutex);
 	if (should_exist) {
-		ret = scoutfs_btree_lookup(sb, &super->lock_clients, &key,
-					   &iref);
+		ret = scoutfs_btree_lookup(sb, &super->lock_clients,
+					   &cbk, sizeof(cbk), &iref);
 		if (ret == 0)
 			scoutfs_btree_put_iref(&iref);
 	} else {
 		ret = scoutfs_btree_insert(sb, inf->alloc, inf->wri,
 					   &super->lock_clients,
-					   &key, NULL, 0);
+					   &cbk, sizeof(cbk), NULL, 0);
 	}
 	mutex_unlock(&inf->mutex);

@@ -669,14 +664,6 @@ static int finished_recovery(struct super_block *sb, u64 rid, bool cancel)
 	return ret;
 }

-static void set_max_write_version(struct lock_server_info *inf, u64 new)
-{
-	u64 old;
-
-	while (new > (old = atomic64_read(&inf->write_version)) &&
-	       (atomic64_cmpxchg(&inf->write_version, old, new) != old));
-}
-
 /*
 * We sent a lock recover request to the client when we received its
 * greeting while in recovery.  Here we instantiate all the locks it
@@ -740,10 +727,6 @@ int scoutfs_lock_server_recover_response(struct super_block *sb, u64 rid,
 		scoutfs_tseq_add(&inf->tseq_tree, &clent->tseq_entry);

 		put_server_lock(inf, snode);
-
-		/* make sure next write lock is greater than all recovered */
-		set_max_write_version(inf,
-				le64_to_cpu(nlr->locks[i].write_version));
 	}

 	/* send request for next batch of keys */
@@ -755,12 +738,15 @@ out:
 	return ret;
 }

-static int get_rid_and_put_ref(struct scoutfs_btree_item_ref *iref, u64 *rid)
+static int get_rid_and_put_ref(struct scoutfs_btree_item_ref *iref,
+			       u64 *rid)
 {
+	struct scoutfs_lock_client_btree_key *cbk;
 	int ret;

-	if (iref->val_len == 0) {
-		*rid = le64_to_cpu(iref->key->sklc_rid);
+	if (iref->key_len == sizeof(*cbk) && iref->val_len == 0) {
+		cbk = iref->key;
+		*rid = be64_to_cpu(cbk->rid);
 		ret = 0;
 	} else {
 		ret = -EIO;
@@ -781,8 +767,8 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)
 						    recovery_dwork.work);
 	struct super_block *sb = inf->sb;
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_lock_client_btree_key cbk;
 	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_key key;
 	bool timed_out;
 	u64 rid;
 	int ret;
@@ -793,8 +779,9 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)

 	/* we enter recovery if there are any client records */
 	for (rid = 0; ; rid++) {
-		init_lock_clients_key(&key, rid);
-		ret = scoutfs_btree_next(sb, &super->lock_clients, &key, &iref);
+		cbk.rid = cpu_to_be64(rid);
+		ret = scoutfs_btree_next(sb, &super->lock_clients,
+					 &cbk, sizeof(cbk), &iref);
 		if (ret == -ENOENT) {
 			ret = 0;
 			break;
@@ -819,9 +806,10 @@ static void scoutfs_lock_server_recovery_timeout(struct work_struct *work)
 		scoutfs_err(sb, "client rid %016llx lock recovery timed out",
 			    rid);

-		init_lock_clients_key(&key, rid);
+		cbk.rid = cpu_to_be64(rid);
 		ret = scoutfs_btree_delete(sb, inf->alloc, inf->wri,
-					   &super->lock_clients, &key);
+					   &super->lock_clients,
+					   &cbk, sizeof(cbk));
 		if (ret)
 			break;
 	}
@@ -850,6 +838,7 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 {
 	DECLARE_LOCK_SERVER_INFO(sb, inf);
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_lock_client_btree_key cli;
 	struct client_lock_entry *clent;
 	struct client_lock_entry *tmp;
 	struct server_lock_node *snode;
@@ -858,10 +847,10 @@ int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid)
 	bool freed;
 	int ret = 0;

+	cli.rid = cpu_to_be64(rid);
 	mutex_lock(&inf->mutex);
-	init_lock_clients_key(&key, rid);
 	ret = scoutfs_btree_delete(sb, inf->alloc, inf->wri,
-				   &super->lock_clients, &key);
+				   &super->lock_clients, &cli, sizeof(cli));
 	mutex_unlock(&inf->mutex);
 	if (ret == -ENOENT) {
 		ret = 0;
@@ -962,14 +951,14 @@ static void lock_server_tseq_show(struct seq_file *m,
 * we time them out.
 */
 int scoutfs_lock_server_setup(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri, u64 max_vers)
+			      struct scoutfs_radix_allocator *alloc,
+			      struct scoutfs_block_writer *wri)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
 	struct lock_server_info *inf;
 	SCOUTFS_BTREE_ITEM_REF(iref);
-	struct scoutfs_key key;
+	struct scoutfs_lock_client_btree_key cbk;
 	unsigned int nr;
 	u64 rid;
 	int ret;
@@ -988,7 +977,6 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	scoutfs_tseq_tree_init(&inf->tseq_tree, lock_server_tseq_show);
 	inf->alloc = alloc;
 	inf->wri = wri;
-	atomic64_set(&inf->write_version, max_vers); /* inc_return gives +1 */

 	inf->tseq_dentry = scoutfs_tseq_create("server_locks", sbi->debug_root,
 					       &inf->tseq_tree);
@@ -1002,8 +990,9 @@ int scoutfs_lock_server_setup(struct super_block *sb,
 	/* we enter recovery if there are any client records */
 	nr = 0;
 	for (rid = 0; ; rid++) {
-		init_lock_clients_key(&key, rid);
-		ret = scoutfs_btree_next(sb, &super->lock_clients, &key, &iref);
+		cbk.rid = cpu_to_be64(rid);
+		ret = scoutfs_btree_next(sb, &super->lock_clients,
+					 &cbk, sizeof(cbk), &iref);
 		if (ret == -ENOENT)
 			break;
 		if (ret == 0)
@@ -12,8 +12,8 @@ int scoutfs_lock_server_response(struct super_block *sb, u64 rid,
 int scoutfs_lock_server_farewell(struct super_block *sb, u64 rid);

 int scoutfs_lock_server_setup(struct super_block *sb,
-			      struct scoutfs_alloc *alloc,
-			      struct scoutfs_block_writer *wri, u64 max_vers);
+			      struct scoutfs_radix_allocator *alloc,
+			      struct scoutfs_block_writer *wri);
 void scoutfs_lock_server_destroy(struct super_block *sb);

 #endif
@@ -100,7 +100,7 @@ do {								\
 } while (0)

 /* listening and their accepting sockets have a fixed locking order */
-enum spin_lock_subtype {
+enum {
 	CONN_LOCK_LISTENER,
 	CONN_LOCK_ACCEPTED,
 };
@@ -369,7 +369,6 @@ static int submit_send(struct super_block *sb,
 	msend->nh.cmd = cmd;
 	msend->nh.flags = flags;
 	msend->nh.error = net_err;
-	memset(msend->nh.__pad, 0, sizeof(msend->nh.__pad));
 	msend->nh.data_len = cpu_to_le16(data_len);
 	if (data_len)
 		memcpy(msend->nh.data, data, data_len);
@@ -76,7 +76,7 @@ struct scoutfs_net_connection {
 	void *info;
 };

-enum conn_flags {
+enum {
 	CONN_FL_valid_greeting = (1UL << 0), /* other commands can proceed */
 	CONN_FL_established =	 (1UL << 1), /* added sends queue send work */
 	CONN_FL_shutting_down =	 (1UL << 2), /* shutdown work was queued */
@@ -102,7 +102,6 @@ static inline void scoutfs_addr_from_sin(struct scoutfs_inet_addr *addr,
 {
 	addr->addr = be32_to_le32(sin->sin_addr.s_addr);
 	addr->port = be16_to_le16(sin->sin_port);
-	memset(addr->__pad, 0, sizeof(addr->__pad));
 }

 struct scoutfs_net_connection *
@@ -16,7 +16,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
-#include <linux/namei.h>

 #include <linux/parser.h>
 #include <linux/inet.h>
@@ -29,16 +28,24 @@

 static const match_table_t tokens = {
 	{Opt_server_addr, "server_addr=%s"},
-	{Opt_metadev_path, "metadev_path=%s"},
 	{Opt_err, NULL}
 };

 struct options_sb_info {
 	struct dentry *debugfs_dir;
+	u32 btree_force_tiny_blocks;
 };

 u32 scoutfs_option_u32(struct super_block *sb, int token)
 {
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct options_sb_info *osi = sbi->options;
+
+	switch(token) {
+		case Opt_btree_force_tiny_blocks:
+			return osi->btree_force_tiny_blocks;
+	}
+
 	WARN_ON_ONCE(1);
 	return 0;
 }
@@ -83,52 +90,6 @@ static int parse_ipv4(struct super_block *sb, char *str,
 	return 0;
 }

-static int parse_bdev_path(struct super_block *sb, substring_t *substr,
-			      char **bdev_path_ret)
-{
-	char *bdev_path;
-	struct inode *bdev_inode;
-	struct path path;
-	bool got_path = false;
-	int ret;
-
-	bdev_path = match_strdup(substr);
-	if (!bdev_path) {
-		scoutfs_err(sb, "bdev string dup failed");
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = kern_path(bdev_path, LOOKUP_FOLLOW, &path);
-	if (ret) {
-		scoutfs_err(sb, "path %s not found for bdev: error %d",
-			    bdev_path, ret);
-		goto out;
-	}
-	got_path = true;
-
-	bdev_inode = d_inode(path.dentry);
-	if (!S_ISBLK(bdev_inode->i_mode)) {
-		scoutfs_err(sb, "path %s for bdev is not a block device",
-			    bdev_path);
-		ret = -ENOTBLK;
-		goto out;
-	}
-
-out:
-	if (got_path) {
-		path_put(&path);
-	}
-
-	if (ret < 0) {
-		kfree(bdev_path);
-	} else {
-		*bdev_path_ret = bdev_path;
-	}
-
-	return ret;
-}
-
 int scoutfs_parse_options(struct super_block *sb, char *options,
 			  struct mount_options *parsed)
 {
@@ -154,13 +115,6 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
 			if (ret < 0)
 				return ret;
 			break;
-		case Opt_metadev_path:
-
-			ret = parse_bdev_path(sb, &args[0],
-						 &parsed->metadev_path);
-			if (ret < 0)
-				return ret;
-			break;
 		default:
 			scoutfs_err(sb, "Unknown or malformed option, \"%s\"",
 				    p);
@@ -168,11 +122,6 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
 		}
 	}

-	if (!parsed->metadev_path) {
-		scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
-		return -EINVAL;
-	}
-
 	return 0;
 }

@@ -194,6 +143,13 @@ int scoutfs_options_setup(struct super_block *sb)
 		goto out;
 	}

+	if (!debugfs_create_bool("btree_force_tiny_blocks", 0644,
+				 osi->debugfs_dir,
+				 &osi->btree_force_tiny_blocks)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = 0;
 out:
 	if (ret)
@@ -5,15 +5,18 @@
 #include <linux/in.h>
 #include "format.h"

-enum scoutfs_mount_options {
+enum {
+	/*
+	 * For debugging we can quickly create huge trees by limiting
+	 * the number of items in each block as though the blocks were tiny.
+	 */
+	Opt_btree_force_tiny_blocks,
 	Opt_server_addr,
-	Opt_metadev_path,
 	Opt_err,
 };

 struct mount_options {
 	struct sockaddr_in server_addr;
-	char *metadev_path;
 };

 int scoutfs_parse_options(struct super_block *sb, char *options,
@@ -112,13 +112,12 @@ static ktime_t random_to(u32 lo, u32 hi)
 /*
 * The caller is about to read all the quorum blocks.  We invalidate any
 * cached blocks and issue one large contiguous read to repopulate the
- * cache.  The caller then uses normal __bread to read each block.  I'm
+ * cache.  The caller then uses normal sb_bread to read each block.  I'm
 * not a huge fan of the plug but I couldn't get the individual
 * readahead requests merged without it.
 */
 static void readahead_quorum_blocks(struct super_block *sb)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct buffer_head *bh;
 	struct blk_plug plug;
 	int i;
@@ -126,8 +125,7 @@ static void readahead_quorum_blocks(struct super_block *sb)
 	blk_start_plug(&plug);

 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
-		bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i,
-			     SCOUTFS_BLOCK_SM_SIZE);
+		bh = sb_getblk(sb, SCOUTFS_QUORUM_BLKNO + i);
 		if (!bh)
 			continue;

@@ -146,7 +144,7 @@ struct quorum_block_head {
 	struct list_head head;
 	union {
 		struct scoutfs_quorum_block blk;
-		u8 bytes[SCOUTFS_BLOCK_SM_SIZE];
+		u8 bytes[SCOUTFS_BLOCK_SIZE];
 	};
 };

@@ -186,13 +184,13 @@ static size_t quorum_block_bytes(struct scoutfs_quorum_block *blk)
 static bool invalid_quorum_block(struct buffer_head *bh,
 				 struct scoutfs_quorum_block *blk)
 {
-	return bh->b_size != SCOUTFS_BLOCK_SM_SIZE ||
-	       sizeof(struct scoutfs_quorum_block) > SCOUTFS_BLOCK_SM_SIZE ||
+	return bh->b_size != SCOUTFS_BLOCK_SIZE ||
+	       sizeof(struct scoutfs_quorum_block) > SCOUTFS_BLOCK_SIZE ||
 	       quorum_block_crc(blk) != blk->crc ||
 	       le64_to_cpu(blk->blkno) != bh->b_blocknr ||
 	       blk->term == 0 ||
 	       blk->log_nr > SCOUTFS_QUORUM_LOG_MAX ||
-	       quorum_block_bytes(blk) > SCOUTFS_BLOCK_SM_SIZE;
+	       quorum_block_bytes(blk) > SCOUTFS_BLOCK_SIZE;
 }

 /* true if a is stale and should be ignored */
@@ -217,7 +215,6 @@ static bool stale_quorum_block(struct scoutfs_quorum_block *a,
 static int read_quorum_blocks(struct super_block *sb, struct list_head *blocks)
 {
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_quorum_block *blk;
 	struct quorum_block_head *qbh;
 	struct quorum_block_head *tmp;
@@ -230,8 +227,7 @@ static int read_quorum_blocks(struct super_block *sb, struct list_head *blocks)

 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
 		brelse(bh);
-		bh = __bread(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i,
-			     SCOUTFS_BLOCK_SM_SIZE);
+		bh = sb_bread(sb, SCOUTFS_QUORUM_BLKNO + i);
 		if (!bh) {
 			scoutfs_inc_counter(sb, quorum_read_block_error);
 			ret = -EIO;
@@ -295,25 +291,23 @@ static int write_quorum_block(struct super_block *sb,
 			      struct scoutfs_quorum_block *our_blk)
 {
 	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_quorum_block *blk;
 	struct buffer_head *bh = NULL;
 	size_t size;
 	int ret;

-	BUILD_BUG_ON(sizeof(struct scoutfs_quorum_block) >
-		     SCOUTFS_BLOCK_SM_SIZE);
+	BUILD_BUG_ON(sizeof(struct scoutfs_quorum_block) > SCOUTFS_BLOCK_SIZE);

-	bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO +
-		      prandom_u32_max(SCOUTFS_QUORUM_BLOCKS),
-		      SCOUTFS_BLOCK_SM_SIZE);
+	bh = sb_getblk(sb, SCOUTFS_QUORUM_BLKNO +
+			   prandom_u32_max(SCOUTFS_QUORUM_BLOCKS));
 	if (bh == NULL) {
 		ret = -EIO;
 		goto out;
 	}

 	size = quorum_block_bytes(our_blk);
-	if (WARN_ON_ONCE(size > SCOUTFS_BLOCK_SM_SIZE || size > bh->b_size)) {
+	if (WARN_ON_ONCE(size > SCOUTFS_BLOCK_SIZE ||
+			 size > bh->b_size)) {
 		ret = -EIO;
 		goto out;
 	}
@@ -536,7 +530,7 @@ int scoutfs_quorum_election(struct super_block *sb, ktime_t timeout_abs,
 	trace_scoutfs_quorum_election(sb, prev_term);

 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
-	our_blk = kmalloc(SCOUTFS_BLOCK_SM_SIZE, GFP_NOFS);
+	our_blk = kmalloc(SCOUTFS_BLOCK_SIZE, GFP_NOFS);
 	if (!super || !our_blk) {
 		ret = -ENOMEM;
 		goto out;
@@ -554,7 +548,7 @@ int scoutfs_quorum_election(struct super_block *sb, ktime_t timeout_abs,
 			    SCOUTFS_QUORUM_TERM_HI_MS);

 	for (;;) {
-		memset(our_blk, 0, SCOUTFS_BLOCK_SM_SIZE);
+		memset(our_blk, 0, SCOUTFS_BLOCK_SIZE);

 		scoutfs_inc_counter(sb, quorum_cycle);

@@ -0,0 +1,45 @@
+#ifndef _SCOUTFS_RADIX_H_
+#define _SCOUTFS_RADIX_H_
+
+#include "per_task.h"
+
+struct scoutfs_block_writer;
+
+struct scoutfs_radix_allocator {
+	struct mutex mutex;
+	struct scoutfs_radix_root avail;
+	struct scoutfs_radix_root freed;
+};
+
+int scoutfs_radix_alloc(struct super_block *sb,
+			struct scoutfs_radix_allocator *alloc,
+			struct scoutfs_block_writer *wri, u64 *blkno);
+int scoutfs_radix_alloc_data(struct super_block *sb,
+			     struct scoutfs_radix_allocator *alloc,
+			     struct scoutfs_block_writer *wri,
+			     struct scoutfs_radix_root *root,
+			     int count, u64 *blkno_ret, int *count_ret);
+int scoutfs_radix_free(struct super_block *sb,
+		       struct scoutfs_radix_allocator *alloc,
+		       struct scoutfs_block_writer *wri, u64 blkno);
+int scoutfs_radix_free_data(struct super_block *sb,
+			    struct scoutfs_radix_allocator *alloc,
+			    struct scoutfs_block_writer *wri,
+			    struct scoutfs_radix_root *root,
+			    u64 blkno, int count);
+int scoutfs_radix_merge(struct super_block *sb,
+			struct scoutfs_radix_allocator *alloc,
+			struct scoutfs_block_writer *wri,
+			struct scoutfs_radix_root *dst,
+			struct scoutfs_radix_root *src,
+			struct scoutfs_radix_root *inp, bool meta, u64 count);
+void scoutfs_radix_init_alloc(struct scoutfs_radix_allocator *alloc,
+			      struct scoutfs_radix_root *avail,
+			      struct scoutfs_radix_root *freed);
+void scoutfs_radix_root_init(struct super_block *sb,
+			     struct scoutfs_radix_root *root, bool meta);
+u64 scoutfs_radix_root_free_bytes(struct super_block *sb,
+				  struct scoutfs_radix_root *root);
+u64 scoutfs_radix_bit_leaf_nr(u64 bit);
+
+#endif
@@ -37,26 +37,25 @@
 #include "server.h"
 #include "net.h"
 #include "data.h"
-#include "ext.h"

 struct lock_info;

 #define STE_FMT "[%llu %llu %llu 0x%x]"
-#define STE_ARGS(te) (te)->start, (te)->len, (te)->map, (te)->flags
+#define STE_ARGS(te) (te)->iblock, (te)->count, (te)->blkno, (te)->flags
 #define STE_FIELDS(pref)			\
-	__field(__u64, pref##_start)		\
-	__field(__u64, pref##_len)		\
-	__field(__u64, pref##_map)		\
+	__field(__u64, pref##_iblock)		\
+	__field(__u64, pref##_count)		\
+	__field(__u64, pref##_blkno)		\
 	__field(__u8, pref##_flags)
 #define STE_ASSIGN(pref, te)			\
-	__entry->pref##_start = (te)->start;	\
-	__entry->pref##_len = (te)->len;	\
-	__entry->pref##_map = (te)->map;	\
+	__entry->pref##_iblock = (te)->iblock;	\
+	__entry->pref##_count = (te)->count;	\
+	__entry->pref##_blkno = (te)->blkno;	\
 	__entry->pref##_flags = (te)->flags;
 #define STE_ENTRY_ARGS(pref)			\
-	__entry->pref##_start,			\
-	__entry->pref##_len,			\
-	__entry->pref##_map,			\
+	__entry->pref##_iblock,			\
+	__entry->pref##_count,			\
+	__entry->pref##_blkno,			\
 	__entry->pref##_flags

 #define DECLARE_TRACED_EXTENT(name) \
@@ -169,70 +168,36 @@ TRACE_EVENT(scoutfs_data_fallocate,
 		__entry->len, __entry->ret)
 );

-TRACE_EVENT(scoutfs_data_move_blocks,
-	TP_PROTO(struct super_block *sb, u64 from_ino, u64 from_start, u64 len,
-		 u64 map, u8 flags, u64 to_ino, u64 to_start),
-
-	TP_ARGS(sb, from_ino, from_start, len, map, flags, to_ino, to_start),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, from_ino)
-		__field(__u64, from_start)
-		__field(__u64, len)
-		__field(__u64, map)
-		__field(__u8, flags)
-		__field(__u64, to_ino)
-		__field(__u64, to_start)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->from_ino = from_ino;
-		__entry->from_start = from_start;
-		__entry->len = len;
-		__entry->map = map;
-		__entry->flags = flags;
-		__entry->to_ino = to_ino;
-		__entry->to_start = to_start;
-	),
-
-	TP_printk(SCSBF" from_ino %llu from_start %llu len %llu map %llu flags 0x%x to_ino %llu to_start %llu\n",
-		SCSB_TRACE_ARGS, __entry->from_ino, __entry->from_start,
-		__entry->len, __entry->map, __entry->flags, __entry->to_ino,
-		__entry->to_start)
-);
-
 TRACE_EVENT(scoutfs_data_fiemap,
-	TP_PROTO(struct super_block *sb, __u64 start, __u64 len, int ret),
+	TP_PROTO(struct super_block *sb, __u64 off, int i, __u64 blkno),


-	TP_ARGS(sb, start, len, ret),
+	TP_ARGS(sb, off, i, blkno),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, start)
-		__field(__u64, len)
-		__field(int, ret)
+		__field(__u64, off)
+		__field(int, i)
+		__field(__u64, blkno)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->start = start;
-		__entry->len = len;
-		__entry->ret = ret;
+		__entry->off = off;
+		__entry->i = i;
+		__entry->blkno = blkno;
 	),

-	TP_printk(SCSBF" start %llu len %llu ret %d", SCSB_TRACE_ARGS,
-		  __entry->start, __entry->len, __entry->ret)
+	TP_printk(SCSBF" blk_off %llu i %u blkno %llu", SCSB_TRACE_ARGS,
+		  __entry->off, __entry->i, __entry->blkno)
 );

 TRACE_EVENT(scoutfs_get_block,
 	TP_PROTO(struct super_block *sb, __u64 ino, __u64 iblock,
-		 int create, struct scoutfs_extent *ext,
+		 int create, struct scoutfs_traced_extent *te,
 		 int ret, __u64 blkno, size_t size),

-	TP_ARGS(sb, ino, iblock, create, ext, ret, blkno, size),
+	TP_ARGS(sb, ino, iblock, create, te, ret, blkno, size),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
@@ -250,7 +215,7 @@ TRACE_EVENT(scoutfs_get_block,
 		__entry->ino = ino;
 		__entry->iblock = iblock;
 		__entry->create = create;
-		STE_ASSIGN(ext, ext)
+		STE_ASSIGN(ext, te)
 		__entry->ret = ret;
 		__entry->blkno = blkno;
 		__entry->size = size;
@@ -262,35 +227,11 @@ TRACE_EVENT(scoutfs_get_block,
 		  __entry->blkno, __entry->size)
 );

-TRACE_EVENT(scoutfs_data_alloc_block_enter,
-	TP_PROTO(struct super_block *sb, __u64 ino, __u64 iblock,
-		 struct scoutfs_extent *ext),
+TRACE_EVENT(scoutfs_data_file_extent_class,
+	TP_PROTO(struct super_block *sb, __u64 ino,
+		 struct scoutfs_traced_extent *te),

-	TP_ARGS(sb, ino, iblock, ext),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, ino)
-		__field(__u64, iblock)
-		STE_FIELDS(ext)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->ino = ino;
-		__entry->iblock = iblock;
-		STE_ASSIGN(ext, ext)
-	),
-
-	TP_printk(SCSBF" ino %llu iblock %llu ext "STE_FMT,
-		  SCSB_TRACE_ARGS, __entry->ino, __entry->iblock,
-		  STE_ENTRY_ARGS(ext))
-);
-
-DECLARE_EVENT_CLASS(scoutfs_data_file_extent_class,
-	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),
-
-	TP_ARGS(sb, ino, ext),
+	TP_ARGS(sb, ino, te),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
@@ -301,35 +242,36 @@ DECLARE_EVENT_CLASS(scoutfs_data_file_extent_class,
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->ino = ino;
-		STE_ASSIGN(ext, ext)
+		STE_ASSIGN(ext, te)
 	),

 	TP_printk(SCSBF" ino %llu ext "STE_FMT,
 		  SCSB_TRACE_ARGS, __entry->ino, STE_ENTRY_ARGS(ext))
 );
-DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_alloc,
-	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),
-	TP_ARGS(sb, ino, ext)
+DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_alloc_block,
+	TP_PROTO(struct super_block *sb, __u64 ino,
+		 struct scoutfs_traced_extent *te),
+	TP_ARGS(sb, ino, te)
 );
-DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_prealloc,
-	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),
-	TP_ARGS(sb, ino, ext)
+DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_convert_unwritten,
+	TP_PROTO(struct super_block *sb, __u64 ino,
+		 struct scoutfs_traced_extent *te),
+	TP_ARGS(sb, ino, te)
 );
-DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_get_block_found,
-	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),
-	TP_ARGS(sb, ino, ext)
-);
-DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_get_block_mapped,
-	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),
-	TP_ARGS(sb, ino, ext)
+DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_prealloc_unwritten,
+	TP_PROTO(struct super_block *sb, __u64 ino,
+		 struct scoutfs_traced_extent *te),
+	TP_ARGS(sb, ino, te)
 );
 DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_extent_truncated,
-	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),
-	TP_ARGS(sb, ino, ext)
+	TP_PROTO(struct super_block *sb, __u64 ino,
+		 struct scoutfs_traced_extent *te),
+	TP_ARGS(sb, ino, te)
 );
 DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent,
-	TP_PROTO(struct super_block *sb, __u64 ino, struct scoutfs_extent *ext),
-	TP_ARGS(sb, ino, ext)
+	TP_PROTO(struct super_block *sb, __u64 ino,
+		 struct scoutfs_traced_extent *te),
+	TP_ARGS(sb, ino, te)
 );

 TRACE_EVENT(scoutfs_data_truncate_items,
@@ -357,9 +299,9 @@ TRACE_EVENT(scoutfs_data_truncate_items,

 TRACE_EVENT(scoutfs_data_wait_check,
 	TP_PROTO(struct super_block *sb, __u64 ino, __u64 pos, __u64 len,
-		 __u8 sef, __u8 op, struct scoutfs_extent *ext, int ret),
+		 __u8 sef, __u8 op, struct scoutfs_traced_extent *te, int ret),

-	TP_ARGS(sb, ino, pos, len, sef, op, ext, ret),
+	TP_ARGS(sb, ino, pos, len, sef, op, te, ret),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
@@ -379,7 +321,7 @@ TRACE_EVENT(scoutfs_data_wait_check,
 		__entry->len = len;
 		__entry->sef = sef;
 		__entry->op = op;
-		STE_ASSIGN(ext, ext)
+		STE_ASSIGN(ext, te)
 		__entry->ret = ret;
 	),

@@ -564,22 +506,22 @@ TRACE_EVENT(scoutfs_ioc_release,
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, ino)
-		__field(__u64, offset)
-		__field(__u64, length)
+		__field(__u64, block)
+		__field(__u64, count)
 		__field(__u64, vers)
 	),

 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->ino = ino;
-		__entry->offset = args->offset;
-		__entry->length = args->length;
+		__entry->block = args->block;
+		__entry->count = args->count;
 		__entry->vers = args->data_version;
 	),

-	TP_printk(SCSBF" ino %llu offset %llu length %llu vers %llu",
-		  SCSB_TRACE_ARGS, __entry->ino, __entry->offset,
-		  __entry->length, __entry->vers)
+	TP_printk(SCSBF" ino %llu block %llu count %llu vers %llu",
+		  SCSB_TRACE_ARGS, __entry->ino, __entry->block,
+		  __entry->count, __entry->vers)
 );

 DEFINE_EVENT(scoutfs_ino_ret_class, scoutfs_ioc_release_ret,
@@ -598,7 +540,7 @@ TRACE_EVENT(scoutfs_ioc_stage,
 		__field(__u64, ino)
 		__field(__u64, vers)
 		__field(__u64, offset)
-		__field(__s32, length)
+		__field(__s32, count)
 	),

 	TP_fast_assign(
@@ -606,12 +548,12 @@ TRACE_EVENT(scoutfs_ioc_stage,
 		__entry->ino = ino;
 		__entry->vers = args->data_version;
 		__entry->offset = args->offset;
-		__entry->length = args->length;
+		__entry->count = args->count;
 	),

-	TP_printk(SCSBF" ino %llu vers %llu offset %llu length %d",
+	TP_printk(SCSBF" ino %llu vers %llu offset %llu count %d",
 		  SCSB_TRACE_ARGS, __entry->ino, __entry->vers,
-		  __entry->offset, __entry->length)
+		  __entry->offset, __entry->count)
 );

 TRACE_EVENT(scoutfs_ioc_data_wait_err,
@@ -1733,43 +1675,6 @@ TRACE_EVENT(scoutfs_btree_dirty_block,
 		  __entry->bt_blkno, __entry->bt_seq)
 );

-TRACE_EVENT(scoutfs_btree_walk,
-	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *root,
-		 struct scoutfs_key *key, int flags, int level,
-		 struct scoutfs_btree_ref *ref),
-
-	TP_ARGS(sb, root, key, flags, level, ref),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, root_blkno)
-		__field(__u64, root_seq)
-		__field(__u8, root_height)
-		sk_trace_define(key)
-		__field(int, flags)
-		__field(int, level)
-		__field(__u64, ref_blkno)
-		__field(__u64, ref_seq)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
-		__entry->root_seq = le64_to_cpu(root->ref.seq);
-		__entry->root_height = root->height;
-		sk_trace_assign(key, key);
-		__entry->flags = flags;
-		__entry->level = level;
-		__entry->ref_blkno = le64_to_cpu(ref->blkno);
-		__entry->ref_seq = le64_to_cpu(ref->seq);
-	),
-
-	TP_printk(SCSBF" root blkno %llu seq %llu height %u key "SK_FMT" flags 0x%x level %d ref blkno %llu seq %llu",
-		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->root_seq,
-		  __entry->root_height, sk_trace_args(key), __entry->flags,
-		  __entry->level, __entry->ref_blkno, __entry->ref_seq)
-);
-
 TRACE_EVENT(scoutfs_online_offline_blocks,
 	TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta,
 		 u64 on_now, u64 off_now),
@@ -2139,12 +2044,12 @@ TRACE_EVENT(scoutfs_forest_prepare_commit,
 		  __entry->bloom_blkno, __entry->bloom_seq)
 );

-TRACE_EVENT(scoutfs_forest_using_roots,
-	TP_PROTO(struct super_block *sb, struct scoutfs_btree_root *fs_root,
-		 struct scoutfs_btree_root *logs_root),
-	TP_ARGS(sb, fs_root, logs_root),
+TRACE_EVENT(scoutfs_forest_read_super,
+	TP_PROTO(struct super_block *sb, struct scoutfs_super_block *super),
+	TP_ARGS(sb, super),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		__field(__u64, hdr_seq)
 		__field(__u64, fs_blkno)
 		__field(__u64, fs_seq)
 		__field(__u64, logs_blkno)
@@ -2152,21 +2057,24 @@ TRACE_EVENT(scoutfs_forest_using_roots,
 	),
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->fs_blkno = le64_to_cpu(fs_root->ref.blkno);
-		__entry->fs_seq = le64_to_cpu(fs_root->ref.seq);
-		__entry->logs_blkno = le64_to_cpu(logs_root->ref.blkno);
-		__entry->logs_seq = le64_to_cpu(logs_root->ref.seq);
+		__entry->hdr_seq = le64_to_cpu(super->hdr.seq);
+		__entry->fs_blkno = le64_to_cpu(super->fs_root.ref.blkno);
+		__entry->fs_seq = le64_to_cpu(super->fs_root.ref.seq);
+		__entry->logs_blkno = le64_to_cpu(super->logs_root.ref.blkno);
+		__entry->logs_seq = le64_to_cpu(super->logs_root.ref.seq);
 	),
-	TP_printk(SCSBF" fs blkno %llu seq %llu logs blkno %llu seq %llu",
-		  SCSB_TRACE_ARGS, __entry->fs_blkno, __entry->fs_seq,
-		  __entry->logs_blkno, __entry->logs_seq)
+	TP_printk(SCSBF" hdr seq %llu fs blkno %llu seq %llu logs blkno %llu seq %llu",
+		  SCSB_TRACE_ARGS, __entry->hdr_seq, __entry->fs_blkno,
+		  __entry->fs_seq, __entry->logs_blkno, __entry->logs_seq)
 );

-TRACE_EVENT(scoutfs_forest_init_our_log,
-	TP_PROTO(struct super_block *sb, u64 rid, u64 nr, u64 blkno, u64 seq),
-	TP_ARGS(sb, rid, nr, blkno, seq),
+TRACE_EVENT(scoutfs_forest_add_root,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *key, u64 rid,
+		 u64 nr, u64 blkno, u64 seq),
+	TP_ARGS(sb, key, rid, nr, blkno, seq),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
+		sk_trace_define(key)
 		__field(__u64, b_rid)
 		__field(__u64, nr)
 		__field(__u64, blkno)
@@ -2174,14 +2082,71 @@ TRACE_EVENT(scoutfs_forest_init_our_log,
 	),
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(key, key);
 		__entry->b_rid = rid;
 		__entry->nr = nr;
 		__entry->blkno = blkno;
 		__entry->seq = seq;
 	),
-	TP_printk(SCSBF" rid %016llx nr %llu blkno %llu seq %llx",
+	TP_printk(SCSBF" key "SK_FMT" rid %016llx nr %llu blkno %llu seq %llx",
+		  SCSB_TRACE_ARGS, sk_trace_args(key),
+		  __entry->b_rid, __entry->nr, __entry->blkno, __entry->seq)
+);
+
+TRACE_EVENT(scoutfs_forest_iter_search,
+	TP_PROTO(struct super_block *sb, u64 rid, u64 nr, u64 vers,
+		 u8 flags, struct scoutfs_key *key),
+	TP_ARGS(sb, rid, nr, vers, flags, key),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, b_rid)
+		__field(__u64, nr)
+		__field(__u64, vers)
+		__field(__u8, flags)
+		sk_trace_define(key)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->b_rid = rid;
+		__entry->nr = nr;
+		__entry->vers = vers;
+		__entry->flags = flags;
+		sk_trace_assign(key, key);
+	),
+	TP_printk(SCSBF" rid %016llx nr %llu vers %llu flags %x key "SK_FMT,
 		  SCSB_TRACE_ARGS, __entry->b_rid, __entry->nr,
-		  __entry->blkno, __entry->seq)
+		  __entry->vers, __entry->flags, sk_trace_args(key))
+);
+
+TRACE_EVENT(scoutfs_forest_iter_ret,
+	TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
+		 struct scoutfs_key *end, bool forward, int ret,
+		 u64 found_vers, int found_ret, struct scoutfs_key *found),
+	TP_ARGS(sb, key, end, forward, ret, found_vers, found_ret, found),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		sk_trace_define(key)
+		sk_trace_define(end)
+		__field(char, forward)
+		__field(int, ret)
+		__field(__u64, found_vers)
+		__field(int, found_ret)
+		sk_trace_define(found)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		sk_trace_assign(key, key);
+		sk_trace_assign(end, end);
+		__entry->forward = !!forward;
+		__entry->ret = ret;
+		__entry->found_vers = found_vers;
+		__entry->found_ret = found_ret;
+		sk_trace_assign(found, found);
+	),
+	TP_printk(SCSBF" key "SK_FMT" end "SK_FMT" fwd %u ret %d fv %llu fc %d f "SK_FMT,
+		  SCSB_TRACE_ARGS, sk_trace_args(key), sk_trace_args(end),
+		  __entry->forward, __entry->ret, __entry->found_vers,
+		  __entry->found_ret, sk_trace_args(found))
 );

 DECLARE_EVENT_CLASS(scoutfs_block_class,
@@ -2241,6 +2206,11 @@ DEFINE_EVENT(scoutfs_block_class, scoutfs_block_invalidate,
 		 int refcount, int io_count, unsigned long bits, u64 lru_moved),
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved)
 );
+DEFINE_EVENT(scoutfs_block_class, scoutfs_block_move,
+	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
+		 int refcount, int io_count, unsigned long bits, u64 lru_moved),
+	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved)
+);
 DEFINE_EVENT(scoutfs_block_class, scoutfs_block_mark_dirty,
 	TP_PROTO(struct super_block *sb, void *bp, u64 blkno,
 		 int refcount, int io_count, unsigned long bits, u64 lru_moved),
@@ -2257,243 +2227,171 @@ DEFINE_EVENT(scoutfs_block_class, scoutfs_block_shrink,
 	TP_ARGS(sb, bp, blkno, refcount, io_count, bits, lru_moved)
 );

-DECLARE_EVENT_CLASS(scoutfs_ext_next_class,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len,
-		 struct scoutfs_extent *ext, int ret),
-
-	TP_ARGS(sb, start, len, ext, ret),
-
+TRACE_EVENT(scoutfs_radix_dirty,
+	TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root,
+		 u64 orig_blkno, u64 dirty_blkno, u64 par_blkno),
+	TP_ARGS(sb, root, orig_blkno, dirty_blkno, par_blkno),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, start)
-		__field(__u64, len)
-		STE_FIELDS(ext)
-		__field(int, ret)
+		__field(__u64, root_blkno)
+		__field(__u64, orig_blkno)
+		__field(__u64, dirty_blkno)
+		__field(__u64, par_blkno)
 	),
-
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->start = start;
-		__entry->len = len;
-		STE_ASSIGN(ext, ext)
-		__entry->ret = ret;
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->orig_blkno = orig_blkno;
+		__entry->dirty_blkno = dirty_blkno;
+		__entry->par_blkno = par_blkno;
 	),
-
-	TP_printk(SCSBF" start %llu len %llu ext "STE_FMT" ret %d",
-		  SCSB_TRACE_ARGS, __entry->start, __entry->len,
-		  STE_ENTRY_ARGS(ext), __entry->ret)
+	TP_printk(SCSBF" root_blkno %llu orig_blkno %llu dirty_blkno %llu par_blkno %llu",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->orig_blkno,
+		  __entry->dirty_blkno, __entry->par_blkno)
 );

-DEFINE_EVENT(scoutfs_ext_next_class, scoutfs_ext_op_next,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len,
-		 struct scoutfs_extent *ext, int ret),
-	TP_ARGS(sb, start, len, ext, ret)
-);
-DEFINE_EVENT(scoutfs_ext_next_class, scoutfs_ext_next,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len,
-		 struct scoutfs_extent *ext, int ret),
-	TP_ARGS(sb, start, len, ext, ret)
-);
-
-DECLARE_EVENT_CLASS(scoutfs_ext_typical_class,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len, u64 map, u8 flags,
-		 int ret),
-
-	TP_ARGS(sb, start, len, map, flags, ret),
-
+TRACE_EVENT(scoutfs_radix_walk,
+	TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root,
+		 int grl, int level, u64 blkno, int ind, u64 bit, u64 next),
+	TP_ARGS(sb, root, grl, level, blkno, ind, bit, next),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, start)
-		__field(__u64, len)
-		__field(__u64, map)
-		__field(__u8, flags)
-		__field(int, ret)
+		__field(__u64, root_blkno)
+		__field(unsigned int, grl)
+		__field(__u64, blkno)
+		__field(int, level)
+		__field(int, ind)
+		__field(__u64, bit)
+		__field(__u64, next)
 	),
-
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->start = start;
-		__entry->len = len;
-		__entry->map = map;
-		__entry->flags = flags;
-		__entry->ret = ret;
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->grl = grl;
+		__entry->blkno = blkno;
+		__entry->level = level;
+		__entry->ind = ind;
+		__entry->bit = bit;
+		__entry->next = next;
 	),
-
-	TP_printk(SCSBF" start %llu len %llu map %llu flags %u ret %d",
-		  SCSB_TRACE_ARGS, __entry->start, __entry->len, __entry->map,
-		  __entry->flags, __entry->ret)
+	TP_printk(SCSBF" root_blkno %llu grl 0x%x blkno %llu level %d ind %d bit %llu next %llu",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->grl,
+		  __entry->blkno, __entry->level, __entry->ind, __entry->bit,
+		  __entry->next)
 );

-DEFINE_EVENT(scoutfs_ext_typical_class, scoutfs_ext_op_insert,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len, u64 map, u8 flags,
-		 int ret),
-	TP_ARGS(sb, start, len, map, flags, ret)
-);
-DEFINE_EVENT(scoutfs_ext_typical_class, scoutfs_ext_insert,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len, u64 map, u8 flags,
-		 int ret),
-	TP_ARGS(sb, start, len, map, flags, ret)
-);
-DEFINE_EVENT(scoutfs_ext_typical_class, scoutfs_ext_op_remove,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len, u64 map, u8 flags,
-		 int ret),
-	TP_ARGS(sb, start, len, map, flags, ret)
-);
-DEFINE_EVENT(scoutfs_ext_typical_class, scoutfs_ext_remove,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len, u64 map, u8 flags,
-		 int ret),
-	TP_ARGS(sb, start, len, map, flags, ret)
-);
-DEFINE_EVENT(scoutfs_ext_typical_class, scoutfs_ext_set,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len, u64 map, u8 flags,
-		 int ret),
-	TP_ARGS(sb, start, len, map, flags, ret)
-);
-
-TRACE_EVENT(scoutfs_ext_alloc,
-	TP_PROTO(struct super_block *sb, u64 start, u64 len, u64 count,
-		 struct scoutfs_extent *ext, int ret),
-
-	TP_ARGS(sb, start, len, count, ext, ret),
-
+TRACE_EVENT(scoutfs_radix_fixup_refs,
+	TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root,
+		 u32 sm_first, u64 sm_total, u16 lg_first, u64 lg_total,
+		 u64 blkno, int level),
+	TP_ARGS(sb, root, sm_first, sm_total, lg_first, lg_total, blkno, level),
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
-		__field(__u64, start)
-		__field(__u64, len)
+		__field(__u64, root_blkno)
+		__field(__u32, sm_first)
+		__field(__u64, sm_total)
+		__field(__u16, lg_first)
+		__field(__u64, lg_total)
+		__field(__u64, blkno)
+		__field(int, level)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->sm_first = sm_first;
+		__entry->sm_total = sm_total;
+		__entry->lg_first = lg_first;
+		__entry->lg_total = lg_total;
+		__entry->blkno = blkno;
+		__entry->level = level;
+	),
+	TP_printk(SCSBF" root_blkno %llu sm_first %u sm_total %llu lg_first %u lg_total %llu blkno %llu level %u",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->sm_first,
+		  __entry->sm_total, __entry->lg_first, __entry->lg_total,
+		  __entry->blkno, __entry->level)
+);
+
+DECLARE_EVENT_CLASS(scoutfs_radix_bitop,
+	TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root,
+		 u64 blkno, u64 bit, int ind, int nbits),
+	TP_ARGS(sb, root, blkno, bit, ind, nbits),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, root_blkno)
+		__field(__u64, blkno)
+		__field(__u64, bit)
+		__field(int, ind)
+		__field(int, nbits)
+	),
+	TP_fast_assign(
+		SCSB_TRACE_ASSIGN(sb);
+		__entry->root_blkno = le64_to_cpu(root->ref.blkno);
+		__entry->blkno = blkno;
+		__entry->bit = bit;
+		__entry->ind = ind;
+		__entry->nbits = nbits;
+	),
+	TP_printk(SCSBF" root_blkno %llu blkno %llu bit %llu ind %d nbits %d",
+		  SCSB_TRACE_ARGS, __entry->root_blkno, __entry->blkno,
+		  __entry->bit, __entry->ind, __entry->nbits)
+);
+DEFINE_EVENT(scoutfs_radix_bitop, scoutfs_radix_clear,
+	TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root,
+		 u64 blkno, u64 bit, int ind, int nbits),
+	TP_ARGS(sb, root, blkno, bit, ind, nbits)
+);
+DEFINE_EVENT(scoutfs_radix_bitop, scoutfs_radix_set,
+	TP_PROTO(struct super_block *sb, struct scoutfs_radix_root *root,
+		 u64 blkno, u64 bit, int ind, int nbits),
+	TP_ARGS(sb, root, blkno, bit, ind, nbits)
+);
+
+TRACE_EVENT(scoutfs_radix_merge,
+	TP_PROTO(struct super_block *sb,
+		 struct scoutfs_radix_root *inp, u64 inp_blkno,
+		 struct scoutfs_radix_root *src, u64 src_blkno,
+		 struct scoutfs_radix_root *dst, u64 dst_blkno,
+		 u64 count, u64 leaf_bit, int ind, int sm_delta,
+		 int src_lg_delta, int dst_lg_delta),
+	TP_ARGS(sb, inp, inp_blkno, src, src_blkno, dst, dst_blkno, count,
+		leaf_bit, ind, sm_delta, src_lg_delta, dst_lg_delta),
+	TP_STRUCT__entry(
+		SCSB_TRACE_FIELDS
+		__field(__u64, inp_root_blkno)
+		__field(__u64, inp_blkno)
+		__field(__u64, src_root_blkno)
+		__field(__u64, src_blkno)
+		__field(__u64, dst_root_blkno)
+		__field(__u64, dst_blkno)
 		__field(__u64, count)
-		STE_FIELDS(ext)
-		__field(int, ret)
+		__field(__u64, leaf_bit)
+		__field(int, ind)
+		__field(int, sm_delta)
+		__field(int, src_lg_delta)
+		__field(int, dst_lg_delta)
 	),
-
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
-		__entry->start = start;
-		__entry->len = len;
+		__entry->inp_root_blkno = le64_to_cpu(inp->ref.blkno);
+		__entry->inp_blkno = inp_blkno;
+		__entry->src_root_blkno = le64_to_cpu(src->ref.blkno);
+		__entry->src_blkno = src_blkno;
+		__entry->dst_root_blkno = le64_to_cpu(dst->ref.blkno);
+		__entry->dst_blkno = dst_blkno;
 		__entry->count = count;
-		STE_ASSIGN(ext, ext)
-		__entry->ret = ret;
+		__entry->leaf_bit = leaf_bit;
+		__entry->ind = ind;
+		__entry->sm_delta = sm_delta;
+		__entry->src_lg_delta = src_lg_delta;
+		__entry->dst_lg_delta = dst_lg_delta;
 	),
-
-	TP_printk(SCSBF" start %llu len %llu count %llu ext "STE_FMT" ret %d",
-		  SCSB_TRACE_ARGS, __entry->start, __entry->len, __entry->count,
-		  STE_ENTRY_ARGS(ext), __entry->ret)
-);
-
-TRACE_EVENT(scoutfs_alloc_alloc_meta,
-	TP_PROTO(struct super_block *sb, u64 blkno, int ret),
-
-	TP_ARGS(sb, blkno, ret),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, blkno)
-		__field(int, ret)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->blkno = blkno;
-		__entry->ret = ret;
-	),
-
-	TP_printk(SCSBF" blkno %llu ret %d",
-		  SCSB_TRACE_ARGS, __entry->blkno, __entry->ret)
-);
-
-TRACE_EVENT(scoutfs_alloc_free_meta,
-	TP_PROTO(struct super_block *sb, u64 blkno, int ret),
-
-	TP_ARGS(sb, blkno, ret),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, blkno)
-		__field(int, ret)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->blkno = blkno;
-		__entry->ret = ret;
-	),
-
-	TP_printk(SCSBF" blkno %llu ret %d",
-		  SCSB_TRACE_ARGS, __entry->blkno, __entry->ret)
-);
-
-TRACE_EVENT(scoutfs_alloc_alloc_data,
-	TP_PROTO(struct super_block *sb, u64 req, u64 blkno, u64 count,
-		 int ret),
-
-	TP_ARGS(sb, req, blkno, count, ret),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, req)
-		__field(__u64, blkno)
-		__field(__u64, count)
-		__field(int, ret)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->req = req;
-		__entry->blkno = blkno;
-		__entry->count = count;
-		__entry->ret = ret;
-	),
-
-	TP_printk(SCSBF" req %llu blkno %llu count %llu ret %d",
-		  SCSB_TRACE_ARGS, __entry->req, __entry->blkno,
-		  __entry->count, __entry->ret)
-);
-
-TRACE_EVENT(scoutfs_alloc_free_data,
-	TP_PROTO(struct super_block *sb, u64 blkno, u64 count, int ret),
-
-	TP_ARGS(sb, blkno, count, ret),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, blkno)
-		__field(__u64, count)
-		__field(int, ret)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->blkno = blkno;
-		__entry->count = count;
-		__entry->ret = ret;
-	),
-
-	TP_printk(SCSBF" blkno %llu count %llu ret %d",
-		  SCSB_TRACE_ARGS, __entry->blkno, __entry->count,
-		  __entry->ret)
-);
-
-TRACE_EVENT(scoutfs_alloc_move,
-	TP_PROTO(struct super_block *sb, u64 total, u64 moved, int ret),
-
-	TP_ARGS(sb, total, moved, ret),
-
-	TP_STRUCT__entry(
-		SCSB_TRACE_FIELDS
-		__field(__u64, total)
-		__field(__u64, moved)
-		__field(int, ret)
-	),
-
-	TP_fast_assign(
-		SCSB_TRACE_ASSIGN(sb);
-		__entry->total = total;
-		__entry->moved = moved;
-		__entry->ret = ret;
-	),
-
-	TP_printk(SCSBF" total %llu moved %llu ret %d",
-		  SCSB_TRACE_ARGS, __entry->total, __entry->moved,
-		  __entry->ret)
+	TP_printk(SCSBF" irb %llu ib %llu srb %llu sb %llu drb %llu db %llu cnt %llu lb %llu ind %u smd %d sld %d dld %d",
+		  SCSB_TRACE_ARGS, __entry->inp_root_blkno, __entry->inp_blkno,
+		  __entry->src_root_blkno, __entry->src_blkno,
+		  __entry->dst_root_blkno, __entry->dst_blkno,
+		  __entry->count, __entry->leaf_bit, __entry->ind,
+		  __entry->sm_delta, __entry->src_lg_delta,
+		  __entry->dst_lg_delta)
 );

 #endif /* _TRACE_SCOUTFS_H */
@@ -58,12 +58,10 @@ do {								\

 int scoutfs_server_lock_request(struct super_block *sb, u64 rid,
 				struct scoutfs_net_lock *nl);
-int scoutfs_server_lock_response(struct super_block *sb, u64 rid, u64 id,
-				 struct scoutfs_net_lock_grant_response *gr);
+int scoutfs_server_lock_response(struct super_block *sb, u64 rid,
+				 u64 id, struct scoutfs_net_lock *nl);
 int scoutfs_server_lock_recover_request(struct super_block *sb, u64 rid,
 					struct scoutfs_key *key);
-void scoutfs_server_get_roots(struct super_block *sb,
-			      struct scoutfs_net_roots *roots);
 int scoutfs_server_hold_commit(struct super_block *sb);
 int scoutfs_server_apply_commit(struct super_block *sb, int err);

@@ -1,71 +0,0 @@
-/*
- * A copy of sort() from upstream with a priv argument that's passed
- * to comparison, like list_sort().
- */
-
-/* ------------------------ */
-
-/*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
- *
- * Jan 23 2005  Matt Mackall <mpm@selenic.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-#include "sort_priv.h"
-
-/**
- * sort - sort an array of elements
- * @priv: caller's pointer to pass to comparison and swap functions
- * @base: pointer to data to sort
- * @num: number of elements
- * @size: size of each element
- * @cmp_func: pointer to comparison function
- * @swap_func: pointer to swap function or NULL
- *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
- *
- * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
- * O(n*n) worst-case behavior and extra memory requirements that make
- * it less suitable for kernel use.
- */
-
-void sort_priv(void *priv, void *base, size_t num, size_t size,
-	       int (*cmp_func)(void *priv, const void *, const void *),
-	       void (*swap_func)(void *priv, void *, void *, int size))
-{
-	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
-
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-			    cmp_func(priv, base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(priv, base + r, base + c) >= 0)
-				break;
-			swap_func(priv, base + r, base + c, size);
-		}
-	}
-
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(priv, base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-			    cmp_func(priv, base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(priv, base + r, base + c) >= 0)
-				break;
-			swap_func(priv, base + r, base + c, size);
-		}
-	}
-}
@@ -1,8 +0,0 @@
-#ifndef _SCOUTFS_SORT_PRIV_H_
-#define _SCOUTFS_SORT_PRIV_H_
-
-void sort_priv(void *priv, void *base, size_t num, size_t size,
-	       int (*cmp_func)(void *priv, const void *, const void *),
-	       void (*swap_func)(void *priv, void *, void *, int size));
-
-#endif
@@ -47,9 +47,9 @@ bool scoutfs_spbm_empty(struct scoutfs_spbm *spbm)
 	return RB_EMPTY_ROOT(&spbm->root);
 }

-enum spbm_flags {
+enum {
 	/* if a node isn't found then return an allocated new node */
-	SPBM_FIND_ALLOC = (1 << 0),
+	SPBM_FIND_ALLOC = 0x1,
 };
 static struct spbm_node *find_node(struct scoutfs_spbm *spbm, u64 index,
 				   int flags)
@@ -1,68 +0,0 @@
-#ifndef _SCOUTFS_SRCH_H_
-#define _SCOUTFS_SRCH_H_
-
-struct scoutfs_block;
-
-struct scoutfs_srch_rb_root {
-	struct rb_root root;
-	struct rb_node *last;
-	unsigned long nr;
-};
-
-struct scoutfs_srch_rb_node {
-	struct rb_node node;
-	u64 ino;
-	u64 id;
-};
-
-#define scoutfs_srch_foreach_rb_node(snode, node, sroot)		\
-	for (node = rb_first(&(sroot)->root);				\
-	     node && (snode = container_of(node, struct scoutfs_srch_rb_node, \
-					   node), 1);			\
-	     node = rb_next(node))
-
-int scoutfs_srch_add(struct super_block *sb,
-		     struct scoutfs_alloc *alloc,
-		     struct scoutfs_block_writer *wri,
-		     struct scoutfs_srch_file *sfl,
-		     struct scoutfs_block **bl_ret,
-		     u64 hash, u64 ino, u64 id);
-
-void scoutfs_srch_destroy_rb_root(struct scoutfs_srch_rb_root *sroot);
-int scoutfs_srch_search_xattrs(struct super_block *sb,
-			       struct scoutfs_srch_rb_root *sroot,
-			       u64 hash, u64 ino, u64 last_ino, bool *done);
-
-int scoutfs_srch_rotate_log(struct super_block *sb,
-			    struct scoutfs_alloc *alloc,
-			    struct scoutfs_block_writer *wri,
-			    struct scoutfs_btree_root *root,
-			    struct scoutfs_srch_file *sfl);
-int scoutfs_srch_get_compact(struct super_block *sb,
-			     struct scoutfs_alloc *alloc,
-			     struct scoutfs_block_writer *wri,
-			     struct scoutfs_btree_root *root,
-			     u64 rid, struct scoutfs_srch_compact *sc);
-int scoutfs_srch_update_compact(struct super_block *sb,
-				struct scoutfs_alloc *alloc,
-				struct scoutfs_block_writer *wri,
-				struct scoutfs_btree_root *root, u64 rid,
-				struct scoutfs_srch_compact *sc);
-int scoutfs_srch_commit_compact(struct super_block *sb,
-				struct scoutfs_alloc *alloc,
-				struct scoutfs_block_writer *wri,
-				struct scoutfs_btree_root *root, u64 rid,
-				struct scoutfs_srch_compact *res,
-				struct scoutfs_alloc_list_head *av,
-				struct scoutfs_alloc_list_head *fr);
-int scoutfs_srch_cancel_compact(struct super_block *sb,
-				struct scoutfs_alloc *alloc,
-				struct scoutfs_block_writer *wri,
-				struct scoutfs_btree_root *root, u64 rid,
-				struct scoutfs_alloc_list_head *av,
-				struct scoutfs_alloc_list_head *fr);
-
-void scoutfs_srch_destroy(struct super_block *sb);
-int scoutfs_srch_setup(struct super_block *sb);
-
-#endif
@@ -41,9 +41,6 @@
 #include "sysfs.h"
 #include "quorum.h"
 #include "forest.h"
-#include "srch.h"
-#include "item.h"
-#include "alloc.h"
 #include "scoutfs_trace.h"

 static struct dentry *scoutfs_debugfs_root;
@@ -79,30 +76,11 @@ retry:
 	return cpu_to_le64(ret);
 }

-struct statfs_free_blocks {
-	u64 meta;
-	u64 data;
-};
-
-static int count_free_blocks(struct super_block *sb, void *arg, int owner,
-			     u64 id, bool meta, bool avail, u64 blocks)
-{
-	struct statfs_free_blocks *sfb = arg;
-
-	if (meta)
-		sfb->meta += blocks;
-	else
-		sfb->data += blocks;
-
-	return 0;
-}
-
 /*
- * Build the free block counts by having alloc read all the persistent
- * blocks which contain allocators and calling us for each of them.
- * Only the super block reads aren't cached so repeatedly calling statfs
- * is like repeated O_DIRECT IO.  We can add a cache and stale results
- * if that IO becomes a problem.
+ * Ask the server for the current statfs fields.  The message is very
+ * cheap so we're not worrying about spinning in statfs flooding the
+ * server with requests.  We can add a cache and stale results if that
+ * becomes a problem.
 *
 * We fake the number of free inodes value by assuming that we can fill
 * free blocks with a certain number of inodes.  We then the number of
@@ -115,50 +93,30 @@ static int count_free_blocks(struct super_block *sb, void *arg, int owner,
 static int scoutfs_statfs(struct dentry *dentry, struct kstatfs *kst)
 {
 	struct super_block *sb = dentry->d_inode->i_sb;
-	struct scoutfs_super_block *super = NULL;
-	struct statfs_free_blocks sfb = {0,};
+	struct scoutfs_net_statfs nstatfs;
 	__le32 uuid[4];
 	int ret;

-	scoutfs_inc_counter(sb, statfs);
-
-	super = kzalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
-	if (!super) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = scoutfs_read_super(sb, super);
+	ret = scoutfs_client_statfs(sb, &nstatfs);
 	if (ret)
-		goto out;
+		return ret;

-	ret = scoutfs_alloc_foreach(sb, count_free_blocks, &sfb);
-	if (ret < 0)
-		goto out;
-
-	kst->f_bfree = (sfb.meta << SCOUTFS_BLOCK_SM_LG_SHIFT) + sfb.data;
+	kst->f_bfree = le64_to_cpu(nstatfs.bfree);
 	kst->f_type = SCOUTFS_SUPER_MAGIC;
-	kst->f_bsize = SCOUTFS_BLOCK_SM_SIZE;
-	kst->f_blocks = (le64_to_cpu(super->total_meta_blocks) <<
-			 SCOUTFS_BLOCK_SM_LG_SHIFT) +
-			le64_to_cpu(super->total_data_blocks);
+	kst->f_bsize = SCOUTFS_BLOCK_SIZE;
+	kst->f_blocks = le64_to_cpu(nstatfs.total_blocks);
 	kst->f_bavail = kst->f_bfree;

-	/* arbitrarily assume ~1K / empty file */
-	kst->f_ffree = sfb.meta * (SCOUTFS_BLOCK_LG_SIZE / 1024);
-	kst->f_files = kst->f_ffree + le64_to_cpu(super->next_ino);
+	kst->f_ffree = kst->f_bfree * 16;
+	kst->f_files = kst->f_ffree + le64_to_cpu(nstatfs.next_ino);

-	BUILD_BUG_ON(sizeof(uuid) != sizeof(super->uuid));
-	memcpy(uuid, super->uuid, sizeof(uuid));
+	BUILD_BUG_ON(sizeof(uuid) != sizeof(nstatfs.uuid));
+	memcpy(uuid, &nstatfs, sizeof(uuid));
 	kst->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[1]);
 	kst->f_fsid.val[1] = le32_to_cpu(uuid[2]) ^ le32_to_cpu(uuid[3]);
 	kst->f_namelen = SCOUTFS_NAME_LEN;
-	kst->f_frsize = SCOUTFS_BLOCK_SM_SIZE;
-
+	kst->f_frsize = SCOUTFS_BLOCK_SIZE;
 	/* the vfs fills f_flags */
-	ret = 0;
-out:
-	kfree(super);

 	/*
 	 * We don't take cluster locks in statfs which makes it a very
@@ -168,7 +126,7 @@ out:
 	if (scoutfs_trigger(sb, STATFS_LOCK_PURGE))
 		scoutfs_free_unused_locks(sb, -1UL);

-	return ret;
+	return 0;
 }

 static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
@@ -177,21 +135,10 @@ static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
 	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;

 	seq_printf(seq, ",server_addr="SIN_FMT, SIN_ARG(&opts->server_addr));
-	seq_printf(seq, ",metadev_path=%s", opts->metadev_path);

 	return 0;
 }

-static ssize_t metadev_path_show(struct kobject *kobj,
-				 struct kobj_attribute *attr, char *buf)
-{
-	struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
-	struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
-
-	return snprintf(buf, PAGE_SIZE, "%s", opts->metadev_path);
-}
-SCOUTFS_ATTR_RO(metadev_path);
-
 static ssize_t server_addr_show(struct kobject *kobj,
 			      struct kobj_attribute *attr, char *buf)
 {
@@ -204,7 +151,6 @@ static ssize_t server_addr_show(struct kobject *kobj,
 SCOUTFS_ATTR_RO(server_addr);

 static struct attribute *mount_options_attrs[] = {
-	SCOUTFS_ATTR_PTR(metadev_path),
 	SCOUTFS_ATTR_PTR(server_addr),
 	NULL,
 };
@@ -217,20 +163,6 @@ static int scoutfs_sync_fs(struct super_block *sb, int wait)
 	return scoutfs_trans_sync(sb, wait);
 }

-/*
- * Data dev is closed by generic code, but we have to explicitly close the meta
- * dev.
- */
-static void scoutfs_metadev_close(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	if (sbi->meta_bdev) {
-		blkdev_put(sbi->meta_bdev, SCOUTFS_META_BDEV_MODE);
-		sbi->meta_bdev = NULL;
-	}
-}
-
 /*
 * This destroys all the state that's built up in the sb info during
 * mount.  It's called by us on errors during mount if we haven't set
@@ -246,7 +178,6 @@ static void scoutfs_put_super(struct super_block *sb)
 	sbi->shutdown = true;

 	scoutfs_data_destroy(sb);
-	scoutfs_srch_destroy(sb);

 	scoutfs_unlock(sb, sbi->rid_lock, SCOUTFS_LOCK_WRITE);
 	sbi->rid_lock = NULL;
@@ -254,7 +185,6 @@ static void scoutfs_put_super(struct super_block *sb)
 	scoutfs_shutdown_trans(sb);
 	scoutfs_client_destroy(sb);
 	scoutfs_inode_destroy(sb);
-	scoutfs_item_destroy(sb);
 	scoutfs_forest_destroy(sb);

 	/* the server locks the listen address and compacts */
@@ -273,9 +203,6 @@ static void scoutfs_put_super(struct super_block *sb)
 	debugfs_remove(sbi->debug_root);
 	scoutfs_destroy_counters(sb);
 	scoutfs_destroy_sysfs(sb);
-	scoutfs_metadev_close(sb);
-
-	kfree(sbi->opts.metadev_path);
 	kfree(sbi);

 	sb->s_fs_info = NULL;
@@ -300,33 +227,30 @@ static const struct super_operations scoutfs_super_ops = {
 int scoutfs_write_super(struct super_block *sb,
 			struct scoutfs_super_block *super)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
 	le64_add_cpu(&super->hdr.seq, 1);

-	return scoutfs_block_write_sm(sb, sbi->meta_bdev, SCOUTFS_SUPER_BLKNO,
-				      &super->hdr,
+	return scoutfs_block_write_sm(sb, SCOUTFS_SUPER_BLKNO, &super->hdr,
 				      sizeof(struct scoutfs_super_block));
 }

 /*
- * Read super, specifying bdev.
+ * Read the super block.  If it's valid store it in the caller's super
+ * struct.
 */
-static int scoutfs_read_super_from_bdev(struct super_block *sb,
-					struct block_device *bdev,
-					struct scoutfs_super_block *super_res)
+int scoutfs_read_super(struct super_block *sb,
+		       struct scoutfs_super_block *super_res)
 {
 	struct scoutfs_super_block *super;
 	__le32 calc;
-	u64 blkno;
 	int ret;

 	super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
 	if (!super)
 		return -ENOMEM;

-	ret = scoutfs_block_read_sm(sb, bdev, SCOUTFS_SUPER_BLKNO, &super->hdr,
-				    sizeof(struct scoutfs_super_block), &calc);
+	ret = scoutfs_block_read_sm(sb, SCOUTFS_SUPER_BLKNO, &super->hdr,
+				    sizeof(struct scoutfs_super_block),
+				    &calc);
 	if (ret < 0)
 		goto out;

@@ -370,61 +294,13 @@ static int scoutfs_read_super_from_bdev(struct super_block *sb,
 		goto out;
 	}

-	blkno = (SCOUTFS_QUORUM_BLKNO + SCOUTFS_QUORUM_BLOCKS) >>
-		SCOUTFS_BLOCK_SM_LG_SHIFT;
-	if (le64_to_cpu(super->first_meta_blkno) < blkno) {
-		scoutfs_err(sb, "super block first meta blkno %llu is within quorum blocks",
-			le64_to_cpu(super->first_meta_blkno));
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (le64_to_cpu(super->first_meta_blkno) >
-	    le64_to_cpu(super->last_meta_blkno)) {
-		scoutfs_err(sb, "super block first meta blkno %llu is greater than last meta blkno %llu",
-			le64_to_cpu(super->first_meta_blkno),
-			le64_to_cpu(super->last_meta_blkno));
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (le64_to_cpu(super->first_data_blkno) >
-	    le64_to_cpu(super->last_data_blkno)) {
-		scoutfs_err(sb, "super block first data blkno %llu is greater than last data blkno %llu",
-			le64_to_cpu(super->first_data_blkno),
-			le64_to_cpu(super->last_data_blkno));
-		ret = -EINVAL;
-		goto out;
-	}
-
-	blkno = (i_size_read(sb->s_bdev->bd_inode) >>
-		 SCOUTFS_BLOCK_SM_SHIFT) - 1;
-	if (le64_to_cpu(super->last_data_blkno) > blkno) {
-		scoutfs_err(sb, "super block last data blkno %llu is outsite device size last blkno %llu",
-			le64_to_cpu(super->last_data_blkno), blkno);
-		ret = -EINVAL;
-		goto out;
-	}
-
+	*super_res = *super;
+	ret = 0;
 out:
-	if (ret == 0)
-		*super_res = *super;
 	kfree(super);
-
 	return ret;
 }

-/*
- * Read the super block from meta dev.
- */
-int scoutfs_read_super(struct super_block *sb,
-		       struct scoutfs_super_block *super_res)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	return scoutfs_read_super_from_bdev(sb, sbi->meta_bdev, super_res);
-}
-
 /*
 * This needs to be setup after reading the super because it uses the
 * fsid found in the super block.
@@ -461,66 +337,10 @@ static int assign_random_id(struct scoutfs_sb_info *sbi)
 	return 0;
 }

-/*
- * Ensure superblock copies in metadata and data block devices are valid, and
- * fill in in-memory superblock if so.
- */
-static int scoutfs_read_supers(struct super_block *sb)
-{
-	struct scoutfs_super_block *meta_super = NULL;
-	struct scoutfs_super_block *data_super = NULL;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	int ret = 0;
-
-	meta_super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
-	data_super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
-	if (!meta_super || !data_super) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = scoutfs_read_super_from_bdev(sb, sbi->meta_bdev, meta_super);
-	if (ret < 0) {
-		scoutfs_err(sb, "could not get meta_super: error %d", ret);
-		goto out;
-	}
-
-	ret = scoutfs_read_super_from_bdev(sb, sb->s_bdev, data_super);
-	if (ret < 0) {
-		scoutfs_err(sb, "could not get data_super: error %d", ret);
-		goto out;
-	}
-
-	if (!SCOUTFS_IS_META_BDEV(meta_super)) {
-		scoutfs_err(sb, "meta_super META flag not set");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (SCOUTFS_IS_META_BDEV(data_super)) {
-		scoutfs_err(sb, "data_super META flag set");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (memcmp(meta_super->uuid, data_super->uuid, SCOUTFS_UUID_BYTES)) {
-		scoutfs_err(sb, "superblock UUID mismatch");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	sbi->super = *meta_super;
-out:
-	kfree(meta_super);
-	kfree(data_super);
-	return ret;
-}
-
 static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct scoutfs_sb_info *sbi;
 	struct mount_options opts;
-	struct block_device *meta_bdev;
 	struct inode *inode;
 	int ret;

@@ -559,31 +379,14 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)

 	sbi->opts = opts;

-	ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE);
-	if (ret != SCOUTFS_BLOCK_SM_SIZE) {
+	ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SIZE);
+	if (ret != SCOUTFS_BLOCK_SIZE) {
 		scoutfs_err(sb, "failed to set blocksize, returned %d", ret);
 		ret = -EIO;
 		goto out;
 	}

-	meta_bdev =
-		blkdev_get_by_path(sbi->opts.metadev_path,
-				   SCOUTFS_META_BDEV_MODE, sb);
-	if (IS_ERR(meta_bdev)) {
-		scoutfs_err(sb, "could not open metadev: error %ld",
-			    PTR_ERR(meta_bdev));
-		ret = PTR_ERR(meta_bdev);
-		goto out;
-	}
-	sbi->meta_bdev = meta_bdev;
-	ret = set_blocksize(sbi->meta_bdev, SCOUTFS_BLOCK_SM_SIZE);
-	if (ret != 0) {
-		scoutfs_err(sb, "failed to set metadev blocksize, returned %d",
-			    ret);
-		goto out;
-	}
-
-	ret = scoutfs_read_supers(sb) ?:
+	ret = scoutfs_read_super(sb, &SCOUTFS_SB(sb)->super) ?:
 	      scoutfs_debugfs_setup(sb) ?:
 	      scoutfs_setup_sysfs(sb) ?:
 	      scoutfs_setup_counters(sb) ?:
@@ -593,7 +396,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_setup_triggers(sb) ?:
 	      scoutfs_block_setup(sb) ?:
 	      scoutfs_forest_setup(sb) ?:
-	      scoutfs_item_setup(sb) ?:
 	      scoutfs_inode_setup(sb) ?:
 	      scoutfs_data_setup(sb) ?:
 	      scoutfs_setup_trans(sb) ?:
@@ -604,8 +406,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_client_setup(sb) ?:
 	      scoutfs_lock_rid(sb, SCOUTFS_LOCK_WRITE, 0, sbi->rid,
 				   &sbi->rid_lock) ?:
-	      scoutfs_trans_get_log_trees(sb) ?:
-	      scoutfs_srch_setup(sb);
+	      scoutfs_trans_get_log_trees(sb);
 	if (ret)
 		goto out;

@@ -25,7 +25,6 @@ struct options_sb_info;
 struct net_info;
 struct block_info;
 struct forest_info;
-struct srch_info;

 struct scoutfs_sb_info {
 	struct super_block *sb;
@@ -36,8 +35,6 @@ struct scoutfs_sb_info {

 	struct scoutfs_super_block super;

-	struct block_device *meta_bdev;
-
 	spinlock_t next_ino_lock;

 	struct data_info *data_info;
@@ -47,8 +44,6 @@ struct scoutfs_sb_info {
 	struct quorum_info *quorum_info;
 	struct block_info *block_info;
 	struct forest_info *forest_info;
-	struct srch_info *srch_info;
-	struct item_cache_info *item_cache_info;

 	wait_queue_head_t trans_hold_wq;
 	struct task_struct *trans_task;
@@ -96,13 +91,6 @@ static inline bool SCOUTFS_HAS_SBI(struct super_block *sb)
 	return (sb != NULL) && (SCOUTFS_SB(sb) != NULL);
 }

-static inline bool SCOUTFS_IS_META_BDEV(struct scoutfs_super_block *super_block)
-{
-	return !!(le64_to_cpu(super_block->flags) & SCOUTFS_FLAG_IS_META_BDEV);
-}
-
-#define SCOUTFS_META_BDEV_MODE (FMODE_READ | FMODE_WRITE | FMODE_EXCL)
-
 /*
 * A small string embedded in messages that's used to identify a
 * specific mount.  It's the three most significant bytes of the fsid
@@ -25,10 +25,8 @@
 #include "counters.h"
 #include "client.h"
 #include "inode.h"
-#include "alloc.h"
+#include "radix.h"
 #include "block.h"
-#include "msg.h"
-#include "item.h"
 #include "scoutfs_trace.h"

 /*
@@ -66,7 +64,7 @@ struct trans_info {
 	bool writing;

 	struct scoutfs_log_trees lt;
-	struct scoutfs_alloc alloc;
+	struct scoutfs_radix_allocator alloc;
 	struct scoutfs_block_writer wri;
 };

@@ -112,7 +110,8 @@ int scoutfs_trans_get_log_trees(struct super_block *sb)
 	ret = scoutfs_client_get_log_trees(sb, &lt);
 	if (ret == 0) {
 		tri->lt = lt;
-		scoutfs_alloc_init(&tri->alloc, &lt.meta_avail, &lt.meta_freed);
+		scoutfs_radix_init_alloc(&tri->alloc, &lt.meta_avail,
+					 &lt.meta_freed);
 		scoutfs_block_writer_init(sb, &tri->wri);

 		scoutfs_forest_init_btrees(sb, &tri->alloc, &tri->wri, &lt);
@@ -127,7 +126,6 @@ bool scoutfs_trans_has_dirty(struct super_block *sb)

 	return scoutfs_block_writer_has_dirty(sb, &tri->wri);
 }
-
 /*
 * This work func is responsible for writing out all the dirty blocks
 * that make up the current dirty transaction.  It prevents writers from
@@ -158,8 +156,6 @@ void scoutfs_trans_write_func(struct work_struct *work)
 						   trans_write_work.work);
 	struct super_block *sb = sbi->sb;
 	DECLARE_TRANS_INFO(sb, tri);
-	u64 trans_seq = sbi->trans_seq;
-	char *s = NULL;
 	int ret = 0;

 	sbi->trans_task = current;
@@ -169,49 +165,37 @@ void scoutfs_trans_write_func(struct work_struct *work)
 	trace_scoutfs_trans_write_func(sb,
 			scoutfs_block_writer_dirty_bytes(sb, &tri->wri));

-	if (!scoutfs_block_writer_has_dirty(sb, &tri->wri) &&
-	    !scoutfs_item_dirty_pages(sb)) {
-		if (sbi->trans_deadline_expired) {
-			/*
-			 * If we're not writing data then we only advance the
-			 * seq at the sync deadline interval.  This keeps idle
-			 * mounts from pinning a seq and stopping readers of the
-			 * seq indices but doesn't send a message for every sync
-			 * syscall.
-			 */
-			ret = scoutfs_client_advance_seq(sb, &trans_seq);
-			if (ret < 0)
-			      s = "clean advance seq";
-		}
-		goto out;
+	if (scoutfs_block_writer_has_dirty(sb, &tri->wri)) {
+		if (sbi->trans_deadline_expired)
+			scoutfs_inc_counter(sb, trans_commit_timer);
+
+		ret = scoutfs_inode_walk_writeback(sb, true) ?:
+		      scoutfs_block_writer_write(sb, &tri->wri) ?:
+		      scoutfs_inode_walk_writeback(sb, false) ?:
+		      commit_btrees(sb) ?:
+		      scoutfs_client_advance_seq(sb, &sbi->trans_seq) ?:
+		      scoutfs_trans_get_log_trees(sb);
+		if (ret)
+			goto out;
+
+	} else if (sbi->trans_deadline_expired) {
+		/*
+		 * If we're not writing data then we only advance the
+		 * seq at the sync deadline interval.  This keeps idle
+		 * mounts from pinning a seq and stopping readers of the
+		 * seq indices but doesn't send a message for every sync
+		 * syscall.
+		 */
+		ret = scoutfs_client_advance_seq(sb, &sbi->trans_seq);
 	}

-	if (sbi->trans_deadline_expired)
-		scoutfs_inc_counter(sb, trans_commit_timer);
-
-	scoutfs_inc_counter(sb, trans_commit_written);
-
-	/* XXX this all needs serious work for dealing with errors */
-	ret = (s = "data submit", scoutfs_inode_walk_writeback(sb, true)) ?:
-	      (s = "item dirty", scoutfs_item_write_dirty(sb))  ?:
-	      (s = "data prepare", scoutfs_data_prepare_commit(sb))  ?:
-	      (s = "alloc prepare", scoutfs_alloc_prepare_commit(sb,
-						&tri->alloc, &tri->wri))  ?:
-	      (s = "meta write", scoutfs_block_writer_write(sb, &tri->wri))  ?:
-	      (s = "data wait", scoutfs_inode_walk_writeback(sb, false)) ?:
-	      (s = "commit log trees", commit_btrees(sb)) ?:
-	      scoutfs_item_write_done(sb) ?:
-	      (s = "advance seq", scoutfs_client_advance_seq(sb, &trans_seq)) ?:
-	      (s = "get log trees", scoutfs_trans_get_log_trees(sb));
 out:
-	if (ret < 0)
-		scoutfs_err(sb, "critical transaction commit failure: %s, %d",
-			    s, ret);
+	/* XXX this all needs serious work for dealing with errors */
+	WARN_ON_ONCE(ret);

 	spin_lock(&sbi->trans_write_lock);
 	sbi->trans_write_count++;
 	sbi->trans_write_ret = ret;
-	sbi->trans_seq = trans_seq;
 	spin_unlock(&sbi->trans_write_lock);
 	wake_up(&sbi->trans_write_wq);

@@ -369,31 +353,10 @@ static bool acquired_hold(struct super_block *sb,
 	items = tri->reserved_items + cnt->items;
 	vals = tri->reserved_vals + cnt->vals;

-	/*
-	 * In theory each dirty item page could be straddling two full
-	 * blocks, requiring 4 allocations for each item cache page.
-	 * That's much too conservative, typically many dirty item cache
-	 * pages that are near each other all land in one block.  This
-	 * rough estimate is still so far beyond what typically happens
-	 * that it accounts for having to dirty parent blocks and
-	 * whatever dirtying is done during the transaction hold.
-	 */
-	if (scoutfs_alloc_meta_low(sb, &tri->alloc,
-				   scoutfs_item_dirty_pages(sb) * 2)) {
-		scoutfs_inc_counter(sb, trans_commit_dirty_meta_full);
-		queue_trans_work(sbi);
-		goto out;
-	}
-
-	/*
-	 * Extent modifications can use meta allocators without creating
-	 * dirty items so we have to check the meta alloc specifically.
-	 * The size of the client's avail and freed roots are bound so
-	 * we're unlikely to need very many block allocations per
-	 * transaction hold.  XXX This should be more precisely tuned.
-	 */
-	if (scoutfs_alloc_meta_low(sb, &tri->alloc, 16)) {
-		scoutfs_inc_counter(sb, trans_commit_meta_alloc_low);
+	/* XXX arbitrarily limit to 8 meg transactions */
+	if (scoutfs_block_writer_dirty_bytes(sb, &tri->wri) >=
+			(8 * 1024 * 1024)) {
+		scoutfs_inc_counter(sb, trans_commit_full);
 		queue_trans_work(sbi);
 		goto out;
 	}
@@ -550,23 +513,6 @@ void scoutfs_release_trans(struct super_block *sb)
 		wake_up(&sbi->trans_hold_wq);
 }

-/*
- * Return the current transaction sequence.  Whether this is racing with
- * the transaction write thread is entirely dependent on the caller's
- * context.
- */
-u64 scoutfs_trans_sample_seq(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	u64 ret;
-
-	spin_lock(&sbi->trans_write_lock);
-	ret = sbi->trans_seq;
-	spin_unlock(&sbi->trans_write_lock);
-
-	return ret;
-}
-
 int scoutfs_setup_trans(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
@@ -18,7 +18,6 @@ int scoutfs_hold_trans(struct super_block *sb,
 		       const struct scoutfs_item_count cnt);
 bool scoutfs_trans_held(void);
 void scoutfs_release_trans(struct super_block *sb);
-u64 scoutfs_trans_sample_seq(struct super_block *sb);
 void scoutfs_trans_track_item(struct super_block *sb, signed items,
 			      signed vals);

@@ -1,7 +1,7 @@
 #ifndef _SCOUTFS_TRIGGERS_H_
 #define _SCOUTFS_TRIGGERS_H_

-enum scoutfs_trigger {
+enum {
 	SCOUTFS_TRIGGER_BTREE_STALE_READ,
 	SCOUTFS_TRIGGER_BTREE_ADVANCE_RING_HALF,
 	SCOUTFS_TRIGGER_HARD_STALE_ERROR,
@@ -1,20 +0,0 @@
-#ifndef _SCOUTFS_UTIL_H_
-#define _SCOUTFS_UTIL_H_
-
-/*
- * Little utility helpers that probably belong upstream.
- */
-
-static inline void down_write_two(struct rw_semaphore *a,
-				  struct rw_semaphore *b)
-{
-	BUG_ON(a == b);
-
-	if (a > b)
-		swap(a, b);
-
-	down_write(a);
-	down_write_nested(b, SINGLE_DEPTH_NESTING);
-}
-
-#endif
@@ -20,7 +20,7 @@
 #include "inode.h"
 #include "key.h"
 #include "super.h"
-#include "item.h"
+#include "kvec.h"
 #include "forest.h"
 #include "trans.h"
 #include "xattr.h"
@@ -94,17 +94,21 @@ static int unknown_prefix(const char *name)
 	       strncmp(name, SCOUTFS_XATTR_PREFIX, SCOUTFS_XATTR_PREFIX_LEN);
 }

+struct prefix_tags {
+	unsigned long hide:1,
+		      indx:1;
+};

 #define HIDE_TAG	"hide."
-#define SRCH_TAG	"srch."
+#define INDX_TAG	"indx."
 #define TAG_LEN		(sizeof(HIDE_TAG) - 1)

-int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
-			     struct scoutfs_xattr_prefix_tags *tgs)
+static int parse_tags(const char *name, unsigned int name_len,
+		      struct prefix_tags *tgs)
 {
 	bool found;

-	memset(tgs, 0, sizeof(struct scoutfs_xattr_prefix_tags));
+	memset(tgs, 0, sizeof(struct prefix_tags));

 	if ((name_len < (SCOUTFS_XATTR_PREFIX_LEN + TAG_LEN + 1)) ||
 	    strncmp(name, SCOUTFS_XATTR_PREFIX, SCOUTFS_XATTR_PREFIX_LEN))
@@ -116,8 +120,8 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
 		if (!strncmp(name, HIDE_TAG, TAG_LEN)) {
 			if (++tgs->hide == 0)
 				return -EINVAL;
-		} else if (!strncmp(name, SRCH_TAG, TAG_LEN)) {
-			if (++tgs->srch == 0)
+		} else if (!strncmp(name, INDX_TAG, TAG_LEN)) {
+			if (++tgs->indx == 0)
 				return -EINVAL;
 		} else {
 			/* only reason to use scoutfs. is tags */
@@ -132,6 +136,17 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
 	return 0;
 }

+void scoutfs_xattr_index_key(struct scoutfs_key *key,
+			     u64 hash, u64 ino, u64 id)
+{
+	scoutfs_key_set_zeros(key);
+	key->sk_zone = SCOUTFS_XATTR_INDEX_ZONE;
+	key->skxi_hash = cpu_to_le64(hash);
+	key->sk_type = SCOUTFS_XATTR_INDEX_NAME_TYPE;
+	key->skxi_ino = cpu_to_le64(ino);
+	key->skxi_id = cpu_to_le64(id);
+}
+
 /*
 * Find the next xattr and copy the key, xattr header, and as much of
 * the name and value into the callers buffer as we can.  Returns the
@@ -156,6 +171,7 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
 {
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_key last;
+	struct kvec val;
 	u8 last_part;
 	int total;
 	u8 part;
@@ -178,9 +194,8 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,

 	for (;;) {
 		key->skx_part = part;
-		ret = scoutfs_item_next(sb, key, &last,
-					(void *)xat + total, bytes - total,
-					lock);
+		kvec_init(&val, (void *)xat + total, bytes - total);
+		ret = scoutfs_forest_next(sb, key, &last, &val, lock);
 		if (ret < 0) {
 			/* XXX corruption, ran out of parts */
 			if (ret == -ENOENT && part > 0)
@@ -256,6 +271,7 @@ static int create_xattr_items(struct inode *inode, u64 id,
 	struct scoutfs_key key;
 	unsigned int part_bytes;
 	unsigned int total;
+	struct kvec val;
 	int ret;

 	init_xattr_key(&key, scoutfs_ino(inode),
@@ -266,13 +282,12 @@ static int create_xattr_items(struct inode *inode, u64 id,
 	while (total < bytes) {
 		part_bytes = min_t(unsigned int, bytes - total,
 				   SCOUTFS_XATTR_MAX_PART_SIZE);
+		kvec_init(&val, (void *)xat + total, part_bytes);

-		ret = scoutfs_item_create(sb, &key,
-					  (void *)xat + total, part_bytes,
-					  lock);
+		ret = scoutfs_forest_create(sb, &key, &val, lock);
 		if (ret) {
 			while (key.skx_part-- > 0)
-				scoutfs_item_delete(sb, &key, lock);
+				scoutfs_forest_delete_dirty(sb, &key);
 			break;
 		}

@@ -284,114 +299,24 @@ static int create_xattr_items(struct inode *inode, u64 id,
 }

 /*
- * Delete the items that make up the given xattr.  If this returns an
- * error then no items have been deleted.
+ * Delete and save the items that make up the given xattr.  If this
+ * returns an error then the deleted and saved items are left on the
+ * list for the caller to restore.
 */
 static int delete_xattr_items(struct inode *inode, u32 name_hash, u64 id,
-			      u8 nr_parts, struct scoutfs_lock *lock)
+			      u8 nr_parts, struct list_head *list,
+			      struct scoutfs_lock *lock)
 {
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_key key;
-	int ret = 0;
-	int i;
+	int ret;

 	init_xattr_key(&key, scoutfs_ino(inode), name_hash, id);

-	/* dirty additional existing old items */
-	for (i = 1; i < nr_parts; i++) {
-		key.skx_part = i;
-		ret = scoutfs_item_dirty(sb, &key, lock);
-		if (ret)
-			goto out;
-	}
+	do {
+		ret = scoutfs_forest_delete_save(sb, &key, list, lock);
+	} while (ret == 0 && ++key.skx_part < nr_parts);

-	for (i = 0; i < nr_parts; i++) {
-		key.skx_part = i;
-		ret = scoutfs_item_delete(sb, &key, lock);
-		if (ret)
-			break;
-	}
-out:
-	return ret;
-}
-
-/*
- * The caller needs to overwrite existing old xattr items with new
- * items.  We carefully stage the changes so that we can always unwind
- * to the original items if we return an error.  Both items have at
- * least one part.  Either the old or new can have more parts.  We dirty
- * and create first because we can always unwind those.  We delete last
- * after dirtying so that it can't fail and we don't have to restore the
- * deleted items.
- */
-static int change_xattr_items(struct inode *inode, u64 id,
-			      struct scoutfs_xattr *new_xat,
-			      unsigned int new_bytes, u8 new_parts,
-			      u8 old_parts, struct scoutfs_lock *lock)
-{
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_key key;
-	int last_created = -1;
-	int bytes;
-	int off;
-	int i;
-	int ret;
-
-	init_xattr_key(&key, scoutfs_ino(inode),
-		       xattr_name_hash(new_xat->name, new_xat->name_len), id);
-
-	/* dirty existing old items */
-	for (i = 0; i < old_parts; i++) {
-		key.skx_part = i;
-		ret = scoutfs_item_dirty(sb, &key, lock);
-		if (ret)
-			goto out;
-	}
-
-	/* create any new items past the old */
-	for (i = old_parts; i < new_parts; i++) {
-		off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
-		bytes = min_t(unsigned int, new_bytes - off,
-			      SCOUTFS_XATTR_MAX_PART_SIZE);
-
-		key.skx_part = i;
-		ret = scoutfs_item_create(sb, &key, (void *)new_xat + off,
-					  bytes, lock);
-		if (ret)
-			goto out;
-
-		last_created = i;
-	}
-
-	/* update dirtied overlapping existing items, last partial first */
-	for (i = old_parts - 1; i >= 0; i--) {
-		off = i * SCOUTFS_XATTR_MAX_PART_SIZE;
-		bytes = min_t(unsigned int, new_bytes - off,
-			      SCOUTFS_XATTR_MAX_PART_SIZE);
-
-		key.skx_part = i;
-		ret = scoutfs_item_update(sb, &key, (void *)new_xat + off,
-					  bytes, lock);
-		/* only last partial can fail, then we unwind created */
-		if (ret < 0)
-			goto out;
-	}
-
-	/* delete any dirtied old items past new */
-	for (i = new_parts; i < old_parts; i++) {
-		key.skx_part = i;
-		scoutfs_item_delete(sb, &key, lock);
-	}
-
-	ret = 0;
-out:
-	if (ret < 0) {
-		/* delete any newly created items */
-		for (i = old_parts; i <= last_created; i++) {
-			key.skx_part = i;
-			scoutfs_item_delete(sb, &key, lock);
-		}
-	}
 	return ret;
 }

@@ -421,7 +346,7 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,

 	/* only need enough for caller's name and value sizes */
 	bytes = sizeof(struct scoutfs_xattr) + name_len + size;
-	xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL);
+	xat = kmalloc(bytes, GFP_NOFS);
 	if (!xat)
 		return -ENOMEM;

@@ -464,7 +389,7 @@ ssize_t scoutfs_getxattr(struct dentry *dentry, const char *name, void *buffer,
 	ret = le16_to_cpu(xat->val_len);
 	memcpy(buffer, &xat->name[xat->name_len], ret);
 out:
-	vfree(xat);
+	kfree(xat);
 	return ret;
 }

@@ -486,17 +411,20 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	const u64 ino = scoutfs_ino(inode);
-	struct scoutfs_xattr_prefix_tags tgs;
 	struct scoutfs_xattr *xat = NULL;
+	struct scoutfs_lock *indx_lock = NULL;
 	struct scoutfs_lock *lck = NULL;
 	size_t name_len = strlen(name);
+	struct scoutfs_key indx_key;
 	struct scoutfs_key key;
-	bool undo_srch = false;
+	struct prefix_tags tgs;
+	bool undo_indx = false;
 	LIST_HEAD(ind_locks);
+	LIST_HEAD(saved);
 	u8 found_parts;
 	unsigned int bytes;
 	u64 ind_seq;
-	u64 hash = 0;
+	u64 hash;
 	u64 id = 0;
 	int ret;
 	int err;
@@ -516,14 +444,14 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
 	if (unknown_prefix(name))
 		return -EOPNOTSUPP;

-	if (scoutfs_xattr_parse_tags(name, name_len, &tgs) != 0)
+	if (parse_tags(name, name_len, &tgs) != 0)
 		return -EINVAL;

-	if ((tgs.hide || tgs.srch) && !capable(CAP_SYS_ADMIN))
+	if ((tgs.hide || tgs.indx) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;

 	bytes = sizeof(struct scoutfs_xattr) + name_len + size;
-	xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL);
+	xat = kmalloc(bytes, GFP_NOFS);
 	if (!xat) {
 		ret = -ENOMEM;
 		goto out;
@@ -563,24 +491,29 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,

 	/* prepare our xattr */
 	if (value) {
-		if (found_parts)
-			id = le64_to_cpu(key.skx_id);
-		else
-			id = si->next_xattr_id++;
+		id = si->next_xattr_id++;
 		xat->name_len = name_len;
 		xat->val_len = cpu_to_le16(size);
-		memset(xat->__pad, 0, sizeof(xat->__pad));
 		memcpy(xat->name, name, name_len);
 		memcpy(&xat->name[xat->name_len], value, size);
 	}

+	if (tgs.indx && !(found_parts && value)) {
+		hash = scoutfs_hash64(name, name_len);
+		ret = scoutfs_lock_xattr_index(sb, SCOUTFS_LOCK_WRITE_ONLY, 0,
+					       hash, &indx_lock);
+		if (ret < 0)
+			goto unlock;
+	}
+
 retry:
 	ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
 	      scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
 	      scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
 						SIC_XATTR_SET(found_parts,
 							      value != NULL,
-							      name_len, size));
+							      name_len, size,
+							      tgs.indx));
 	if (ret > 0)
 		goto retry;
 	if (ret)
@@ -590,27 +523,34 @@ retry:
 	if (ret < 0)
 		goto release;

-	if (tgs.srch && !(found_parts && value)) {
+	if (tgs.indx && !(found_parts && value)) {
 		if (found_parts)
 			id = le64_to_cpu(key.skx_id);
 		hash = scoutfs_hash64(name, name_len);
-		ret = scoutfs_forest_srch_add(sb, hash, ino, id);
+		scoutfs_xattr_index_key(&indx_key, hash, ino, id);
+		if (value)
+			ret = scoutfs_forest_create_force(sb, &indx_key, NULL,
+							  indx_lock);
+		else
+			ret = scoutfs_forest_delete_force(sb, &indx_key,
+							  indx_lock);
 		if (ret < 0)
 			goto release;
-		undo_srch = true;
+		undo_indx = true;
 	}

-	if (found_parts && value)
-		ret = change_xattr_items(inode, id, xat, bytes,
-					 xattr_nr_parts(xat), found_parts, lck);
-	else if (found_parts)
+	ret = 0;
+	if (found_parts)
 		ret = delete_xattr_items(inode, le64_to_cpu(key.skx_name_hash),
 					 le64_to_cpu(key.skx_id), found_parts,
-					 lck);
-	else
+					 &saved, lck);
+	if (value && ret == 0)
 		ret = create_xattr_items(inode, id, xat, bytes, lck);
-	if (ret < 0)
+	if (ret < 0) {
+		scoutfs_forest_restore(sb, &saved, lck);
 		goto release;
+	}
+	scoutfs_forest_free_batch(sb, &saved);

 	/* XXX do these want i_mutex or anything? */
 	inode_inc_iversion(inode);
@@ -619,8 +559,13 @@ retry:
 	ret = 0;

 release:
-	if (ret < 0 && undo_srch) {
-		err = scoutfs_forest_srch_add(sb, hash, ino, id);
+	if (ret < 0 && undo_indx) {
+		if (value)
+			err = scoutfs_forest_delete_force(sb, &indx_key,
+							  indx_lock);
+		else
+			err = scoutfs_forest_create_force(sb, &indx_key, NULL,
+							  indx_lock);
 		BUG_ON(err);
 	}

@@ -628,9 +573,10 @@ release:
 	scoutfs_inode_index_unlock(sb, &ind_locks);
 unlock:
 	up_write(&si->xattr_rwsem);
+	scoutfs_unlock(sb, indx_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	scoutfs_unlock(sb, lck, SCOUTFS_LOCK_WRITE);
 out:
-	vfree(xat);
+	kfree(xat);

 	return ret;
 }
@@ -655,10 +601,10 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	struct super_block *sb = inode->i_sb;
-	struct scoutfs_xattr_prefix_tags tgs;
 	struct scoutfs_xattr *xat = NULL;
 	struct scoutfs_lock *lck = NULL;
 	struct scoutfs_key key;
+	struct prefix_tags tgs;
 	unsigned int bytes;
 	ssize_t total = 0;
 	u32 name_hash = 0;
@@ -694,8 +640,8 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
 			break;
 		}

-		is_hidden = scoutfs_xattr_parse_tags(xat->name, xat->name_len,
-						     &tgs) == 0 && tgs.hide;
+		is_hidden = parse_tags(xat->name, xat->name_len, &tgs) == 0 &&
+			    tgs.hide;

 		if (show_hidden == is_hidden) {
 			if (size) {
@@ -747,12 +693,15 @@ ssize_t scoutfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		       struct scoutfs_lock *lock)
 {
-	struct scoutfs_xattr_prefix_tags tgs;
+	struct scoutfs_lock *indx_lock = NULL;
 	struct scoutfs_xattr *xat = NULL;
+	struct scoutfs_key indx_key;
 	struct scoutfs_key last;
 	struct scoutfs_key key;
+	struct prefix_tags tgs;
 	bool release = false;
 	unsigned int bytes;
+	struct kvec val;
 	u64 hash;
 	int ret;

@@ -768,8 +717,8 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 	init_xattr_key(&last, ino, U32_MAX, U64_MAX);

 	for (;;) {
-		ret = scoutfs_item_next(sb, &key, &last, (void *)xat, bytes,
-					lock);
+		kvec_init(&val, (void *)xat, bytes);
+		ret = scoutfs_forest_next(sb, &key, &last, &val, lock);
 		if (ret < 0) {
 			if (ret == -ENOENT)
 				ret = 0;
@@ -777,23 +726,32 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		}

 		if (key.skx_part != 0 ||
-		    scoutfs_xattr_parse_tags(xat->name, xat->name_len,
-					     &tgs) != 0)
+		    parse_tags(xat->name, xat->name_len, &tgs) != 0)
 			memset(&tgs, 0, sizeof(tgs));

+		if (tgs.indx) {
+			hash = scoutfs_hash64(xat->name, xat->name_len);
+			scoutfs_xattr_index_key(&indx_key, hash, ino,
+						le64_to_cpu(key.skx_id));
+			ret = scoutfs_lock_xattr_index(sb,
+						      SCOUTFS_LOCK_WRITE_ONLY,
+						      0, hash, &indx_lock);
+			if (ret < 0)
+				break;
+		}
+
 		ret = scoutfs_hold_trans(sb, SIC_EXACT(2, 0));
 		if (ret < 0)
 			break;
 		release = true;

-		ret = scoutfs_item_delete(sb, &key, lock);
+		ret = scoutfs_forest_delete(sb, &key, lock);
 		if (ret < 0)
 			break;

-		if (tgs.srch) {
-			hash = scoutfs_hash64(xat->name, xat->name_len);
-			ret = scoutfs_forest_srch_add(sb, hash, ino,
-						      le64_to_cpu(key.skx_id));
+		if (tgs.indx) {
+		       ret = scoutfs_forest_delete_force(sb, &indx_key,
+							 indx_lock);
 		       if (ret < 0)
 			       break;
 		}
@@ -801,11 +759,15 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		scoutfs_release_trans(sb);
 		release = false;

+		scoutfs_unlock(sb, indx_lock, SCOUTFS_LOCK_WRITE_ONLY);
+		indx_lock = NULL;
+
 		/* don't need to inc, next won't see deleted item */
 	}

 	if (release)
 		scoutfs_release_trans(sb);
+	scoutfs_unlock(sb, indx_lock, SCOUTFS_LOCK_WRITE_ONLY);
 	kfree(xat);
 out:
 	return ret;
@@ -14,12 +14,7 @@ ssize_t scoutfs_list_xattrs(struct inode *inode, char *buffer,
 int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
 		       struct scoutfs_lock *lock);

-struct scoutfs_xattr_prefix_tags {
-	unsigned long hide:1,
-		      srch:1;
-};
-
-int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len,
-			     struct scoutfs_xattr_prefix_tags *tgs);
+void scoutfs_xattr_index_key(struct scoutfs_key *key,
+			     u64 hash, u64 ino, u64 id);

 #endif
@@ -1,6 +0,0 @@
-src/*.d
-src/createmany
-src/dumb_setxattr
-src/handle_cat
-src/bulk_create_paths
-src/find_xattrs
@@ -1,49 +0,0 @@
-CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -fno-strict-aliasing 
-SHELL := /usr/bin/bash
-
-# each binary command is built from a single .c file
-BIN := src/createmany			\
-	src/dumb_setxattr		\
-	src/handle_cat			\
-	src/bulk_create_paths		\
-	src/find_xattrs
-
-DEPS := $(wildcard src/*.d)
-
-all: $(BIN)
-
-ifneq ($(DEPS),)
-include $(DEPS)
-endif
-
-$(BIN): %: %.c Makefile
-	gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@
-
-.PHONY: clean
-clean:
-	@rm -f $(BIN) $(DEPS)
-
-#
-# Make sure we only have all three items needed for each test: entry in
-# sequence, test script in tests/, and output in golden/.
-#
-.PHONY: check-test-files
-check-test-files:
-	@for t in $$(grep -v "^#" sequence); do			\
-		test -e "tests/$$t" ||				\
-			echo "no test for list entry: $$t";	\
-		t=$${t%%.sh};					\
-		test -e "golden/$$t" ||				\
-			echo "no output for list entry: $$t";	\
-	done;							\
-	for t in golden/*; do					\
-		t=$$(basename "$$t");				\
-		grep -q "^$$t.sh$$" sequence ||			\
-			echo "output not in list: $$t";		\
-	done;							\
-	for t in tests/*; do					\
-		t=$$(basename "$$t");				\
-		test "$$t" == "list" && continue;		\
-		grep -q "^$$t$$" sequence ||			\
-			echo "test not in list: $$t";		\
-	done
@@ -1,123 +0,0 @@
-
-This test suite exercises multi-node scoutfs by using multiple mounts on
-one host to simulate multiple nodes across a network.
-
-It also contains a light test wrapper that executes xfstests on one of
-the test mounts.
-
-## Invoking Tests
-
-The basic test invocation has to specify the devices for the fs the
-number of mounts to test, whether to create a new fs and insert the
-built module, and where to put the results.
-
-    # bash ./run-tests.sh                       \
-        -M /dev/vda                             \
-        -D /dev/vdb                             \
-        -i                                      \
-        -m                                      \
-        -n 3                                    \
-        -q 2                                    \
-        -r ./results
-
-All options can be seen by running with -h.
-
-This script is built to test multi-node systems on one host by using
-different mounts of the same devices.  The script creates a fake block
-device in front of each fs block device for each mount that will be
-tested.  Currently it will create free loop devices and will mount on
-/mnt/test.[0-9].
-
-All tests will be run by default.  Particular tests can be included or
-excluded by providing test name regular expressions with the -I and -E
-options.  The definitive list of tests and the order in which they'll be
-run is found in the sequence file.
-
-## xfstests
-
-The last test that is run checks out, builds, and runs xfstests.  It
-needs -X and -x options for the xfstests git repo and branch.  It also
-needs spare devices on which to make scratch scoutfs volumes.  The test
-verifies that the expected set of xfstests tests ran and passed.
-
-        -f /dev/vdc                             \
-        -e /dev/vdd                             \
-        -X $HOME/git/scoutfs-xfstests           \
-        -x scoutfs                              \
-
-An xfstests repo that knows about scoutfs is only required to sprinkle
-the scoutfs cases throughout the xfstests harness.
-
-## Individual Test Invocation
-
-Each test is run in a new bash invocation.  A set of directories in the
-test volume and in the results path are created for the test.  Each
-test's working directory isn't managed.
-
-Test output, temp files, and dmesg snapshots are all put in a tmp/ dir
-in the results/ dir.  Per-test dirs are only destroyed before each test
-invocation.
-
-The harness will check for unexpected output in dmesg after each
-individual test.
-
-Each test that fails will have its results appened to the fail.log file
-in the results/ directory.  The details of the failure can be examined
-in the directories for each test in results/output/ and results/tmp/. 
-
-## Writing tests
-
-Tests have access to a set of t\_ prefixed bash functions that are found
-in files in funcs/.
-
-Tests complete by calling t\_ functions which indicate the result of the
-test and can return a message.  If the tests passes then its output is
-compared with known good output.  If the output doesn't match then the
-test fails.  The t\_ completion functions return specific status codes so
-that returning without calling one can be detected.
-
-The golden output has to be consistent across test platforms so there
-are a number of filter functions which strip out local details from
-command output.  t\_filter\_fs is by far the most used which canonicalizes
-fs mount paths and block device details.
-
-Tests can be relatively loose about checking errors.  If commands
-produce output in failure cases then the test will fail without having
-to specifically test for errors on every command execution.  Care should
-be taken to make sure that blowing through a bunch of commands with no
-error checking doesn't produce catastrophic results.  Usually tests are
-simple and it's fine.
-
-A bare sync will sync all the mounted filesystems and ensure that
-no mounts have dirty data.  sync -f can be used to sync just a specific
-filesystem, though it doesn't exist on all platforms.
-
-The harness doesn't currently ensure that all mounts are restored after
-each test invocation.  It probably should.  Currently it's the
-responsibility of the test to restore any mounts it alters and there are
-t\_ functions to mount all configured mount points.
-
-## Environment Variables
-
-Tests have a number of exported environment variables that are commonly
-used during the test.
-
-| Variable         | Description          | Origin          | Example           |
-| ---------------- | -------------------  | --------------- | ----------------- |
-| T\_MB[0-9]       | per-mount meta bdev  | created per run | /dev/loop0        |
-| T\_DB[0-9]       | per-mount data bdev  | created per run | /dev/loop1        |
-| T\_D[0-9]        | per-mount test dir   | made for test   | /mnt/test.[0-9]/t |
-| T\_META\_DEVICE  | main FS meta bdev    | -M              | /dev/vda          |
-| T\_DATA\_DEVICE  | main FS data bdev    | -D              | /dev/vdb          |
-| T\_EX\_META\_DEV | scratch meta bdev    | -f              | /dev/vdd          |
-| T\_EX\_DATA\_DEV | scratch meta bdev    | -e              | /dev/vdc          |
-| T\_M[0-9]        | mount paths          | mounted per run | /mnt/test.[0-9]/  |
-| T\_NR\_MOUNTS    | number of mounts     | -n              | 3                 |
-| T\_O[0-9]        | mount options        | created per run | -o server\_addr=  |
-| T\_QUORUM        | quorum count         | -q              | 2                 |
-| T\_TMP           | per-test tmp prefix  | made for test   | results/tmp/t/tmp |
-| T\_TMPDIR        | per-test tmp dir dir | made for test   | results/tmp/t     |
-
-There are also a number of variables that are set in response to options
-and are exported but their use is rare so they aren't included here.
-
@@ -1,58 +0,0 @@
-
-t_status_msg()
-{
-	echo "$*" > "$T_TMPDIR/status.msg"
-}
-
-export T_PASS_STATUS=100
-export T_SKIP_STATUS=101
-export T_FAIL_STATUS=102
-export T_FIRST_STATUS="$T_PASS_STATUS"
-export T_LAST_STATUS="$T_FAIL_STATUS"
-
-t_pass()
-{
-	exit $T_PASS_STATUS
-}
-
-t_skip()
-{
-	t_status_msg "$@"
-	exit $T_SKIP_STATUS
-}
-
-t_fail()
-{
-	t_status_msg "$@"
-	exit $T_FAIL_STATUS
-}
-
-#
-# Quietly run a command during a test.  If it succeeds then we have a
-# log of its execution but its output isn't included in the test's
-# compared output.  If it fails then the test fails.
-#
-t_quiet()
-{
-	echo "# $*" >> "$T_TMPDIR/quiet.log"
-	"$@" > "$T_TMPDIR/quiet.log" 2>&1 || \
-		t_fail "quiet command failed"
-}
-
-#
-# redirect test output back to the output of the invoking script intead
-# of the compared output.
-#
-t_restore_output()
-{
-	exec >&6 2>&1
-}
-
-#
-# redirect a command's output back to the compared output after the
-# test has restored its output
-#
-t_compare_output()
-{
-	"$@" >&7 2>&1
-}
@@ -1,63 +0,0 @@
-
-# filter out device ids and mount paths
-t_filter_fs()
-{
-	sed -e 's@mnt/test\.[0-9]*@mnt/test@g' \
-	    -e 's@Device: [a-fA-F0-7]*h/[0-9]*d@Device: 0h/0d@g'
-}
-
-#
-# Filter out expected messages.  Putting messages here implies that
-# tests aren't relying on messages to discover failures.. they're
-# directly testing the result of whatever it is that's generating the
-# message.
-#
-t_filter_dmesg()
-{
-	local re
-
-	# the kernel can just be noisy
-	re=" used greatest stack depth: "
-
-	# mkfs/mount checks partition tables
-	re="$re|unknown partition table"
-
-	# dm swizzling
-	re="$re|device doesn't appear to be in the dev hash table"
-	re="$re|device-mapper:.*uevent:.*version"
-	re="$re|device-mapper:.*ioctl:.*initialised"
-
-	# some tests try invalid devices
-	re="$re|scoutfs .* error reading super block"
-	re="$re| EXT4-fs (.*): get root inode failed"
-	re="$re| EXT4-fs (.*): mount failed"
-	re="$re| EXT4-fs (.*): no journal found"
-	re="$re| EXT4-fs (.*): VFS: Can't find ext4 filesystem"
-
-	# dropping caches is fine
-	re="$re| drop_caches: "
-
-	# mount and unmount spew a bunch
-	re="$re|scoutfs.*client connected"
-	re="$re|scoutfs.*client disconnected"
-	re="$re|scoutfs.*server setting up"
-	re="$re|scoutfs.*server ready"
-	re="$re|scoutfs.*server accepted"
-	re="$re|scoutfs.*server closing"
-	re="$re|scoutfs.*server shutting down"
-	re="$re|scoutfs.*server stopped"
-
-	# xfstests records test execution in desg
-	re="$re| run fstests "
-
-	# tests that drop unmount io triggers fencing
-	re="$re|scoutfs .* error: fencing "
-	re="$re|scoutfs .*: waiting for .* lock clients"
-	re="$re|scoutfs .*: all lock clients recovered"
-	re="$re|scoutfs .* error: client rid.*lock recovery timed out"
-
-	# some tests mount w/o options
-	re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"
-
-	egrep -v "($re)" 
-}
@@ -1,231 +0,0 @@
-
-#
-# Make all previously dirty items in memory in all mounts synced and
-# visible in the inode seq indexes.  We have to force a sync on every
-# node by dirtying data as that's the only way to guarantee advancing
-# the sequence number on each node which limits index visibility.  Some
-# distros don't have sync -f so we dirty our mounts then sync
-# everything.
-#
-t_sync_seq_index()
-{
-	local m
-	
-	for m in $T_MS; do
-		t_quiet touch $m
-	done
-	t_quiet sync
-}
-
-#
-# Output the "f.$fsid.r.$rid" identifier string for the given mount
-# number, 0 is used by default if none is specified. 
-#
-t_ident()
-{
-	local nr="${1:-0}"
-	local mnt="$(eval echo \$T_M$nr)"
-	local fsid
-	local rid
-
-	fsid=$(scoutfs statfs -s fsid -p "$mnt")
-	rid=$(scoutfs statfs -s rid -p "$mnt")
-
-	echo "f.${fsid:0:6}.r.${rid:0:6}"
-}
-
-#
-# Output the mount's sysfs path, defaulting to mount 0 if none is
-# specified.
-#
-t_sysfs_path()
-{
-	local nr="$1"
-
-	echo "/sys/fs/scoutfs/$(t_ident $nr)"
-}
-
-#
-# Output the mount's debugfs path, defaulting to mount 0 if none is
-# specified.
-#
-t_debugfs_path()
-{
-	local nr="$1"
-
-	echo "/sys/kernel/debug/scoutfs/$(t_ident $nr)"
-}
-
-#
-# output all the configured test nrs for iteration
-#
-t_fs_nrs()
-{
-	seq 0 $((T_NR_MOUNTS - 1))
-}
-
-#
-# Output the mount nr of the current server.  This takes no steps to
-# ensure that the server doesn't shut down and have some other mount
-# take over.  
-#
-t_server_nr()
-{
-	for i in $(t_fs_nrs); do
-		if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "1" ]; then
-			echo $i
-			return
-		fi
-	done
-
-	t_fail "t_server_nr didn't find a server"
-}
-
-#
-# Output the mount nr of the first client that we find.  There can be
-# no clients if there's only one mount who has to be the server.  This
-# takes no steps to ensure that the client doesn't become a server at
-# any point.
-#
-t_first_client_nr()
-{
-	for i in $(t_fs_nrs); do
-		if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "0" ]; then
-			echo $i
-			return
-		fi
-	done
-
-	t_fail "t_first_client_nr didn't find any clients"
-}
-
-t_mount()
-{
-	local nr="$1"
-
-	test "$nr" -lt "$T_NR_MOUNTS" || \
-		t_fail "fs nr $nr invalid"
-
-	eval t_quiet mount -t scoutfs \$T_O$nr \$T_DB$nr \$T_M$nr
-}
-
-t_umount()
-{
-	local nr="$1"
-
-	test "$nr" -lt "$T_NR_MOUNTS" || \
-		t_fail "fs nr $nr invalid"
-
-	eval t_quiet umount \$T_DB$i
-}
-
-#
-# Attempt to mount all the configured mounts, assuming that they're
-# not already mounted.
-#
-t_mount_all()
-{
-	local pids=""
-	local p
-
-	for i in $(t_fs_nrs); do
-		t_mount $i &
-		p="$!"
-		pids="$pids $!"
-	done
-	for p in $pids; do
-		t_quiet wait $p
-	done
-}
-
-#
-# Attempt to unmount all the configured mounts, assuming that they're
-# all mounted.
-#
-t_umount_all()
-{
-	local pids=""
-	local p
-
-	for i in $(t_fs_nrs); do
-		t_umount $i &
-		p="$!"
-		pids="$pids $!"
-	done
-	for p in $pids; do
-		t_quiet wait $p
-	done
-}
-
-t_remount_all()
-{
-	t_quiet t_umount_all || t_fail "umounting all failed"
-	t_quiet t_mount_all || t_fail "mounting all failed"
-}
-
-t_reinsert_remount_all()
-{
-	t_quiet t_umount_all || t_fail "umounting all failed"
-
-	t_quiet rmmod scoutfs || \
-		t_fail "rmmod scoutfs failed"
-	t_quiet insmod "$T_KMOD/src/scoutfs.ko" ||
-		t_fail "insmod scoutfs failed"
-
-	t_quiet t_mount_all || t_fail "mounting all failed"
-}
-
-t_trigger_path() {
-	local nr="$1"
-
-	echo "/sys/kernel/debug/scoutfs/$(t_ident $nr)/trigger"
-}
-
-t_trigger_get() {
-	local which="$1"
-	local nr="$2"
-
-	cat "$(t_trigger_path "$nr")/$which"
-}
-
-t_trigger_show() {
-	local which="$1"
-	local string="$2"
-	local nr="$3"
-
-	echo "trigger $which $string: $(t_trigger_get $which $nr)"
-}
-
-t_trigger_arm() {
-	local which="$1"
-	local nr="$2"
-	local path=$(t_trigger_path "$nr")
-
-	echo 1 > "$path/$which"
-	t_trigger_show $which armed $nr
-}
-
-#
-# output the value of the given counter for the given mount, defaulting
-# to mount 0 if a mount isn't specified.
-#
-t_counter() {
-	local which="$1"
-	local nr="$2"
-
-	cat "$(t_sysfs_path $nr)/counters/$which"
-}
-
-#
-# output the value of the given counter for the given mount, defaulting
-# to mount 0 if a mount isn't specified.
-#
-t_counter_diff() {
-	local which="$1"
-	local old="$2"
-	local nr="$3"
-	local new
-
-	new="$(t_counter $which $nr)"
-	echo "counter $which diff $((new - old))"
-}
@@ -1,25 +0,0 @@
-
-#
-# Make sure that all the base command arguments are found in the path.
-# This isn't strictly necessary as the test will naturally fail if the
-# command isn't found, but it's nice to fail fast and clearly
-# communicate why.
-#
-t_require_commands() {
-	local c
-
-	for c in "$@"; do
-		which "$c" >/dev/null 2>&1 || \
-			t_fail "command $c not found in path"
-	done
-}
-
-#
-# make sure that we have at least this many mounts
-#
-t_require_mounts() {
-	local req="$1"
-
-	test "$T_NR_MOUNTS" -ge "$req" || \
-		t_skip "$req mounts required, only have $T_NR_MOUNTS"
-}
@@ -1,36 +0,0 @@
-== calculate number of files
-== create per mount dirs
-== generate phase scripts
-== round 1: create
-== round 1: online
-== round 1: verify
-== round 1: release
-== round 1: offline
-== round 1: stage
-== round 1: online
-== round 1: verify
-== round 1: release
-== round 1: offline
-== round 1: unlink
-== round 2: create
-== round 2: online
-== round 2: verify
-== round 2: release
-== round 2: offline
-== round 2: stage
-== round 2: online
-== round 2: verify
-== round 2: release
-== round 2: offline
-== round 2: unlink
-== round 3: create
-== round 3: online
-== round 3: verify
-== round 3: release
-== round 3: offline
-== round 3: stage
-== round 3: online
-== round 3: verify
-== round 3: release
-== round 3: offline
-== round 3: unlink
@@ -1,53 +0,0 @@
-== single block write
-online: 1
-offline: 0
-st_blocks: 8
-== single block overwrite
-online: 1
-offline: 0
-st_blocks: 8
-== append
-online: 2
-offline: 0
-st_blocks: 16
-== release
-online: 0
-offline: 2
-st_blocks: 16
-== duplicate release
-online: 0
-offline: 2
-st_blocks: 16
-== duplicate release past i_size
-online: 0
-offline: 2
-st_blocks: 16
-== stage
-online: 2
-offline: 0
-st_blocks: 16
-== duplicate stage
-online: 2
-offline: 0
-st_blocks: 16
-== larger file
-online: 256
-offline: 0
-st_blocks: 2048
-== partial truncate
-online: 128
-offline: 0
-st_blocks: 1024
-== single sparse block
-online: 1
-offline: 0
-st_blocks: 8
-== empty file
-online: 0
-offline: 0
-st_blocks: 0
-== non-regular file
-online: 0
-offline: 0
-st_blocks: 0
-== cleanup
@@ -1,55 +0,0 @@
-== root inode updates flow back and forth
-== stat of created file matches
-== written file contents match
-== overwritten file contents match
-== appended file contents match
-== fiemap matches after racey appends
-== unlinked file isn't found
-== symlink targets match
-/mnt/test/test/basic-posix-consistency/file.targ
-/mnt/test/test/basic-posix-consistency/file.targ
-/mnt/test/test/basic-posix-consistency/file.targ2
-/mnt/test/test/basic-posix-consistency/file.targ2
-== new xattrs are visible
-# file: /mnt/test/test/basic-posix-consistency/file
-user.xat="1"
-
-# file: /mnt/test/test/basic-posix-consistency/file
-user.xat="1"
-
-== modified xattrs are updated
-# file: /mnt/test/test/basic-posix-consistency/file
-user.xat="2"
-
-# file: /mnt/test/test/basic-posix-consistency/file
-user.xat="2"
-
-== deleted xattrs
-/mnt/test/test/basic-posix-consistency/file: user.xat: No such attribute
-/mnt/test/test/basic-posix-consistency/file: user.xat: No such attribute
-== readdir after modification
-one
-two
-three
-four
-one
-two
-three
-four
-two
-four
-two
-four
-== can delete empty dir
-== some easy rename cases
--- file between dirs
--- file within dir
--- dir within dir
--- overwrite file
--- can't overwrite non-empty dir
-mv: cannot move ‘/mnt/test/test/basic-posix-consistency/dir/c/clobber’ to ‘/mnt/test/test/basic-posix-consistency/dir/a/dir’: Directory not empty
--- can overwrite empty dir
-== path resoluion
-== inode indexes match after syncing existing
-== inode indexes match after copying and syncing
-== inode indexes match after removing and syncing
@@ -1,4 +0,0 @@
-Run createmany in /mnt/test/test/createmany-parallel/0
-Run createmany in /mnt/test/test/createmany-parallel/1
-Run createmany in /mnt/test/test/createmany-parallel/2
-Run createmany in /mnt/test/test/createmany-parallel/3
@@ -1,3 +0,0 @@
-== measure initial createmany
-== measure initial createmany
-== measure two concurrent createmany runs
@@ -1,2 +0,0 @@
-== create large directory with 1220608 files
-== randomly renaming 5000 files
@@ -1,2 +0,0 @@
-== repeated cross-mount alloc+free, totalling 2x free
-== remove empty test file
@@ -1,10 +0,0 @@
-== create per node dirs
-== touch files on each node
-== recreate the files
-== turn the files into directories
-== rename parent dirs
-== rename parent dirs back
-== create some hard links
-== recreate one of the hard links
-== delete the remaining hard link
-== race to blow everything away
@@ -1,4 +0,0 @@
-== create files and sync
-== modify files
-== mount and unmount
-== verify files
@@ -1,4 +0,0 @@
-== create per mount files
-== time independent modification
-== time concurrent independent modification
-== time concurrent conflicting modification
@@ -1,2 +0,0 @@
-=== setup files ===
-=== ping-pong xattr ops ===
@@ -1 +0,0 @@
-== race writing and index walking
@@ -1,3 +0,0 @@
-== make test dir
-== do enough stuff to make lock leaks visible
-== make sure nothing has leaked
@@ -1,2 +0,0 @@
-=== getcwd after lock revocation
-trigger statfs_lock_purge armed: 1
@@ -1,15 +0,0 @@
-=== setup test file ===
-# file: /mnt/test/test/lock-shrink-consistency/dir/file
-user.test="aaa"
-
-=== commit dirty trans and revoke lock ===
-trigger statfs_lock_purge armed: 1
-trigger statfs_lock_purge after it fired: 0
-=== change xattr on other mount ===
-# file: /mnt/test/test/lock-shrink-consistency/dir/file
-user.test="bbb"
-
-=== verify new xattr under new lock on first mount ===
-# file: /mnt/test/test/lock-shrink-consistency/dir/file
-user.test="bbb"
-
@@ -1,33 +0,0 @@
-== build test files
-== wrapped offsets should fail
-ioctl failed on '/mnt/test/test/move-blocks/to': Value too large for defined data type (75)
-scoutfs: move-blocks failed: Value too large for defined data type (75)
-ioctl failed on '/mnt/test/test/move-blocks/to': Value too large for defined data type (75)
-scoutfs: move-blocks failed: Value too large for defined data type (75)
-== specifying same file fails
-ioctl failed on '/mnt/test/test/move-blocks/hardlink': Invalid argument (22)
-scoutfs: move-blocks failed: Invalid argument (22)
-== specifying files in other file systems fails
-ioctl failed on '/mnt/test/test/move-blocks/to': Invalid cross-device link (18)
-scoutfs: move-blocks failed: Invalid cross-device link (18)
-== offsets must be multiples of 4KB
-ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
-scoutfs: move-blocks failed: Invalid argument (22)
-ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
-scoutfs: move-blocks failed: Invalid argument (22)
-ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
-scoutfs: move-blocks failed: Invalid argument (22)
-== can't move onto existing extent
-ioctl failed on '/mnt/test/test/move-blocks/to': Invalid argument (22)
-scoutfs: move-blocks failed: Invalid argument (22)
-== can't move between files with offline extents
-ioctl failed on '/mnt/test/test/move-blocks/to': No data available (61)
-scoutfs: move-blocks failed: No data available (61)
-ioctl failed on '/mnt/test/test/move-blocks/to': No data available (61)
-scoutfs: move-blocks failed: No data available (61)
-== basic moves work
-== moving final partial block sets partial i_size
-123
-== moving updates inode fields
-== moving blocks backwards works
-== combine many files into one
@@ -1,56 +0,0 @@
-== create files
-== waiter shows up in ioctl
-offline waiting should be empty:
-0
-offline waiting should now have one known entry:
-== multiple waiters on same block listed once
-offline waiting still has one known entry:
-== different blocks show up
-offline waiting now has two known entries:
-== staging wakes everyone
-offline waiting should be empty again:
-0
-== interruption does no harm
-offline waiting should now have one known entry:
-offline waiting should be empty again:
-0
-== EIO injection for waiting readers works
-offline waiting should now have two known entries:
-2
-data_wait_err found 2 waiters.
-offline waiting should now have 0 known entries:
-0
-dd: error reading ‘/mnt/test/test/offline-extent-waiting/dir/file’: Input/output error
-0+0 records in
-0+0 records out
-dd: error reading ‘/mnt/test/test/offline-extent-waiting/dir/file’: Input/output error
-0+0 records in
-0+0 records out
-offline waiting should be empty again:
-0
-== readahead while offline does no harm
-== waiting on interesting blocks works
-offline waiting is empty at block 0
-0
-offline waiting is empty at block 1
-0
-offline waiting is empty at block 128
-0
-offline waiting is empty at block 129
-0
-offline waiting is empty at block 254
-0
-offline waiting is empty at block 255
-0
-== contents match when staging blocks forward
-== contents match when staging blocks backwards
-== truncate to same size doesn't wait
-offline wating should be empty:
-0
-== truncating does wait
-truncate should be waiting for first block:
-trunate should no longer be waiting:
-0
-== writing waits
-should be waiting for write
-== cleanup
@@ -1,4 +0,0 @@
-== advance lock version by creating unrelated files
-== create before file version
-== verify before version, touch after version
-== verify after version
@@ -1,31 +0,0 @@
-== 0 data_version arg fails
-setattr: data version must not be 0
-Try `setattr --help' or `setattr --usage' for more information.
-== args must specify size and offline
-setattr: must provide size if using --offline option
-Try `setattr --help' or `setattr --usage' for more information.
-== only works on regular files
-failed to open '/mnt/test/test/setattr_more/dir': Is a directory (21)
-scoutfs: setattr failed: Is a directory (21)
-setattr_more ioctl failed on '/mnt/test/test/setattr_more/char': Inappropriate ioctl for device (25)
-scoutfs: setattr failed: Inappropriate ioctl for device (25)
-== non-zero file size fails
-setattr_more ioctl failed on '/mnt/test/test/setattr_more/file': Invalid argument (22)
-scoutfs: setattr failed: Invalid argument (22)
-== non-zero file data_version fails
-setattr_more ioctl failed on '/mnt/test/test/setattr_more/file': Invalid argument (22)
-scoutfs: setattr failed: Invalid argument (22)
-== large size is set
-578437695752307201
-== large data_version is set
-578437695752307201
-== large ctime is set
-1972-02-19 00:06:25.999999999 +0000
-== large offline extents are created
-Filesystem type is: 554f4353
-File size of /mnt/test/test/setattr_more/file is 40988672 (10007 blocks of 4096 bytes)
- ext:     logical_offset:        physical_offset: length:   expected: flags:
-   0:        0..   10006:          0..     10006:  10007:             unknown,eof
-/mnt/test/test/setattr_more/file: 1 extent found
-== correct offline extent length
-976563
@@ -1 +0,0 @@
-== interrupt waiting mount
@@ -1,9 +0,0 @@
-== dirs shouldn't appear in data_seq queries
-== two created files are present and come after each other
-found first
-found second
-== unlinked entries must not be present
-== dirty inodes can not be present
-== changing metadata must increase meta seq
-== changing contents must increase data seq
-== make sure dirtying doesn't livelock walk
@@ -1,146 +0,0 @@
-== simple whole file multi-block releasing
-== release last block that straddles i_size
-== release entire file past i_size
-== releasing offline extents is fine
-== 0 count is fine
-== release past i_size is fine
-== wrapped blocks fails
-release ioctl failed: Invalid argument (22)
-scoutfs: release failed: Invalid argument (22)
-== releasing non-file fails
-ioctl failed: Inappropriate ioctl for device (25)
-release: must provide file version --data-version
-Try `release --help' or `release --usage' for more information.
-== releasing a non-scoutfs file fails
-ioctl failed: Inappropriate ioctl for device (25)
-release: must provide file version --data-version
-Try `release --help' or `release --usage' for more information.
-== releasing bad version fails
-release: must provide file version --data-version
-Try `release --help' or `release --usage' for more information.
-== verify small release merging
-0 0 0:  (0 0 1)  (1 101 4)
-0 0 1:  (0 0 2)  (2 102 3)
-0 0 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
-0 0 3:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
-0 0 4:  (0 0 1)  (1 101 3)  (4 0 1)
-0 1 0:  (0 0 2)  (2 102 3)
-0 1 1:  (0 0 2)  (2 102 3)
-0 1 2:  (0 0 3)  (3 103 2)
-0 1 3:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
-0 1 4:  (0 0 2)  (2 102 2)  (4 0 1)
-0 2 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
-0 2 1:  (0 0 3)  (3 103 2)
-0 2 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
-0 2 3:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
-0 2 4:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
-0 3 0:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
-0 3 1:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
-0 3 2:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
-0 3 3:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
-0 3 4:  (0 0 1)  (1 101 2)  (3 0 2)
-0 4 0:  (0 0 1)  (1 101 3)  (4 0 1)
-0 4 1:  (0 0 2)  (2 102 2)  (4 0 1)
-0 4 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
-0 4 3:  (0 0 1)  (1 101 2)  (3 0 2)
-0 4 4:  (0 0 1)  (1 101 3)  (4 0 1)
-1 0 0:  (0 0 2)  (2 102 3)
-1 0 1:  (0 0 2)  (2 102 3)
-1 0 2:  (0 0 3)  (3 103 2)
-1 0 3:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
-1 0 4:  (0 0 2)  (2 102 2)  (4 0 1)
-1 1 0:  (0 0 2)  (2 102 3)
-1 1 1:  (0 100 1)  (1 0 1)  (2 102 3)
-1 1 2:  (0 100 1)  (1 0 2)  (3 103 2)
-1 1 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
-1 1 4:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
-1 2 0:  (0 0 3)  (3 103 2)
-1 2 1:  (0 100 1)  (1 0 2)  (3 103 2)
-1 2 2:  (0 100 1)  (1 0 2)  (3 103 2)
-1 2 3:  (0 100 1)  (1 0 3)  (4 104 1)
-1 2 4:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
-1 3 0:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
-1 3 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
-1 3 2:  (0 100 1)  (1 0 3)  (4 104 1)
-1 3 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
-1 3 4:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
-1 4 0:  (0 0 2)  (2 102 2)  (4 0 1)
-1 4 1:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
-1 4 2:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
-1 4 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
-1 4 4:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
-2 0 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
-2 0 1:  (0 0 3)  (3 103 2)
-2 0 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
-2 0 3:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
-2 0 4:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
-2 1 0:  (0 0 3)  (3 103 2)
-2 1 1:  (0 100 1)  (1 0 2)  (3 103 2)
-2 1 2:  (0 100 1)  (1 0 2)  (3 103 2)
-2 1 3:  (0 100 1)  (1 0 3)  (4 104 1)
-2 1 4:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
-2 2 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 2)
-2 2 1:  (0 100 1)  (1 0 2)  (3 103 2)
-2 2 2:  (0 100 2)  (2 0 1)  (3 103 2)
-2 2 3:  (0 100 2)  (2 0 2)  (4 104 1)
-2 2 4:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
-2 3 0:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
-2 3 1:  (0 100 1)  (1 0 3)  (4 104 1)
-2 3 2:  (0 100 2)  (2 0 2)  (4 104 1)
-2 3 3:  (0 100 2)  (2 0 2)  (4 104 1)
-2 3 4:  (0 100 2)  (2 0 3)
-2 4 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
-2 4 1:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
-2 4 2:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
-2 4 3:  (0 100 2)  (2 0 3)
-2 4 4:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
-3 0 0:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
-3 0 1:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
-3 0 2:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
-3 0 3:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
-3 0 4:  (0 0 1)  (1 101 2)  (3 0 2)
-3 1 0:  (0 0 2)  (2 102 1)  (3 0 1)  (4 104 1)
-3 1 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
-3 1 2:  (0 100 1)  (1 0 3)  (4 104 1)
-3 1 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
-3 1 4:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
-3 2 0:  (0 0 1)  (1 101 1)  (2 0 2)  (4 104 1)
-3 2 1:  (0 100 1)  (1 0 3)  (4 104 1)
-3 2 2:  (0 100 2)  (2 0 2)  (4 104 1)
-3 2 3:  (0 100 2)  (2 0 2)  (4 104 1)
-3 2 4:  (0 100 2)  (2 0 3)
-3 3 0:  (0 0 1)  (1 101 2)  (3 0 1)  (4 104 1)
-3 3 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 1)  (4 104 1)
-3 3 2:  (0 100 2)  (2 0 2)  (4 104 1)
-3 3 3:  (0 100 3)  (3 0 1)  (4 104 1)
-3 3 4:  (0 100 3)  (3 0 2)
-3 4 0:  (0 0 1)  (1 101 2)  (3 0 2)
-3 4 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
-3 4 2:  (0 100 2)  (2 0 3)
-3 4 3:  (0 100 3)  (3 0 2)
-3 4 4:  (0 100 3)  (3 0 2)
-4 0 0:  (0 0 1)  (1 101 3)  (4 0 1)
-4 0 1:  (0 0 2)  (2 102 2)  (4 0 1)
-4 0 2:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
-4 0 3:  (0 0 1)  (1 101 2)  (3 0 2)
-4 0 4:  (0 0 1)  (1 101 3)  (4 0 1)
-4 1 0:  (0 0 2)  (2 102 2)  (4 0 1)
-4 1 1:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
-4 1 2:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
-4 1 3:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
-4 1 4:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
-4 2 0:  (0 0 1)  (1 101 1)  (2 0 1)  (3 103 1)  (4 0 1)
-4 2 1:  (0 100 1)  (1 0 2)  (3 103 1)  (4 0 1)
-4 2 2:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
-4 2 3:  (0 100 2)  (2 0 3)
-4 2 4:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
-4 3 0:  (0 0 1)  (1 101 2)  (3 0 2)
-4 3 1:  (0 100 1)  (1 0 1)  (2 102 1)  (3 0 2)
-4 3 2:  (0 100 2)  (2 0 3)
-4 3 3:  (0 100 3)  (3 0 2)
-4 3 4:  (0 100 3)  (3 0 2)
-4 4 0:  (0 0 1)  (1 101 3)  (4 0 1)
-4 4 1:  (0 100 1)  (1 0 1)  (2 102 2)  (4 0 1)
-4 4 2:  (0 100 2)  (2 0 1)  (3 103 1)  (4 0 1)
-4 4 3:  (0 100 3)  (3 0 2)
-4 4 4:  (0 100 4)  (4 0 1)
@@ -1,23 +0,0 @@
-== create/release/stage single block file
-== create/release/stage larger file
-== multiple release,drop_cache,stage cycles
-== release+stage shouldn't change stat, data seq or vers
-== stage does change meta_seq
-== can't use stage to extend online file
-stage: must provide file version with --data-version
-Try `stage --help' or `stage --usage' for more information.
-== wrapped region fails
-stage returned -1, not 4096: error Invalid argument (22)
-scoutfs: stage failed: Input/output error (5)
-== non-block aligned offset fails
-stage returned -1, not 4095: error Invalid argument (22)
-scoutfs: stage failed: Input/output error (5)
-== non-block aligned len within block fails
-stage returned -1, not 1024: error Invalid argument (22)
-scoutfs: stage failed: Input/output error (5)
-== partial final block that writes to i_size does work
-== zero length stage doesn't bring blocks online
-== stage of non-regular file fails
-ioctl failed: Inappropriate ioctl for device (25)
-stage: must provide file version with --data-version
-Try `stage --help' or `stage --usage' for more information.
@@ -1,18 +0,0 @@
-=== XATTR_ flag combinations
-dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -c -r
-returned -1 errno 22 (Invalid argument)
-dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -r
-returned -1 errno 61 (No data available)
-dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -c
-returned 0
-dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -c
-returned -1 errno 17 (File exists)
-dumb_setxattr -p /mnt/test/test/simple-xattr-unit/file -n user.test -v val -r
-returned 0
-=== bad lengths
-setfattr: /mnt/test/test/simple-xattr-unit/file: Operation not supported
-setfattr: /mnt/test/test/simple-xattr-unit/file: Numerical result out of range
-setfattr: /mnt/test/test/simple-xattr-unit/file: Numerical result out of range
-setfattr: /mnt/test/test/simple-xattr-unit/file: Argument list too long
-=== good length boundaries
-=== 500 random lengths
@@ -1,13 +0,0 @@
-== create new xattrs
-== update existing xattr
-== remove an xattr
-== remove xattr with files
-== create entries in current log
-== delete small fraction
-== remove files
-== create entries that exceed one log
-== delete fractions in phases
-== remove files
-== create entries for exceed search entry limit
-== delete half
-== entirely remove third batch
@@ -1,2 +0,0 @@
-== create initial files
-== race stage and release
@@ -1,11 +0,0 @@
-== create file for xattr ping pong
-# file: /mnt/test/test/stale-btree-read/file
-user.xat="initial"
-
-== retry btree block read
-trigger btree_stale_read armed: 1
-# file: /mnt/test/test/stale-btree-read/file
-user.xat="btree"
-
-trigger btree_stale_read after: 0
-counter btree_stale_read diff 1
@@ -1,281 +0,0 @@
-Ran:
-generic/001
-generic/002
-generic/005
-generic/006
-generic/007
-generic/011
-generic/013
-generic/014
-generic/020
-generic/028
-generic/032
-generic/034
-generic/035
-generic/037
-generic/039
-generic/040
-generic/041
-generic/053
-generic/056
-generic/057
-generic/062
-generic/065
-generic/066
-generic/067
-generic/069
-generic/070
-generic/071
-generic/073
-generic/076
-generic/084
-generic/086
-generic/087
-generic/088
-generic/090
-generic/092
-generic/098
-generic/101
-generic/104
-generic/106
-generic/107
-generic/117
-generic/124
-generic/129
-generic/131
-generic/169
-generic/184
-generic/221
-generic/228
-generic/236
-generic/245
-generic/249
-generic/257
-generic/258
-generic/286
-generic/294
-generic/306
-generic/307
-generic/308
-generic/309
-generic/313
-generic/315
-generic/322
-generic/335
-generic/336
-generic/337
-generic/341
-generic/342
-generic/343
-generic/348
-generic/360
-generic/376
-generic/377
-Not
-run:
-generic/004
-generic/008
-generic/009
-generic/012
-generic/015
-generic/016
-generic/018
-generic/021
-generic/022
-generic/026
-generic/031
-generic/033
-generic/050
-generic/052
-generic/058
-generic/059
-generic/060
-generic/061
-generic/063
-generic/064
-generic/079
-generic/081
-generic/082
-generic/091
-generic/094
-generic/096
-generic/110
-generic/111
-generic/113
-generic/114
-generic/115
-generic/116
-generic/118
-generic/119
-generic/121
-generic/122
-generic/123
-generic/128
-generic/130
-generic/134
-generic/135
-generic/136
-generic/138
-generic/139
-generic/140
-generic/142
-generic/143
-generic/144
-generic/145
-generic/146
-generic/147
-generic/148
-generic/149
-generic/150
-generic/151
-generic/152
-generic/153
-generic/154
-generic/155
-generic/156
-generic/157
-generic/158
-generic/159
-generic/160
-generic/161
-generic/162
-generic/163
-generic/171
-generic/172
-generic/173
-generic/174
-generic/177
-generic/178
-generic/179
-generic/180
-generic/181
-generic/182
-generic/183
-generic/185
-generic/188
-generic/189
-generic/190
-generic/191
-generic/193
-generic/194
-generic/195
-generic/196
-generic/197
-generic/198
-generic/199
-generic/200
-generic/201
-generic/202
-generic/203
-generic/205
-generic/206
-generic/207
-generic/210
-generic/211
-generic/212
-generic/214
-generic/216
-generic/217
-generic/218
-generic/219
-generic/220
-generic/222
-generic/223
-generic/225
-generic/227
-generic/229
-generic/230
-generic/235
-generic/238
-generic/240
-generic/244
-generic/250
-generic/252
-generic/253
-generic/254
-generic/255
-generic/256
-generic/259
-generic/260
-generic/261
-generic/262
-generic/263
-generic/264
-generic/265
-generic/266
-generic/267
-generic/268
-generic/271
-generic/272
-generic/276
-generic/277
-generic/278
-generic/279
-generic/281
-generic/282
-generic/283
-generic/284
-generic/287
-generic/288
-generic/289
-generic/290
-generic/291
-generic/292
-generic/293
-generic/295
-generic/296
-generic/301
-generic/302
-generic/303
-generic/304
-generic/305
-generic/312
-generic/314
-generic/316
-generic/317
-generic/318
-generic/324
-generic/326
-generic/327
-generic/328
-generic/329
-generic/330
-generic/331
-generic/332
-generic/353
-generic/355
-generic/356
-generic/357
-generic/358
-generic/359
-generic/361
-generic/362
-generic/363
-generic/364
-generic/365
-generic/366
-generic/367
-generic/368
-generic/369
-generic/370
-generic/371
-generic/372
-generic/373
-generic/374
-generic/378
-generic/379
-generic/380
-generic/381
-generic/382
-generic/383
-generic/384
-generic/385
-generic/386
-shared/001
-shared/002
-shared/003
-shared/004
-shared/032
-shared/051
-shared/289
-Passed all 72 tests
--- a/Show More
+++ b/Show More