From efd97633553bc1af9c356397b067bb24f743dd86 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Wed, 29 Apr 2020 14:50:11 -0700
Subject: [PATCH] scoutfs: use efficient btree block structures

This btree implementation was first built for the relatively light duty
of indexing segments in the LSM item implementation.  We're now using it
as the core metadata index.  It's already using a lot of cpu to do its
job with small blocks and it only gets more expensive as the block size
increases.  These changes reduce the CPU use of working with the btree
block structures.

We use a balanced binary tree to index items by key in the block.  This
gives us rare tree balancing cost on insertion and deletion instead of
the memmove overhead of maintaining a dense array of item offsets sorted
by key.  The keys are stored in the item struct which are stored in an
array at the front of the block so searching for an item uses contiguous
cachelines.

We add a trailing owner offset to values so that we can iterate through
them.  This is used to track space freed up by values instead of paying
the memmove cost of keeping all the values at the end of the block.  We
occasionally reclaim the fragmented value free space instead of
splitting the block.

Direct item lookups use a small hash table at the end of the block
which maps offsets to items.  It uses linear probing and is guaranteed
to have a light load factor so lookups are very likely to only need
a single cache lookup.

We adjust the watermark for triggering a join from half of a block down
to a quarter.  This results in less utilized blocks on average.  But it
creates distance between the join and split thresholds so we get less
cpu use from constantly joining and splitting if item populations happen
to hover around the previously shared threshold.

While shifting the implementation we choose not to add support for some
features that no longer make sense.  There are no longer callers of
_before and _after, and having synthetic tests to use small btree blocks
no longer makes ense when we can easily create very tall trees.  Both
those btree interfaces and the tiny btree block support will be removed.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/btree.c  | 1076 ++++++++++++++++++++++++++++-----------------
 kmod/src/format.h |   42 +-
 2 files changed, 693 insertions(+), 425 deletions(-)

diff --git a/kmod/src/btree.c b/kmod/src/btree.c
index 0aa2edd6..a62535f5 100644
--- a/kmod/src/btree.c
+++ b/kmod/src/btree.c
@@ -28,48 +28,51 @@
 #include "msg.h"
 #include "block.h"
 #include "radix.h"
+#include "avl.h"
 
 #include "scoutfs_trace.h"
 
 /*
  * scoutfs uses a cow btree to index fs metadata.
  *
- * Using a cow btree lets nodes determine the validity of cached blocks
- * based on a single root ref (blkno, seq) that is communicated through
- * locking and messaging.  As long as their cached blocks aren't
- * overwritten in the ring they can continue to use those cached blocks
- * as the newer cowed blocks continue to reference them.
- *
  * Today callers provide all the locking.  They serialize readers and
  * writers and writers and committing all the dirty blocks.
  *
- * Btree items are stored in each block as a small header with the key
- * followed by the value.  New items are allocated from the back of the
- * block towards the front. 
+ * Block reference have sufficient metadata to discover corrupt
+ * references.  If a reader encounters a bad block it backs off which
+ * gives the caller the opportunity to resample the root in case it was
+ * reading through a stale btree that has been overwritten.  This lets
+ * mounts read trees that are modified by other mounts without exclusive
+ * locking.
  *
- * A dense array of item headers after the btree block header stores the
- * offsets of the items and is kept sorted by the item's keys.  The
- * array is small enough that keeping it sorted with memmove() involves
- * a few cache lines at most.
+ * Btree items are stored as a dense array of structs at the front of
+ * each block.  New items are allocated at the end of the array.
+ * Deleted items are swapped with the last item to maintain the dense
+ * array.  The items are indexed by a balanced binary tree with parent
+ * pointers so the relocated item can have references to it updated.
  *
- * Parent blocks in the btree have the same format as leaf blocks.
- * There's one key for every child reference instead of having separator
- * keys between child references.  The key in a child reference contains
- * the largest key that may be found in the child subtree.  The right
- * spine of the tree has maximal keys so that they don't have to be
- * updated if we insert an item with a key greater than everything in
- * the tree.
- */
-
-/*
- * XXX:
- *  - counters and tracing
- *  - could issue read-ahead around reads up to dirty blkno
- *  - have barrier as we cross to prevent refreshing clobbering stale reads
- *  - audit/comment that dirty blknos can wrap around ring
- *  - figure out some max transaction size so ring won't wrap in one
- *  - update the world of comments
- *  - validate structures on read?
+ * Values are allocated from the end of the block towards the front,
+ * consuming the end of free space in the center of the block.  Deleted
+ * values can be merged with this free space, but more likely they'll
+ * create fragmented free space amongst other existing values.  All
+ * values are stored with an offset at the end which contains either the
+ * offset of their item or the offset of the start of their free space.
+ * This lets an infrequent compaction process move items towards the
+ * back of the block to reclaim free space.
+ *
+ * Exact item searches are only performed on leaf blocks.  Leaf blocks
+ * have a hash table at the end of the block which is used to find items
+ * with a specific key.  It uses linear probing and maintains a low load
+ * factor so any given search will most likely only need a single
+ * cacheline.
+ *
+ * Parent block reference items are stored as items with a block
+ * reference as a value.  There's an item with a key for every child
+ * reference instead of having separator keys between child references.
+ * The key in a child reference contains the largest key that may be
+ * found in the child subtree.  The right spine of the tree has maximal
+ * keys so that they don't have to be updated if we insert an item with
+ * a key greater than everything in the tree.
  */
 
 /* btree walking has a bunch of behavioural bit flags */
@@ -81,88 +84,75 @@ enum {
 	 BTW_DIRTY	= (1 <<  4), /* cow stable blocks */
 	 BTW_ALLOC	= (1 <<  5), /* allocate a new block for 0 ref */
 	 BTW_INSERT	= (1 <<  6), /* walking to insert, try splitting */
-	 BTW_DELETE	= (1 <<  7), /* walking to delete, try merging */
+	 BTW_DELETE	= (1 <<  7), /* walking to delete, try joining */
 };
 
-/* number of contiguous bytes used by the item and it's value */
-static inline unsigned int len_bytes(unsigned val_len)
+/* total length of the value payload */
+static inline unsigned int val_bytes(unsigned val_len)
 {
-	return sizeof(struct scoutfs_btree_item) + val_len;
+	return val_len + (val_len ? SCOUTFS_BTREE_VAL_OWNER_BYTES : 0);
 }
 
-/* number of contiguous bytes used an existing item */
+/* number of bytes in a block used by an item with the given value length */
+static inline unsigned int item_len_bytes(unsigned val_len)
+{
+	return sizeof(struct scoutfs_btree_item) + val_bytes(val_len);
+}
+
+/* number of bytes used by an existing item */
 static inline unsigned int item_bytes(struct scoutfs_btree_item *item)
 {
-	return len_bytes(le16_to_cpu(item->val_len));
-}
-
-/* total block bytes used by an item: header, item, key, value */
-static inline unsigned int all_len_bytes(unsigned val_len)
-{
-	return sizeof(struct scoutfs_btree_item_header) + len_bytes(val_len);
+	return item_len_bytes(le16_to_cpu(item->val_len));
 }
 
 /*
- * The minimum number of bytes we allow in a block.  During descent to
- * modify if we see a block with fewer used bytes then we'll try to
- * merge items from neighbours.  If the neighbour also has less than the
- * min bytes then the two blocks are merged.
- *
- * This is carefully calculated so that if two blocks are merged the
- * resulting block will have at least parent min free bytes free so
- * that it's not immediately split again.
- *
- * new_used = min_used + min_used - hdr
- * new_used <= (bs - parent_min_free)
- *
- * min_used + min_used - hdr <= (bs - parent_min_free)
- * 2 * min_used <= (bs - parent_min_free - hdr)
- * min_used <= (bs - parent_min_free - hdr) / 2
+ * Join blocks when they both are 1/4 full.  This puts some distance
+ * between the join threshold and the full threshold for splitting.
+ * Blocks that just split or joined need to undergo a reasonable amount
+ * of item modification before they'll split or join again.
  */
-static inline int min_used_bytes(int block_size)
+static unsigned int join_low_watermark(void)
 {
-	return (block_size - sizeof(struct scoutfs_btree_block) -
-		SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES) / 2;
+	return (SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 4;
 }
 
-/* total block bytes used by an existing item */
-static inline unsigned int all_item_bytes(struct scoutfs_btree_item *item)
+/*
+ * return the integer percentages of total space the block could have
+ * consumed by items that is currently consumed.
+ */
+static unsigned int item_full_pct(struct scoutfs_btree_block *bt)
 {
-	return all_len_bytes(le16_to_cpu(item->val_len));
+	return (int)le16_to_cpu(bt->total_item_bytes) * 100 /
+		(SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block));
 }
 
-/* number of free bytes between last item header and first item */
-static inline unsigned int free_bytes(struct scoutfs_btree_block *bt)
+static inline __le16 ptr_off(struct scoutfs_btree_block *bt, void *ptr)
 {
-	unsigned int nr = le32_to_cpu(bt->nr_items);
-
-	return le32_to_cpu(bt->free_end) -
-	       offsetof(struct scoutfs_btree_block, item_hdrs[nr]);
+	return cpu_to_le16(ptr - (void *)bt);
 }
 
-/* all bytes used by item offsets, headers, and values */
-static inline unsigned int used_total(struct scoutfs_btree_block *bt)
+static inline void *off_ptr(struct scoutfs_btree_block *bt, u16 off)
 {
-	return SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block) -
-	       free_bytes(bt);
+	return (void *)bt + off;
 }
 
 static inline struct scoutfs_btree_item *
-off_item(struct scoutfs_btree_block *bt, __le32 off)
+off_item(struct scoutfs_btree_block *bt, __le16 off)
 {
-	return (void *)bt + le32_to_cpu(off);
+	return (void *)bt + le16_to_cpu(off);
 }
 
-static inline struct scoutfs_btree_item *
-pos_item(struct scoutfs_btree_block *bt, unsigned int pos)
+static struct scoutfs_btree_item *last_item(struct scoutfs_btree_block *bt)
 {
-	return off_item(bt, bt->item_hdrs[pos].off);
+	BUG_ON(bt->nr_items == 0);
+
+	return &bt->items[le16_to_cpu(bt->nr_items) - 1];
 }
 
-static inline struct scoutfs_btree_item *
-last_item(struct scoutfs_btree_block *bt)
+/* offset of the start of the free range in the middle of the block */
+static inline unsigned int mid_free_off(struct scoutfs_btree_block *bt)
 {
-	return pos_item(bt, le32_to_cpu(bt->nr_items) - 1);
+	return le16_to_cpu(ptr_off(bt, &bt->items[le16_to_cpu(bt->nr_items)]));
 }
 
 static inline struct scoutfs_key *item_key(struct scoutfs_btree_item *item)
@@ -170,9 +160,10 @@ static inline struct scoutfs_key *item_key(struct scoutfs_btree_item *item)
 	return &item->key;
 }
 
-static inline void *item_val(struct scoutfs_btree_item *item)
+static inline void *item_val(struct scoutfs_btree_block *bt,
+			     struct scoutfs_btree_item *item)
 {
-	return item->val;
+	return off_ptr(bt, le16_to_cpu(item->val_off));
 }
 
 static inline unsigned item_val_len(struct scoutfs_btree_item *item)
@@ -180,173 +171,482 @@ static inline unsigned item_val_len(struct scoutfs_btree_item *item)
 	return le16_to_cpu(item->val_len);
 }
 
-/*
- * Returns the sorted item position that an item with the given key
- * should occupy.
- *
- * It sets *cmp to the final comparison of the given key and the
- * position's item key.  This can only be -1 or 0 because we bias
- * towards returning the pos that a key should occupy.
- *
- * If the given key is greater then all items' keys then the number of
- * items can be returned.
- */
-static int find_pos(struct scoutfs_btree_block *bt, struct scoutfs_key *key,
-		    int *cmp)
+static struct scoutfs_btree_item *node_item(struct scoutfs_avl_node *node)
 {
+	if (node == NULL)
+		return NULL;
+	return container_of(node, struct scoutfs_btree_item, node);
+}
+
+static struct scoutfs_btree_item *prev_item(struct scoutfs_btree_block *bt,
+					    struct scoutfs_btree_item *item)
+{
+	if (item == NULL)
+		return NULL;
+	return node_item(scoutfs_avl_prev(&bt->item_root, &item->node));
+}
+
+static struct scoutfs_btree_item *next_item(struct scoutfs_btree_block *bt,
+					    struct scoutfs_btree_item *item)
+{
+	if (item == NULL)
+		return NULL;
+	return node_item(scoutfs_avl_next(&bt->item_root, &item->node));
+}
+
+static int cmp_key_item(void *arg, struct scoutfs_avl_node *node)
+{
+	struct scoutfs_key *key = arg;
+	struct scoutfs_btree_item *item = node_item(node);
+
+	return scoutfs_key_compare(key, item_key(item));
+}
+
+/*
+ * We have a small fixed-size linearly probed hash table at the end of
+ * leaf blocks which is used for direct item lookups (as opposed to
+ * iterators).  The hash table only stores non-zero offsets to the
+ * items.  If an item is moved then its offset is updated.  The hash
+ * table is sized to allow a max load of 75%, but most items are larger
+ * and most blocks aren't full.
+ */
+static int leaf_item_hash_ind(struct scoutfs_key *key)
+{
+	return crc32c(~0, key, sizeof(struct scoutfs_key)) %
+	       SCOUTFS_BTREE_LEAF_ITEM_HASH_NR;
+}
+
+static __le16 *leaf_item_hash_buckets(struct scoutfs_btree_block *bt)
+{
+	return (void *)bt + SCOUTFS_BLOCK_SIZE -
+		SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES;
+}
+
+static inline int leaf_item_hash_next_bucket(int i)
+{
+	if (++i >= SCOUTFS_BTREE_LEAF_ITEM_HASH_NR)
+		i = 0;
+	return i;
+}
+
+#define foreach_leaf_item_hash_bucket(i, nr, key)			       \
+	for (i = leaf_item_hash_ind(key), nr = SCOUTFS_BTREE_LEAF_ITEM_HASH_NR;\
+	     nr-- > 0;							       \
+	     i = leaf_item_hash_next_bucket(i))
+
+static struct scoutfs_btree_item *
+leaf_item_hash_search(struct scoutfs_btree_block *bt, struct scoutfs_key *key)
+{
+	__le16 *buckets = leaf_item_hash_buckets(bt);
 	struct scoutfs_btree_item *item;
-	unsigned int start = 0;
-	unsigned int end = le32_to_cpu(bt->nr_items);
-	unsigned int pos = 0;
+	__le16 off;
+	int nr;
+	int i;
 
-	*cmp = -1;
+	if (WARN_ON_ONCE(bt->level > 0))
+		return NULL;
 
-	while (start < end) {
-		pos = start + (end - start) / 2;
+	foreach_leaf_item_hash_bucket(i, nr, key) {
+		off = buckets[i];
+		if (off == 0)
+			return NULL;
 
-		item = pos_item(bt, pos);
-		*cmp = scoutfs_key_compare(key, item_key(item));
-		if (*cmp < 0) {
-			end = pos;
-		} else if (*cmp > 0) {
-			start = ++pos;
-			*cmp = -1;
-		} else {
+		item = off_item(bt, off);
+		if (scoutfs_key_compare(key, item_key(item)) == 0)
+			return item;
+	}
+
+	return NULL;
+}
+
+static void leaf_item_hash_insert(struct scoutfs_btree_block *bt,
+				  struct scoutfs_key *key, __le16 off)
+{
+	__le16 *buckets = leaf_item_hash_buckets(bt);
+	int nr;
+	int i;
+
+	if (bt->level > 0)
+		return;
+
+	foreach_leaf_item_hash_bucket(i, nr, key) {
+		if (buckets[i] == 0) {
+			buckets[i] = off;
+			return;
+		}
+	}
+
+	/* table should have been been enough for all items */
+	BUG();
+}
+
+/*
+ * Deletion clears the offset in a bucket.  That could create a
+ * discontinuity that would stop a search from seeing colliding
+ * insertions that were pushed into further buckets.  Each time we zero
+ * a bucket we rehash all the populated buckets following it.  There
+ * won't be many in our light load tables and this works reliably as the
+ * contiguous population wraps past the end of table.  Comparing hashed
+ * bucket positions to find candidates to relocate after the wrap is
+ * tricky.  
+ */
+static void leaf_item_hash_delete(struct scoutfs_btree_block *bt,
+				  struct scoutfs_key *key, __le16 del_off)
+{
+	__le16 *buckets = leaf_item_hash_buckets(bt);
+	__le16 off;
+	int nr;
+	int i;
+
+	if (bt->level > 0)
+		return;
+
+	foreach_leaf_item_hash_bucket(i, nr, key) {
+		off = buckets[i];
+		/* we must find the item we're trying to delete */
+		BUG_ON(off == 0);
+
+		if (off == del_off) {
+			buckets[i] = 0;
 			break;
 		}
 	}
 
-	return pos;
+	while ((i = leaf_item_hash_next_bucket(i)), buckets[i] != 0) {
+		off = buckets[i];
+		buckets[i] = 0;
+		leaf_item_hash_insert(bt, item_key(off_item(bt, off)), off);
+	}
 }
 
-/* move a number of contigous elements from the src index to the dst index */
-#define memmove_arr(arr, dst, src, nr) \
-	memmove(&(arr)[dst], &(arr)[src], (nr) * sizeof(*(arr)))
+static void leaf_item_hash_change(struct scoutfs_btree_block *bt,
+				  struct scoutfs_key *key, __le16 to,
+				  __le16 from)
+{
+	__le16 *buckets = leaf_item_hash_buckets(bt);
+	__le16 off;
+	int nr;
+	int i;
+
+	if (bt->level > 0)
+		return;
+
+	foreach_leaf_item_hash_bucket(i, nr, key) {
+		off = buckets[i];
+		/* we must find the item we're trying to change */
+		BUG_ON(off == 0);
+
+		if (off == from) {
+			buckets[i] = to;
+			return;
+		}
+	}
+}
+
+/*
+ * Given an offset to the start of a value, return info describing the
+ * previous value in the block.  Each value ends with an owner offset
+ * which points to either the value's item if it's in use or to the
+ * start of the value if it's been freed.  Either the item is returned
+ * or the length of the previous value is set.
+ */
+static struct scoutfs_btree_item *
+get_prev_val_owner(struct scoutfs_btree_block *bt, unsigned int off,
+		   unsigned int *prev_val_bytes)
+{
+	__le16 *owner = off_ptr(bt, off - sizeof(*owner));
+	unsigned int own = get_unaligned_le16(owner);
+
+	if (own >= mid_free_off(bt)) {
+		*prev_val_bytes = off - own;
+		return NULL;
+	} else {
+		*prev_val_bytes = 0;
+		return off_ptr(bt, own);
+	}
+}
+
+/*
+ * Set the owner offset at the end of a full value, the given length includes
+ * the offset.
+ */
+static void set_val_owner(struct scoutfs_btree_block *bt, unsigned int val_off,
+			  unsigned int vb, __le16 item_off)
+{
+	__le16 *owner = off_ptr(bt, val_off + vb - sizeof(*owner));
+
+	put_unaligned_le16(le16_to_cpu(item_off) ?: val_off, owner);
+}
+
+/*
+ * As values are freed they can leave fragmented free space amongst
+ * other values.  This is called when we can't insert because there
+ * isn't enough free space but we know that there's sufficient free
+ * space amongst the values for the new insertion.
+ *
+ * But we only want to do this when there is enough free space to
+ * justify the cost of the compaction.  We don't want to bother
+ * compacting if the block is almost full and we just be split in a few
+ * more operations.  The split heuristic requires a generous amount of
+ * fragmented free space that will avoid a split.
+ */
+static void compact_values(struct scoutfs_btree_block *bt)
+{
+	struct scoutfs_btree_item *item;
+	unsigned int free_off;
+	unsigned int free_len;
+	unsigned int to_off;
+	unsigned int end;
+	unsigned int vb;
+	void *from;
+	void *to;
+
+	if (bt->last_free_off == 0)
+		return;
+
+	free_off = le16_to_cpu(bt->last_free_off);
+	free_len = le16_to_cpu(bt->last_free_len);
+	end = mid_free_off(bt) + le16_to_cpu(bt->mid_free_len);
+
+	while (free_off > end) {
+		item = get_prev_val_owner(bt, free_off, &vb);
+		if (item == NULL) {
+			free_off -= vb;
+			free_len += vb;
+			continue;
+		}
+
+		from = off_ptr(bt, le16_to_cpu(item->val_off));
+		vb = val_bytes(le16_to_cpu(item->val_len));
+		to_off = free_off + free_len - vb;
+		to = off_ptr(bt, to_off);
+		if (to >= from + vb)
+			memcpy(to, from, vb);
+		else
+			memmove(to, from, vb);
+
+		free_off = le16_to_cpu(item->val_off);
+		item->val_off = cpu_to_le16(to_off);
+	}
+
+	le16_add_cpu(&bt->mid_free_len, free_len);
+	bt->last_free_off = 0;
+	bt->last_free_len = 0;
+}
+
+/*
+ * Insert an item's value into the block.  The caller has made sure
+ * there's free space.  We store the value at the end of free space in
+ * the block and point its final offset at its owning item, and copy the
+ * value into place.
+ */
+static __le16 insert_value(struct scoutfs_btree_block *bt, __le16 item_off,
+			   void *val, unsigned val_len)
+{
+	unsigned int val_off;
+	unsigned int vb;
+
+	if (val_len == 0)
+		return 0;
+
+	BUG_ON(le16_to_cpu(bt->mid_free_len) < val_bytes(val_len));
+
+	vb = val_bytes(val_len);
+	val_off = mid_free_off(bt) + le16_to_cpu(bt->mid_free_len) - vb;
+	le16_add_cpu(&bt->mid_free_len, -vb);
+
+	memcpy(off_ptr(bt, val_off), val, val_len);
+	set_val_owner(bt, val_off, vb, item_off);
+
+	return cpu_to_le16(val_off);
+}
+
+/*
+ * Delete an item's value from the block.  The caller has updated the
+ * item.  We leave behind a free region whose owner offset indicates
+ * that the value isn't in use.  It might merge with the central free
+ * region or the final freed value, and might become the final freed
+ * value.
+ */
+static void delete_value(struct scoutfs_btree_block *bt,
+			 unsigned int val_off, unsigned int val_len)
+{
+	unsigned int free_off;
+	unsigned int free_len;
+	bool is_last;
+
+	if (val_len == 0)
+		return;
+
+	free_off = val_off;
+	free_len = val_bytes(val_len);
+	is_last = false;
+
+	/* see if we can merge with mid free region */
+	if (mid_free_off(bt) + le16_to_cpu(bt->mid_free_len) == free_off) {
+		le16_add_cpu(&bt->mid_free_len, free_len);
+		return;
+	}
+
+	if (free_off + free_len == le16_to_cpu(bt->last_free_off)) {
+		/* merge with front of last free */
+		free_len += le16_to_cpu(bt->last_free_len);
+		is_last = true;
+
+	} else if ((le16_to_cpu(bt->last_free_off) +
+		    le16_to_cpu(bt->last_free_len)) == free_off) {
+		/* merge with end of last free */
+		free_off = le16_to_cpu(bt->last_free_off);
+		free_len += le16_to_cpu(bt->last_free_len);
+		is_last = true;
+
+	} else if (free_off > le16_to_cpu(bt->last_free_off)) {
+		/* become new last */
+		is_last = true;
+	}
+
+	set_val_owner(bt, free_off, free_len, 0);
+	if (is_last) {
+		bt->last_free_off = cpu_to_le16(free_off);
+		bt->last_free_len = cpu_to_le16(free_len);
+	}
+}
 
 /*
  * Insert a new item into the block.  The caller has made sure that
- * there's space for the item and its metadata.
+ * there is sufficient free space in block for the new item.  We might
+ * have to compact the values to the end of the block to reclaim
+ * fragmented free space between values.
+ *
+ * This only consumes free space.  It's safe to use references to block
+ * structures after this call.
  */
-static void create_item(struct scoutfs_btree_block *bt, unsigned int pos,
-			struct scoutfs_key *key, void *val, unsigned val_len)
+static void create_item(struct scoutfs_btree_block *bt,
+			struct scoutfs_key *key, void *val, unsigned val_len,
+			struct scoutfs_avl_node *parent, int cmp)
 {
-	unsigned int nr = le32_to_cpu(bt->nr_items);
 	struct scoutfs_btree_item *item;
-	unsigned all_bytes;
 
-	all_bytes = all_len_bytes(val_len);
-	BUG_ON(free_bytes(bt) < all_bytes);
+	BUG_ON(le16_to_cpu(bt->mid_free_len) < item_len_bytes(val_len));
 
-	if (pos < nr)
-		memmove_arr(bt->item_hdrs, pos + 1, pos, nr - pos);
+	le16_add_cpu(&bt->mid_free_len,
+		     -(u16)sizeof(struct scoutfs_btree_item));
+	le16_add_cpu(&bt->nr_items, 1);
+	item = last_item(bt);
 
-	le32_add_cpu(&bt->free_end, -len_bytes(val_len));
-	bt->item_hdrs[pos].off = bt->free_end;
-	nr++;
-	bt->nr_items = cpu_to_le32(nr);
+	item->key = *key;
 
-	BUG_ON(le32_to_cpu(bt->free_end) <
-	       offsetof(struct scoutfs_btree_block, item_hdrs[nr]));
+	scoutfs_avl_insert(&bt->item_root, parent, &item->node, cmp);
+	leaf_item_hash_insert(bt, item_key(item), ptr_off(bt, item));
 
-	item = pos_item(bt, pos);
-	*item_key(item) = *key;
+	item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len);
 	item->val_len = cpu_to_le16(val_len);
 
-	if (val_len)
-		memcpy(item_val(item), val, val_len);
+	le16_add_cpu(&bt->total_item_bytes, item_bytes(item));
 }
 
 /*
  * Delete an item from a btree block.
  *
- * This moves all the headers after the item (in sort order) towards the
- * start of the header array.  It moves all the items before the removed
- * item towards the end of the block.  The items that have to be moved
- * can be anywhere in the sort order.  We first move the item region
- * and then walk the headers looking for offsets that need to be updated.
- *
- * The item motion means that callers can not hold item references
- * across item deletion.
+ * As we delete the item we can relocate an unrelated item to maintain
+ * the dense array of items.  The caller can use another single item
+ * after this call if they give us the opportunity to let them know if
+ * we move it.
  */
-static void delete_item(struct scoutfs_btree_block *bt, unsigned int pos)
+static void delete_item(struct scoutfs_btree_block *bt,
+			struct scoutfs_btree_item *item,
+			struct scoutfs_btree_item **use_after)
 {
-	unsigned int nr = le32_to_cpu(bt->nr_items);
-	unsigned int updated;
-	unsigned int total;
-	unsigned int first;
-	unsigned int bytes;
-	unsigned int last;
-	unsigned int off;
-	int i;
+	struct scoutfs_btree_item *last;
+	unsigned int val_off;
+	unsigned int val_len;
 
-	/* calculate region of items to move */
-	first = le32_to_cpu(bt->free_end);
-	last = le32_to_cpu(bt->item_hdrs[pos].off);
-	total = last - first;
-	bytes = item_bytes(pos_item(bt, pos));
+	/* save some values before we delete the item */
+	val_off = le16_to_cpu(item->val_off);
+	val_len = le16_to_cpu(item->val_len);
+	last = last_item(bt);
 
-	/* move items before deleted to the back of the block */
-	if (total > 0) {
-		/* update headers before memove overwrites deleted item */
-		for (i = 0, updated = 0; i < nr && updated < total; i++) {
-			off = le32_to_cpu(bt->item_hdrs[i].off);
-			if (off >= first && off < last) {
-				updated += item_bytes(pos_item(bt, i));
-				le32_add_cpu(&bt->item_hdrs[i].off, bytes);
-			}
-		}
-		BUG_ON(updated != total);
-
-		memmove(off_item(bt, cpu_to_le32(first + bytes)),
-			off_item(bt, cpu_to_le32(first)), total);
+	/* delete the item */
+	scoutfs_avl_delete(&bt->item_root, &item->node);
+	leaf_item_hash_delete(bt, item_key(item), ptr_off(bt, item));
+	le16_add_cpu(&bt->nr_items, -1);
+	le16_add_cpu(&bt->mid_free_len, sizeof(struct scoutfs_btree_item));
+	le16_add_cpu(&bt->total_item_bytes, -item_bytes(item));
 
+	/* move the final item into the deleted space */
+	if (last != item) {
+		item->key = last->key;
+		item->val_off = last->val_off;
+		item->val_len = last->val_len;
+		if (last->val_len)
+			set_val_owner(bt, le16_to_cpu(last->val_off),
+				      val_bytes(le16_to_cpu(last->val_len)),
+				      ptr_off(bt, item));
+		leaf_item_hash_change(bt, &last->key, ptr_off(bt, item),
+				      ptr_off(bt, last));
+		scoutfs_avl_relocate(&bt->item_root, &item->node,&last->node);
+		if (use_after && *use_after == last)
+			*use_after = item;
 	}
 
-	/* wipe deleted bytes to avoid leaking data */
-	memset(off_item(bt, cpu_to_le32(first)), 0, bytes);
-
-	if (pos < (nr - 1))
-		memmove_arr(bt->item_hdrs, pos, pos + 1, nr - 1 - pos);
-
-	le32_add_cpu(&bt->free_end, bytes);
-	le32_add_cpu(&bt->nr_items, -1);
+	delete_value(bt, val_off, val_len);
 }
 
 /*
  * Move items from a source block to a destination block.  The caller
- * tells us if we're moving from the tail of the source block right to
- * the head of the destination block, or vice versa.  We stop moving
- * once we've moved enough bytes of items.
+ * has made sure there's sufficient free space in the destination block,
+ * though item creation may need to compact values.  The caller tells us
+ * if we're moving from the tail of the source block right to the head
+ * of the destination block, or vice versa.  We're always adding the
+ * first or last item to the avl, so the parent is always the previous
+ * first or last node.
  */
 static void move_items(struct scoutfs_btree_block *dst,
 		       struct scoutfs_btree_block *src, bool move_right,
 		       int to_move)
 {
+	struct scoutfs_avl_node *par;
+	struct scoutfs_avl_node *node;
 	struct scoutfs_btree_item *from;
-	unsigned int t;
-	unsigned int f;
+	struct scoutfs_btree_item *next;
+	int cmp;
 
 	if (move_right) {
-		f = le32_to_cpu(src->nr_items) - 1;
-		t = 0;
+		node = scoutfs_avl_last(&src->item_root);
+		par = scoutfs_avl_first(&dst->item_root);
+		cmp = -1;
 	} else {
-		f = 0;
-		t = le32_to_cpu(dst->nr_items);
+		node = scoutfs_avl_first(&src->item_root);
+		par = scoutfs_avl_last(&dst->item_root);
+		cmp = 1;
 	}
+	from = node_item(node);
 
-	while (f < le32_to_cpu(src->nr_items) && to_move > 0) {
-		from = pos_item(src, f);
+	while (to_move > 0 && from != NULL) {
+		to_move -= item_bytes(from);
 
-		create_item(dst, t, item_key(from), item_val(from),
-			    item_val_len(from));
-
-		to_move -= all_item_bytes(from);
-
-		delete_item(src, f);
 		if (move_right)
-			f--;
+			next = prev_item(src, from);
 		else
-			t++;
+			next = next_item(src, from);
+
+		create_item(dst, item_key(from), item_val(src, from),
+			    item_val_len(from), par, cmp);
+
+		if (move_right) {
+			if (par)
+				par = scoutfs_avl_prev(&dst->item_root, par);
+			else
+				par = scoutfs_avl_first(&dst->item_root);
+		} else {
+			if (par)
+				par = scoutfs_avl_next(&dst->item_root, par);
+			else
+				par = scoutfs_avl_last(&dst->item_root);
+		}
+
+		delete_item(src, from, &next);
+		from = next;
 	}
 }
 
@@ -468,7 +768,6 @@ retry:
 		/* returning a newly allocated block */
 		memset(new, 0, SCOUTFS_BLOCK_SIZE);
 		new->hdr.fsid = super->hdr.fsid;
-		new->free_end = cpu_to_le32(SCOUTFS_BLOCK_SIZE);
 	}
 	bl = new_bl;
 	bt = new;
@@ -497,35 +796,52 @@ out:
  * specifies the key in the item that describes the items in the child.
  */
 static void create_parent_item(struct scoutfs_btree_block *parent,
-			       unsigned pos, struct scoutfs_btree_block *child,
+			       struct scoutfs_btree_block *child,
 			       struct scoutfs_key *key)
 {
+	struct scoutfs_avl_node *par;
+	int cmp;
 	struct scoutfs_btree_ref ref = {
 		.blkno = child->hdr.blkno,
 		.seq = child->hdr.seq,
 	};
 
-	create_item(parent, pos, key, &ref, sizeof(ref));
+	scoutfs_avl_search(&parent->item_root, cmp_key_item, key, &cmp, &par,
+			   NULL, NULL);
+	create_item(parent, key, &ref, sizeof(ref), par, cmp);
 }
 
 /*
- * Update the parent item that refers to a child by deleting and
- * recreating it.  Descent should have ensured that there was always
- * room for a maximal key in parents.
+ * Update an existing parent item reference to a child who may be new or
+ * may have had its last item changed.
  */
 static void update_parent_item(struct scoutfs_btree_block *parent,
-			       unsigned pos, struct scoutfs_btree_block *child)
+			       struct scoutfs_btree_item *par_item,
+			       struct scoutfs_btree_block *child)
 {
-	struct scoutfs_btree_item *item = last_item(child);
+	struct scoutfs_btree_ref *ref = item_val(parent, par_item);
 
-	delete_item(parent, pos);
-	create_parent_item(parent, pos, child, item_key(item));
+	par_item->key = *item_key(last_item(child));
+	ref->blkno = child->hdr.blkno;
+	ref->seq = child->hdr.seq;
+}
+
+static void init_btree_block(struct scoutfs_btree_block *bt, int level)
+{
+	int free;
+
+	free = SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block);
+	if (level == 0)
+		free -= SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES;
+
+	bt->level = level;
+	bt->mid_free_len = cpu_to_le16(free);
 }
 
 /*
  * See if we need to split this block while descending for insertion so
  * that we have enough space to insert.  Parent blocks need enough space
- * for a new item and child ref if a child block splits.  Leaf blocks
+ * to insert a new parent item if a child block splits.  Leaf blocks
  * need enough space to insert the new item with its value.
  *
  * We split to the left so that the greatest key in the existing block
@@ -538,35 +854,36 @@ static int try_split(struct super_block *sb,
 		     struct scoutfs_block_writer *wri,
 		     struct scoutfs_btree_root *root,
 		     struct scoutfs_key *key, unsigned val_len,
-		     struct scoutfs_btree_block *parent, unsigned pos,
+		     struct scoutfs_btree_block *parent,
 		     struct scoutfs_btree_block *right)
 {
 	struct scoutfs_block *left_bl = NULL;
 	struct scoutfs_block *par_bl = NULL;
 	struct scoutfs_btree_block *left;
-	struct scoutfs_btree_item *item;
 	struct scoutfs_key max_key;
-	unsigned int all_bytes;
 	int ret;
 	int err;
 
-	if (scoutfs_option_bool(sb, Opt_btree_force_tiny_blocks))
-		all_bytes = SCOUTFS_BLOCK_SIZE - SCOUTFS_BTREE_TINY_BLOCK_SIZE;
-	else if (right->level)
-		all_bytes = SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES;
-	else
-		all_bytes = all_len_bytes(val_len);
+	/* parents need to leave room for child references */
+	if (right->level)
+		val_len = sizeof(struct scoutfs_btree_ref);
 
-	if (free_bytes(right) >= all_bytes)
+	/* don't need to split if there's enough space for the item */
+	if (le16_to_cpu(right->mid_free_len) >= item_len_bytes(val_len))
 		return 0;
 
+	if (item_full_pct(right) < 80) {
+		compact_values(right);
+		return 0;
+	}
+
 	/* alloc split neighbour first to avoid unwinding tree growth */
 	ret = get_ref_block(sb, alloc, wri, BTW_ALLOC, NULL, &left_bl);
 	if (ret)
 		return ret;
 	left = left_bl->data;
 
-	left->level = right->level;
+	init_btree_block(left, right->level);
 
 	if (!parent) {
 		ret = get_ref_block(sb, alloc, wri, BTW_ALLOC, NULL, &par_bl);
@@ -579,21 +896,19 @@ static int try_split(struct super_block *sb,
 		}
 		parent = par_bl->data;
 
-		parent->level = root->height;
+		init_btree_block(parent, root->height);
 		root->height++;
 		root->ref.blkno = parent->hdr.blkno;
 		root->ref.seq = parent->hdr.seq;
 
 		scoutfs_key_set_ones(&max_key);
-
-		pos = 0;
-		create_parent_item(parent, pos, right, &max_key);
+		create_parent_item(parent, right, &max_key);
 	}
 
-	move_items(left, right, false, used_total(right) / 2);
+	move_items(left, right, false,
+		   le16_to_cpu(right->total_item_bytes) / 2);
 
-	item = last_item(left);
-	create_parent_item(parent, pos, left, item_key(item));
+	create_parent_item(parent, left, item_key(last_item(left)));
 
 	scoutfs_block_put(sb, left_bl);
 	scoutfs_block_put(sb, par_bl);
@@ -603,78 +918,73 @@ static int try_split(struct super_block *sb,
 
 /*
  * This is called during descent for deletion when we have a parent and
- * might need to merge items from a sibling block if this block has too
- * much free space.  Eventually we'll be able to fit all of the
+ * might need to join this block with a sibling block if this block has
+ * too much free space.  Eventually we'll be able to fit all of the
  * sibling's items in our free space which lets us delete the sibling
  * block.
- *
- * XXX this could more cleverly chose a merge candidate sibling
  */
-static int try_merge(struct super_block *sb,
-		     struct scoutfs_radix_allocator *alloc,
-		     struct scoutfs_block_writer *wri,
-		     struct scoutfs_btree_root *root,
-		     struct scoutfs_btree_block *parent, unsigned pos,
-		     struct scoutfs_btree_block *bt)
+static int try_join(struct super_block *sb,
+		    struct scoutfs_radix_allocator *alloc,
+		    struct scoutfs_block_writer *wri,
+		    struct scoutfs_btree_root *root,
+		    struct scoutfs_btree_block *parent,
+		    struct scoutfs_btree_item *par_item,
+		    struct scoutfs_btree_block *bt)
 {
+	struct scoutfs_btree_item *sib_par_item;
 	struct scoutfs_btree_block *sib;
 	struct scoutfs_block *sib_bl;
 	struct scoutfs_btree_ref *ref;
-	unsigned int min_used;
-	unsigned int sib_pos;
+	unsigned int sib_tot;
 	bool move_right;
 	int to_move;
 	int ret;
 
-	BUILD_BUG_ON(min_used_bytes(SCOUTFS_BTREE_TINY_BLOCK_SIZE) < 0);
-
-	if (scoutfs_option_bool(sb, Opt_btree_force_tiny_blocks))
-		min_used = min_used_bytes(SCOUTFS_BTREE_TINY_BLOCK_SIZE);
-	else
-		min_used = min_used_bytes(SCOUTFS_BLOCK_SIZE);
-
-	if (used_total(bt) >= min_used)
+	if (le16_to_cpu(bt->total_item_bytes) >= join_low_watermark())
 		return 0;
 
 	/* move items right into our block if we have a left sibling */
-	if (pos) {
-		sib_pos = pos - 1;
+	sib_par_item = prev_item(parent, par_item);
+	if (sib_par_item) {
 		move_right = true;
 	} else {
-		sib_pos = pos + 1;
+		sib_par_item = next_item(parent, par_item);
 		move_right = false;
 	}
 
-	ref = item_val(pos_item(parent, sib_pos));
+	ref = item_val(parent, sib_par_item);
 	ret = get_ref_block(sb, alloc, wri, BTW_DIRTY, ref, &sib_bl);
 	if (ret)
 		return ret;
 	sib = sib_bl->data;
 
-	if (used_total(sib) < min_used)
-		to_move = used_total(sib);
+	sib_tot = le16_to_cpu(bt->total_item_bytes);
+	if (sib_tot < join_low_watermark())
+		to_move = sib_tot;
 	else
-		to_move = min_used - used_total(bt);
+		to_move = sib_tot - join_low_watermark();
 
+	if (le16_to_cpu(bt->mid_free_len) < to_move)
+		compact_values(bt);
 	move_items(bt, sib, move_right, to_move);
 
 	/* update our parent's item */
 	if (!move_right)
-		update_parent_item(parent, pos, bt);
+		update_parent_item(parent, par_item, bt);
 
 	/* update or delete sibling's parent item */
-	if (le32_to_cpu(sib->nr_items) == 0) {
-		delete_item(parent, sib_pos);
+	if (le16_to_cpu(sib->nr_items) == 0) {
+		delete_item(parent, sib_par_item, NULL);
 		ret = scoutfs_radix_free(sb, alloc, wri,
 					 le64_to_cpu(sib->hdr.blkno));
 		BUG_ON(ret); /* could have dirtied alloc to avoid error */
 
 	} else if (move_right) {
-		update_parent_item(parent, sib_pos, sib);
+		update_parent_item(parent, sib_par_item, sib);
 	}
 
 	/* and finally shrink the tree if our parent is the root with 1 */
-	if (le32_to_cpu(parent->nr_items) == 1) {
+	if (le16_to_cpu(parent->nr_items) == 1) {
 		root->height--;
 		root->ref.blkno = bt->hdr.blkno;
 		root->ref.seq = bt->hdr.seq;
@@ -688,77 +998,6 @@ static int try_merge(struct super_block *sb,
 	return 1;
 }
 
-/*
- * A quick and dirty verification of the btree block.  We could add a
- * lot more checks and make it only verified on read or after
- * significant events like splitting and merging.
- */
-static int verify_btree_block(struct scoutfs_btree_block *bt, int level)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_item *prev = NULL;
-	unsigned int bytes = 0;
-	unsigned int after_off = sizeof(struct scoutfs_btree_block);
-	unsigned int first_off;
-	unsigned int off;
-	unsigned int nr;
-	unsigned int i = 0;
-	int bad = 1;
-
-	nr = le32_to_cpu(bt->nr_items);
-	if (nr == 0)
-		goto out;
-
-	after_off = offsetof(struct scoutfs_btree_block, item_hdrs[nr]);
-	first_off = SCOUTFS_BLOCK_SIZE;
-
-	if (after_off > SCOUTFS_BLOCK_SIZE) {
-		nr = 0;
-		goto out;
-	}
-
-	for (i = 0; i < nr; i++) {
-		off = le32_to_cpu(bt->item_hdrs[i].off);
-		if (off >= SCOUTFS_BLOCK_SIZE || off < after_off)
-			goto out;
-
-		first_off = min(first_off, off);
-
-		item = pos_item(bt, i);
-		bytes += item_bytes(item);
-
-		if (i > 0 && scoutfs_key_compare(item_key(item),
-						 item_key(prev)) <= 0)
-			goto out;
-
-		prev = item;
-	}
-
-	if (first_off < le32_to_cpu(bt->free_end))
-		goto out;
-
-	if ((le32_to_cpu(bt->free_end) + bytes) != SCOUTFS_BLOCK_SIZE)
-		goto out;
-
-	bad = 0;
-out:
-	if (bad) {
-		printk("bt %p blkno %llu level %d end %u nr %u (after %u bytes %u)\n",
-			bt, le64_to_cpu(bt->hdr.blkno), level,
-			le32_to_cpu(bt->free_end), le32_to_cpu(bt->nr_items),
-			after_off, bytes);
-		for (i = 0; i < nr; i++) {
-			item = pos_item(bt, i);
-			printk("  [%u] off %u val_len %u\n",
-			       i, le32_to_cpu(bt->item_hdrs[i].off),
-			       item_val_len(item));
-		}
-		BUG_ON(bad);
-	}
-
-	return 0;
-}
-
 /*
  * Return the leaf block that should contain the given key.  The caller
  * is responsible for searching the leaf block and performing their
@@ -788,12 +1027,14 @@ static int btree_walk(struct super_block *sb,
 	struct scoutfs_block *bl = NULL;
 	struct scoutfs_btree_block *parent = NULL;
 	struct scoutfs_btree_block *bt;
+	struct scoutfs_btree_item *par_item;
 	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_item *prev;
+	struct scoutfs_avl_node *next_node;
+	struct scoutfs_avl_node *node;
 	struct scoutfs_btree_ref *ref;
 	unsigned int level;
-	unsigned int pos;
 	unsigned int nr;
-	int cmp;
 	int ret;
 
 	if (WARN_ON_ONCE((flags & (BTW_NEXT|BTW_PREV)) && iter_key == NULL) ||
@@ -804,11 +1045,11 @@ restart:
 	scoutfs_block_put(sb, par_bl);
 	par_bl = NULL;
 	parent = NULL;
+	par_item = NULL;
 	scoutfs_block_put(sb, bl);
 	bl = NULL;
 	bt = NULL;
 	level = root->height;
-	pos = 0;
 	ret = 0;
 
 	if (!root->height) {
@@ -819,7 +1060,7 @@ restart:
 					    &root->ref, &bl);
 			if (ret == 0) {
 				bt = bl->data;
-				bt->level = 0;
+				init_btree_block(bt, 0);
 				root->height = 1;
 			}
 		}
@@ -834,11 +1075,6 @@ restart:
 			break;
 		bt = bl->data;
 
-		/* XXX it'd be nice to make this tunable */
-		ret = 0 && verify_btree_block(bt, level);
-		if (ret)
-			break;
-
 		/* XXX more aggressive block verification, before ref updates? */
 		if (bt->level != level) {
 			scoutfs_corruption(sb, SC_BTREE_BLOCK_LEVEL,
@@ -855,19 +1091,19 @@ restart:
 		}
 
 		/*
-		 * Splitting and merging can add or remove parents or
-		 * change the pos we take through parents to reach the
+		 * Splitting and joining can add or remove parents or
+		 * change the parent item we use to reach the child
 		 * block with the search key.  In the rare case that we
-		 * split or merge we simply restart the walk rather than
-		 * try and special case modifying the path to reflect
-		 * the tree changes.
+		 * split or join we simply restart the walk instead of
+		 * update our state to reflect the tree changes.
 		 */
 		ret = 0;
 		if (flags & (BTW_INSERT | BTW_DELETE))
 			ret = try_split(sb, alloc, wri, root, key, val_len,
-					parent, pos, bt);
+					parent, bt);
 		if (ret == 0 && (flags & BTW_DELETE) && parent)
-			ret = try_merge(sb, alloc, wri, root, parent, pos, bt);
+			ret = try_join(sb, alloc, wri, root, parent, par_item,
+				       bt);
 		if (ret > 0)
 			goto restart;
 		else if (ret < 0)
@@ -877,33 +1113,33 @@ restart:
 		if (level == 0)
 			break;
 
-		nr = le32_to_cpu(bt->nr_items);
-
+		nr = le16_to_cpu(bt->nr_items);
 		/* Find the next child block for the search key. */
-		pos = find_pos(bt, key, &cmp);
-		if (pos >= nr) {
+		node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key,
+					  NULL, NULL, &next_node, NULL);
+		item = node_item(node ?: next_node);
+		if (item == NULL) {
 			scoutfs_corruption(sb, SC_BTREE_NO_CHILD_REF,
 					   corrupt_btree_block_level,
-					   "root_height %u root_blkno %llu root_seq %llu blkno %llu seq %llu level %u nr %u pos %u cmp %d",
+					   "root_height %u root_blkno %llu root_seq %llu blkno %llu seq %llu level %u nr %u",
 					   root->height,
 					   le64_to_cpu(root->ref.blkno),
 					   le64_to_cpu(root->ref.seq),
 					   le64_to_cpu(bt->hdr.blkno),
 					   le64_to_cpu(bt->hdr.seq), bt->level,
-					   nr, pos, cmp);
+					   nr);
 			ret = -EIO;
 			break;
 		}
 
 		/* give the caller the next key to iterate towards */
-		if (iter_key && (flags & BTW_NEXT) && (pos < (nr - 1))) {
-			item = pos_item(bt, pos);
+		if (iter_key && (flags & BTW_NEXT) && next_item(bt, item)) {
 			*iter_key = *item_key(item);
 			scoutfs_key_inc(iter_key);
 
-		} else if (iter_key && (flags & BTW_PREV) && (pos > 0)) {
-			item = pos_item(bt, pos - 1);
-			*iter_key = *item_key(item);
+		} else if (iter_key && (flags & BTW_PREV) &&
+			   (prev = prev_item(bt, item))) {
+			*iter_key = *item_key(prev);
 		}
 
 		scoutfs_block_put(sb, par_bl);
@@ -912,7 +1148,8 @@ restart:
 		bl = NULL;
 		bt = NULL;
 
-		ref = item_val(pos_item(parent, pos));
+		par_item = item;
+		ref = item_val(parent, par_item);
 	}
 
 out:
@@ -935,10 +1172,12 @@ static void init_item_ref(struct scoutfs_btree_item_ref *iref,
 			  struct scoutfs_block *bl,
 			  struct scoutfs_btree_item *item)
 {
+	struct scoutfs_btree_block *bt = bl->data;
+
 	iref->sb = sb;
 	iref->bl = bl;
 	iref->key = item_key(item);
-	iref->val = item_val(item);
+	iref->val = item_val(bt, item);
 	iref->val_len = le16_to_cpu(item->val_len);
 }
 
@@ -963,8 +1202,6 @@ int scoutfs_btree_lookup(struct super_block *sb,
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl;
-	unsigned int pos;
-	int cmp;
 	int ret;
 
 	if (WARN_ON_ONCE(iref->key))
@@ -973,16 +1210,15 @@ int scoutfs_btree_lookup(struct super_block *sb,
 	ret = btree_walk(sb, NULL, NULL, root, 0, key, 0, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;
-		pos = find_pos(bt, key, &cmp);
-		if (cmp == 0) {
-			item = pos_item(bt, pos);
+
+		item = leaf_item_hash_search(bt, key);
+		if (item) {
 			init_item_ref(iref, sb, bl, item);
 			ret = 0;
 		} else {
 			scoutfs_block_put(sb, bl);
 			ret = -ENOENT;
 		}
-
 	}
 
 	return ret;
@@ -1009,9 +1245,11 @@ int scoutfs_btree_insert(struct super_block *sb,
 			 struct scoutfs_key *key,
 			 void *val, unsigned val_len)
 {
+	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
+	struct scoutfs_avl_node *node;
+	struct scoutfs_avl_node *par;
 	struct scoutfs_block *bl;
-	int pos;
 	int cmp;
 	int ret;
 
@@ -1022,12 +1260,19 @@ int scoutfs_btree_insert(struct super_block *sb,
 			 val_len, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;
-		pos = find_pos(bt, key, &cmp);
-		if (cmp) {
-			create_item(bt, pos, key, val, val_len);
-			ret = 0;
-		} else {
+
+		item = leaf_item_hash_search(bt, key);
+		if (item) {
 			ret = -EEXIST;
+		} else {
+			node = scoutfs_avl_search(&bt->item_root, cmp_key_item,
+						  key, &cmp, &par, NULL, NULL);
+			if (node) {
+				ret = -EEXIST;
+			} else {
+				create_item(bt, key, val, val_len, par, cmp);
+				ret = 0;
+			}
 		}
 
 		scoutfs_block_put(sb, bl);
@@ -1036,6 +1281,18 @@ int scoutfs_btree_insert(struct super_block *sb,
 	return ret;
 }
 
+static void update_item_value(struct scoutfs_btree_block *bt,
+			      struct scoutfs_btree_item *item,
+			      void *val, unsigned val_len)
+{
+	le16_add_cpu(&bt->total_item_bytes, val_bytes(val_len) -
+		     val_bytes(le16_to_cpu(item->val_len)));
+	delete_value(bt, le16_to_cpu(item->val_off),
+		     le16_to_cpu(item->val_len));
+	item->val_off = insert_value(bt, ptr_off(bt, item), val, val_len);
+	item->val_len = cpu_to_le16(val_len);
+}
+
 /*
  * Update a btree item.  -ENOENT is returned if the item didn't exist.
  *
@@ -1053,10 +1310,9 @@ int scoutfs_btree_update(struct super_block *sb,
 			 struct scoutfs_key *key,
 			 void *val, unsigned val_len)
 {
+	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl;
-	int pos;
-	int cmp;
 	int ret;
 
 	if (invalid_item(val_len))
@@ -1066,10 +1322,10 @@ int scoutfs_btree_update(struct super_block *sb,
 			 val_len, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;
-		pos = find_pos(bt, key, &cmp);
-		if (cmp == 0) {
-			delete_item(bt, pos);
-			create_item(bt, pos, key, val, val_len);
+
+		item = leaf_item_hash_search(bt, key);
+		if (item) {
+			update_item_value(bt, item, val, val_len);
 			ret = 0;
 		} else {
 			ret = -ENOENT;
@@ -1092,9 +1348,10 @@ int scoutfs_btree_force(struct super_block *sb,
 			struct scoutfs_key *key,
 			void *val, unsigned val_len)
 {
+	struct scoutfs_btree_item *item;
+	struct scoutfs_avl_node *par;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl;
-	int pos;
 	int cmp;
 	int ret;
 
@@ -1105,10 +1362,17 @@ int scoutfs_btree_force(struct super_block *sb,
 			 val_len, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;
-		pos = find_pos(bt, key, &cmp);
-		if (cmp == 0)
-			delete_item(bt, pos);
-		create_item(bt, pos, key, val, val_len);
+
+		item = leaf_item_hash_search(bt, key);
+		if (item) {
+			update_item_value(bt, item, val, val_len);
+		} else {
+			scoutfs_avl_search(&bt->item_root, cmp_key_item, key,
+					   &cmp, &par, NULL, NULL);
+			create_item(bt, key, val, val_len, par, cmp);
+		}
+		ret = 0;
+
 		scoutfs_block_put(sb, bl);
 	}
 
@@ -1125,19 +1389,19 @@ int scoutfs_btree_delete(struct super_block *sb,
 			 struct scoutfs_btree_root *root,
 			 struct scoutfs_key *key)
 {
+	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl;
-	int pos;
-	int cmp;
 	int ret;
 
 	ret = btree_walk(sb, alloc, wri, root, BTW_DELETE | BTW_DIRTY, key,
 			 0, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;
-		pos = find_pos(bt, key, &cmp);
-		if (cmp == 0) {
-			if (le32_to_cpu(bt->nr_items) == 1) {
+
+		item = leaf_item_hash_search(bt, key);
+		if (item) {
+			if (le16_to_cpu(bt->nr_items) == 1) {
 				/* remove final empty block */
 				ret = scoutfs_radix_free(sb, alloc, wri,
 							 bl->blkno);
@@ -1147,7 +1411,7 @@ int scoutfs_btree_delete(struct super_block *sb,
 					root->ref.seq = 0;
 				}
 			} else {
-				delete_item(bt, pos);
+				delete_item(bt, item, NULL);
 				ret = 0;
 			}
 		} else {
@@ -1162,8 +1426,8 @@ int scoutfs_btree_delete(struct super_block *sb,
 
 /*
  * Iterate from a key value to the next item in the direction of
- * iteration.  Callers set flags to tell which way to iterate and
- * whether the search key is inclusive, or not.
+ * iteration.  Callers set flags to tell which way to iterate.  The
+ * first key is always inclusive.
  *
  * Walking can land in a leaf that doesn't contain any items in the
  * direction of the iteration.  Walking gives us the next key to walk
@@ -1176,13 +1440,14 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root,
 		      int flags, struct scoutfs_key *key,
 		      struct scoutfs_btree_item_ref *iref)
 {
+	struct scoutfs_avl_node *node;
+	struct scoutfs_avl_node *next;
+	struct scoutfs_avl_node *prev;
 	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl;
 	struct scoutfs_key iter_key;
 	struct scoutfs_key walk_key;
-	int pos;
-	int cmp;
+	struct scoutfs_block *bl;
 	int ret;
 
 	if (WARN_ON_ONCE(flags & BTW_DIRTY) ||
@@ -1199,19 +1464,15 @@ static int btree_iter(struct super_block *sb,struct scoutfs_btree_root *root,
 			break;
 		bt = bl->data;
 
-		pos = find_pos(bt, key, &cmp);
+		node = scoutfs_avl_search(&bt->item_root, cmp_key_item, key,
+					  NULL, NULL, &next, &prev);
 
-		/* point pos towards iteration, find_pos already for _NEXT */
-		if ((flags & BTW_AFTER) && cmp == 0)
-			pos++;
-		else if ((flags & BTW_PREV) && cmp < 0)
-			pos--;
-		else if ((flags & BTW_BEFORE) && cmp == 0)
-			pos--;
-
-		/* found the next item in this leaf */
-		if (pos >= 0 && pos < le32_to_cpu(bt->nr_items)) {
-			item = pos_item(bt, pos);
+		if (node == NULL && (flags & BTW_NEXT))
+			node = next;
+		else if (node == NULL && (flags & BTW_PREV))
+			node = prev;
+		item = node_item(node);
+		if (item) {
 			init_item_ref(iref, sb, bl, item);
 			ret = 0;
 			break;
@@ -1274,16 +1535,17 @@ int scoutfs_btree_dirty(struct super_block *sb,
 			struct scoutfs_btree_root *root,
 			struct scoutfs_key *key)
 {
+	struct scoutfs_btree_item *item;
 	struct scoutfs_btree_block *bt;
 	struct scoutfs_block *bl;
-	int cmp;
 	int ret;
 
 	ret = btree_walk(sb, alloc, wri, root, BTW_DIRTY, key, 0, &bl, NULL);
 	if (ret == 0) {
 		bt = bl->data;
-		find_pos(bt, key, &cmp);
-		if (cmp == 0)
+
+		item = leaf_item_hash_search(bt, key);
+		if (item)
 			ret = 0;
 		else
 			ret = -ENOENT;
diff --git a/kmod/src/format.h b/kmod/src/format.h
index 413774da..e8b4ec86 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -196,17 +196,10 @@ struct scoutfs_avl_node {
 } __packed;
 
 /* when we split we want to have multiple items on each side */
-#define SCOUTFS_BTREE_MAX_VAL_LEN (SCOUTFS_BLOCK_SIZE / 8)
+#define SCOUTFS_BTREE_MAX_VAL_LEN 512
 
-/*
- * The min number of free bytes we must leave in a parent as we descend
- * to modify.  This guarantees enough free bytes in a parent to insert a
- * new child reference item as a child block splits.
- */
-#define SCOUTFS_BTREE_PARENT_MIN_FREE_BYTES				\
-	(sizeof(struct scoutfs_btree_item_header) +			\
-	 sizeof(struct scoutfs_btree_item) +				\
-	 sizeof(struct scoutfs_btree_ref))
+/* each value ends with an offset which lets compaction iterate over values */
+#define SCOUTFS_BTREE_VAL_OWNER_BYTES	sizeof(__le16)
 
 /*
  * When debugging we can tune the splitting and merging thresholds to
@@ -236,24 +229,37 @@ struct scoutfs_btree_root {
 	__u8 height;
 } __packed;
 
-struct scoutfs_btree_item_header {
-	__le32 off;
-} __packed;
-
 struct scoutfs_btree_item {
+	struct scoutfs_avl_node node;
 	struct scoutfs_key key;
+	__le16 val_off;
 	__le16 val_len;
-	__u8 val[0];
 } __packed;
 
 struct scoutfs_btree_block {
 	struct scoutfs_block_header hdr;
-	__le32 free_end;
-	__le32 nr_items;
+	struct scoutfs_avl_root item_root;
+	__le16 nr_items;
+	__le16 total_item_bytes;
+	__le16 mid_free_len;
+	__le16 last_free_off;
+	__le16 last_free_len;
 	__u8 level;
-	struct scoutfs_btree_item_header item_hdrs[0];
+	struct scoutfs_btree_item items[0];
+	/* leaf blocks have a fixed size item offset hash table at the end */
 } __packed;
 
+/*
+ * Try to aim for a 75% load in a leaf full of items with no value.
+ * We'll almost never see this because most items have values and most
+ * blocks aren't full.
+ */
+#define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR					  \
+	((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) /	  \
+	 (sizeof(struct scoutfs_btree_item) + (sizeof(__le16))) * 100 / 75)
+#define SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES \
+	(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR * sizeof(__le16))
+
 struct scoutfs_mounted_client_btree_val {
 	__u8 flags;
 } __packed;