From 3eaabe81de90518af57e01ae57fd5bdba7021a42 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 26 Jun 2017 13:56:21 -0700
Subject: [PATCH] scoutfs: add btree stored in persistent ring

Add a cow btree whose blocks are stored in a persistently allocated
ring.   This will let us incrementally index very large data sets
efficiently.

This is an adaptation of the previous btree code which now uses the
ring, stores variable length keys, and augments the items with bits that
ored up through parents.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/Makefile    |    7 +-
 kmod/src/btree.c     | 1878 ++++++++++++++++++++++++++++++++++++++++++
 kmod/src/btree.h     |   51 ++
 kmod/src/format.h    |   85 ++
 kmod/src/sort_priv.c |   71 ++
 kmod/src/sort_priv.h |    8 +
 kmod/src/super.c     |    3 +
 kmod/src/super.h     |    2 +
 8 files changed, 2102 insertions(+), 3 deletions(-)
 create mode 100644 kmod/src/btree.c
 create mode 100644 kmod/src/btree.h
 create mode 100644 kmod/src/sort_priv.c
 create mode 100644 kmod/src/sort_priv.h

diff --git a/kmod/src/Makefile b/kmod/src/Makefile
index 8370da64..8eefa971 100644
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
 
 CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
  
-scoutfs-y += alloc.o bio.o compact.o counters.o data.o dir.o kvec.o inode.o \
-	     ioctl.o item.o key.o lock.o manifest.o msg.o net.o  options.o \
-	     ring.o seg.o scoutfs_trace.o super.o trans.o xattr.o
+scoutfs-y += alloc.o bio.o btree.o compact.o counters.o data.o dir.o kvec.o \
+	     inode.o ioctl.o item.o key.o lock.o manifest.o msg.o net.o \
+	     options.o ring.o seg.o scoutfs_trace.o sort_priv.o super.o trans.o \
+	     xattr.o
diff --git a/kmod/src/btree.c b/kmod/src/btree.c
new file mode 100644
index 00000000..e30f5af0
--- /dev/null
+++ b/kmod/src/btree.c
@@ -0,0 +1,1878 @@
+/*
+ * Copyright (C) 2017 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32c.h>
+#include <linux/sort.h>
+#include <linux/blkdev.h>
+
+#include "super.h"
+#include "format.h"
+#include "key.h"
+#include "btree.h"
+#include "sort_priv.h"
+
+#include "scoutfs_trace.h"
+
+/*
+ * scoutfs uses a cow btree in a ring of preallocated blocks to index
+ * the manifest (and allocator, but mostly the manifest).
+ *
+ * Using a cow btree lets nodes determine the validity of cached blocks
+ * based on a single root ref (blkno, seq) that is communicated through
+ * locking and messaging.  As long as their cached blocks aren't
+ * overwritten in the ring they can continue to use those cached blocks
+ * as the newer cowed blocks continue to reference them.
+ *
+ * New blocks written to the btree are allocated from the tail of the
+ * preallocated ring.  This avoids a fine grained persistent record of
+ * free btree blocks.  It also gathers all dirty btree blocks into one
+ * contiguous write.
+ *
+ * To ensure that newly written blocks don't overwrite previously valid
+ * existing blocks in the ring we take two preventative measures.  First
+ * we ensure that there are 4x the number of preallocated blocks that
+ * would be needed to store the btrees.  Then, second, for every set of
+ * blocks written to the current half of the ring we ensure that at
+ * least half of the written blocks are cow copies of valid blocks that
+ * were stored in the old half of the ring.  This ensures that the
+ * current half of the ring will contain all the valid referenced btree
+ * blocks by the time it fills up and wraps around to start overwriting
+ * the old half of the ring.
+ *
+ * To find the blocks in the old half of the ring we augment the btree
+ * items to store bits that are or-ed in parent items up to the root.
+ * Parent items have bits set for the half of the ring that their child
+ * block is stored in.
+ *
+ * Blocks are of a fixed size and are set to 4k to avoid multi-page
+ * blocks.  This means they can be smaller than the page size and we can
+ * need to pin dirty blocks and invalidate and re-read stable blocks
+ * that could fall in the same page.  We use buffer heads to track
+ * sub-page block state for us.  We abuse knowledge of the page cache
+ * and buffer heads to cast between pointers to the blocks and the
+ * buffer heads that contain reference counts of the block contents.
+ *
+ * We store modified blocks in a list on b_private instead of marking
+ * the blocks dirty.  We don't want them written out (and possibly
+ * reclaimed and re-read) before we have a chance to update their
+ * checksums.  We hold an elevated bh count to avoid the buffers from
+ * being removed from the pages while we have them in the list.
+ *
+ * Today callers provide all the locking.  They serialize readers and
+ * writers and writers and committing all the dirty blocks.
+ *
+ * Btree items are stored in each block as a small header with the key
+ * followed by the value.  New items are allocated from the back of the
+ * block towards the front.  Deleted items can be reclaimed by packing
+ * items towards the back of the block by walking them in reverse offset
+ * order.
+ *
+ * A dense array of item headers after the btree block header stores the
+ * offsets and bits of the items and is kept sorted by the item's keys.
+ * The array is small enough that keeping it sorted with memmove()
+ * involves a few cache lines at most.
+ *
+ * Parent blocks in the btree have the same format as leaf blocks.
+ * There's one key for every child reference instead of having separator
+ * keys between child references.  The key in a child reference contains
+ * the largest key that may be found in the child subtree.  The right
+ * spine of the tree has maximal keys so that they don't have to be
+ * updated if we insert an item with a key greater than everything in
+ * the tree.
+ */
+
+/*
+ * XXX:
+ *  - counters and tracing
+ *  - could issue read-ahead around reads up to dirty blkno
+ *  - have barrier as we cross to prevent refreshing clobbering stale reads
+ *  - audit split and merge for bit updating
+ *  - audit/comment that dirty blknos can wrap around ring
+ *  - figure out some max transaction size so ring won't wrap in one
+ *  - update the world of comments
+ *  - validate structures on read?
+ */
+
+/*
+ * There's one physical ring that stores the blocks for all btrees.  We
+ * track the state of the ring and all its dirty blocks in this one
+ * btree_info per mount/super.
+ */
+struct btree_info {
+	struct mutex mutex;
+
+	unsigned long cur_dirtied;
+	unsigned long old_dirtied;
+	struct buffer_head *first_dirty_bh;
+	struct buffer_head *last_dirty_bh;
+	u64 first_dirty_blkno;
+	u64 first_dirty_seq;
+};
+
+#define DECLARE_BTREE_INFO(sb, name) \
+	struct btree_info *name = SCOUTFS_SB(sb)->btree_info
+
+/* btree walking has a bunch of behavioural bit flags */
+enum {
+	 BTW_NEXT	= (1 <<  0), /* return >= key */
+	 BTW_AFTER	= (1 <<  1), /* return > key */
+	 BTW_PREV	= (1 <<  2), /* return <= key */
+	 BTW_BEFORE	= (1 <<  3), /* return < key */
+	 BTW_DIRTY	= (1 <<  4), /* cow stable blocks */
+	 BTW_BIT	= (1 <<  5), /* search for the first set bit, not key */
+	 BTW_DIRTY_OLD	= (1 <<  6), /* dirty old leaf blocks to balance ring */
+	 BTW_ALLOC	= (1 <<  7), /* allocate a new block for 0 ref */
+	 BTW_INSERT	= (1 <<  8), /* walking to insert, try splitting */
+	 BTW_DELETE	= (1 <<  9), /* walking to delete, try merging */
+};
+
+/*
+ * This greatest key value is stored down the right spine of the tree
+ * and has to be sorted by memcmp() greater than all possible keys in
+ * all btrees.  We give it room for a decent number of big-endian
+ * primary sort values.
+ */
+static char max_key[SCOUTFS_BTREE_GREATEST_KEY_LEN] = {
+	[0 ... (SCOUTFS_BTREE_GREATEST_KEY_LEN - 1)] = 0xff,
+};
+
+/* number of contiguous bytes used by the item header, key, and value */
+static inline unsigned len_bytes(unsigned key_len, unsigned val_len)
+{
+	return sizeof(struct scoutfs_btree_item) + key_len + val_len;
+}
+
+/* number of contiguous bytes used an existing item */
+static inline unsigned int item_bytes(struct scoutfs_btree_item *item)
+{
+	return len_bytes(le16_to_cpu(item->key_len), le16_to_cpu(item->val_len));
+}
+
+/* total block bytes used by an item: header, item, key, value */
+static inline unsigned int all_len_bytes(unsigned key_len, unsigned val_len)
+{
+	return sizeof(struct scoutfs_btree_item_header) +
+		len_bytes(key_len, val_len);
+}
+
+/* total block bytes used by an existing item */
+static inline unsigned int all_item_bytes(struct scoutfs_btree_item *item)
+{
+	return all_len_bytes(le16_to_cpu(item->key_len),
+			     le16_to_cpu(item->val_len));
+}
+
+/* number of contig free bytes between last item header and first item */
+static inline unsigned int contig_free(struct scoutfs_btree_block *bt)
+{
+	unsigned int nr = le16_to_cpu(bt->nr_items);
+
+	return le16_to_cpu(bt->free_end) -
+	       offsetof(struct scoutfs_btree_block, item_hdrs[nr]);
+}
+
+/* number of contig bytes free after reclaiming free amongst items */
+static inline unsigned int reclaimable_free(struct scoutfs_btree_block *bt)
+{
+	return contig_free(bt) + le16_to_cpu(bt->free_reclaim);
+}
+
+/* all bytes used by item offsets, headers, and values */
+static inline unsigned int used_total(struct scoutfs_btree_block *bt)
+{
+	return SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block) -
+	       reclaimable_free(bt);
+}
+
+static inline struct scoutfs_btree_item *
+off_item(struct scoutfs_btree_block *bt, __le16 off)
+{
+	return (void *)bt + le16_to_cpu(off);
+}
+
+static inline struct scoutfs_btree_item *
+pos_item(struct scoutfs_btree_block *bt, unsigned int pos)
+{
+	return off_item(bt, bt->item_hdrs[pos].off);
+}
+
+static inline struct scoutfs_btree_item *
+last_item(struct scoutfs_btree_block *bt)
+{
+	return pos_item(bt, le16_to_cpu(bt->nr_items) - 1);
+}
+
+static inline void *item_key(struct scoutfs_btree_item *item)
+{
+	return item->data;
+}
+
+static inline unsigned item_key_len(struct scoutfs_btree_item *item)
+{
+	return le16_to_cpu(item->key_len);
+}
+
+static inline void *item_val(struct scoutfs_btree_item *item)
+{
+	return item_key(item) + le16_to_cpu(item->key_len);
+}
+
+static inline unsigned item_val_len(struct scoutfs_btree_item *item)
+{
+	return le16_to_cpu(item->val_len);
+}
+
+static inline int cmp_keys(void *a, unsigned a_len, void *b, unsigned b_len)
+{
+	return memcmp(a, b, min(a_len, b_len)) ?:
+	       a_len < b_len ? -1 : a_len > b_len ? 1 : 0;
+}
+
+/*
+ * Returns the sorted item position that an item with the given key
+ * should occupy.
+ *
+ * It sets *cmp to the final comparison of the given key and the
+ * position's item key.  This can only be -1 or 0 because we bias
+ * towards returning the pos that a key should occupy.
+ *
+ * If the given key is greater then all items' keys then the number of
+ * items can be returned.
+ */
+static int find_pos(struct scoutfs_btree_block *bt, void *key, unsigned key_len,
+		    int *cmp)
+{
+	struct scoutfs_btree_item *item;
+	unsigned int start = 0;
+	unsigned int end = le16_to_cpu(bt->nr_items);
+	unsigned int pos = 0;
+
+	*cmp = -1;
+
+	while (start < end) {
+		pos = start + (end - start) / 2;
+
+		item = pos_item(bt, pos);
+		*cmp = cmp_keys(key, key_len, item_key(item), item_key_len(item));
+		if (*cmp < 0) {
+			end = pos;
+		} else if (*cmp > 0) {
+			start = ++pos;
+			*cmp = -1;
+		} else {
+			break;
+		}
+	}
+
+	return pos;
+}
+
+static inline u8 pos_bits(struct scoutfs_btree_block *bt, unsigned int pos)
+{
+	return bt->item_hdrs[pos].bits;
+}
+
+static inline bool pos_bit_set(struct scoutfs_btree_block *bt, unsigned int pos,
+			       u8 bit)
+{
+	return bt->item_hdrs[pos].bits & bit;
+}
+
+static inline u16 bit_count(struct scoutfs_btree_block *bt, u8 bit)
+{
+	int ind;
+
+	BUG_ON(hweight8(bit) != 1);
+
+	ind = ffs(bit) - 1;
+	return le16_to_cpu(bt->bit_counts[ind]);
+}
+
+/* find the first item pos with the given bit set */
+static int find_pos_bit(struct scoutfs_btree_block *bt, int pos, u8 bit)
+{
+	unsigned int nr = le16_to_cpu(bt->nr_items);
+
+	while (pos < nr && !pos_bit_set(bt, pos, bit))
+		pos++;
+
+	return pos;
+}
+
+/*
+ * Record the path we took through parent blocks.  Used to set the bits
+ * in parent reference items that lead to bits in leaves.
+ */
+struct btree_path {
+	unsigned nr;
+	struct scoutfs_btree_block *bt[SCOUTFS_BTREE_MAX_HEIGHT];
+	u16 pos[SCOUTFS_BTREE_MAX_HEIGHT];
+};
+
+#define DECLARE_BTREE_PATH(name) \
+	struct btree_path name = {0, }
+
+/*
+ * Add a block to the path for later traversal for updating bits.  Only dirty
+ * blocks are put in the path and they have an extra ref to keep them pinned
+ * until we write them out.
+ */
+static void path_push(struct btree_path *path,
+		      struct scoutfs_btree_block *bt, unsigned pos)
+{
+	if (path) {
+		BUG_ON(path->nr >= SCOUTFS_BTREE_MAX_HEIGHT);
+
+		path->bt[path->nr] = bt;
+		path->pos[path->nr++] = pos;
+	}
+}
+
+static struct scoutfs_btree_block *path_pop(struct btree_path *path, unsigned *pos)
+{
+	if (!path || path->nr == 0)
+		return NULL;
+
+	*pos = path->pos[--path->nr];
+	return path->bt[path->nr];
+}
+
+static u8 half_bit(struct scoutfs_btree_ring *bring, u64 blkno)
+{
+	u64 half_blkno = le64_to_cpu(bring->first_blkno) +
+			 (le64_to_cpu(bring->nr_blocks) / 2);
+
+	return blkno < half_blkno ? SCOUTFS_BTREE_BIT_HALF1 :
+				    SCOUTFS_BTREE_BIT_HALF2;
+}
+
+static u8 other_half_bit(struct scoutfs_btree_ring *bring, u64 blkno)
+{
+	return half_bit(bring, blkno) ^ (SCOUTFS_BTREE_BIT_HALF1 |
+					 SCOUTFS_BTREE_BIT_HALF2);
+}
+
+static u8 bits_from_counts(struct scoutfs_btree_block *bt)
+{
+	u8 bits = 0;
+	int i;
+
+	for (i = 0; i < SCOUTFS_BTREE_BITS; i++) {
+		if (bt->bit_counts[i])
+			bits |= 1 << i;
+	}
+
+	return bits;
+}
+
+/*
+ * Iterate through 0-based bit numbers set in 'bits' from least to
+ * greatest.  It modifies 'bits' as it goes!
+ */
+#define for_each_bit(i, bits) \
+	for (i = bits ? ffs(bits) : 0; i-- > 0; bits &= ~(1 < i))
+
+/*
+ * Store the new bits and update the counts to match the difference from
+ * the previously set bits.  Callers use this to keep item bits in sync
+ * with the counts of bits in the block headers.
+ */
+static void store_pos_bits(struct scoutfs_btree_block *bt, int pos, u8 bits)
+{
+	u8 diff = bits ^ pos_bits(bt, pos);
+	int b;
+
+	if (!diff)
+		return;
+
+	for_each_bit(b, diff) {
+		if (bits & (1 << b))
+			le16_add_cpu(&bt->bit_counts[b], 1);
+		else
+			le16_add_cpu(&bt->bit_counts[b], -1);
+	}
+
+	bt->item_hdrs[pos].bits = bits;
+}
+
+/*
+ * The caller has descended through parents to a final block.  Each
+ * block may have had item bits modified and counts updated but they
+ * didn't keep parent item bits in sync with modifications to all the
+ * children.  Our job is to ascend back through parents and set their
+ * bits to the union of all the bits down through the path to the final
+ * block.
+ */
+static void path_repair_reset(struct btree_path *path)
+{
+	struct scoutfs_btree_block *parent;
+	struct scoutfs_btree_block *bt;
+	u8 bits;
+	int pos;
+
+	bt = path_pop(path, &pos);
+
+	while ((parent = path_pop(path, &pos))) {
+		bits = bits_from_counts(bt);
+		store_pos_bits(parent, pos, bits);
+		bt = parent;
+	}
+}
+
+static int cmp_hdr_item_key(void *priv, const void *a_ptr, const void *b_ptr)
+{
+	struct scoutfs_btree_block *bt = priv;
+	const struct scoutfs_btree_item_header *a_hdr = a_ptr;
+	const struct scoutfs_btree_item_header *b_hdr = b_ptr;
+	struct scoutfs_btree_item *a_item = off_item(bt, a_hdr->off);
+	struct scoutfs_btree_item *b_item = off_item(bt, b_hdr->off);
+
+	return cmp_keys(item_key(a_item), item_key_len(a_item),
+		        item_key(b_item), item_key_len(b_item));
+}
+
+static int cmp_hdr_off(void *priv, const void *a_ptr, const void *b_ptr)
+{
+	const struct scoutfs_btree_item_header *a_hdr = a_ptr;
+	const struct scoutfs_btree_item_header *b_hdr = b_ptr;
+
+	return (int)le16_to_cpu(a_hdr->off) - (int)le16_to_cpu(b_hdr->off);
+}
+
+static void swap_hdr(void *priv, void *a_ptr, void *b_ptr, int size)
+{
+	struct scoutfs_btree_item_header *a_hdr = a_ptr;
+	struct scoutfs_btree_item_header *b_hdr = b_ptr;
+
+	swap(*a_hdr, *b_hdr);
+}
+
+/*
+ * As items are deleted they create fragmented free space.  Even if we
+ * indexed free space in the block it could still get sufficiently
+ * fragmented to force a split on insertion even though the two
+ * resulting blocks would have less than the minimum space consumed by
+ * items.
+ *
+ * We don't bother implementing free space indexing and addressing that
+ * corner case.  Instead we track the number of bytes that could be
+ * reclaimed if we compacted the item space after the free_end offset.
+ * If this additional free space would satisfy an insertion then we
+ * compact the items instead of splitting the block.
+ *
+ * We move the free space to the center of the block by walking
+ * backwards through the items in offset order and packing them towards
+ * the end of the block.
+ *
+ * We don't have specific metadata to either walk the items in offset
+ * order or to update the item offsets as we move items.  We sort the
+ * item offset array to achieve both ends.  First we sort it by offset
+ * so we can walk in reverse order.  As we move items we update their
+ * offset and then sort by keys once we're done.
+ */
+static void compact_items(struct scoutfs_btree_block *bt)
+{
+	unsigned int nr = le16_to_cpu(bt->nr_items);
+	struct scoutfs_btree_item *from;
+	struct scoutfs_btree_item *to;
+	unsigned int bytes;
+	__le16 end;
+	int i;
+
+	sort_priv(bt, bt->item_hdrs, nr, sizeof(bt->item_hdrs[0]),
+		  cmp_hdr_off, swap_hdr);
+
+	end = cpu_to_le16(SCOUTFS_BLOCK_SIZE);
+
+	for (i = nr - 1; i >= 0; i--) {
+		from = pos_item(bt, i);
+
+		bytes = item_bytes(from);
+		le16_add_cpu(&end, -bytes);
+		to = off_item(bt, end);
+		bt->item_hdrs[i].off = end;
+
+		if (from != to)
+			memmove(to, from, bytes);
+	}
+
+	bt->free_end = end;
+	bt->free_reclaim = 0;
+
+	sort_priv(bt, bt->item_hdrs, nr, sizeof(bt->item_hdrs[0]),
+		  cmp_hdr_item_key, swap_hdr);
+}
+
+/* move a number of contigous elements from the src index to the dst index */
+#define memmove_arr(arr, dst, src, nr) \
+	memmove(&(arr)[dst], &(arr)[src], (nr) * sizeof(*(arr)))
+
+/*
+ * Insert a new item into the block.  The caller has made sure that
+ * there's space for the item and its metadata but we might have to
+ * compact the block to make that space contiguous.
+ *
+ * The possibility of compaction means that callers *can not* hold item,
+ * key, or value pointers across item creation.  An easy way to verify
+ * this is to audit pos_item() callers.
+ */
+static void create_item(struct scoutfs_btree_block *bt, unsigned int pos, u8 bits,
+			void *key, unsigned key_len, void *val, unsigned val_len)
+{
+	unsigned nr = le16_to_cpu(bt->nr_items);
+	struct scoutfs_btree_item *item;
+	unsigned all_bytes;
+
+	all_bytes = all_len_bytes(key_len, val_len);
+	if (contig_free(bt) < all_bytes) {
+		BUG_ON(reclaimable_free(bt) < all_bytes);
+		compact_items(bt);
+	}
+
+	if (pos < nr)
+		memmove_arr(bt->item_hdrs, pos + 1, pos, nr - pos);
+
+	le16_add_cpu(&bt->free_end, -len_bytes(key_len, val_len));
+	bt->item_hdrs[pos].off = bt->free_end;
+	nr++;
+	bt->nr_items = cpu_to_le16(nr);
+
+	BUG_ON(le16_to_cpu(bt->free_end) <
+	       offsetof(struct scoutfs_btree_block, item_hdrs[nr]));
+
+	bt->item_hdrs[pos].bits = 0;
+	store_pos_bits(bt, pos, bits);
+
+	item = pos_item(bt, pos);
+	item->key_len = cpu_to_le16(key_len);
+	item->val_len = cpu_to_le16(val_len);
+
+	memcpy(item_key(item), key, key_len);
+	if (val_len)
+		memcpy(item_val(item), val, val_len);
+}
+
+/*
+ * Delete an item from a btree block.  We record the amount of space it
+ * frees to later decide if we can satisfy an insertion by compaction
+ * instead of splitting.
+ */
+static void delete_item(struct scoutfs_btree_block *bt, unsigned int pos)
+{
+	struct scoutfs_btree_item *item = pos_item(bt, pos);
+	unsigned int nr = le16_to_cpu(bt->nr_items);
+
+	store_pos_bits(bt, pos, 0);
+
+	if (pos < (nr - 1))
+		memmove_arr(bt->item_hdrs, pos, pos + 1, nr - 1 - pos);
+
+	le16_add_cpu(&bt->free_reclaim, item_bytes(item));
+	nr--;
+	bt->nr_items = cpu_to_le16(nr);
+
+	/* wipe deleted items to avoid leaking data */
+	memset(item, 0, item_bytes(item));
+}
+
+/*
+ * Move items from a source block to a destination block.  The caller
+ * tells us if we're moving from the tail of the source block right to
+ * the head of the destination block, or vice versa.  We stop moving
+ * once we've moved enough bytes of items.
+ */
+static void move_items(struct scoutfs_btree_block *dst,
+		       struct scoutfs_btree_block *src, bool move_right,
+		       int to_move)
+{
+	struct scoutfs_btree_item *from;
+	unsigned int t;
+	unsigned int f;
+
+	if (move_right) {
+		f = le16_to_cpu(src->nr_items) - 1;
+		t = 0;
+	} else {
+		f = 0;
+		t = le16_to_cpu(dst->nr_items);
+	}
+
+	while (f < le16_to_cpu(src->nr_items) && to_move > 0) {
+		from = pos_item(src, f);
+
+		create_item(dst, t, pos_bits(src, f), item_key(from),
+			    item_key_len(from), item_val(from),
+			    item_val_len(from));
+
+		to_move -= all_item_bytes(from);
+
+		delete_item(src, f);
+		if (move_right)
+			f--;
+		else
+			t++;
+	}
+}
+
+/*
+ * This is only used after we've elevated bh reference counts.  Until we
+ * drop the counts the bhs won't be removed from the page.  This lets us
+ * use pointers to the block contents in the api and not have to litter
+ * it with redundant containers.
+ */
+static struct buffer_head *virt_to_bh(void *kaddr)
+{
+	struct buffer_head *bh;
+	struct page *page;
+	long off;
+
+	page = virt_to_page((unsigned long)kaddr);
+	BUG_ON(!page_has_buffers(page));
+        bh = page_buffers(page);
+	BUG_ON((unsigned long)bh->b_data !=
+	       ((unsigned long)kaddr & PAGE_CACHE_MASK));
+
+	off = (unsigned long)kaddr & ~PAGE_CACHE_MASK;
+        while (off >= SCOUTFS_BLOCK_SIZE) {
+                bh = bh->b_this_page;
+		off -= SCOUTFS_BLOCK_SIZE;
+	}
+
+	return bh;
+}
+
+static void put_btree_block(void *ptr)
+{
+	if (!IS_ERR_OR_NULL(ptr))
+		put_bh(virt_to_bh(ptr));
+}
+
+enum {
+        BH_ScoutfsChecked = BH_PrivateStart,
+        BH_ScoutfsValidCrc,
+};
+
+BUFFER_FNS(ScoutfsChecked, scoutfs_checked)	/* has had crc checked */
+BUFFER_FNS(ScoutfsValidCrc, scoutfs_valid_crc)	/* crc matched */
+
+
+/*
+ * Make sure that we've found a valid block and that it's the block that
+ * we're looking for.
+ */
+static bool valid_referenced_block(struct scoutfs_super_block *super,
+				   struct scoutfs_btree_ref *ref,
+				   struct scoutfs_btree_block *bt,
+				   struct buffer_head *bh)
+{
+	__le32 existing;
+	u32 calc;
+
+	if (!buffer_scoutfs_checked(bh)) {
+		lock_buffer(bh);
+		if (!buffer_scoutfs_checked(bh)) {
+			existing = bt->crc;
+			bt->crc = 0;
+			calc = crc32c(~0, bt, SCOUTFS_BLOCK_SIZE);
+			bt->crc = existing;
+
+			set_buffer_scoutfs_checked(bh);
+			if (calc == le32_to_cpu(existing))
+				set_buffer_scoutfs_valid_crc(bh);
+			else
+				clear_buffer_scoutfs_valid_crc(bh);
+		}
+		unlock_buffer(bh);
+	}
+
+	return buffer_scoutfs_valid_crc(bh) && super->hdr.fsid == bt->fsid &&
+	       ref->blkno == bt->blkno && ref->seq == bt->seq;
+}
+
+/*
+ * This is used to lookup cached blocks, read blocks, cow blocks for
+ * dirtying, and allocate new blocks.
+ *
+ * Btree blocks don't have rigid cache consistency.  We can be following
+ * a new root to read refs into previously stale cached blocks.  If we
+ * see that the block metadata doesn't match we first assume that we
+ * just have a stale block and try and re-read it.  If it still doesn't
+ * match we assume that we're an reader racing with a writer overwriting
+ * old blocks in the ring.  We return an error that tells the caller to
+ * deal with this error: either find a new root or return a hard error
+ * if the block is really corrupt.
+ *
+ * This only sets the caller's reference.  It doesn't know if the
+ * caller's ref is in a parent item and would need to update bits and
+ * counts based on the blkno.  It's up to the callers to take care of
+ * that.
+ *
+ * btree callers serialize concurrent writers in a btree but not between
+ * btrees.  We have to lock around the shared btree_info.  Callers do
+ * lock between all btree writers and writing dirty blocks.  We don't
+ * have to lock around the bti fields that are only changed by commits.
+ */
+static int get_ref_block(struct super_block *sb, int flags,
+			 struct scoutfs_btree_ref *ref,
+			 struct scoutfs_btree_block **bt_ret)
+{
+	DECLARE_BTREE_INFO(sb, bti);
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_btree_ring *bring = &super->bring;
+	struct scoutfs_btree_block *bt = NULL;
+	struct scoutfs_btree_block *new;
+	struct buffer_head *bh;
+	int retries = 1;
+	u64 blkno;
+	u64 seq;
+	int ret;
+
+retry:
+	/* always get the current block, either to return or cow from */
+	if (ref && ref->blkno) {
+		bh = sb_bread(sb, le64_to_cpu(ref->blkno));
+		if (!bh) {
+			ret = -EIO;
+			goto out;
+		}
+		bt = (void *)bh->b_data;
+
+		if (!valid_referenced_block(super, ref, bt, bh)) {
+			if (retries-- > 0) {
+				lock_buffer(bh);
+				clear_buffer_uptodate(bh);
+				unlock_buffer(bh);
+				put_bh(bh);
+				bt = NULL;
+				goto retry;
+			}
+			/* XXX let us know when we eventually hit this */
+			ret = WARN_ON_ONCE(-ESTALE);
+			goto out;
+		}
+
+		/* done if not dirtying or already dirty */
+		if (!(flags & BTW_DIRTY) ||
+		    (le64_to_cpu(bt->seq) >= bti->first_dirty_seq)) {
+			ret = 0;
+			goto out;
+		}
+
+	} else if (!(flags & BTW_ALLOC)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	mutex_lock(&bti->mutex);
+
+	blkno = le64_to_cpu(bring->first_blkno) + le64_to_cpu(bring->next_block);
+	seq = le64_to_cpu(bring->next_seq);
+
+	bh = sb_getblk(sb, blkno);
+	if (!bh) {
+		ret = -ENOMEM;
+		mutex_unlock(&bti->mutex);
+		goto out;
+	}
+	new = (void *)bh->b_data;
+
+	set_buffer_uptodate(bh);
+	set_buffer_scoutfs_checked(bh);
+	set_buffer_scoutfs_valid_crc(bh);
+
+	/*
+	 * Track our contiguous dirty blocks by holding a ref and putting
+	 * them in a list.  We don't want them marked dirty or else they
+	 * can be written out before we're ready.
+	 */
+	get_bh(bh);
+	bh->b_private = NULL;
+	if (bti->last_dirty_bh)
+		bti->last_dirty_bh->b_private = bh;
+	bti->last_dirty_bh = bh;
+	if (!bti->first_dirty_bh)
+		bti->first_dirty_bh = bh;
+
+	/* wrap next block and increase next seq */
+	if (le64_to_cpu(bring->next_block) == le64_to_cpu(bring->nr_blocks))
+		bring->next_block = 0;
+	else
+		le64_add_cpu(&bring->next_block, 1);
+
+	le64_add_cpu(&bring->next_seq, 1);
+
+	if (half_bit(bring, blkno) == half_bit(bring, bti->first_dirty_blkno))
+		bti->cur_dirtied++;
+	else
+		bti->old_dirtied++;
+
+	mutex_unlock(&bti->mutex);
+
+	if (bt) {
+		/* returning a cow of an existing block */
+		memcpy(new, bt, SCOUTFS_BLOCK_SIZE);
+		put_btree_block(bt);
+		bt = new;
+	} else {
+		/* returning a newly allocated block */
+		bt = new;
+		new = NULL;
+		memset(bt, 0, SCOUTFS_BLOCK_SIZE);
+		bt->fsid = super->hdr.fsid;
+		bt->free_end = cpu_to_le16(SCOUTFS_BLOCK_SIZE);
+	}
+
+	bt->blkno = cpu_to_le64(blkno);
+	bt->seq = cpu_to_le64(seq);
+	if (ref) {
+		ref->blkno = bt->blkno;
+		ref->seq = bt->seq;
+	}
+	ret = 0;
+
+out:
+	if (ret) {
+		put_btree_block(bt);
+		bt = NULL;
+	}
+
+	*bt_ret = bt;
+	return ret;
+}
+
+/*
+ * Get the block referenced by the given parent item.  The parent item
+ * and its bits are updated.
+ */
+static int get_parent_ref_block(struct super_block *sb, int flags,
+			        struct scoutfs_btree_block *parent, unsigned pos,
+			        struct scoutfs_btree_block **bt_ret)
+{
+	struct scoutfs_btree_ring *bring = &SCOUTFS_SB(sb)->super.bring;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_ref *ref;
+	u8 bits;
+	int ret;
+
+	/* ref can only be updated, no insertion or compaction */
+	item = pos_item(parent, pos);
+	ref = item_val(item);
+
+	ret = get_ref_block(sb, flags, ref, bt_ret);
+	if (ret == 0) {
+		bits = bits_from_counts(*bt_ret) |
+		       half_bit(bring, le64_to_cpu(ref->blkno));
+		store_pos_bits(parent, pos, bits);
+	}
+
+	return ret;
+}
+
+/*
+ * Create a new item in the parent which references the child.  The caller
+ * specifies the key in the item that describes the items in the child.
+ */
+static void create_parent_item(struct scoutfs_btree_ring *bring,
+			       struct scoutfs_btree_block *parent,
+			       unsigned pos, struct scoutfs_btree_block *child,
+			       void *key, unsigned key_len)
+{
+	struct scoutfs_btree_ref ref = {
+		.blkno = child->blkno,
+		.seq = child->seq,
+	};
+	u8 bits = bits_from_counts(child) |
+		  half_bit(bring, le64_to_cpu(ref.blkno));
+
+	create_item(parent, pos, bits, key, key_len, &ref, sizeof(ref));
+}
+
+/*
+ * Update the parent item that refers to a child by deleting and
+ * recreating it.  Descent should have ensured that there was always
+ * room for a maximal key in parents.
+ */
+static void update_parent_item(struct scoutfs_btree_ring *bring,
+			       struct scoutfs_btree_block *parent,
+			       unsigned pos, struct scoutfs_btree_block *child)
+{
+	struct scoutfs_btree_item *item = last_item(child);
+
+	delete_item(parent, pos);
+	create_parent_item(bring, parent, pos, child,
+			   item_key(item), item_key_len(item));
+}
+
+/* the parent item key and value are fine, but child items have changed */
+static void update_parent_bits(struct scoutfs_btree_ring *bring,
+			       struct scoutfs_btree_block *parent,
+			       unsigned pos, struct scoutfs_btree_block *child)
+{
+	u8 bits = bits_from_counts(child) |
+		  half_bit(bring, le64_to_cpu(child->blkno));
+
+	store_pos_bits(parent, pos, bits);
+}
+
+/*
+ * See if we need to split this block while descending for insertion so
+ * that we have enough space to insert.  Parent blocks need enough space
+ * for a new item and child ref if a child block splits.  Leaf blocks
+ * need enough space to insert the new item with its value.
+ *
+ * We split to the left so that the greatest key in the existing block
+ * doesn't change so we don't have to update the key in its parent item.
+ * We still have to update its bits.
+ *
+ * Returns -errno, 0 if nothing done, or 1 if we split.
+ */
+static int try_split(struct super_block *sb, struct scoutfs_btree_root *root,
+		     void *key, unsigned key_len, unsigned val_len,
+		     struct scoutfs_btree_block *parent, unsigned pos,
+		     struct scoutfs_btree_block *right)
+{
+	struct scoutfs_btree_ring *bring = &SCOUTFS_SB(sb)->super.bring;
+	struct scoutfs_btree_block *left = NULL;
+	struct scoutfs_btree_item *item;
+	unsigned int all_bytes;
+	bool put_parent = false;
+	int ret;
+
+	if (right->level)
+		all_bytes = all_len_bytes(SCOUTFS_BTREE_MAX_KEY_LEN,
+					  sizeof(struct scoutfs_btree_ref));
+	else
+		all_bytes = all_len_bytes(key_len, val_len);
+
+	if (reclaimable_free(right) >= all_bytes)
+		return 0;
+
+	/* alloc split neighbour first to avoid unwinding tree growth */
+	ret = get_ref_block(sb, BTW_ALLOC, NULL, &left);
+	if (ret)
+		return ret;
+	left->level = right->level;
+
+	if (!parent) {
+		ret = get_ref_block(sb, BTW_ALLOC, NULL, &parent);
+		if (ret) {
+			put_btree_block(left);
+			return ret;
+		}
+		put_parent = true;
+
+		parent->level = root->height;
+		root->height++;
+		root->ref.blkno = parent->blkno;
+		root->ref.seq = parent->seq;
+
+		pos = 0;
+		create_parent_item(bring, parent, pos, right,
+				   &max_key, sizeof(max_key));
+	}
+
+	move_items(left, right, false, used_total(right) / 2);
+	update_parent_bits(bring, parent, pos, right);
+
+	item = last_item(left);
+	create_parent_item(bring, parent, pos, left,
+			   item_key(item), item_key_len(item));
+
+	put_btree_block(left);
+	if (put_parent)
+		put_btree_block(parent);
+
+	return 1;
+}
+
+/*
+ * This is called during descent for deletion when we have a parent and
+ * might need to merge items from a sibling block if this block has too
+ * much free space.  Eventually we'll be able to fit all of the
+ * sibling's items in our free space which lets us delete the sibling
+ * block.
+ *
+ * XXX this could more cleverly chose a merge candidate sibling
+ */
+static int try_merge(struct super_block *sb, struct scoutfs_btree_root *root,
+		     struct scoutfs_btree_block *parent, unsigned pos,
+		     struct scoutfs_btree_block *bt)
+{
+	struct scoutfs_btree_ring *bring = &SCOUTFS_SB(sb)->super.bring;
+	struct scoutfs_btree_block *sib;
+	unsigned int sib_pos;
+	bool move_right;
+	int to_move;
+	int ret;
+
+	if (reclaimable_free(bt) <= SCOUTFS_BTREE_FREE_LIMIT)
+		return 0;
+
+	/* move items right into our block if we have a left sibling */
+	if (pos) {
+		sib_pos = pos - 1;
+		move_right = true;
+	} else {
+		sib_pos = pos + 1;
+		move_right = false;
+	}
+
+	ret = get_parent_ref_block(sb, BTW_DIRTY, parent, sib_pos, &sib);
+	if (ret)
+		return ret;
+
+	if (used_total(sib) <= reclaimable_free(bt))
+		to_move = used_total(sib);
+	else
+		to_move = reclaimable_free(bt) - SCOUTFS_BTREE_FREE_LIMIT;
+
+	move_items(bt, sib, move_right, to_move);
+
+	/* update our parent's item */
+	if (!move_right)
+		update_parent_item(bring, parent, pos, bt);
+	else
+		update_parent_bits(bring, parent, pos, bt);
+
+	/* update or delete sibling's parent item */
+	if (le16_to_cpu(sib->nr_items) == 0)
+		delete_item(parent, sib_pos);
+	else if (move_right)
+		update_parent_item(bring, parent, sib_pos, sib);
+	else
+		update_parent_bits(bring, parent, sib_pos, sib);
+
+	/* and finally shrink the tree if our parent is the root with 1 */
+	if (le16_to_cpu(parent->nr_items) == 1) {
+		root->height--;
+		root->ref.blkno = bt->blkno;
+		root->ref.seq = bt->seq;
+	}
+
+	put_btree_block(sib);
+
+	return 1;
+}
+
+/*
+ * This is called before writing dirty blocks to ensure that each batch
+ * of dirty blocks migrates half as many blocks from the old half of the
+ * ring as it dirties from the current half.  This ensures that by the
+ * time we fill the current half of the ring it will no longer reference
+ * the old half.
+ *
+ * We've walked to the parent of the leaf level which might have dirtied
+ * more blocks.  Our job is to dirty as many leaves as we need to bring
+ * the old count back up to equal the current count.  The caller will
+ * keep trying to walk down different paths of each of the btrees.
+ */
+static int try_dirty_old(struct super_block *sb, struct scoutfs_btree_block *bt,
+			 u8 old_bit)
+{
+	DECLARE_BTREE_INFO(sb, bti);
+	struct scoutfs_btree_block *dirtied;
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_ref *ref;
+	struct blk_plug plug;
+	int ret = 0;
+	int pos = 0;
+	int nr;
+	int i;
+
+	if (bti->old_dirtied >= bti->cur_dirtied)
+		return 0;
+
+	/* called when first parent level is highest level, can have nothing */
+	nr = min_t(int, bti->cur_dirtied - bti->old_dirtied,
+		   bit_count(bt, old_bit));
+	if (nr == 0)
+		return -ENOENT;
+
+        blk_start_plug(&plug);
+
+	/* read 'em all */
+	for (i = 0, pos = 0; i < nr; i++, pos++) {
+		pos = find_pos_bit(bt, pos, old_bit);
+		if (pos >= le16_to_cpu(bt->nr_items)) {
+			/* XXX bits in headers didn't match count */
+			ret = -EIO;
+			blk_finish_plug(&plug);
+			goto out;
+		}
+
+		item = pos_item(bt, pos);
+		ref = item_val(item);
+
+		sb_breadahead(sb, le64_to_cpu(ref->blkno));
+	}
+
+	blk_finish_plug(&plug);
+
+	/* then actually try and dirty the blocks */
+	for (i = 0, pos = 0; i < nr; i++, pos++) {
+		pos = find_pos_bit(bt, pos, old_bit);
+
+		ret = get_parent_ref_block(sb, BTW_DIRTY, bt, pos, &dirtied);
+		if (ret)
+			break;
+		put_btree_block(dirtied);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * A quick and dirty verification of the btree block.  We could add a
+ * lot more checks and make it only verified on read or after
+ * significant events like splitting and merging.
+ */
+static int verify_btree_block(struct scoutfs_btree_block *bt, int level)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_item *prev;
+	unsigned int bytes = 0;
+	unsigned int after_off = sizeof(struct scoutfs_btree_block);
+	unsigned int first_off;
+	unsigned int off;
+	unsigned int nr;
+	unsigned int i = 0;
+	int bad = 1;
+
+	nr = le16_to_cpu(bt->nr_items);
+	if (nr == 0)
+		goto out;
+
+	after_off = offsetof(struct scoutfs_btree_block, item_hdrs[nr]);
+	first_off = SCOUTFS_BLOCK_SIZE;
+
+	if (after_off > SCOUTFS_BLOCK_SIZE) {
+		nr = 0;
+		goto out;
+	}
+
+	for (i = 0; i < nr; i++) {
+		off = le16_to_cpu(bt->item_hdrs[i].off);
+		if (off >= SCOUTFS_BLOCK_SIZE || off < after_off)
+			goto out;
+
+		first_off = min(first_off, off);
+
+		item = pos_item(bt, i);
+		bytes += item_bytes(item);
+
+		if (i > 0 && cmp_keys(item_key(item), item_key_len(item),
+				      item_key(prev), item_key_len(prev)) <= 0)
+			goto out;
+
+		prev = item;
+	}
+
+	if (first_off < le16_to_cpu(bt->free_end))
+		goto out;
+
+	if ((le16_to_cpu(bt->free_end) + bytes +
+	     le16_to_cpu(bt->free_reclaim)) != SCOUTFS_BLOCK_SIZE)
+		goto out;
+
+	bad = 0;
+out:
+	if (bad) {
+		printk("bt %p blkno %llu level %d end %u reclaim %u nr %u (after %u bytes %u)\n",
+			bt, le64_to_cpu(bt->blkno), level,
+			le16_to_cpu(bt->free_end),
+			le16_to_cpu(bt->free_reclaim), le16_to_cpu(bt->nr_items),
+			after_off, bytes);
+		for (i = 0; i < nr; i++) {
+			item = pos_item(bt, i);
+			printk("  [%u] off %u key_len %u val_len %u\n",
+			       i, le16_to_cpu(bt->item_hdrs[i].off),
+			       item_key_len(item), item_val_len(item));
+		}
+		BUG_ON(bad);
+	}
+
+	return 0;
+}
+
+/* XXX bleh, this should probably share code with the key_buf equivalent */
+static void inc_key(u8 *bytes, unsigned *len)
+{
+	int i;
+
+	if (*len < SCOUTFS_BTREE_MAX_KEY_LEN) {
+		memset(bytes + *len, 0, SCOUTFS_BTREE_MAX_KEY_LEN - *len);
+		*len = SCOUTFS_BTREE_MAX_KEY_LEN;
+	}
+
+	for (i = *len - 1; i >= 0; i--) {
+		if (++bytes[i] != 0)
+			break;
+	}
+}
+
+/*
+ * Return the leaf block that should contain the given key.  The caller
+ * is responsible for searching the leaf block and performing their
+ * operation.
+ *
+ * Iteration starting from a key can end up in a leaf that doesn't
+ * contain the next item in the direction iteration.  As we descend we
+ * give the caller the nearest key in the direction of iteration that
+ * will land in a different leaf.
+ *
+ * The caller provides the path to record the parent blocks and items
+ * used to reach the leaf.  We let them repair the path once they've
+ * potentially updated bits in the leaf.  They must always repair the
+ * path because we can modify parent bits during descent before
+ * returning an error.
+ */
+static int btree_walk(struct super_block *sb, struct scoutfs_btree_root *root,
+		      struct btree_path *path, int flags,
+		      void *key, unsigned key_len, unsigned int val_len, u8 bit,
+		      struct scoutfs_btree_block **bt_ret,
+		      void *iter_key, unsigned *iter_len)
+{
+	struct scoutfs_btree_block *parent = NULL;
+	struct scoutfs_btree_block *bt = NULL;
+	struct scoutfs_btree_item *item;
+	unsigned level;
+	unsigned pos;
+	unsigned nr;
+	int cmp;
+	int ret;
+
+	if (WARN_ON_ONCE((flags & BTW_DIRTY) && path == NULL) ||
+	    WARN_ON_ONCE((flags & (BTW_NEXT|BTW_PREV)) && iter_key == NULL))
+		return -EINVAL;
+
+restart:
+	path_repair_reset(path);
+	put_btree_block(parent);
+	parent = NULL;
+	put_btree_block(bt);
+	bt = NULL;
+	level = root->height;
+	if (iter_len)
+		*iter_len = 0;
+	pos = 0;
+	ret = 0;
+
+	if (!root->height) {
+		if (!(flags & BTW_INSERT)) {
+			ret = -ENOENT;
+		} else {
+			ret = get_ref_block(sb, BTW_ALLOC, &root->ref, &bt);
+			if (ret == 0) {
+				bt->level = 0;
+				root->height = 1;
+			}
+		}
+		goto out;
+	}
+
+	while(level-- > 0) {
+		if (parent)
+			ret = get_parent_ref_block(sb, flags, parent, pos, &bt);
+		else
+			ret = get_ref_block(sb, flags, &root->ref, &bt);
+		if (ret)
+			break;
+
+		/* push the parent once we could have updated its bits */
+		if (parent)
+			path_push(path, parent, pos);
+
+		/* XXX it'd be nice to make this tunable */
+		ret = 0 && verify_btree_block(bt, level);
+		if (ret)
+			break;
+
+		/* XXX more aggressive block verification, before ref updates? */
+		if (bt->level != level) {
+			ret = -EIO;
+			break;
+		}
+
+		/*
+		 * Splitting and merging can add or remove parents or
+		 * change the pos we take through parents to reach the
+		 * block with the search key|bit.  In the rare case that
+		 * we split or merge we simply restart the walk rather
+		 * than try and special case modifying the path to
+		 * reflect the tree changes.
+		 */
+		if (flags & BTW_INSERT)
+			ret = try_split(sb, root, key, key_len, val_len,
+				        parent, pos, bt);
+		else if ((flags & BTW_DELETE) && parent)
+			ret = try_merge(sb, root, parent, pos, bt);
+		else
+			ret = 0;
+		if (ret > 0)
+			goto restart;
+		else if (ret < 0)
+			break;
+
+		/* dirtying old stops at the last parent level */
+		if ((flags & BTW_DIRTY_OLD) && (level < 2)) {
+			if (level == 1) {
+				path_push(path, bt, 0);
+				ret = try_dirty_old(sb, bt, bit);
+			} else {
+				ret = -ENOENT;
+			}
+			break;
+		}
+
+		/* done at the leaf */
+		if (level == 0) {
+			path_push(path, bt, 0);
+			break;
+		}
+
+		nr = le16_to_cpu(bt->nr_items);
+
+		/*
+		 * Find the next child block for the search key or bit.
+		 * Key searches should always find a child, bit searches
+		 * can find that the bit isn't set in the first block.
+		 */
+		if (flags & BTW_BIT) {
+			pos = find_pos_bit(bt, 0, bit);
+			if (pos >= nr)
+				ret = -ENOENT;
+		} else {
+			pos = find_pos(bt, key, key_len, &cmp);
+			if (pos >= nr)
+				ret = -EIO;
+		}
+		if (ret)
+			break;
+
+		/* give the caller the next key to iterate towards */
+		if (iter_key && (flags & BTW_NEXT) && (pos < (nr - 1))) {
+			item = pos_item(bt, pos);
+			*iter_len = item_key_len(item);
+			memcpy(iter_key, item_key(item), *iter_len);
+			inc_key(iter_key, iter_len);
+
+		} else if (iter_key && (flags & BTW_PREV) && (pos > 0)) {
+			item = pos_item(bt, pos - 1);
+			*iter_len = item_key_len(item);
+			memcpy(iter_key, item_key(item), *iter_len);
+		}
+
+		put_btree_block(parent);
+		parent = bt;
+		bt = NULL;
+	}
+
+out:
+	put_btree_block(parent);
+	if (ret) {
+		put_btree_block(bt);
+		bt = NULL;
+	}
+
+	if (bt_ret)
+		*bt_ret = bt;
+	else
+		put_btree_block(bt);
+
+	return ret;
+}
+
+static void init_item_ref(struct scoutfs_btree_item_ref *iref,
+			  struct scoutfs_btree_item *item)
+{
+	iref->key = item_key(item);
+	iref->key_len = le16_to_cpu(item->key_len);
+	iref->val = item_val(item);
+	iref->val_len = le16_to_cpu(item->val_len);
+}
+
+void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref)
+{
+	if (!IS_ERR_OR_NULL(iref) && !IS_ERR_OR_NULL(iref->key)) {
+		put_btree_block(iref->key);
+		memset(iref, 0, sizeof(struct scoutfs_btree_item_ref));
+	}
+}
+
+/*
+ * Find the item with the given key and point to it from the caller's
+ * item ref.  They're given a reference to the block that they'll drop
+ * when they're done.
+ */
+int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len,
+			 struct scoutfs_btree_item_ref *iref)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	unsigned int pos;
+	int cmp;
+	int ret;
+
+	if (WARN_ON_ONCE(iref->key))
+		return -EINVAL;
+
+	ret = btree_walk(sb, root, NULL, 0, key, key_len, 0, 0, &bt, NULL, NULL);
+	if (ret == 0) {
+		pos = find_pos(bt, key, key_len, &cmp);
+		if (cmp == 0) {
+			item = pos_item(bt, pos);
+			init_item_ref(iref, item);
+			ret = 0;
+		} else {
+			put_btree_block(bt);
+			ret = -ENOENT;
+		}
+
+	}
+
+	return ret;
+}
+
+static bool invalid_item(void *key, unsigned key_len, unsigned val_len)
+{
+	return WARN_ON_ONCE(key_len == 0) ||
+	       WARN_ON_ONCE(key_len > SCOUTFS_BTREE_MAX_KEY_LEN) ||
+	       WARN_ON_ONCE(val_len > SCOUTFS_BTREE_MAX_VAL_LEN) ||
+	       WARN_ON_ONCE(key_len > SCOUTFS_BTREE_GREATEST_KEY_LEN &&
+			    cmp_keys(key, key_len, max_key, sizeof(max_key)) > 0);
+}
+
+/*
+ * Insert a new item in the tree.
+ *
+ * 0 is returned on success.  -EEXIST is returned if the key is already
+ * present in the tree.
+ *
+ * If no value pointer is given then the item is created with a zero
+ * length value.
+ */
+int scoutfs_btree_insert(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len,
+			 void *val, unsigned val_len)
+{
+	struct scoutfs_btree_block *bt;
+	DECLARE_BTREE_PATH(path);
+	int pos;
+	int cmp;
+	int ret;
+
+	if (invalid_item(key, key_len, val_len))
+		return -EINVAL;
+
+	ret = btree_walk(sb, root, &path, BTW_DIRTY | BTW_INSERT, key, key_len,
+			 val_len, 0, &bt, NULL, NULL);
+	if (ret == 0) {
+		pos = find_pos(bt, key, key_len, &cmp);
+		if (cmp) {
+			create_item(bt, pos, 0, key, key_len, val, val_len);
+			ret = 0;
+		} else {
+			ret = -EEXIST;
+		}
+
+		put_btree_block(bt);
+	}
+
+	path_repair_reset(&path);
+	return ret;
+}
+
+/*
+ * Update a btree item.  The key and value must be of the same length (though
+ * it would be easy enough for us to change that if a caller cared).
+ */
+int scoutfs_btree_update(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len,
+			 void *val, unsigned val_len)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	DECLARE_BTREE_PATH(path);
+	int pos;
+	int cmp;
+	int ret;
+
+	if (invalid_item(key, key_len, val_len))
+		return -EINVAL;
+
+	ret = btree_walk(sb, root, &path, BTW_DIRTY, key, key_len, 0, 0, &bt,
+			 NULL, NULL);
+	if (ret == 0) {
+		pos = find_pos(bt, key, key_len, &cmp);
+		if (cmp == 0) {
+			item = pos_item(bt, pos);
+			if (item_key_len(item) != key_len ||
+			    item_val_len(item) != val_len) {
+				ret = -EINVAL;
+			} else {
+				memcpy(item_key(item), key, key_len);
+				memcpy(item_val(item), val, val_len);
+				ret = 0;
+			}
+			ret = 0;
+		} else {
+			ret = -ENOENT;
+		}
+
+		put_btree_block(bt);
+	}
+
+	path_repair_reset(&path);
+	return ret;
+}
+
+/*
+ * Delete an item from the tree.  -ENOENT is returned if the key isn't
+ * found.
+ */
+int scoutfs_btree_delete(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len)
+{
+	struct scoutfs_btree_block *bt;
+	DECLARE_BTREE_PATH(path);
+	int pos;
+	int cmp;
+	int ret;
+
+	ret = btree_walk(sb, root, &path, BTW_DELETE | BTW_DIRTY, key, key_len,
+			 0, 0, &bt, NULL, NULL);
+	if (ret == 0) {
+		pos = find_pos(bt, key, key_len, &cmp);
+		if (cmp == 0) {
+			delete_item(bt, pos);
+			ret = 0;
+
+			/* delete the final block in the tree */
+			if (bt->nr_items == 0) {
+				root->height = 0;
+				root->ref.blkno = 0;
+				root->ref.seq = 0;
+			}
+		} else {
+			ret = -ENOENT;
+		}
+
+		put_btree_block(bt);
+	}
+
+	path_repair_reset(&path);
+	return ret;
+}
+
+/*
+ * Iterate from a key value to the next item in the direction of
+ * iteration.  Callers set flags to tell which way to iterate and
+ * whether the search key is inclusive, or not.
+ *
+ * Walking can land in a leaf that doesn't contain any items in the
+ * direction of the iteration.  Walking gives us the next key to walk
+ * towards in this case.  We keep trying until we run out of blocks or
+ * find the next item.  This method is aggressively permissive because
+ * it lets the tree shape change between each walk and allows empty
+ * blocks.
+ */
+static int btree_iter(struct super_block *sb, struct scoutfs_btree_root *root,
+		      int flags, void *key, unsigned key_len,
+		      struct scoutfs_btree_item_ref *iref)
+{
+	struct scoutfs_btree_item *item;
+	struct scoutfs_btree_block *bt;
+	unsigned iter_len;
+	unsigned walk_len;
+	void *iter_key;
+	void *walk_key;
+	int pos;
+	int cmp;
+	int ret;
+
+	if (WARN_ON_ONCE(flags & BTW_DIRTY) ||
+	    WARN_ON_ONCE(iref->key))
+		return -EINVAL;
+
+	walk_key = kmalloc(SCOUTFS_BTREE_MAX_KEY_LEN, GFP_NOFS);
+	iter_key = kmalloc(SCOUTFS_BTREE_MAX_KEY_LEN, GFP_NOFS);
+	if (!walk_key || !iter_key)
+		return -ENOMEM;
+
+	memcpy(walk_key, key, key_len);
+	walk_len = key_len;
+
+	for (;;) {
+		ret = btree_walk(sb, root, NULL, flags, walk_key, walk_len,
+				 0, 0, &bt, iter_key, &iter_len);
+		if (ret < 0)
+			break;
+
+		pos = find_pos(bt, key, key_len, &cmp);
+
+		/* point pos towards iteration, find_pos already for _NEXT */
+		if ((flags & BTW_AFTER) && cmp == 0)
+			pos++;
+		else if ((flags & BTW_PREV) && cmp < 0)
+			pos--;
+		else if ((flags & BTW_BEFORE) && cmp == 0)
+			pos--;
+
+		/* found the next item in this leaf */
+		if (pos >= 0 && pos < le16_to_cpu(bt->nr_items)) {
+			item = pos_item(bt, pos);
+			init_item_ref(iref, item);
+			ret = 0;
+			break;
+		}
+
+		put_btree_block(bt);
+
+		/* nothing in this leaf, walk gave us a key */
+		if (iter_len > 0) {
+			memcpy(walk_key, iter_key, iter_len);
+			walk_len = iter_len;
+			continue;
+		}
+
+		ret = -ENOENT;
+		break;
+	}
+
+	kfree(walk_key);
+	kfree(iter_key);
+
+	return ret;
+}
+
+int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
+		       void *key, unsigned key_len,
+		       struct scoutfs_btree_item_ref *iref)
+{
+	return btree_iter(sb, root, BTW_NEXT, key, key_len, iref);
+}
+
+int scoutfs_btree_after(struct super_block *sb, struct scoutfs_btree_root *root,
+		        void *key, unsigned key_len,
+		        struct scoutfs_btree_item_ref *iref)
+{
+	return btree_iter(sb, root, BTW_NEXT | BTW_AFTER, key, key_len, iref);
+}
+
+int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root,
+		       void *key, unsigned key_len,
+		       struct scoutfs_btree_item_ref *iref)
+{
+	return btree_iter(sb, root, BTW_PREV, key, key_len, iref);
+}
+
+int scoutfs_btree_before(struct super_block *sb, struct scoutfs_btree_root *root,
+		         void *key, unsigned key_len,
+		         struct scoutfs_btree_item_ref *iref)
+{
+	return btree_iter(sb, root, BTW_PREV | BTW_BEFORE, key, key_len, iref);
+}
+
+/*
+ * Ensure that the blocks that lead to the item with the given key are
+ * dirty.  caller can hold a transaction to pin the dirty blocks and
+ * guarantee that later updates of the item will succeed.
+ *
+ * <0 is returned on error, including -ENOENT if the key isn't present.
+ */
+int scoutfs_btree_dirty(struct super_block *sb, struct scoutfs_btree_root *root,
+			void *key, unsigned key_len)
+{
+	struct scoutfs_btree_block *bt;
+	DECLARE_BTREE_PATH(path);
+	int cmp;
+	int ret;
+
+	ret = btree_walk(sb, root, &path, BTW_DIRTY, key, key_len, 0, 0, &bt,
+			 NULL, NULL);
+	if (ret == 0) {
+		find_pos(bt, key, key_len, &cmp);
+		if (cmp == 0)
+			ret = 0;
+		else
+			ret = -ENOENT;
+		put_btree_block(bt);
+	}
+
+	path_repair_reset(&path);
+	return ret;
+}
+
+/*
+ * This initializes all our tracking info based on the super.  Called
+ * before dirtying anything after having read the super or finished
+ * writing dirty blocks.
+ */
+static int btree_prepare_write(struct super_block *sb)
+{
+	struct scoutfs_btree_ring *bring = &SCOUTFS_SB(sb)->super.bring;
+	DECLARE_BTREE_INFO(sb, bti);
+
+	bti->cur_dirtied = 0;
+	bti->old_dirtied = 0;
+	bti->first_dirty_bh = NULL;
+	bti->last_dirty_bh = NULL;
+	bti->first_dirty_blkno = le64_to_cpu(bring->first_blkno) +
+				 le64_to_cpu(bring->next_block);
+	bti->first_dirty_seq = le64_to_cpu(bring->next_seq);
+
+	return 0;
+}
+
+/*
+ * The caller is serializing btree item dirtying and dirty block writing.
+ */
+bool scoutfs_btree_has_dirty(struct super_block *sb)
+{
+	DECLARE_BTREE_INFO(sb, bti);
+
+	return bti->first_dirty_bh != NULL;
+}
+
+/* dirty block allocation built this list */
+#define for_each_dirty_bh(bti, bh, tmp) \
+	for (bh = bti->first_dirty_bh; bh && (tmp = bh->b_private, 1); bh = tmp)
+
+/*
+ * Write the dirty region of blocks to the ring.  The caller still has
+ * to write the super after we're done.  That could fail and we could
+ * be asked to write the blocks all over again.
+ *
+ * We're the only writer.
+ */
+int scoutfs_btree_write_dirty(struct super_block *sb)
+{
+	DECLARE_BTREE_INFO(sb, bti);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_super_block *super = &sbi->super;
+	struct scoutfs_btree_ring *bring = &super->bring;
+	struct scoutfs_btree_root *roots[] = {
+		/* XXX super roots go here */
+		NULL,
+	};
+	struct scoutfs_btree_root *root;
+	struct scoutfs_btree_block *bt;
+	DECLARE_BTREE_PATH(path);
+	struct buffer_head *tmp;
+	struct buffer_head *bh;
+	struct blk_plug plug;
+	unsigned next_root;
+	u8 bit;
+	int ret;
+
+	if (bti->first_dirty_bh == NULL)
+		return 0;
+
+	/* cow old dirty blocks to balance ring */
+	bit = other_half_bit(bring, bti->first_dirty_blkno);
+	next_root = 0;
+	root = roots[next_root];
+	while (root && bti->old_dirtied < bti->cur_dirtied) {
+		ret = btree_walk(sb, root, &path,
+				 BTW_DIRTY | BTW_BIT | BTW_DIRTY_OLD,
+				 NULL, 0, 0, bit, NULL, NULL, NULL);
+		path_repair_reset(&path);
+		if (ret == -ENOENT) {
+			root = roots[next_root++];
+			continue;
+		}
+		if (ret < 0)
+			goto out;
+	}
+
+	/* checksum everything to reduce time between io submission merging */
+	for_each_dirty_bh(bti, bh, tmp) {
+		bt = (void *)bh->b_data;
+		bt->crc = 0;
+		bt->crc = cpu_to_le32(crc32c(~0, bt, SCOUTFS_BLOCK_SIZE));
+	}
+
+        blk_start_plug(&plug);
+
+	for_each_dirty_bh(bti, bh, tmp) {
+		lock_buffer(bh);
+		set_buffer_dirty(bh);
+		set_buffer_mapped(bh);
+		bh->b_end_io = end_buffer_write_sync;
+		get_bh(bh);
+		/* XXX should be more careful with flags */
+		submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+	}
+
+	blk_finish_plug(&plug);
+
+	ret = 0;
+	for_each_dirty_bh(bti, bh, tmp) {
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			ret = -EIO;
+	}
+out:
+	return ret;
+}
+
+/*
+ * The dirty blocks and their super reference have been successfully written.
+ * Remove them from the dirty list and drop their references and prepare
+ * for the next write.
+ */
+void scoutfs_btree_write_complete(struct super_block *sb)
+{
+	DECLARE_BTREE_INFO(sb, bti);
+	struct buffer_head *bh;
+	struct buffer_head *tmp;
+
+	for_each_dirty_bh(bti, bh, tmp) {
+		bh->b_private = NULL;
+		put_bh(bh);
+	}
+
+	btree_prepare_write(sb);
+}
+
+int scoutfs_btree_setup(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct btree_info *bti;
+
+	bti = kzalloc(sizeof(struct btree_info), GFP_KERNEL);
+	if (!bti)
+		return -ENOMEM;
+
+	mutex_init(&bti->mutex);
+
+	sbi->btree_info = bti;
+
+	btree_prepare_write(sb);
+
+	return 0;
+}
+
+void scoutfs_btree_destroy(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+
+	kfree(sbi->btree_info);
+	sbi->btree_info = NULL;
+}
diff --git a/kmod/src/btree.h b/kmod/src/btree.h
new file mode 100644
index 00000000..860923a2
--- /dev/null
+++ b/kmod/src/btree.h
@@ -0,0 +1,51 @@
+#ifndef _SCOUTFS_BTREE_H_
+#define _SCOUTFS_BTREE_H_
+
+#include <linux/uio.h>
+
+struct scoutfs_btree_item_ref {
+	void *key;
+	unsigned key_len;
+	void *val;
+	unsigned val_len;
+};
+
+#define SCOUTFS_BTREE_ITEM_REF(name) \
+	struct scoutfs_btree_item_ref name = {NULL,}
+
+int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len,
+			 struct scoutfs_btree_item_ref *iref);
+int scoutfs_btree_insert(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len,
+			 void *val, unsigned val_len);
+int scoutfs_btree_update(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len,
+			 void *val, unsigned val_len);
+int scoutfs_btree_delete(struct super_block *sb, struct scoutfs_btree_root *root,
+			 void *key, unsigned key_len);
+int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
+		       void *key, unsigned key_len,
+		       struct scoutfs_btree_item_ref *iref);
+int scoutfs_btree_after(struct super_block *sb, struct scoutfs_btree_root *root,
+		        void *key, unsigned key_len,
+		        struct scoutfs_btree_item_ref *iref);
+int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root,
+		       void *key, unsigned key_len,
+		       struct scoutfs_btree_item_ref *iref);
+int scoutfs_btree_before(struct super_block *sb, struct scoutfs_btree_root *root,
+		         void *key, unsigned key_len,
+		         struct scoutfs_btree_item_ref *iref);
+int scoutfs_btree_dirty(struct super_block *sb, struct scoutfs_btree_root *root,
+			void *key, unsigned key_len);
+
+void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
+
+bool scoutfs_btree_has_dirty(struct super_block *sb);
+int scoutfs_btree_write_dirty(struct super_block *sb);
+void scoutfs_btree_write_complete(struct super_block *sb);
+
+int scoutfs_btree_setup(struct super_block *sb);
+void scoutfs_btree_destroy(struct super_block *sb);
+
+#endif
diff --git a/kmod/src/format.h b/kmod/src/format.h
index 6ef16073..8a7a2df5 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -76,6 +76,90 @@ struct scoutfs_ring_descriptor {
 	__le64 nr_blocks;
 } __packed;
 
+/*
+ * Assert that we'll be able to represent all possible keys with 8 64bit
+ * primary sort values.
+ */
+#define SCOUTFS_BTREE_GREATEST_KEY_LEN 32
+/* level >0 segments can have a full key and some metadata */
+#define SCOUTFS_BTREE_MAX_KEY_LEN 320
+/* level 0 segments can have two full keys in the value :/ */
+#define SCOUTFS_BTREE_MAX_VAL_LEN 768
+
+/*
+ * A 4EB test image measured a worst case height of 17.  This is plenty
+ * generous.
+ */
+#define SCOUTFS_BTREE_MAX_HEIGHT 20
+
+/* btree blocks (beyond the first) need to be at least half full */
+#define SCOUTFS_BTREE_FREE_LIMIT \
+	((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2)
+
+#define SCOUTFS_BTREE_BITS 8
+
+/*
+ * Btree items can have bits associated with them.  Their parent items
+ * reflect all the bits that their child block contain.  Thus searches
+ * can find items with bits set.
+ *
+ * @SCOUTFS_BTREE_BIT_HALF1: Tracks blocks found in the first half of
+ * the ring.  It's used to migrate blocks from the old half of the ring
+ * into the current half as blocks are dirtied.  It's not found in leaf
+ * items but is calculated based on the block number of referenced
+ * blocks.  _HALF2 is identical but for the second half of the ring.
+ */
+enum {
+	SCOUTFS_BTREE_BIT_HALF1		= (1 << 0),
+	SCOUTFS_BTREE_BIT_HALF2		= (1 << 1),
+};
+
+struct scoutfs_btree_ref {
+	__le64 blkno;
+	__le64 seq;
+} __packed;
+
+/*
+ * A height of X means that the first block read will have level X-1 and
+ * the leaves will have level 0.
+ */
+struct scoutfs_btree_root {
+	struct scoutfs_btree_ref ref;
+	__u8 height;
+} __packed;
+
+struct scoutfs_btree_item_header {
+	__le16 off;
+	__u8 bits;
+} __packed;
+
+struct scoutfs_btree_item {
+	__le16 key_len;
+	__le16 val_len;
+	__u8 data[0];
+} __packed;
+
+struct scoutfs_btree_block {
+	__le64 fsid;
+	__le64 blkno;
+	__le64 seq;
+	__le32 crc;
+	__le32 _pad;
+	__le16 free_end;
+	__le16 free_reclaim;
+	__le16 nr_items;
+	__le16 bit_counts[SCOUTFS_BTREE_BITS];
+	__u8 level;
+	struct scoutfs_btree_item_header item_hdrs[0];
+} __packed;
+
+struct scoutfs_btree_ring {
+	__le64 first_blkno;
+	__le64 nr_blocks;
+	__le64 next_block;
+	__le64 next_seq;
+} __packed;
+
 /*
  * This is absurdly huge.  If there was only ever 1 item per segment and
  * 2^64 items the tree could get this deep.
@@ -313,6 +397,7 @@ struct scoutfs_super_block {
 	__le64 ring_blocks;
 	__le64 ring_tail_block;
 	__le64 ring_gen;
+	struct scoutfs_btree_ring bring;
 	__le64 next_seg_seq;
 	struct scoutfs_ring_descriptor alloc_ring;
 	struct scoutfs_manifest manifest;
diff --git a/kmod/src/sort_priv.c b/kmod/src/sort_priv.c
new file mode 100644
index 00000000..2acc0802
--- /dev/null
+++ b/kmod/src/sort_priv.c
@@ -0,0 +1,71 @@
+/*
+ * A copy of sort() from upstream with a priv argument that's passed
+ * to comparison, like list_sort().
+ */
+
+/* ------------------------ */
+
+/*
+ * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ *
+ * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include "sort_priv.h"
+
+/**
+ * sort - sort an array of elements
+ * @priv: caller's pointer to pass to comparison and swap functions
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap_func function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+void sort_priv(void *priv, void *base, size_t num, size_t size,
+	       int (*cmp_func)(void *priv, const void *, const void *),
+	       void (*swap_func)(void *priv, void *, void *, int size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+			    cmp_func(priv, base + c, base + c + size) < 0)
+				c += size;
+			if (cmp_func(priv, base + r, base + c) >= 0)
+				break;
+			swap_func(priv, base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(priv, base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+			    cmp_func(priv, base + c, base + c + size) < 0)
+				c += size;
+			if (cmp_func(priv, base + r, base + c) >= 0)
+				break;
+			swap_func(priv, base + r, base + c, size);
+		}
+	}
+}
diff --git a/kmod/src/sort_priv.h b/kmod/src/sort_priv.h
new file mode 100644
index 00000000..c5fde547
--- /dev/null
+++ b/kmod/src/sort_priv.h
@@ -0,0 +1,8 @@
+#ifndef _SCOUTFS_SORT_PRIV_H_
+#define _SCOUTFS_SORT_PRIV_H_
+
+void sort_priv(void *priv, void *base, size_t num, size_t size,
+	       int (*cmp_func)(void *priv, const void *, const void *),
+	       void (*swap_func)(void *priv, void *, void *, int size));
+
+#endif
diff --git a/kmod/src/super.c b/kmod/src/super.c
index 4c0ba6a3..d7321786 100644
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -203,6 +203,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_op = &scoutfs_super_ops;
 
+	/* btree blocks use long lived bh->b_data refs */
+	mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS);
+
 	sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL);
 	sb->s_fs_info = sbi;
 	sbi->sb = sb;
diff --git a/kmod/src/super.h b/kmod/src/super.h
index fc278b69..95285080 100644
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -17,6 +17,7 @@ struct trans_info;
 struct lock_info;
 struct net_info;
 struct inode_sb_info;
+struct btree_info;
 
 struct scoutfs_sb_info {
 	struct super_block *sb;
@@ -34,6 +35,7 @@ struct scoutfs_sb_info {
 	struct compact_info *compact_info;
 	struct data_info *data_info;
 	struct inode_sb_info *inode_sb_info;
+	struct btree_info *btree_info;
 
 	wait_queue_head_t trans_hold_wq;
 	struct task_struct *trans_task;