From 97cb75bd88ba0de3f0aacfb8d82ab3b0441db758 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Fri, 10 Feb 2017 09:58:37 -0800
Subject: [PATCH] Remove dead btree, block, and buddy code

Remove all the unused dead code from the previous btree block design.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/Makefile        |    6 +-
 kmod/src/block.c         |  786 -------------------
 kmod/src/block.h         |   38 -
 kmod/src/btree.c         | 1582 --------------------------------------
 kmod/src/btree.h         |   77 --
 kmod/src/buddy.c         | 1063 -------------------------
 kmod/src/buddy.h         |   20 -
 kmod/src/counters.h      |    2 -
 kmod/src/crc.c           |   23 -
 kmod/src/crc.h           |    6 -
 kmod/src/dir.c           |    2 -
 kmod/src/format.h        |  163 ----
 kmod/src/inode.c         |    7 -
 kmod/src/inode.h         |    2 -
 kmod/src/ioctl.c         |    1 -
 kmod/src/key.h           |  123 ---
 kmod/src/kvec.c          |    2 -
 kmod/src/name.c          |   35 -
 kmod/src/name.h          |    8 -
 kmod/src/scoutfs_trace.c |    1 -
 kmod/src/scoutfs_trace.h |  165 ----
 kmod/src/super.c         |   21 -
 kmod/src/super.h         |   28 -
 kmod/src/trans.c         |    2 -
 kmod/src/xattr.c         |    1 -
 25 files changed, 3 insertions(+), 4161 deletions(-)
 delete mode 100644 kmod/src/block.c
 delete mode 100644 kmod/src/block.h
 delete mode 100644 kmod/src/btree.c
 delete mode 100644 kmod/src/btree.h
 delete mode 100644 kmod/src/buddy.c
 delete mode 100644 kmod/src/buddy.h
 delete mode 100644 kmod/src/crc.c
 delete mode 100644 kmod/src/crc.h
 delete mode 100644 kmod/src/name.c
 delete mode 100644 kmod/src/name.h

diff --git a/kmod/src/Makefile b/kmod/src/Makefile
index 3e7c9b35..e31924ed 100644
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -2,6 +2,6 @@ obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
 
 CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
  
-scoutfs-y += alloc.o bio.o block.o btree.o buddy.o compact.o counters.o crc.o \
-	     data.o dir.o kvec.o inode.o ioctl.o item.o key.o manifest.o \
-	     msg.o name.o seg.o scoutfs_trace.o super.o trans.o treap.o xattr.o
+scoutfs-y += alloc.o bio.o compact.o counters.o data.o dir.o kvec.o inode.o \
+	     ioctl.o item.o key.o manifest.o msg.o seg.o scoutfs_trace.o \
+	     super.o trans.o treap.o xattr.o
diff --git a/kmod/src/block.c b/kmod/src/block.c
deleted file mode 100644
index 3ecf6a7b..00000000
--- a/kmod/src/block.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Copyright (C) 2016 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-
-#include "super.h"
-#include "format.h"
-#include "block.h"
-#include "crc.h"
-#include "counters.h"
-#include "buddy.h"
-
-/*
- * scoutfs maintains a cache of metadata blocks in a radix tree.  This
- * gives us blocks bigger than page size and avoids fixing the location
- * of a logical cached block in one possible position in a larger block
- * device page cache page.
- *
- * This does the work to cow dirty blocks, track dirty blocks, generate
- * checksums as they're written, only write them in transactions, verify
- * checksums on read, and invalidate and retry reads of stale cached
- * blocks.  (That last bit only has a hint of an implementation.)
- *
- * XXX
- *  - tear down dirty blocks left by write errors on unmount
- *  - multiple smaller page allocs
- *  - vmalloc?  vm_map_ram?
- *  - blocks allocated from per-cpu pages when page size > block size
- *  - cmwq crc calcs if that makes sense
- *  - slab of block structs
- *  - don't verify checksums in end_io context?
- *  - fall back to multiple single bios per block io if bio alloc fails?
- *  - fail mount if total_blocks is greater than long radix blkno
- */
-
-struct scoutfs_block {
-	struct rw_semaphore rwsem;
-	atomic_t refcount;
-	struct list_head lru_entry;
-	u64 blkno;
-
-	unsigned long bits;
-
-	struct super_block *sb;
-	struct page *page;
-	void *data;
-};
-
-#define DIRTY_RADIX_TAG 0
-
-enum {
-	BLOCK_BIT_UPTODATE = 0,
-	BLOCK_BIT_ERROR,
-	BLOCK_BIT_CLASS_SET,
-};
-
-static struct scoutfs_block *alloc_block(struct super_block *sb, u64 blkno)
-{
-	struct scoutfs_block *bl;
-	struct page *page;
-
-	/* we'd need to be just a bit more careful */
-	BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_SIZE);
-
-	bl = kzalloc(sizeof(struct scoutfs_block), GFP_NOFS);
-	if (bl) {
-		/* change _from_contents if allocs not aligned */
-		page = alloc_pages(GFP_NOFS, SCOUTFS_BLOCK_PAGE_ORDER);
-		WARN_ON_ONCE(!page);
-		if (page) {
-			init_rwsem(&bl->rwsem);
-			atomic_set(&bl->refcount, 1);
-			INIT_LIST_HEAD(&bl->lru_entry);
-			bl->blkno = blkno;
-			bl->sb = sb;
-			bl->page = page;
-			bl->data = page_address(page);
-			trace_printk("allocated bl %p\n", bl);
-		} else {
-			kfree(bl);
-			bl = NULL;
-		}
-	}
-
-	return bl;
-}
-
-void scoutfs_block_put(struct scoutfs_block *bl)
-{
-	if (!IS_ERR_OR_NULL(bl) && atomic_dec_and_test(&bl->refcount)) {
-		trace_printk("freeing bl %p\n", bl);
-		WARN_ON_ONCE(!list_empty(&bl->lru_entry));
-		__free_pages(bl->page, SCOUTFS_BLOCK_PAGE_ORDER);
-		kfree(bl);
-		scoutfs_inc_counter(bl->sb, block_mem_free);
-	}
-}
-
-static void lru_add(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl)
-{
-	if (list_empty(&bl->lru_entry)) {
-		list_add_tail(&bl->lru_entry, &sbi->block_lru_list);
-		sbi->block_lru_nr++;
-	}
-}
-
-static void lru_del(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl)
-{
-	if (!list_empty(&bl->lru_entry)) {
-		list_del_init(&bl->lru_entry);
-		sbi->block_lru_nr--;
-	}
-}
-
-/*
- * The caller is referencing a block but doesn't know if its in the LRU
- * or not.  If it is move it to the tail so it's last to be dropped by
- * the shrinker.
- */
-static void lru_move(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl)
-{
-	if (!list_empty(&bl->lru_entry))
-		list_move_tail(&bl->lru_entry, &sbi->block_lru_list);
-}
-
-static void radix_insert(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl,
-			 bool dirty)
-{
-	radix_tree_insert(&sbi->block_radix, bl->blkno, bl);
-	if (dirty)
-		radix_tree_tag_set(&sbi->block_radix, bl->blkno,
-				   DIRTY_RADIX_TAG);
-	else
-		lru_add(sbi, bl);
-	atomic_inc(&bl->refcount);
-}
-
-/* deleting the blkno from the radix also clears the dirty tag if it was set */
-static void radix_delete(struct scoutfs_sb_info *sbi, struct scoutfs_block *bl)
-{
-	lru_del(sbi, bl);
-	radix_tree_delete(&sbi->block_radix, bl->blkno);
-	scoutfs_block_put(bl);
-}
-
-static int verify_block_header(struct super_block *sb, struct scoutfs_block *bl)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct scoutfs_block_header *hdr = bl->data;
-	u32 crc = scoutfs_crc_block(hdr);
-	int ret = -EIO;
-
-	if (le32_to_cpu(hdr->crc) != crc) {
-		printk("blkno %llu hdr crc %x != calculated %x\n", bl->blkno,
-			le32_to_cpu(hdr->crc), crc);
-	} else if (super->hdr.fsid && hdr->fsid != super->hdr.fsid) {
-		printk("blkno %llu fsid %llx != super fsid %llx\n", bl->blkno,
-			le64_to_cpu(hdr->fsid), le64_to_cpu(super->hdr.fsid));
-	} else if (le64_to_cpu(hdr->blkno) != bl->blkno) {
-		printk("blkno %llu invalid hdr blkno %llx\n", bl->blkno,
-			le64_to_cpu(hdr->blkno));
-	} else {
-		ret = 0;
-	}
-
-	return ret;
-}
-
-static void block_read_end_io(struct bio *bio, int err)
-{
-	struct scoutfs_block *bl = bio->bi_private;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);
-
-	if (!err && !verify_block_header(bl->sb, bl))
-		set_bit(BLOCK_BIT_UPTODATE, &bl->bits);
-	else
-		set_bit(BLOCK_BIT_ERROR, &bl->bits);
-
-	/*
-	 * uncontended spin_lock in wake_up and unconditional smp_mb to
-	 * make waitqueue_active safe are about the same cost, so we
-	 * prefer the obviously safe choice.
-	 */
-	wake_up(&sbi->block_wq);
-
-	scoutfs_block_put(bl);
-	bio_put(bio);
-}
-
-/*
- * Once a transaction block is persistent it's fine to drop the dirty
- * tag.  It's been checksummed so it can be read in again.  It's seq
- * will be in the current transaction so it'll simply be dirtied and
- * checksummed and written out again.
- */
-static void block_write_end_io(struct bio *bio, int err)
-{
-	struct scoutfs_block *bl = bio->bi_private;
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);
-	unsigned long flags;
-
-	if (!err) {
-		spin_lock_irqsave(&sbi->block_lock, flags);
-		radix_tree_tag_clear(&sbi->block_radix,
-				     bl->blkno, DIRTY_RADIX_TAG);
-		lru_add(sbi, bl);
-		spin_unlock_irqrestore(&sbi->block_lock, flags);
-	}
-
-	/* not too worried about racing ints */
-	if (err && !sbi->block_write_err)
-		sbi->block_write_err = err;
-
-	if (atomic_dec_and_test(&sbi->block_writes))
-		wake_up(&sbi->block_wq);
-
-	scoutfs_block_put(bl);
-	bio_put(bio);
-
-}
-
-static int block_submit_bio(struct scoutfs_block *bl, int rw)
-{
-	struct super_block *sb = bl->sb;
-	struct bio *bio;
-	int ret;
-
-	bio = bio_alloc(GFP_NOFS, SCOUTFS_PAGES_PER_BLOCK);
-	if (WARN_ON_ONCE(!bio))
-		return -ENOMEM;
-
-	bio->bi_sector = bl->blkno << (SCOUTFS_BLOCK_SHIFT - 9);
-	bio->bi_bdev = sb->s_bdev;
-	if (rw & WRITE) {
-		bio->bi_end_io = block_write_end_io;
-	} else
-		bio->bi_end_io = block_read_end_io;
-	bio->bi_private = bl;
-
-	ret = bio_add_page(bio, bl->page, SCOUTFS_BLOCK_SIZE, 0);
-	if (WARN_ON_ONCE(ret != SCOUTFS_BLOCK_SIZE)) {
-		bio_put(bio);
-		return -ENOMEM;
-	}
-
-	atomic_inc(&bl->refcount);
-	submit_bio(rw, bio);
-
-	return 0;
-}
-
-/*
- * Read an existing block from the device and verify its metadata header.
- */
-struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_block *found;
-	struct scoutfs_block *bl;
-	unsigned long flags;
-	int ret;
-
-	/* find an existing block, dropping if it's errored */
-	spin_lock_irqsave(&sbi->block_lock, flags);
-
-	bl = radix_tree_lookup(&sbi->block_radix, blkno);
-	if (bl) {
-		if (test_bit(BLOCK_BIT_ERROR, &bl->bits)) {
-			radix_delete(sbi, bl);
-			bl = NULL;
-		} else {
-			lru_move(sbi, bl);
-			atomic_inc(&bl->refcount);
-		}
-	}
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
-	if (bl)
-		goto wait;
-
-	/* allocate a new block and try to insert it */
-	bl = alloc_block(sb, blkno);
-	if (!bl) {
-		ret = -EIO;
-		goto out;
-	}
-
-	ret = radix_tree_preload(GFP_NOFS);
-	if (ret)
-		goto out;
-
-	spin_lock_irqsave(&sbi->block_lock, flags);
-
-	found = radix_tree_lookup(&sbi->block_radix, blkno);
-	if (found) {
-		scoutfs_block_put(bl);
-		bl = found;
-		lru_move(sbi, bl);
-		atomic_inc(&bl->refcount);
-	} else {
-		radix_insert(sbi, bl, false);
-	}
-
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
-	radix_tree_preload_end();
-
-	if (!found) {
-		ret = block_submit_bio(bl, READ_SYNC | REQ_META);
-		if (ret)
-			goto out;
-	}
-
-wait:
-	ret = wait_event_interruptible(sbi->block_wq,
-				test_bit(BLOCK_BIT_UPTODATE, &bl->bits) ||
-				test_bit(BLOCK_BIT_ERROR, &bl->bits));
-	if (ret == 0 && test_bit(BLOCK_BIT_ERROR, &bl->bits))
-		ret = -EIO;
-out:
-	if (ret) {
-		scoutfs_block_put(bl);
-		bl = ERR_PTR(ret);
-	}
-
-	return bl;
-}
-
-/*
- * Read an existing block from the device described by the caller's
- * reference.
- *
- * If the reference sequence numbers don't match then we could be racing
- * with another writer. We back off and try again.  If it happens too
- * many times the caller assumes that we've hit persistent corruption
- * and returns an error.
- *
- * XXX:
- *  - actually implement this
- *  - reads that span transactions?
- *  - writers creating a new dirty block?
- */
-struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb,
-					     struct scoutfs_block_ref *ref)
-{
-	struct scoutfs_block_header *hdr;
-	struct scoutfs_block *bl;
-
-	bl = scoutfs_block_read(sb, le64_to_cpu(ref->blkno));
-	if (!IS_ERR(bl)) {
-		hdr = scoutfs_block_data(bl);
-		if (WARN_ON_ONCE(hdr->seq != ref->seq)) {
-			scoutfs_block_put(bl);
-			bl = ERR_PTR(-EAGAIN);
-		}
-	}
-
-	return bl;
-}
-
-/*
- * The caller knows that it's not racing with writers.
- */
-int scoutfs_block_has_dirty(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	return radix_tree_tagged(&sbi->block_radix, DIRTY_RADIX_TAG);
-}
-
-/*
- * Submit writes for all the blocks in the radix with their dirty tag
- * set.  The transaction machinery ensures that the dirty blocks form a
- * consistent image and excludes future dirtying while IO is in flight.
- *
- * Presence in the dirty tree holds a reference.  Blocks are only
- * removed from the tree which drops the ref when IO completes.
- *
- * Blocks that see write errors remain in the dirty tree and will try to
- * be written again in the next transaction commit.
- *
- * Reads can traverse the blocks while they're in flight.
- */
-int scoutfs_block_write_dirty(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_block *blocks[16];
-	struct scoutfs_block *bl;
-	struct blk_plug plug;
-	unsigned long flags;
-	u64 blkno;
-	int ret;
-	int nr;
-	int i;
-
-	atomic_set(&sbi->block_writes, 1);
-	sbi->block_write_err = 0;
-	blkno = 0;
-	ret = 0;
-
-	blk_start_plug(&plug);
-
-	do {
-		/* get refs to a bunch of dirty blocks */
-		spin_lock_irqsave(&sbi->block_lock, flags);
-		nr = radix_tree_gang_lookup_tag(&sbi->block_radix,
-						(void **)blocks, blkno,
-						ARRAY_SIZE(blocks),
-						DIRTY_RADIX_TAG);
-		if (nr > 0)
-			blkno = blocks[nr - 1]->blkno + 1;
-		for (i = 0; i < nr; i++)
-			atomic_inc(&blocks[i]->refcount);
-		spin_unlock_irqrestore(&sbi->block_lock, flags);
-
-		/* submit them in order, being careful to put all on err */
-		for (i = 0; i < nr; i++) {
-			bl = blocks[i];
-
-			if (ret == 0) {
-				scoutfs_block_set_crc(bl);
-				atomic_inc(&sbi->block_writes);
-				ret = block_submit_bio(bl, WRITE);
-				if (ret)
-					atomic_dec(&sbi->block_writes);
-			}
-			scoutfs_block_put(bl);
-		}
-	} while (nr && !ret);
-
-	blk_finish_plug(&plug);
-
-	/* wait for all io to drain */
-	atomic_dec(&sbi->block_writes);
-	wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0);
-
-	return ret ?: sbi->block_write_err;
-}
-
-/*
- * XXX This is a gross hack for writing the super.  It doesn't have
- * per-block write completion indication.  It knows that it's the only
- * thing that will be writing.
- */
-int scoutfs_block_write_sync(struct scoutfs_block *bl)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);
-	int ret;
-
-	BUG_ON(atomic_read(&sbi->block_writes) != 0);
-
-	atomic_inc(&sbi->block_writes);
-	ret = block_submit_bio(bl, WRITE);
-	if (ret)
-		atomic_dec(&sbi->block_writes);
-	else
-		wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0);
-
-	return ret ?: sbi->block_write_err;
-}
-
-/*
- * Give the caller a dirty block that they can safely modify.  If the
- * reference refers to a stable clean block then we allocate a new block
- * and update the reference.
- *
- * Blocks are dirtied and modified within a transaction that has a given
- * sequence number which we use to determine if the block is currently
- * dirty or not.
- *
- * For now we're using the dirty super block in the sb_info to track the
- * dirty seq.  That'll be different when we have multiple btrees.
- *
- * Callers are responsible for serializing modification to the reference
- * which is probably embedded in some other dirty persistent structure.
- */
-struct scoutfs_block *scoutfs_block_dirty_ref(struct super_block *sb,
-					      struct scoutfs_block_ref *ref)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_block_header *hdr;
-	struct scoutfs_block *copy_bl = NULL;
-	struct scoutfs_block *bl;
-	u64 blkno = 0;
-	int ret;
-	int err;
-
-	bl = scoutfs_block_read(sb, le64_to_cpu(ref->blkno));
-	if (IS_ERR(bl) || ref->seq == sbi->super.hdr.seq)
-		return bl;
-
-	ret = scoutfs_buddy_alloc_same(sb, &blkno, le64_to_cpu(ref->blkno));
-	if (ret < 0)
-		goto out;
-
-	copy_bl = scoutfs_block_dirty(sb, blkno);
-	if (IS_ERR(copy_bl)) {
-		ret = PTR_ERR(copy_bl);
-		goto out;
-	}
-
-	hdr = scoutfs_block_data(bl);
-	ret = scoutfs_buddy_free(sb, hdr->seq, le64_to_cpu(hdr->blkno), 0);
-	if (ret)
-		goto out;
-
-	memcpy(scoutfs_block_data(copy_bl), scoutfs_block_data(bl),
-	       SCOUTFS_BLOCK_SIZE);
-
-	hdr = scoutfs_block_data(copy_bl);
-	hdr->blkno = cpu_to_le64(blkno);
-	hdr->seq = sbi->super.hdr.seq;
-	ref->blkno = hdr->blkno;
-	ref->seq = hdr->seq;
-
-	ret = 0;
-out:
-	scoutfs_block_put(bl);
-	if (ret) {
-		if (!IS_ERR_OR_NULL(copy_bl)) {
-			err = scoutfs_buddy_free(sb, sbi->super.hdr.seq,
-						 blkno, 0);
-			WARN_ON_ONCE(err); /* freeing dirty must work */
-		}
-		scoutfs_block_put(copy_bl);
-		copy_bl = ERR_PTR(ret);
-	}
-
-	return copy_bl;
-}
-
-/*
- * Return a dirty metadata block with an updated block header to match
- * the current dirty seq.  Callers are responsible for serializing
- * access to the block and for zeroing unwritten block contents.
- *
- * Always allocating a new block and replacing any old cached block
- * serves a very specific purpose.  We can have an unlocked reader
- * traversing stable structures actively using a clean block while a
- * writer gets that same blkno from the allocator and starts modifying
- * it.  By always allocating a new block we let the reader continue
- * safely using their old immutable block while the writer works on the
- * newly allocated block.  The old stable block will be freed once the
- * reader drops their reference.
- */
-struct scoutfs_block *scoutfs_block_dirty(struct super_block *sb, u64 blkno)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_block_header *hdr;
-	struct scoutfs_block *found;
-	struct scoutfs_block *bl;
-	unsigned long flags;
-	int ret;
-
-	/* allocate a new block and try to insert it */
-	bl = alloc_block(sb, blkno);
-	if (!bl) {
-		ret = -EIO;
-		goto out;
-	}
-
-	set_bit(BLOCK_BIT_UPTODATE, &bl->bits);
-
-	ret = radix_tree_preload(GFP_NOFS);
-	if (ret)
-		goto out;
-
-	hdr = bl->data;
-	*hdr = sbi->super.hdr;
-	hdr->blkno = cpu_to_le64(blkno);
-	hdr->seq = sbi->super.hdr.seq;
-
-	spin_lock_irqsave(&sbi->block_lock, flags);
-	found = radix_tree_lookup(&sbi->block_radix, blkno);
-	if (found)
-		radix_delete(sbi, found);
-	radix_insert(sbi, bl, true);
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
-
-	radix_tree_preload_end();
-	ret = 0;
-out:
-	if (ret) {
-		scoutfs_block_put(bl);
-		bl = ERR_PTR(ret);
-	}
-
-	return bl;
-}
-
-/*
- * Allocate a new dirty writable block.  The caller must be in a
- * transaction so that we can assign the dirty seq.
- */
-struct scoutfs_block *scoutfs_block_dirty_alloc(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->stable_super;
-	struct scoutfs_block *bl;
-	u64 blkno;
-	int ret;
-	int err;
-
-	ret = scoutfs_buddy_alloc(sb, &blkno, 0);
-	if (ret < 0)
-		return ERR_PTR(ret);
-
-	bl = scoutfs_block_dirty(sb, blkno);
-	if (IS_ERR(bl)) {
-		err = scoutfs_buddy_free(sb, super->hdr.seq, blkno, 0);
-		WARN_ON_ONCE(err); /* freeing dirty must work */
-	}
-	return bl;
-}
-
-/*
- * Forget the given block by removing it from the radix and clearing its
- * dirty tag.  It will not be found by future lookups and will not be
- * written out.  The caller can still use it until it drops its
- * reference.
- */
-void scoutfs_block_forget(struct scoutfs_block *bl)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);
-	struct scoutfs_block *found;
-	unsigned long flags;
-	u64 blkno = bl->blkno;
-
-	spin_lock_irqsave(&sbi->block_lock, flags);
-	found = radix_tree_lookup(&sbi->block_radix, blkno);
-	if (found == bl)
-		radix_delete(sbi, bl);
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
-}
-
-/*
- * We maintain an LRU of blocks so that the shrinker can free the oldest
- * under memory pressure.  We can't reclaim dirty blocks so only clean
- * blocks are kept in the LRU.  Blocks are only in the LRU while their
- * presence in the radix holds a reference.  We don't care if a reader
- * has an active ref on a clean block that gets reclaimed.  All we're
- * doing is removing from the radix.  The caller can still work with the
- * block and it will be freed once they drop their ref.
- *
- * If this is called with nr_to_scan == 0 then it only returns the nr.
- * We avoid acquiring the lock in that case.
- *
- * Lookup code only moves blocks around in the LRU while they're in the
- * radix. Once we remove the block from the radix we're able to use the
- * lru_entry to drop all the blocks outside the lock.
- *
- * XXX:
- *  - are sc->nr_to_scan and our return meant to be in units of pages?
- *  - should we sync a transaction here?
- */
-int scoutfs_block_shrink(struct shrinker *shrink, struct shrink_control *sc)
-{
-	struct scoutfs_sb_info *sbi = container_of(shrink,
-						   struct scoutfs_sb_info,
-						   block_shrinker);
-	struct scoutfs_block *tmp;
-	struct scoutfs_block *bl;
-	unsigned long flags;
-	unsigned long nr;
-	LIST_HEAD(list);
-
-	nr = sc->nr_to_scan;
-	if (!nr)
-		goto out;
-
-	spin_lock_irqsave(&sbi->block_lock, flags);
-
-	list_for_each_entry_safe(bl, tmp, &sbi->block_lru_list, lru_entry) {
-		if (nr-- == 0)
-			break;
-		atomic_inc(&bl->refcount);
-		radix_delete(sbi, bl);
-		list_add(&bl->lru_entry, &list);
-	}
-
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
-
-	list_for_each_entry_safe(bl, tmp, &list, lru_entry) {
-		list_del_init(&bl->lru_entry);
-		scoutfs_block_put(bl);
-	}
-
-out:
-	return min_t(unsigned long, sbi->block_lru_nr, INT_MAX);
-}
-
-void scoutfs_block_set_crc(struct scoutfs_block *bl)
-{
-	struct scoutfs_block_header *hdr = scoutfs_block_data(bl);
-
-	hdr->crc = cpu_to_le32(scoutfs_crc_block(hdr));
-}
-
-/*
- * Zero the block from the given byte to the end of the block.
- */
-void scoutfs_block_zero(struct scoutfs_block *bl, size_t off)
-{
-	if (WARN_ON_ONCE(off > SCOUTFS_BLOCK_SIZE))
-		return;
-
-	if (off < SCOUTFS_BLOCK_SIZE)
-		memset(scoutfs_block_data(bl) + off, 0,
-		       SCOUTFS_BLOCK_SIZE - off);
-}
-
-/*
- * Zero the block from the given byte to the end of the block.
- */
-void scoutfs_block_zero_from(struct scoutfs_block *bl, void *ptr)
-{
-	return scoutfs_block_zero(bl, (char *)ptr -
-				  (char *)scoutfs_block_data(bl));
-}
-
-void scoutfs_block_set_lock_class(struct scoutfs_block *bl,
-			          struct lock_class_key *class)
-{
-	if (!test_bit(BLOCK_BIT_CLASS_SET, &bl->bits)) {
-		lockdep_set_class(&bl->rwsem, class);
-		set_bit(BLOCK_BIT_CLASS_SET, &bl->bits);
-	}
-}
-
-void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass)
-{
-	if (write)
-		down_write_nested(&bl->rwsem, subclass);
-	else
-		down_read_nested(&bl->rwsem, subclass);
-}
-
-void scoutfs_block_unlock(struct scoutfs_block *bl, bool write)
-{
-	if (write)
-		up_write(&bl->rwsem);
-	else
-		up_read(&bl->rwsem);
-}
-
-void *scoutfs_block_data(struct scoutfs_block *bl)
-{
-	return bl->data;
-}
-
-void *scoutfs_block_data_from_contents(const void *ptr)
-{
-	unsigned long addr = (unsigned long)ptr;
-
-	return (void *)(addr & ~((unsigned long)SCOUTFS_BLOCK_MASK));
-}
-
-void scoutfs_block_destroy(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_block *blocks[16];
-	struct scoutfs_block *bl;
-	unsigned long blkno = 0;
-	int nr;
-	int i;
-
-	do {
-		nr = radix_tree_gang_lookup(&sbi->block_radix, (void **)blocks,
-					    blkno, ARRAY_SIZE(blocks));
-		for (i = 0; i < nr; i++) {
-			bl = blocks[i];
-			blkno = bl->blkno + 1;
-			radix_delete(sbi, bl);
-		}
-	} while (nr);
-}
diff --git a/kmod/src/block.h b/kmod/src/block.h
deleted file mode 100644
index 0eb86837..00000000
--- a/kmod/src/block.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _SCOUTFS_BLOCK_H_
-#define _SCOUTFS_BLOCK_H_
-
-struct scoutfs_block;
-
-#include <linux/fs.h>
-
-struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno);
-struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb,
-					   struct scoutfs_block_ref *ref);
-
-struct scoutfs_block *scoutfs_block_dirty(struct super_block *sb, u64 blkno);
-struct scoutfs_block *scoutfs_block_dirty_alloc(struct super_block *sb);
-struct scoutfs_block *scoutfs_block_dirty_ref(struct super_block *sb,
-					    struct scoutfs_block_ref *ref);
-
-int scoutfs_block_has_dirty(struct super_block *sb);
-int scoutfs_block_write_dirty(struct super_block *sb);
-int scoutfs_block_write_sync(struct scoutfs_block *bl);
-
-void scoutfs_block_set_crc(struct scoutfs_block *bl);
-void scoutfs_block_zero(struct scoutfs_block *bl, size_t off);
-void scoutfs_block_zero_from(struct scoutfs_block *bl, void *ptr);
-
-void scoutfs_block_set_lock_class(struct scoutfs_block *bl,
-			          struct lock_class_key *class);
-void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass);
-void scoutfs_block_unlock(struct scoutfs_block *bl, bool write);
-
-void *scoutfs_block_data(struct scoutfs_block *bl);
-void *scoutfs_block_data_from_contents(const void *ptr);
-void scoutfs_block_forget(struct scoutfs_block *bl);
-void scoutfs_block_put(struct scoutfs_block *bl);
-
-int scoutfs_block_shrink(struct shrinker *shrink, struct shrink_control *sc);
-void scoutfs_block_destroy(struct super_block *sb);
-
-#endif
diff --git a/kmod/src/btree.c b/kmod/src/btree.c
deleted file mode 100644
index a1410134..00000000
--- a/kmod/src/btree.c
+++ /dev/null
@@ -1,1582 +0,0 @@
-/*
- * Copyright (C) 2016 Zach Brown.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/rwsem.h>
-#include <linux/sort.h>
-
-#include "super.h"
-#include "format.h"
-#include "block.h"
-#include "key.h"
-#include "btree.h"
-
-#include "scoutfs_trace.h"
-
-/*
- * scoutfs stores file system metadata in btrees whose items have fixed
- * sized keys and variable length values.
- *
- * Items are stored as a small header with the key followed by the
- * value.  New items are allocated from the back of the block towards
- * the front.  Deleted items can be reclaimed by packing items towards
- * the back of the block by walking them in reverse offset order.
- *
- * A dense array of item offsets after the btree block header header
- * maintains the sorted order of the items by their keys.  The array is
- * small enough that the memmoves to keep it dense involves a few cache
- * lines at most.
- *
- * Parent blocks in the btree have the same format as leaf blocks.
- * There's one key for every child reference instead of having separator
- * keys between child references.  The key in a child reference contains
- * the largest key that may be found in the child subtree.  The right
- * spine of the tree has maximal keys so that they don't have to be
- * updated if we insert an item with a key greater than everything in
- * the tree.
- *
- * btree blocks, block references, and items all have sequence numbers
- * that are set to the current dirty btree sequence number when they're
- * modified.  This lets us efficiently search a range of keys for items
- * that are newer than a given sequence number.
- *
- * Operations are performed in one pass down the tree.  This lets us
- * cascade locks from the root down to the leaves and avoids having to
- * maintain a record of the path down the tree.  Splits and merges are
- * performed as we descend.
- *
- * XXX
- *  - do we want a level in the btree header?  seems like we would?
- *  - validate structures on read?
- *  - internal bl/pos/cmp interface is clumsy..
- */
-
-/* number of contiguous bytes used by the item header and val of given len */
-static inline unsigned int val_bytes(unsigned int val_len)
-{
-	return sizeof(struct scoutfs_btree_item) + val_len;
-}
-
-/* number of contiguous bytes used by the item header its current value */
-static inline unsigned int item_bytes(struct scoutfs_btree_item *item)
-{
-	return val_bytes(le16_to_cpu(item->val_len));
-}
-
-/* total bytes consumed by an item with given val len: offset, header, value */
-static inline unsigned int all_val_bytes(unsigned int val_len)
-{
-	return sizeof(((struct scoutfs_btree_block *)NULL)->item_offs[0]) +
-	       val_bytes(val_len);
-}
-
-/* total bytes consumed by an item with its current value */
-static inline unsigned int all_item_bytes(struct scoutfs_btree_item *item)
-{
-	return all_val_bytes(le16_to_cpu(item->val_len));
-}
-
-/* number of contig free bytes between item offset and first item */
-static inline unsigned int contig_free(struct scoutfs_btree_block *bt)
-{
-	unsigned int nr = le16_to_cpu(bt->nr_items);
-
-	return le16_to_cpu(bt->free_end) -
-	       offsetof(struct scoutfs_btree_block, item_offs[nr]);
-}
-
-/* number of contig bytes free after reclaiming free amongst items */
-static inline unsigned int reclaimable_free(struct scoutfs_btree_block *bt)
-{
-	return contig_free(bt) + le16_to_cpu(bt->free_reclaim);
-}
-
-/* all bytes used by item offsets, headers, and values */
-static inline unsigned int used_total(struct scoutfs_btree_block *bt)
-{
-	return SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block) -
-	       reclaimable_free(bt);
-}
-
-static inline struct scoutfs_btree_item *
-off_item(struct scoutfs_btree_block *bt, __le16 off)
-{
-	return (void *)bt + le16_to_cpu(off);
-}
-
-static inline struct scoutfs_btree_item *
-pos_item(struct scoutfs_btree_block *bt, unsigned int pos)
-{
-	return off_item(bt, bt->item_offs[pos]);
-}
-
-static inline struct scoutfs_key *greatest_key(struct scoutfs_btree_block *bt)
-{
-	unsigned int nr = le16_to_cpu(bt->nr_items);
-
-	return &pos_item(bt, nr - 1)->key;
-}
-
-/*
- * Copy as much of the item as fits in the value vector.  The min of the
- * value vec length and the item length is returned, including possibly
- * 0.
- */
-static int copy_to_val(struct scoutfs_btree_val *val,
-		       struct scoutfs_btree_item *item)
-{
-	size_t val_len = le16_to_cpu(item->val_len);
-	char *val_ptr = item->val;
-	struct kvec *kv;
-	size_t bytes;
-	size_t off;
-	int i;
-
-	/*
-	 * Corruption check, right now we just return -EIO if the
-	 * caller wants this. In the future we can grow this to do
-	 * different things (go readonly, ignore, return error) based
-	 * on the severity of the problem.
-	 */
-	/* XXX corruption */
-	if (val->check_size_eq && val_len != scoutfs_btree_val_length(val))
-		return -EIO;
-	if (val->check_size_lte && val_len > scoutfs_btree_val_length(val))
-		return -EOVERFLOW;
-
-	for (i = 0, off = 0; val_len > 0 && i < ARRAY_SIZE(val->vec); i++) {
-		kv = &val->vec[i];
-
-		if (WARN_ON_ONCE(kv->iov_len && !kv->iov_base))
-			return -EINVAL;
-
-		bytes = min(val_len, kv->iov_len);
-		if (bytes)
-			memcpy(kv->iov_base, val_ptr + off, bytes);
-
-		val_len -= bytes;
-		off += bytes;
-	}
-
-	return off;
-}
-
-/*
- * Copy the caller's value vector into the item in the tree block.  This
- * is only called when the item should exactly match the value vector.
- *
- * -EINVAL is returned if the lengths don't match.
- */
-static int copy_to_item(struct scoutfs_btree_item *item,
-			struct scoutfs_btree_val *val)
-{
-	size_t val_len = le16_to_cpu(item->val_len);
-	char *val_ptr = item->val;
-	struct kvec *kv;
-	size_t bytes;
-	int i;
-
-	if (val_len != scoutfs_btree_val_length(val))
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(val->vec); i++) {
-		kv = &val->vec[i];
-
-		if (WARN_ON_ONCE(kv->iov_len && !kv->iov_base))
-			return -EINVAL;
-
-		bytes = min(val_len, kv->iov_len);
-		if (bytes)
-			memcpy(val_ptr, kv->iov_base, bytes);
-
-		val_len -= bytes;
-		val_ptr += bytes;
-	}
-
-	return 0;
-}
-
-/*
- * Returns the sorted item position that an item with the given key
- * should occupy.
- *
- * It sets *cmp to the final comparison of the given key and the
- * position's item key.
- *
- * If the given key is greater then all items' keys then the number of
- * items can be returned.  Callers need to be careful to test for this
- * invalid index.
- */
-static int find_pos(struct scoutfs_btree_block *bt, struct scoutfs_key *key,
-		    int *cmp)
-{
-	unsigned int start = 0;
-	unsigned int end = le16_to_cpu(bt->nr_items);
-	unsigned int pos = 0;
-
-	*cmp = -1;
-
-	while (start < end) {
-		pos = start + (end - start) / 2;
-
-		*cmp = scoutfs_key_cmp(key, &pos_item(bt, pos)->key);
-		if (*cmp < 0) {
-			end = pos;
-		} else if (*cmp > 0) {
-			start = ++pos;
-			*cmp = -1;
-		} else {
-			break;
-		}
-	}
-
-	return pos;
-}
-
-/* move a number of contigous elements from the src index to the dst index */
-#define memmove_arr(arr, dst, src, nr) \
-	memmove(&(arr)[dst], &(arr)[src], (nr) * sizeof(*(arr)))
-
-/*
- * Allocate and insert a new item into the block.  The caller has made
- * sure that there's room for everything.  The caller is responsible for
- * initializing the value.
- */
-static struct scoutfs_btree_item *create_item(struct scoutfs_btree_block *bt,
-					      unsigned int pos,
-					      struct scoutfs_key *key,
-					      unsigned int val_len)
-{
-	unsigned int nr = le16_to_cpu(bt->nr_items);
-	struct scoutfs_btree_item *item;
-
-	if (pos < nr)
-		memmove_arr(bt->item_offs, pos + 1, pos, nr - pos);
-
-	le16_add_cpu(&bt->free_end, -val_bytes(val_len));
-	bt->item_offs[pos] = bt->free_end;
-	nr++;
-	bt->nr_items = cpu_to_le16(nr);
-
-	BUG_ON(le16_to_cpu(bt->free_end) <
-	       offsetof(struct scoutfs_btree_block, item_offs[nr]));
-
-	item = pos_item(bt, pos);
-	item->key = *key;
-	item->seq = bt->hdr.seq;
-	item->val_len = cpu_to_le16(val_len);
-
-	trace_printk("pos %u off %u\n", pos, le16_to_cpu(bt->item_offs[pos]));
-
-	return item;
-}
-
-/*
- * Delete an item from a btree block.  We record the amount of space it
- * frees to later decide if we can satisfy an insertion by compaction
- * instead of splitting.
- */
-static void delete_item(struct scoutfs_btree_block *bt, unsigned int pos)
-{
-	struct scoutfs_btree_item *item = pos_item(bt, pos);
-	unsigned int nr = le16_to_cpu(bt->nr_items);
-
-	trace_printk("pos %u off %u\n", pos, le16_to_cpu(bt->item_offs[pos]));
-
-	if (pos < (nr - 1))
-		memmove_arr(bt->item_offs, pos, pos + 1, nr - 1 - pos);
-
-	le16_add_cpu(&bt->free_reclaim, item_bytes(item));
-	nr--;
-	bt->nr_items = cpu_to_le16(nr);
-
-	/* wipe deleted items to avoid leaking data */
-	memset(item, 0, item_bytes(item));
-}
-
-/*
- * Move items from a source block to a destination block.  The caller
- * tells us if we're moving from the tail of the source block right to
- * the head of the destination block, or vice versa.  We stop moving
- * once we've moved enough bytes of items.
- */
-static void move_items(struct scoutfs_btree_block *dst,
-		       struct scoutfs_btree_block *src, bool move_right,
-		       int to_move)
-{
-	struct scoutfs_btree_item *from;
-	struct scoutfs_btree_item *to;
-	unsigned int t;
-	unsigned int f;
-
-	if (move_right) {
-		f = le16_to_cpu(src->nr_items) - 1;
-		t = 0;
-	} else {
-		f = 0;
-		t = le16_to_cpu(dst->nr_items);
-	}
-
-	while (f < le16_to_cpu(src->nr_items) && to_move > 0) {
-		from = pos_item(src, f);
-
-		to = create_item(dst, t, &from->key,
-				 le16_to_cpu(from->val_len));
-
-		memcpy(to, from, item_bytes(from));
-		to_move -= all_item_bytes(from);
-
-		delete_item(src, f);
-		if (move_right)
-			f--;
-		else
-			t++;
-	}
-}
-
-static int sort_key_cmp(const void *A, const void *B)
-{
-	struct scoutfs_btree_block *bt = scoutfs_block_data_from_contents(A);
-	const __le16 * __packed a = A;
-	const __le16 * __packed b = B;
-
-	return scoutfs_key_cmp(&off_item(bt, *a)->key, &off_item(bt, *b)->key);
-}
-
-static int sort_off_cmp(const void *A, const void *B)
-{
-	const __le16 * __packed a = A;
-	const __le16 * __packed b = B;
-
-	return (int)le16_to_cpu(*a) - (int)le16_to_cpu(*b);
-}
-
-static void sort_off_swap(void *A, void *B, int size)
-{
-	__le16 * __packed a = A;
-	__le16 * __packed b = B;
-
-	swap(*a, *b);
-}
-
-/*
- * As items are deleted they create fragmented free space.  Even if we
- * indexed free space in the block it could still get sufficiently
- * fragmented to force a split on insertion even though the two
- * resulting blocks would have less than the minimum space consumed by
- * items.
- *
- * We don't bother implementing free space indexing and addressing that
- * corner case.  Instead we track the number of bytes that could be
- * reclaimed if we compacted the item space after the free_end offset.
- * block.  If this additional free space would satisfy an insertion then
- * we compact the items instead of splitting the block.
- *
- * We move the free space to the center of the block by walking
- * backwards through the items in offset order, moving items into free
- * space between items towards the end of the block.
- *
- * We don't have specific metadata to either walk the items in offset
- * order or to update the item offsets as we move items.  We sort the
- * item offset array to achieve both ends.  First we sort it by offset
- * so we can walk in reverse order.  As we move items we update their
- * position and then sort by keys once we're done.
- *
- * Compaction is only attempted during descent as we find a block that
- * needs more or less free space.  The caller has the parent locked for
- * writing and there are no references to the items at this point so
- * it's safe to scramble the block contents.
- */
-static void compact_items(struct scoutfs_btree_block *bt)
-{
-	unsigned int nr = le16_to_cpu(bt->nr_items);
-	struct scoutfs_btree_item *from;
-	struct scoutfs_btree_item *to;
-	unsigned int bytes;
-	__le16 end;
-	int i;
-
-	trace_printk("free_reclaim %u\n", le16_to_cpu(bt->free_reclaim));
-
-	sort(bt->item_offs, nr, sizeof(bt->item_offs[0]),
-	     sort_off_cmp, sort_off_swap);
-
-	end = cpu_to_le16(SCOUTFS_BLOCK_SIZE);
-
-	for (i = nr - 1; i >= 0; i--) {
-		from = pos_item(bt, i);
-
-		bytes = item_bytes(from);
-		le16_add_cpu(&end, -bytes);
-		to = off_item(bt, end);
-		bt->item_offs[i] = end;
-
-		if (from != to)
-			memmove(to, from, bytes);
-	}
-
-	bt->free_end = end;
-	bt->free_reclaim = 0;
-
-	sort(bt->item_offs, nr, sizeof(bt->item_offs[0]),
-	     sort_key_cmp, sort_off_swap);
-}
-
-
-/*
- * Let's talk about btree locking.
- *
- * The main metadata btree has lots of callers who want concurrency.
- * They have their own locks that protect multi item consistency -- say
- * an inode's i_mutex protecting the items related to a given inode.
- * But it's our responsibility to lock the btree itself.
- *
- * Our btree operations are implemented with a single walk down the
- * tree.  This gives us the opportunity to cascade block locks down the
- * tree.  We first lock the root.  Then we lock the first block and
- * unlock the root.  Then lock the next block and unlock the first
- * block.  And so on down the tree.  After contention on the root and
- * first block we have lots of concurrency down paths of the tree to the
- * leaves.
- *
- * Merging during descent has to lock the sibling block that it's
- * pulling items from.  It has to acquire these nested locks in
- * consistent tree order.
- *
- * The cow tree updates let us skip block locking entirely for stable
- * blocks because they're read only.  All the blocks in the stable
- * super tree are stable so we don't have to lock that tree at all.
- * We let the block layer use the header's seq to avoid locking
- * stable blocks.
- *
- * lockdep has to not be freaked out by all of this.  The cascading
- * block locks really make it angry without annotation so we add classes
- * for each level and use nested subclasses for the locking of siblings
- * during merge.
- */
-
-static void set_block_lock_class(struct scoutfs_block *bl, int level)
-{
-#ifdef CONFIG_LOCKDEP
-	static struct lock_class_key tree_depth_classes[SCOUTFS_BTREE_MAX_DEPTH];
-
-	scoutfs_block_set_lock_class(bl, &tree_depth_classes[level]);
-#endif
-}
-
-static void lock_tree_block(struct super_block *sb,
-			    struct scoutfs_btree_root *root,
-			    struct scoutfs_block *bl, bool write, int subclass)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	if (root == &sbi->super.btree_root) {
-		if (bl) {
-			scoutfs_block_lock(bl, write, subclass);
-		} else {
-			if (write)
-				down_write(&sbi->btree_rwsem);
-			else
-				down_read(&sbi->btree_rwsem);
-		}
-	}
-}
-
-static void unlock_tree_block(struct super_block *sb,
-			      struct scoutfs_btree_root *root,
-			      struct scoutfs_block *bl, bool write)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-
-	if (root == &sbi->super.btree_root) {
-		if (bl) {
-			scoutfs_block_unlock(bl, write);
-		} else {
-			if (write)
-				up_write(&sbi->btree_rwsem);
-			else
-				up_read(&sbi->btree_rwsem);
-		}
-	}
-}
-
-/*
- * Allocate and initialize a new tree block. The caller adds references
- * to it.
- */
-static struct scoutfs_block *alloc_tree_block(struct super_block *sb, int level)
-{
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl;
-
-	bl = scoutfs_block_dirty_alloc(sb);
-	if (!IS_ERR(bl)) {
-		bt = scoutfs_block_data(bl);
-
-		bt->free_end = cpu_to_le16(SCOUTFS_BLOCK_SIZE);
-		bt->free_reclaim = 0;
-		bt->nr_items = 0;
-
-		set_block_lock_class(bl, level);
-	}
-
-	return bl;
-}
-
-/* the caller has ensured that the free must succeed */
-static void free_tree_block(struct super_block *sb, struct scoutfs_block *bl)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct scoutfs_btree_block *bt = scoutfs_block_data(bl);
-	int err;
-
-	BUG_ON(bt->hdr.seq != super->hdr.seq);
-
-	scoutfs_block_forget(bl);
-	err = scoutfs_buddy_free(sb, bt->hdr.seq,
-				 le64_to_cpu(bt->hdr.blkno), 0);
-	BUG_ON(err);
-}
-
-/*
- * Allocate a new tree block and point the root at it.  The caller
- * is responsible for the items in the new root block.
- */
-static struct scoutfs_block *grow_tree(struct super_block *sb,
-				       struct scoutfs_btree_root *root)
-{
-	struct scoutfs_block_header *hdr;
-	struct scoutfs_block *bl;
-
-	bl = alloc_tree_block(sb, root->height);
-	if (!IS_ERR(bl)) {
-		hdr = scoutfs_block_data(bl);
-
-		root->height++;
-		root->ref.blkno = hdr->blkno;
-		root->ref.seq = hdr->seq;
-
-		set_block_lock_class(bl, root->height - 1);
-	}
-
-	return bl;
-}
-
-static struct scoutfs_block *get_block_ref(struct super_block *sb, int level,
-				         struct scoutfs_block_ref *ref,
-				         bool dirty)
-{
-	struct scoutfs_block *bl;
-
-	if (dirty)
-		bl = scoutfs_block_dirty_ref(sb, ref);
-	else
-		bl = scoutfs_block_read_ref(sb, ref);
-
-	if (!IS_ERR(bl))
-		set_block_lock_class(bl, level);
-
-	return bl;
-}
-
-/*
- * Create a new item in the parent which references the child.  The caller
- * specifies the key in the item that describes the items in the child.
- */
-static void create_parent_item(struct scoutfs_btree_block *parent,
-			       unsigned int pos,
-			       struct scoutfs_btree_block *child,
-			       struct scoutfs_key *key)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_block_ref ref = {
-		.blkno = child->hdr.blkno,
-		.seq = child->hdr.seq,
-	};
-
-	item = create_item(parent, pos, key, sizeof(ref));
-	memcpy(&item->val, &ref, sizeof(ref));
-}
-
-/*
- * See if we need to split this block while descending for insertion so
- * that we have enough space to insert.
- *
- * Parent blocks need enough space for a new item and child ref if a
- * child block splits.  Leaf blocks need enough space to insert the new
- * item with its value.
- *
- * We split to the left so that the greatest key in the existing block
- * doesn't change so we don't have to update the key in its parent item.
- *
- * If the search key falls in the new split block then we return it to
- * the caller to walk through.
- *
- * The caller has the parent (or root) and our block locked.  We don't
- * have to lock the blocks we allocate while we have the references to
- * them locked.  We only need to lock the new sibling if we return it
- * instead of our given block for the caller to continue descent.
- */
-static struct scoutfs_block *try_split(struct super_block *sb,
-				       struct scoutfs_btree_root *root,
-				       int level, struct scoutfs_key *key,
-				       unsigned int val_len,
-				       struct scoutfs_btree_block *parent,
-				       unsigned int parent_pos,
-				       struct scoutfs_block *right_bl)
-{
-	struct scoutfs_btree_block *right = scoutfs_block_data(right_bl);
-	struct scoutfs_btree_block *left;
-	struct scoutfs_block *left_bl;
-	struct scoutfs_block *par_bl = NULL;
-	struct scoutfs_key maximal;
-	unsigned int all_bytes;
-
-	if (level)
-		val_len = sizeof(struct scoutfs_block_ref);
-	all_bytes = all_val_bytes(val_len);
-
-	if (contig_free(right) >= all_bytes)
-		return right_bl;
-
-	if (reclaimable_free(right) >= all_bytes) {
-		compact_items(right);
-		return right_bl;
-	}
-
-	/* alloc split neighbour first to avoid unwinding tree growth */
-	left_bl = alloc_tree_block(sb, level);
-	if (IS_ERR(left_bl)) {
-		unlock_tree_block(sb, root, right_bl, true);
-		scoutfs_block_put(right_bl);
-		return left_bl;
-	}
-	left = scoutfs_block_data(left_bl);
-
-	if (!parent) {
-		par_bl = grow_tree(sb, root);
-		if (IS_ERR(par_bl)) {
-			free_tree_block(sb, left_bl);
-			scoutfs_block_put(left_bl);
-			unlock_tree_block(sb, root, right_bl, true);
-			scoutfs_block_put(right_bl);
-			return par_bl;
-		}
-
-		parent = scoutfs_block_data(par_bl);
-		parent_pos = 0;
-
-		scoutfs_set_max_key(&maximal);
-		create_parent_item(parent, parent_pos, right, &maximal);
-	}
-
-	move_items(left, right, false, used_total(right) / 2);
-	create_parent_item(parent, parent_pos, left, greatest_key(left));
-	parent_pos++; /* not that anything uses it again :P */
-
-	if (scoutfs_key_cmp(key, greatest_key(left)) <= 0) {
-		/* insertion will go to the new left block */
-		unlock_tree_block(sb, root, right_bl, true);
-		lock_tree_block(sb, root, left_bl, true, 0);
-		swap(right_bl, left_bl);
-	} else {
-		/* insertion will still go through us, might need to compact */
-		if (contig_free(right) < all_bytes)
-			compact_items(right);
-	}
-
-	scoutfs_block_put(par_bl);
-	scoutfs_block_put(left_bl);
-
-	return right_bl;
-}
-
-/*
- * This is called during descent for deletion when we have a parent and
- * might need to merge items from a sibling block if this block has too
- * much free space.  Eventually we'll be able to fit all of the
- * sibling's items in our free space which lets us delete the sibling
- * block.
- *
- * The error handling here is a little weird.  We're returning an
- * ERR_PTR buffer to match splitting so that the walk can handle errors
- * from both easily.  We have to unlock and release our buffer to return
- * an error.
- *
- * The caller locks the parent and our given block.  We need to
- * lock sibling blocks in consistent tree order.  Our common case
- * has us pulling from our left sibling so we prefer to lock blocks
- * from right to left.  Splitting doesn't hold both sibling locks.
- *
- * We free sibling or parent btree block blknos if we drain them of items.
- * They're dirtied either by descent or before we start migrating items
- * so freeing their blkno must succeed.
- *
- * XXX this could more cleverly chose a merge candidate sibling
- */
-static struct scoutfs_block *try_merge(struct super_block *sb,
-				     struct scoutfs_btree_root *root,
-				     struct scoutfs_block *par_bl,
-				     int level, unsigned int pos,
-				     struct scoutfs_block *bl)
-{
-	struct scoutfs_btree_block *parent = scoutfs_block_data(par_bl);
-	struct scoutfs_btree_block *bt = scoutfs_block_data(bl);
-	struct scoutfs_btree_item *sib_item;
-	struct scoutfs_btree_block *sib_bt;
-	struct scoutfs_block *sib_bl;
-	unsigned int sib_pos;
-	bool move_right;
-	int to_move;
-
-	if (reclaimable_free(bt) <= SCOUTFS_BTREE_FREE_LIMIT)
-		return bl;
-
-	/* move items right into our block if we have a left sibling */
-	if (pos) {
-		sib_pos = pos - 1;
-		move_right = true;
-	} else {
-		sib_pos = pos + 1;
-		move_right = false;
-	}
-	sib_item = pos_item(parent, sib_pos);
-
-	sib_bl = get_block_ref(sb, level, (void *)sib_item->val, true);
-	if (IS_ERR(sib_bl)) {
-		/* XXX do we need to unlock this?  don't think so */
-		scoutfs_block_put(bl);
-		return sib_bl;
-	}
-	sib_bt = scoutfs_block_data(sib_bl);
-
-	if (!move_right) {
-		unlock_tree_block(sb, root, bl, true);
-		lock_tree_block(sb, root, sib_bl, true, 0);
-		lock_tree_block(sb, root, bl, true, 1);
-
-		if (reclaimable_free(bt) <= SCOUTFS_BTREE_FREE_LIMIT) {
-			unlock_tree_block(sb, root, sib_bl, true);
-			scoutfs_block_put(sib_bl);
-			return bl;
-		}
-	} else {
-		lock_tree_block(sb, root, sib_bl, true, 1);
-	}
-
-	if (used_total(sib_bt) <= reclaimable_free(bt))
-		to_move = used_total(sib_bt);
-	else
-		to_move = reclaimable_free(bt) - SCOUTFS_BTREE_FREE_LIMIT;
-
-	/*
-	 * Make sure there's room to move a max size item if it's the
-	 * next in line when we only have one byte left to try and move.
-	 *
-	 * XXX This is getting awfully fiddly.  Should we be refactoring
-	 * item insertion/deletion to do this for us?
-	 */
-	if (contig_free(bt) < (to_move + (SCOUTFS_MAX_ITEM_LEN - 1)))
-		compact_items(bt);
-
-	trace_printk("sib_pos %d move_right %u to_move %u\n",
-		     sib_pos, move_right, to_move);
-
-	move_items(bt, sib_bt, move_right, to_move);
-
-	/* update our parent's ref if we changed our greatest key */
-	if (!move_right)
-		pos_item(parent, pos)->key = *greatest_key(bt);
-
-	/* delete an empty sib or update if we changed its greatest key */
-	if (le16_to_cpu(sib_bt->nr_items) == 0) {
-		delete_item(parent, sib_pos);
-		free_tree_block(sb, sib_bl);
-	} else if (move_right) {
-		sib_item->key = *greatest_key(sib_bt);
-	}
-
-	/* and finally shrink the tree if our parent is the root with 1 */
-	if (le16_to_cpu(parent->nr_items) == 1) {
-		root->height--;
-		root->ref.blkno = bt->hdr.blkno;
-		root->ref.seq = bt->hdr.seq;
-		free_tree_block(sb, par_bl);
-		/* caller just unlocks and drops parent */
-	}
-
-	unlock_tree_block(sb, root, sib_bl, true);
-	scoutfs_block_put(sib_bl);
-
-	return bl;
-}
-
-enum {
-	WALK_INSERT = 1,
-	WALK_DELETE,
-	WALK_NEXT_SEQ,
-	WALK_DIRTY,
-};
-
-static u64 item_block_ref_seq(struct scoutfs_btree_item *item)
-{
-	struct scoutfs_block_ref *ref = (void *)item->val;
-
-	return le64_to_cpu(ref->seq);
-}
-
-/*
- * Return true if we should skip this item while iterating by sequence
- * number.  If it's a parent then we test the block ref's seq, if it's a
- * leaf item then we check the item's seq.
- */
-static bool skip_pos_seq(struct scoutfs_btree_block *bt, unsigned int pos,
-			 int level, u64 seq, int op)
-{
-	struct scoutfs_btree_item *item;
-
-	if (op != WALK_NEXT_SEQ || pos >= le16_to_cpu(bt->nr_items))
-	       return false;
-
-	item = pos_item(bt, pos);
-
-	return ((level > 0 && item_block_ref_seq(item) < seq) ||
-	        (level == 0 && le64_to_cpu(item->seq) < seq));
-}
-
-/*
- * Return the next sorted item position, possibly skipping those with
- * sequence numbers less than the desired sequence number.
- */
-static unsigned int next_pos_seq(struct scoutfs_btree_block *bt,
-				 unsigned int pos, int level, u64 seq, int op)
-{
-	do {
-		pos++;
-	} while (skip_pos_seq(bt, pos, level, seq, op));
-
-	return pos;
-}
-
-/*
- * Return the first item after the given key, possibly skipping those
- * with sequence numbers less than the desired sequence number.
- */
-static unsigned int find_pos_after_seq(struct scoutfs_btree_block *bt,
-				       struct scoutfs_key *key, int level,
-				       u64 seq, int op)
-{
-	unsigned int pos;
-	int cmp;
-
-	pos = find_pos(bt, key, &cmp);
-	if (skip_pos_seq(bt, pos, level, seq, op))
-		pos = next_pos_seq(bt, pos, level, seq, op);
-
-	return pos;
-}
-
-/*
- * Verify that the btree block isn't corrupt.  This is way too expensive
- * to do for each block access though that's very helpful for debugging
- * btree block corruption.
- *
- * It should be done the first time we read blocks and it doing it for
- * every block access should be hidden behind runtime options.
- *
- * XXX
- *  - make sure items don't overlap
- *  - make sure offs point to live items
- *  - do things with level
- *  - see if item keys make sense
- */
-static int verify_btree_block(struct scoutfs_btree_block *bt, int level,
-			      struct scoutfs_key *small,
-			      struct scoutfs_key *large)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_key *prev;
-	unsigned int bytes = 0;
-	unsigned int after_offs = sizeof(struct scoutfs_btree_block);
-	unsigned int first_off;
-	unsigned int off;
-	unsigned int nr;
-	unsigned int i = 0;
-	int bad = 1;
-
-	nr = le16_to_cpu(bt->nr_items);
-	if (nr == 0)
-		goto out;
-
-	if (nr > SCOUTFS_BTREE_MAX_ITEMS) {
-		nr = SCOUTFS_BTREE_MAX_ITEMS;
-		goto out;
-	}
-
-	after_offs = offsetof(struct scoutfs_btree_block, item_offs[nr]);
-	first_off = SCOUTFS_BLOCK_SIZE;
-
-	for (i = 0; i < nr; i++) {
-
-		off = le16_to_cpu(bt->item_offs[i]);
-		if (off >= SCOUTFS_BLOCK_SIZE || off < after_offs)
-			goto out;
-
-		first_off = min(first_off, off);
-
-		item = pos_item(bt, i);
-		bytes += item_bytes(item);
-
-		if ((i == 0 && scoutfs_key_cmp(&item->key, small) < 0) ||
-		    (i > 0 && scoutfs_key_cmp(&item->key, prev) <= 0) ||
-		    (i == (nr - 1) && scoutfs_key_cmp(&item->key, large) > 0))
-			goto out;
-
-		prev = &item->key;
-	}
-
-	if (first_off < le16_to_cpu(bt->free_end))
-		goto out;
-
-	if ((le16_to_cpu(bt->free_end) + bytes +
-	     le16_to_cpu(bt->free_reclaim)) != SCOUTFS_BLOCK_SIZE)
-		goto out;
-
-	bad = 0;
-out:
-	if (bad) {
-		printk("bt %p blkno %llu level %d small "CKF" large "CKF" end %u reclaim %u nr %u (max %lu after %u bytes %u)\n",
-			bt, le64_to_cpu(bt->hdr.blkno), level,
-			CKA(small), CKA(large), le16_to_cpu(bt->free_end),
-			le16_to_cpu(bt->free_reclaim), bt->nr_items,
-			SCOUTFS_BTREE_MAX_ITEMS, after_offs, bytes);
-		for (i = 0; i < nr; i++) {
-			item = pos_item(bt, i);
-			off = le16_to_cpu(bt->item_offs[i]);
-			printk("  [%u] off %u key "CKF" len %u\n",
-					i, off, CKA(&item->key),
-					le16_to_cpu(item->val_len));
-		}
-		BUG_ON(bad);
-	}
-
-	return 0;
-}
-
-/*
- * Return the leaf block that should contain the given key.  The caller
- * is responsible for searching the leaf block and performing their
- * operation.  The block is returned locked for either reading or
- * writing depending on the operation.
- *
- * As we descend through parent items we set prev_key or next_key to the
- * last key in the previous sibling's block or to the first key in the
- * next sibling's block, respectively.  This is used by iteration to
- * keep searching sibling blocks if their search key falls at the end of
- * a leaf in their search direction.
- */
-static struct scoutfs_block *btree_walk(struct super_block *sb,
-					struct scoutfs_btree_root *root,
-					struct scoutfs_key *key,
-					struct scoutfs_key *prev_key,
-					struct scoutfs_key *next_key,
-					unsigned int val_len, u64 seq, int op)
-{
-	struct scoutfs_btree_block *parent = NULL;
-	struct scoutfs_block *par_bl = NULL;
-	struct scoutfs_block *bl = NULL;
-	struct scoutfs_btree_item *item = NULL;
-	struct scoutfs_block_ref *ref;
-	struct scoutfs_key small;
-	struct scoutfs_key large;
-	unsigned int level;
-	unsigned int pos = 0;
-	const bool dirty = op == WALK_INSERT || op == WALK_DELETE ||
-			   op == WALK_DIRTY;
-	int ret;
-
-	/* no sibling blocks if we don't have parent blocks */
-	if (next_key)
-		scoutfs_set_max_key(next_key);
-	if (prev_key)
-		scoutfs_key_set_zero(prev_key);
-
-	lock_tree_block(sb, root, NULL, dirty, 0);
-
-	ref = &root->ref;
-	level = root->height;
-
-	if (!root->height) {
-		if (op == WALK_INSERT) {
-			bl = ERR_PTR(-ENOENT);
-		} else {
-			bl = grow_tree(sb, root);
-			if (!IS_ERR(bl)) {
-				lock_tree_block(sb, root, bl, dirty, 0);
-				unlock_tree_block(sb, root, NULL, dirty);
-			}
-		}
-		goto out;
-	}
-
-
-	/* skip the whole tree if the root ref's seq is old */
-	if (op == WALK_NEXT_SEQ && le64_to_cpu(ref->seq) < seq) {
-		bl = ERR_PTR(-ENOENT);
-		goto out;
-	}
-
-	scoutfs_set_key(&small, 0, 0, 0);
-	scoutfs_set_key(&large, ~0ULL, ~0, ~0ULL);
-
-	while (level--) {
-		/* XXX hmm, need to think about retry */
-		bl = get_block_ref(sb, level, ref, dirty);
-		if (IS_ERR(bl))
-			break;
-
-		/* XXX enable this */
-		ret = 0 && verify_btree_block(scoutfs_block_data(bl), level,
-					      &small, &large);
-		if (ret) {
-			scoutfs_block_put(bl);
-			bl = ERR_PTR(ret);
-			break;
-		}
-
-		lock_tree_block(sb, root, bl, dirty, 0);
-
-		if (op == WALK_INSERT)
-			bl = try_split(sb, root, level, key, val_len, parent,
-				       pos, bl);
-		if ((op == WALK_DELETE) && parent)
-			bl = try_merge(sb, root, par_bl, level, pos, bl);
-		if (IS_ERR(bl))
-			break;
-
-		unlock_tree_block(sb, root, par_bl, dirty);
-
-		if (!level)
-			break;
-
-		scoutfs_block_put(par_bl);
-		par_bl = bl;
-		parent = scoutfs_block_data(par_bl);
-
-		/*
-		 * Find the parent item that references the next child
-		 * block to search.  If we're skipping items with old
-		 * seqs then we might not have any child items to
-		 * search.
-		 */
-		pos = find_pos_after_seq(parent, key, level, seq, op);
-		if (pos >= le16_to_cpu(parent->nr_items)) {
-			/* current block dropped as parent below */
-			if (op == WALK_NEXT_SEQ)
-				bl = ERR_PTR(-ENOENT);
-			else
-				bl = ERR_PTR(-EIO);
-			break;
-		}
-
-		/* XXX verify sane length */
-		item = pos_item(parent, pos);
-		ref = (void *)item->val;
-
-		/*
-		 * Update the keys that iterators should continue
-		 * searching from.  Keep in mind that iteration is read
-		 * only so the parent item won't be changed splitting or
-		 * merging.
-		 */
-		if (next_key) {
-			*next_key = item->key;
-			scoutfs_inc_key(next_key);
-		}
-
-		if (pos) {
-			small = pos_item(parent, pos - 1)->key;
-			if (prev_key)
-				*prev_key = small;
-		}
-		large = item->key;
-	}
-
-out:
-	if (IS_ERR(bl))
-		unlock_tree_block(sb, root, par_bl, dirty);
-	scoutfs_block_put(par_bl);
-
-	return bl;
-}
-
-/*
- * Copy the given value identified by the given key into the caller's
- * buffer.  The number of bytes copied is returned, -ENOENT if the key
- * wasn't found, or -errno on errors.
- */
-int scoutfs_btree_lookup(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
-			 struct scoutfs_btree_val *val)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl;
-	unsigned int pos;
-	int cmp;
-	int ret;
-
-	trace_scoutfs_btree_lookup(sb, key, scoutfs_btree_val_length(val));
-
-	bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, 0);
-	if (IS_ERR(bl))
-		return PTR_ERR(bl);
-	bt = scoutfs_block_data(bl);
-
-	pos = find_pos(bt, key, &cmp);
-	if (cmp == 0) {
-		item = pos_item(bt, pos);
-		ret = copy_to_val(val, item);
-	} else {
-		ret = -ENOENT;
-	}
-
-	unlock_tree_block(sb, root, bl, false);
-	scoutfs_block_put(bl);
-
-	trace_printk("key "CKF" ret %d\n", CKA(key), ret);
-
-	return ret;
-}
-
-/*
- * Insert a new item in the tree.
- *
- * 0 is returned on success.  -EEXIST is returned if the key is already
- * present in the tree.
- *
- * If no value pointer is given then the item is created with a zero
- * length value.
- */
-int scoutfs_btree_insert(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
-			 struct scoutfs_btree_val *val)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl;
-	unsigned int val_len;
-	int pos;
-	int cmp;
-	int ret;
-
-	if (val)
-		val_len = scoutfs_btree_val_length(val);
-	else
-		val_len = 0;
-
-	trace_scoutfs_btree_insert(sb, key, val_len);
-
-	if (WARN_ON_ONCE(val_len > SCOUTFS_MAX_ITEM_LEN))
-		return -EINVAL;
-
-	bl = btree_walk(sb, root, key, NULL, NULL, val_len, 0, WALK_INSERT);
-	if (IS_ERR(bl))
-		return PTR_ERR(bl);
-	bt = scoutfs_block_data(bl);
-
-	pos = find_pos(bt, key, &cmp);
-	if (cmp) {
-		item = create_item(bt, pos, key, val_len);
-		if (val)
-			ret = copy_to_item(item, val);
-		else
-			ret = 0;
-	} else {
-		ret = -EEXIST;
-	}
-
-	unlock_tree_block(sb, root, bl, true);
-	scoutfs_block_put(bl);
-
-	return ret;
-}
-
-/*
- * Delete an item from the tree.  -ENOENT is returned if the key isn't
- * found.
- */
-int scoutfs_btree_delete(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key)
-{
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl;
-	int pos;
-	int cmp;
-	int ret;
-
-	trace_scoutfs_btree_delete(sb, key, 0);
-
-	bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, WALK_DELETE);
-	if (IS_ERR(bl)) {
-		ret = PTR_ERR(bl);
-		goto out;
-	}
-	bt = scoutfs_block_data(bl);
-
-	pos = find_pos(bt, key, &cmp);
-	if (cmp == 0) {
-		delete_item(bt, pos);
-		ret = 0;
-
-		/* XXX this locking is broken.. hold root rwsem? */
-
-		/* delete the final block in the tree */
-		if (bt->nr_items == 0) {
-			root->height = 0;
-			root->ref.blkno = 0;
-			root->ref.seq = 0;
-
-			free_tree_block(sb, bl);
-		}
-	} else {
-		ret = -ENOENT;
-	}
-
-	unlock_tree_block(sb, root, bl, true);
-	scoutfs_block_put(bl);
-
-out:
-	trace_printk("key "CKF" ret %d\n", CKA(key), ret);
-	return ret;
-}
-
-/*
- * Find the next key in the tree starting from 'first', and ending at
- * 'last'.  'found', 'found_seq', and 'val' are set to the discovered
- * item if they're provided.
- *
- * The caller can limit results to items with a sequence number greater
- * than or equal to their sequence number.
- *
- * The only tricky bit is that they key we're searching for might not
- * exist in the tree.  We can get to the leaf and find that there are no
- * greater items in the leaf.  We have to search again from the keys
- * greater than the parent item's keys which the walk gives us.  We also
- * star the search over from this next key if walking while filtering
- * based on seqs terminates early.
- *
- * Returns the bytes copied into the value (0 if not provided), -ENOENT
- * if there is no item past first until last, or -errno on errors.
- *
- * It's a common pattern to use the same key for first and found so we're
- * careful to copy first before we modify found.
- */
-static int btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
-		      struct scoutfs_key *first, struct scoutfs_key *last,
-		      u64 seq, int op, struct scoutfs_key *found,
-		      u64 *found_seq, struct scoutfs_btree_val *val)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_key start = *first;
-	struct scoutfs_key key = *first;
-	struct scoutfs_key next_key;
-	struct scoutfs_block *bl;
-	int pos;
-	int ret;
-
-	trace_printk("finding next first "CKF" last "CKF"\n",
-		     CKA(&start), CKA(last));
-
-	/* find the leaf that contains the next item after the key */
-	ret = -ENOENT;
-	while (scoutfs_key_cmp(&key, last) <= 0) {
-
-		bl = btree_walk(sb, root, &key, NULL, &next_key, 0, seq, op);
-
-		/* next seq walks can terminate in parents with old seqs */
-		if (op == WALK_NEXT_SEQ && bl == ERR_PTR(-ENOENT)) {
-			key = next_key;
-			continue;
-		}
-
-		if (IS_ERR(bl)) {
-			ret = PTR_ERR(bl);
-			break;
-		}
-		bt = scoutfs_block_data(bl);
-
-		/* keep trying leaves until next_key passes last */
-		pos = find_pos_after_seq(bt, &key, 0, seq, op);
-		if (pos >= le16_to_cpu(bt->nr_items)) {
-			key = next_key;
-			unlock_tree_block(sb, root, bl, false);
-			scoutfs_block_put(bl);
-			continue;
-		}
-
-		item = pos_item(bt, pos);
-		if (scoutfs_key_cmp(&item->key, last) <= 0) {
-			*found = item->key;
-			if (found_seq)
-				*found_seq = le64_to_cpu(item->seq);
-			if (val)
-				ret = copy_to_val(val, item);
-			else
-				ret = 0;
-		} else {
-			ret = -ENOENT;
-		}
-
-		unlock_tree_block(sb, root, bl, false);
-		scoutfs_block_put(bl);
-		break;
-	}
-
-	trace_printk("next first "CKF" last "CKF" found "CKF" ret %d\n",
-		     CKA(&start), CKA(last), CKA(found), ret);
-	return ret;
-}
-
-int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *first, struct scoutfs_key *last,
-		       struct scoutfs_key *found,
-		       struct scoutfs_btree_val *val)
-{
-	trace_scoutfs_btree_next(sb, first, last);
-
-	return btree_next(sb, root, first, last, 0, 0, found, NULL, val);
-}
-
-int scoutfs_btree_since(struct super_block *sb,
-			struct scoutfs_btree_root *root,
-			struct scoutfs_key *first, struct scoutfs_key *last,
-			u64 seq, struct scoutfs_key *found, u64 *found_seq,
-		        struct scoutfs_btree_val *val)
-{
-	trace_scoutfs_btree_since(sb, first, last);
-
-	return btree_next(sb, root, first, last, seq, WALK_NEXT_SEQ,
-			  found, found_seq, val);
-}
-
-/*
- * Find the greatest key that is >= first and <= last, starting at last.
- * For each search cursor key we descend to the leaf and find its
- * position in the items.  The item binary search returns the position
- * that the key would be inserted into, so if we didn't find the key
- * specifically we go to the previous position.  The btree walk gives us
- * the previous key to search from if we fall off the front of the
- * block.
- *
- * This doesn't support filtering the tree traversal by seqs.
- */
-int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *first, struct scoutfs_key *last,
-		       struct scoutfs_key *found, u64 *found_seq,
-		       struct scoutfs_btree_val *val)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_key key = *last;
-	struct scoutfs_key prev_key;
-	struct scoutfs_block *bl;
-	int pos;
-	int cmp;
-	int ret;
-
-	trace_scoutfs_btree_prev(sb, first, last);
-
-	/* find the leaf that contains the next item after the key */
-	ret = -ENOENT;
-	while (scoutfs_key_cmp(&key, first) >= 0) {
-
-		bl = btree_walk(sb, root, &key, &prev_key, NULL, 0, 0, 0);
-		if (IS_ERR(bl)) {
-			ret = PTR_ERR(bl);
-			break;
-		}
-		bt = scoutfs_block_data(bl);
-
-		pos = find_pos(bt, &key, &cmp);
-
-		/* walk to the prev leaf if we hit the front of this leaf */
-		if (pos == 0 && cmp != 0) {
-			unlock_tree_block(sb, root, bl, false);
-			scoutfs_block_put(bl);
-			if (scoutfs_key_is_zero(&key))
-				break;
-			key = prev_key;
-			continue;
-		}
-
-		/* we want the item before a non-matching position */
-		if (pos && cmp)
-			pos--;
-
-		/* return the item if it's still within our first bound */
-		item = pos_item(bt, pos);
-		if (cmp == 0 || scoutfs_key_cmp(&item->key, first) >= 0) {
-			*found = item->key;
-			if (found_seq)
-				*found_seq = le64_to_cpu(item->seq);
-			if (val)
-				ret = copy_to_val(val, item);
-			else
-				ret = 0;
-		}
-
-		unlock_tree_block(sb, root, bl, false);
-		scoutfs_block_put(bl);
-		break;
-	}
-
-	return ret;
-}
-
-/*
- * Ensure that the blocks that lead to the item with the given key are
- * dirty.  caller can hold a transaction to pin the dirty blocks and
- * guarantee that later updates of the item will succeed.
- *
- * <0 is returned on error, including -ENOENT if the key isn't present.
- */
-int scoutfs_btree_dirty(struct super_block *sb,
-			struct scoutfs_btree_root *root,
-			struct scoutfs_key *key)
-{
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl;
-	int cmp;
-	int ret;
-
-	trace_scoutfs_btree_dirty(sb, key, 0);
-
-	bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, WALK_DIRTY);
-	if (IS_ERR(bl))
-		return PTR_ERR(bl);
-	bt = scoutfs_block_data(bl);
-
-	find_pos(bt, key, &cmp);
-	if (cmp == 0) {
-		ret = 0;
-	} else {
-		ret = -ENOENT;
-	}
-
-	unlock_tree_block(sb, root, bl, true);
-	scoutfs_block_put(bl);
-
-	trace_printk("key "CKF" ret %d\n", CKA(key), ret);
-
-	return ret;
-}
-
-/*
- * This is guaranteed not to fail if the caller has already dirtied the
- * block that contains the item in the current transaction.
- *
- * 0 is returned on success.  -EINVAL is returned if the caller's value
- * length doesn't match the existing item's value length.
- */
-int scoutfs_btree_update(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
-		         struct scoutfs_btree_val *val)
-{
-	struct scoutfs_btree_item *item;
-	struct scoutfs_btree_block *bt;
-	struct scoutfs_block *bl;
-	int pos;
-	int cmp;
-	int ret;
-
-	trace_scoutfs_btree_update(sb, key,
-				   val ? scoutfs_btree_val_length(val) : 0);
-
-	bl = btree_walk(sb, root, key, NULL, NULL, 0, 0, WALK_DIRTY);
-	if (IS_ERR(bl))
-		return PTR_ERR(bl);
-	bt = scoutfs_block_data(bl);
-
-	pos = find_pos(bt, key, &cmp);
-	if (cmp == 0) {
-		item = pos_item(bt, pos);
-		ret = copy_to_item(item, val);
-		if (ret == 0)
-			item->seq = bt->hdr.seq;
-	} else {
-		ret = -ENOENT;
-	}
-
-	unlock_tree_block(sb, root, bl, true);
-	scoutfs_block_put(bl);
-
-	return ret;
-}
-
-/*
- * Set hole to a missing key in the caller's range.
- *
- * 0 is returned if we find a missing key, -ENOSPC is returned if all
- * the keys in the range are present in the tree, and -errno is returned
- * if we saw an error.
- *
- * We try to find the first key in the range.  If the next key is past
- * the first key then we return the key before the found key.  This will
- * tend to let us find the hole with one btree search.
- *
- * We keep searching as long as we keep finding the first key and will
- * return -ENOSPC if we fall off the end of the range doing so.
- */
-int scoutfs_btree_hole(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *first,
-		       struct scoutfs_key *last, struct scoutfs_key *hole)
-{
-	struct scoutfs_key key = *first;
-	struct scoutfs_key found;
-	int ret;
-
-	trace_scoutfs_btree_hole(sb, first, last);
-
-	if (WARN_ON_ONCE(scoutfs_key_cmp(first, last) > 0)) {
-		scoutfs_key_set_zero(hole);
-		return -EINVAL;
-	}
-
-	/* search as long as we keep finding our first key */
-	do {
-		ret = scoutfs_btree_next(sb, root, &key, last, &found, NULL);
-	} while (ret == 0 &&
-		 scoutfs_key_cmp(&found, &key) == 0 &&
-		 (scoutfs_inc_key(&key), ret = -ENOSPC,
-		  scoutfs_key_cmp(&key, last) <= 0));
-
-	if (ret == 0) {
-		*hole = found;
-		scoutfs_dec_key(hole);
-	} else if (ret == -ENOENT) {
-		*hole = *last;
-		ret = 0;
-	}
-
-	trace_printk("first "CKF" last "CKF" hole "CKF" ret %d\n",
-		     CKA(first), CKA(last), CKA(hole), ret);
-
-	return ret;
-}
diff --git a/kmod/src/btree.h b/kmod/src/btree.h
deleted file mode 100644
index dec2310c..00000000
--- a/kmod/src/btree.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef _SCOUTFS_BTREE_H_
-#define _SCOUTFS_BTREE_H_
-
-#include <linux/uio.h>
-
-struct scoutfs_btree_val {
-	struct kvec vec[3];
-	unsigned int check_size_eq:1;
-	unsigned int check_size_lte:1;
-};
-
-static inline void __scoutfs_btree_init_val(struct scoutfs_btree_val *val,
-					    void *ptr0, unsigned int len0,
-					    void *ptr1, unsigned int len1,
-					    void *ptr2, unsigned int len2)
-{
-	*val = (struct scoutfs_btree_val) {
-		{ { ptr0, len0 }, { ptr1, len1 }, { ptr2, len2 } }
-	};
-}
-
-#define _scoutfs_btree_init_val(v, p0, l0, p1, l1, p2, l2, ...) \
-	__scoutfs_btree_init_val(v, p0, l0, p1, l1, p2, l2)
-
-/*
- * Provide a nice variadic initialization function without having to
- * iterate over the callers arg types.  We play some macro games to pad
- * out the callers ptr/len pairs to the full possible number.  This will
- * produce confusing errors if an odd number of arguments is given and
- * the padded ptr/length types aren't compatible with the fixed
- * arguments in the static inline.
- */
-#define scoutfs_btree_init_val(val, ...) \
-	_scoutfs_btree_init_val(val, __VA_ARGS__, NULL, 0, NULL, 0, NULL, 0)
-
-static inline int scoutfs_btree_val_length(struct scoutfs_btree_val *val)
-{
-
-	return iov_length((struct iovec *)val->vec, ARRAY_SIZE(val->vec));
-}
-
-int scoutfs_btree_lookup(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
-			 struct scoutfs_btree_val *val);
-int scoutfs_btree_insert(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
-			 struct scoutfs_btree_val *val);
-int scoutfs_btree_delete(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key);
-int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *first, struct scoutfs_key *last,
-		       struct scoutfs_key *found,
-		       struct scoutfs_btree_val *val);
-int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *first, struct scoutfs_key *last,
-		       struct scoutfs_key *found, u64 *found_seq,
-		       struct scoutfs_btree_val *val);
-int scoutfs_btree_dirty(struct super_block *sb,
-			struct scoutfs_btree_root *root,
-			struct scoutfs_key *key);
-int scoutfs_btree_update(struct super_block *sb,
-			 struct scoutfs_btree_root *root,
-			 struct scoutfs_key *key,
-		         struct scoutfs_btree_val *val);
-int scoutfs_btree_hole(struct super_block *sb, struct scoutfs_btree_root *root,
-		       struct scoutfs_key *first,
-		       struct scoutfs_key *last, struct scoutfs_key *hole);
-int scoutfs_btree_since(struct super_block *sb,
-			struct scoutfs_btree_root *root,
-			struct scoutfs_key *first, struct scoutfs_key *last,
-			u64 seq, struct scoutfs_key *found, u64 *found_seq,
-		        struct scoutfs_btree_val *val);
-
-#endif
diff --git a/kmod/src/buddy.c b/kmod/src/buddy.c
deleted file mode 100644
index 9f4a16fd..00000000
--- a/kmod/src/buddy.c
+++ /dev/null
@@ -1,1063 +0,0 @@
-/*
- * Copyright (C) 2016 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/statfs.h>
-#include <linux/slab.h>
-
-#include "super.h"
-#include "format.h"
-#include "block.h"
-#include "buddy.h"
-#include "scoutfs_trace.h"
-
-/*
- * scoutfs uses buddy bitmaps in an augmented radix to index free space.
- *
- * At the heart of the allocator are the buddy bitmaps in the radix
- * leaves.  For a given region of blocks there are bitmaps for each
- * power of two order of blocks that can be allocated.  N bits record
- * whether each order 0 size block region is allocated or freed, then
- * N/2 bits describe order 1 regions that span pairs of order 0 blocks,
- * and so on.  This ends up using two bits in the bitmaps for each
- * device block that's managed.
- *
- * An order bit is set when it is free.  All of its lower order bits
- * will be clear.  To allocate we clear a bit.  A partial allocation
- * clears the higher order bit and each buddy for each lower order until
- * the allocated order.  Freeing sets an order bit.  Then if it's buddy
- * order is also set we clear both and set their higher order bit.  This
- * proceeds to the highest order.
- *
- * Each buddy block records the first set bit in each order bitmap.  As
- * bits are set they update these first set records if they're before
- * the previous value.  As bits are cleared we find the next set if it
- * was the first.
- *
- * These buddy bitmap blocks that each fully describe a region of blocks
- * are assembled into a radix tree.  Each reference to a leaf block in
- * parent blocks have a bitmap of the orders that are free in its leaf
- * block.  The parent blocks then also record the first slot that has
- * each order bit set in its child references.  This indexing holds all
- * the way to the root.  This lets us quickly determine an order that
- * will satisfy an allocation and descend to the leaf that contains the
- * first free region of that order.
- *
- * These buddy blocks themselves are located in preallocated space. Each
- * logical position in the tree occupies two blocks on the device.  In
- * each transaction we use the currently referenced block to cow into
- * its partner.   Since the block positions are calculated the block
- * references only need a bit to specify which of the pair is being
- * referenced.  The number of blocks needed is precisely calculated by
- * taking the number of leaf blocks needed to track the device blocks
- * and dividing by the radix fanout until we have a single root block.
- *
- * Each aligned block allocation order is stored in a path down the
- * radix to a leaf that's a function of the block offset.  This lets us
- * ensure that we can allocate or free a given allocation order by
- * dirtying those blocks.  If we've allocated an order in a transaction
- * it can always be freed (or re-allocated) while the transaction holds
- * the dirty buddy blocks.
- *
- * We use that property to ensure that frees of stable data don't
- * satisfy allocation until the next transaction.  When we free stable
- * data we dirty the path to its position in the radix and record the
- * free in an rbtree.  We can then apply these frees as we commit the
- * transaction.  If the transaction fails we can undo the frees and let
- * the file system carry on.  We'll try to reapply the frees before the
- * next transaction commits.  The allocator never introduces
- * unrecoverable errors.
- *
- * The radix isn't fully populated when it's created.  mkfs only
- * initializes the two paths down the tree that have partially
- * initialized parent slots and leaf bitmaps.  The path down the left
- * spine has the initial file system blocks allocated.  The path down
- * the right spine can have partial parent slots and bits set in the
- * leaf when device sizes aren't multiples of the leaf block bit count
- * and radix fanout.  The kernel then only has to initialize the rest of
- * the buddy blocks blocks which have fully populated parent slots and
- * leaf bitmaps.
- *
- * XXX
- *  - resize is going to be a thing.  figure out that thing.
- */
-
-struct buddy_info {
-	struct mutex mutex;
-
-	atomic_t alloc_count;
-	struct rb_root pending_frees;
-
-	/* max height given total blocks */
-	u8 max_height;
-	/* the device blkno of the first block of a given level */
-	u64 level_blkno[SCOUTFS_BUDDY_MAX_HEIGHT];
-	/* blk divisor to find slot index at each level */
-	u64 level_div[SCOUTFS_BUDDY_MAX_HEIGHT];
-
-	struct buddy_stack {
-		struct scoutfs_block *bl[SCOUTFS_BUDDY_MAX_HEIGHT];
-		u16 sl[SCOUTFS_BUDDY_MAX_HEIGHT];
-		int nr;
-	} stack;
-};
-
-/* the first device blkno covered by the buddy allocator */
-static u64 first_blkno(struct scoutfs_super_block *super)
-{
-	return SCOUTFS_BUDDY_BLKNO + le64_to_cpu(super->buddy_blocks);
-}
-
-/* the last device blkno covered by the buddy allocator */
-static u64 last_blkno(struct scoutfs_super_block *super)
-{
-	return le64_to_cpu(super->total_blocks) - 1;
-}
-
-/* the last relative blkno covered by the buddy allocator */
-static u64 last_blk(struct scoutfs_super_block *super)
-{
-	return last_blkno(super) - first_blkno(super);
-}
-
-/* true when the device blkno is covered by the allocator */
-static bool device_blkno(struct scoutfs_super_block *super, u64 blkno)
-{
-	return blkno >= first_blkno(super) && blkno <= last_blkno(super);
-}
-
-/* true when the device blkno is used for buddy blocks */
-static bool buddy_blkno(struct scoutfs_super_block *super, u64 blkno)
-{
-	return blkno < first_blkno(super);
-}
-
-/* the order 0 bit offset in a buddy block of a given relative blk */
-static int buddy_bit(u64 blk)
-{
-	return do_div(blk, SCOUTFS_BUDDY_ORDER0_BITS);
-}
-
-/* true if the rel blk could be the start of an allocation of the order */
-static bool valid_order(u64 blk, int order)
-{
-	return (buddy_bit(blk) & ((1 << order) - 1)) == 0;
-}
-
-/* the block bit offset of the first bit of the given order's bitmap */
-static int order_off(int order)
-{
-	if (order == 0)
-		return 0;
-
-	return (2 * SCOUTFS_BUDDY_ORDER0_BITS) -
-	       (SCOUTFS_BUDDY_ORDER0_BITS / (1 << (order - 1)));
-}
-
-/* the bit offset in the block bitmap of an order's bit */
-static int order_nr(int order, int nr)
-{
-	return order_off(order) + nr;
-}
-
-static void stack_push(struct buddy_stack *sta, struct scoutfs_block *bl,
-		       u16 sl)
-{
-	sta->bl[sta->nr] = bl;
-	sta->sl[sta->nr++] = sl;
-}
-
-/* sl isn't returned because callers peek the leaf where sl is meaningless */ 
-static struct scoutfs_block *stack_peek(struct buddy_stack *sta)
-{
-	if (sta->nr)
-		return sta->bl[sta->nr - 1];
-
-	return NULL;
-}
-
-static struct scoutfs_block *stack_pop(struct buddy_stack *sta, u16 *sl)
-{
-	if (sta->nr) {
-		*sl = sta->sl[--sta->nr];
-		return sta->bl[sta->nr];
-	}
-
-	return NULL;
-}
-
-/* update first_set if the caller set an earlier nr for the given order */
-static void set_order_nr(struct scoutfs_buddy_block *bud, int order, u16 nr)
-{
-	u16 first = le16_to_cpu(bud->first_set[order]);
-
-	trace_printk("set level %u order %d nr %u first %u\n",
-		     bud->level, order, nr, first);
-
-	if (nr <= first)
-		bud->first_set[order] = cpu_to_le16(nr);
-}
-
-/* find the next first set if the caller just cleared the current first_set */
-static void clear_order_nr(struct scoutfs_buddy_block *bud, int order, u16 nr)
-{
-	u16 first = le16_to_cpu(bud->first_set[order]);
-	int size;
-	int i;
-
-	trace_printk("cleared level %u order %d nr %u first %u\n",
-		     bud->level, order, nr, first);
-
-	if (nr != first)
-		return;
-
-	if (bud->level) {
-		for (i = nr + 1; i < SCOUTFS_BUDDY_SLOTS; i++) {
-			if (le16_to_cpu(bud->slots[i].free_orders) &
-			    (1 << order))
-				break;
-		}
-		if (i == SCOUTFS_BUDDY_SLOTS)
-			i = U16_MAX;
-
-	} else {
-		size = order_off(order + 1);
-		i = find_next_bit_le(bud->bits, size,
-				       order_nr(order, first) + 1);
-		if (i >= size)
-			i = U16_MAX;
-		else
-			i -= order_off(order);
-	}
-
-	bud->first_set[order] = cpu_to_le16(i);
-
-}
-
-#define for_each_changed_bit(nr, bit, old, new, tmp)		\
-	for (tmp = old ^ new;					\
-	     tmp && (nr = ffs(tmp) - 1, bit = 1 << nr, 1);	\
-	     tmp ^= bit)
-
-/*
- * Set a slot's free_orders value and update first_set for each order
- * that it changes.  Returns true of the slot's free_orders was changed.
- */
-static bool set_slot_free_orders(struct scoutfs_buddy_block *bud, u16 sl,
-				 u16 free_orders)
-{
-	u16 old = le16_to_cpu(bud->slots[sl].free_orders);
-	int order;
-	int tmp;
-	int bit;
-
-	if (old == free_orders)
-		return false;
-
-	for_each_changed_bit(order, bit, old, free_orders, tmp) {
-		if (old & bit)
-			clear_order_nr(bud, order, sl);
-		else
-			set_order_nr(bud, order, sl);
-	}
-
-	bud->slots[sl].free_orders = cpu_to_le16(free_orders);
-	return true;
-}
-
-/*
- * The block at the top of the stack has changed its bits or slots and
- * updated its first set.  We propagate those changes up through
- * free_orders in parents slots and their first_set up through the tree
- * to free_orders in the root.  We can stop when a block's first_set
- * values don't change free_orders in their parent's slot.
- */
-static void stack_cleanup(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-	struct buddy_stack *sta = &binf->stack;
-	struct scoutfs_buddy_root *root = &sbi->super.buddy_root;
-	struct scoutfs_buddy_block *bud;
-	struct scoutfs_block *bl;
-	u16 free_orders = 0;
-	bool parent;
-	u16 sl;
-	int i;
-
-	parent = false;
-	while ((bl = stack_pop(sta, &sl))) {
-
-		bud = scoutfs_block_data(bl);
-		if (parent && !set_slot_free_orders(bud, sl, free_orders)) {
-			scoutfs_block_put(bl);
-			break;
-		}
-
-		free_orders = 0;
-		for (i = 0; i < ARRAY_SIZE(bud->first_set); i++) {
-			if (bud->first_set[i] != cpu_to_le16(U16_MAX))
-				free_orders |= 1 << i;
-		}
-
-		scoutfs_block_put(bl);
-		parent = true;
-	}
-
-	/* set root if we got that far */
-	if (bl == NULL)
-		root->slot.free_orders = cpu_to_le16(free_orders);
-
-	/* put any remaining blocks */
-	while ((bl = stack_pop(sta, &sl)))
-		scoutfs_block_put(bl);
-
-}
-
-static int test_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
-{
-	return !!test_bit_le(order_nr(order, nr), bud->bits);
-}
-
-static void set_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
-{
-	if (!test_and_set_bit_le(order_nr(order, nr), bud->bits))
-		set_order_nr(bud, order, nr);
-}
-
-static void clear_buddy_bit(struct scoutfs_buddy_block *bud, int order, int nr)
-{
-	if (test_and_clear_bit_le(order_nr(order, nr), bud->bits))
-		clear_order_nr(bud, order, nr);
-}
-
-/*
- * mkfs always writes the paths down the sides of the radix that have
- * partially populated blocks.  We only have to initialize full blocks
- * in the middle of the tree.
- */
-static void init_buddy_block(struct buddy_info *binf,
-			     struct scoutfs_super_block *super,
-			     struct scoutfs_block *bl, int level)
-{
-	struct scoutfs_buddy_block *bud = scoutfs_block_data(bl);
-	u16 count;
-	int nr;
-	int i;
-
-	scoutfs_block_zero(bl, sizeof(bud->hdr));
-
-	for (i = 0; i < ARRAY_SIZE(bud->first_set); i++)
-		bud->first_set[i] = cpu_to_le16(U16_MAX);
-
-	bud->level = level;
-
-	if (level) {
-		for (i = 0; i < SCOUTFS_BUDDY_SLOTS; i++)
-			set_slot_free_orders(bud, i, SCOUTFS_BUDDY_ORDER0_BITS);
-	} else {
-		/* ensure that there aren't multiple highest orders */
-		BUILD_BUG_ON((SCOUTFS_BUDDY_ORDER0_BITS /
-			      (1 << (SCOUTFS_BUDDY_ORDERS - 1))) > 1);
-
-		count = SCOUTFS_BUDDY_ORDER0_BITS;
-		nr = 0;
-		for (i = SCOUTFS_BUDDY_ORDERS - 1; i >= 0; i--) {
-			if (count & (1 << i)) {
-				set_buddy_bit(bud, i, nr);
-				nr = (nr + 1) << 1;
-			} else {
-				nr <<= 1;
-			}
-		}
-	}
-}
-
-/*
- * Give the caller the block referenced by the given slot.  They've
- * calculated the blkno of the pair of blocks while walking the tree.
- * The slot describes which of the pair its referencing.  The caller is
- * always going to modify the block so we always try and cow it.  We
- * construct a fake ref so we can re-use the block ref cow code.  When
- * we initialize the first use of a block we use the first of the pair.
- */
-static struct scoutfs_block *get_buddy_block(struct super_block *sb,
-					     struct scoutfs_buddy_slot *slot,
-					     u64 blkno, int level)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct buddy_info *binf = sbi->buddy_info;
-	struct scoutfs_buddy_block *bud;
-	struct scoutfs_block_ref ref;
-	struct scoutfs_block *bl;
-
-	trace_printk("getting block level %d blkno %llu slot seq %llu off %u\n",
-		     level, blkno, le64_to_cpu(slot->seq), slot->blkno_off);
-
-	/* init a new block for an unused slot */
-	if (slot->seq == 0) {
-		bl = scoutfs_block_dirty(sb, blkno);
-		if (!IS_ERR(bl))
-			init_buddy_block(binf, super, bl, level);
-	} else {
-		/* construct block ref from tree walk blkno and slot ref */
-		ref.blkno = cpu_to_le64(blkno + slot->blkno_off);
-		ref.seq = slot->seq;
-		bl = scoutfs_block_dirty_ref(sb, &ref);
-	}
-
-	if (!IS_ERR(bl)) {
-		bud = scoutfs_block_data(bl);
-
-		/* rebuild slot ref to blkno */
-		if (slot->seq != bud->hdr.seq) {
-			slot->blkno_off = le64_to_cpu(bud->hdr.blkno) - blkno;
-			/* alloc_same only xors low bit */
-			BUG_ON(slot->blkno_off > 1);
-			slot->seq = bud->hdr.seq;
-		}
-	}
-
-	return bl;
-}
-
-/*
- * Walk the buddy block radix to the leaf that contains either the given
- * relative blk or the first free given order.  The radix is of a fixed
- * depth and we initialize new blocks as we descend through
- * uninitialized refs.
- *
- * If order is -1 then we search for the blk.
- *
- * As we descend we calculate the base blk offset of the path we're
- * taking down the tree.  This is used to find the blkno of the next
- * block relative to the blkno of the given level.  It's then used by
- * the caller to calculate the total blk offset by adding the bit they
- * find in the block.
- *
- * The path through the tree is recorded in the stack in the buddy info.
- * The caller is responsible for cleaning up the stack and must do so
- * even if we return an error.
- */
-static int buddy_walk(struct super_block *sb, u64 blk, int order, u64 *base)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct buddy_info *binf = sbi->buddy_info;
-	struct buddy_stack *sta = &binf->stack;
-	struct scoutfs_buddy_root *root = &sbi->super.buddy_root;
-	struct scoutfs_buddy_block *bud;
-	struct scoutfs_buddy_slot *slot;
-	struct scoutfs_block *bl;
-	u64 blkno;
-	int level;
-	int ret = 0;
-	int sl = 0;
-
-	/* XXX corruption? */
-	if (blk > last_blk(super) || root->height == 0 ||
-	    root->height > SCOUTFS_BUDDY_MAX_HEIGHT)
-		return -EIO;
-
-	slot = &root->slot;
-	level = root->height;
-	blkno = SCOUTFS_BUDDY_BLKNO;
-	*base = 0;
-
-	while (level--) {
-		/* XXX do base and level make sense here? */
-		bl = get_buddy_block(sb, slot, blkno, level);
-		if (IS_ERR(bl)) {
-			ret = PTR_ERR(bl);
-			break;
-		}
-
-		trace_printk("before blk %llu order %d level %d blkno %llu base %llu sl %d\n",
-			     blk, order, level, blkno, *base, sl);
-
-		bud = scoutfs_block_data(bl);
-
-		if (level) {
-			if (order >= 0) {
-				/* find first slot with order free */
-				sl = le16_to_cpu(bud->first_set[order]);
-				/* XXX corruption */
-				if (sl == U16_MAX) {
-					scoutfs_block_put(bl);
-					ret = -EIO;
-					break;
-				}
-			} else {
-				/* find slot based on blk */
-				sl = div64_u64_rem(blk, binf->level_div[level],
-						   &blk);
-			}
-
-			/* shouldn't be sl * 2, right? */
-			*base = (*base * SCOUTFS_BUDDY_SLOTS) + sl;
-			/* this is the only place we * 2 */
-			blkno = binf->level_blkno[level - 1] + (*base * 2);
-			slot = &bud->slots[sl];
-		} else {
-			*base *= SCOUTFS_BUDDY_ORDER0_BITS;
-			/* sl in stack is 0 for final leaf block */
-			sl = 0;
-		}
-
-		trace_printk("after blk %llu order %d level %d blkno %llu base %llu sl %d\n",
-			     blk, order, level, blkno, *base, sl);
-
-
-		stack_push(sta, bl, sl);
-	}
-
-	trace_printk("walking ret %d\n", ret);
-
-	return ret;
-}
-
-/*
- * Find the order to search for to allocate a requested order.  We try
- * to use the smallest greater or equal order and then the largest
- * smaller order.
- */
-static int find_free_order(struct scoutfs_buddy_root *root, int order)
-{
-	u16 free = le16_to_cpu(root->slot.free_orders);
-	u16 smaller_mask = (1 << order) - 1;
-	u16 larger = free & ~smaller_mask;
-	u16 smaller = free & smaller_mask;
-
-	if (larger)
-		return ffs(larger) - 1;
-	if (smaller)
-		return fls(smaller) - 1;
-
-	return -ENOSPC;
-}
-
-/*
- * Walk to the leaf that contains the found order and allocate a region
- * of the given order, returning the relative blk to the caller.
- */
-static int buddy_alloc(struct super_block *sb, u64 *blk, int order, int found)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-	struct buddy_stack *sta = &binf->stack;
-	struct scoutfs_buddy_block *bud;
-	struct scoutfs_block *bl;
-	u64 base;
-	int ret;
-	int nr;
-	int i;
-
-	trace_printk("alloc order %d found %d\n", order, found);
-
-	if (WARN_ON_ONCE(found >= 0 && order > found))
-		return -EINVAL;
-
-	ret = buddy_walk(sb, *blk, found, &base);
-	if (ret)
-		goto out;
-
-	bl = stack_peek(sta);
-	bud = scoutfs_block_data(bl);
-
-	if (found >= 0) {
-		nr = le16_to_cpu(bud->first_set[found]);
-		/* XXX corruption */
-		if (nr == U16_MAX) {
-			ret = -EIO;
-			goto out;
-		}
-
-		/* give caller the found blk for the order */
-		*blk = base + (nr << found);
-	} else {
-		nr = buddy_bit(*blk) >> found;
-	}
-
-	/* always allocate the higher or equal found order */
-	clear_buddy_bit(bud, found, nr);
-
-	/* and maybe free our buddies between smaller order and larger found */
-	nr = buddy_bit(*blk) >> order;
-	for (i = order; i < found; i++) {
-		set_buddy_bit(bud, i, nr ^ 1);
-		nr >>= 1;
-	}
-
-	ret = 0;
-out:
-	trace_printk("alloc order %d found %d blk %llu ret %d\n",
-		     order, found, *blk, ret);
-	stack_cleanup(sb);
-	return ret;
-}
-
-/*
- * Free a given order by setting its order bit.  If the order's buddy
- * isn't set then it isn't free and we can't merge so we set our order
- * and are done.  If the buddy is free then we can clear it and ascend
- * up to try and set the next higher order.  That performs the same
- * buddy merging test.  Eventually we make it to the highest order which
- * doesn't have a buddy so we can always set it.
- *
- * As we're freeing orders in the final buddy bitmap that only partially
- * covers the end of the device we might try to test buddies which are
- * past the end of the device.  The test will still fall within the leaf
- * block bitmap and those bits past the device will never be set so we
- * will fail the merge and correctly set the orders free.
- */
-static int buddy_free(struct super_block *sb, u64 blk, int order)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-	struct buddy_stack *sta = &binf->stack;
-	struct scoutfs_buddy_block *bud;
-	struct scoutfs_block *bl;
-	u64 unused;
-	int ret;
-	int nr;
-	int i;
-
-	ret = buddy_walk(sb, blk, -1, &unused);
-	if (ret)
-		goto out;
-
-	bl = stack_peek(sta);
-	bud = scoutfs_block_data(bl);
-
-	nr = buddy_bit(blk) >> order;
-	for (i = order; i < SCOUTFS_BUDDY_ORDERS - 2; i++) {
-
-		if (!test_buddy_bit(bud, i, nr ^ 1))
-			break;
-
-		clear_buddy_bit(bud, i, nr ^ 1);
-		nr >>= 1;
-	}
-
-	set_buddy_bit(bud, i, nr);
-
-	ret = 0;
-out:
-	stack_cleanup(sb);
-	return ret;
-}
-
-/*
- * Try to allocate an extent with the size number of blocks.  blkno is
- * set to the start of the extent and the order of the block count is
- * returned.
- */
-int scoutfs_buddy_alloc(struct super_block *sb, u64 *blkno, int order)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct buddy_info *binf = sbi->buddy_info;
-	int found;
-	u64 blk;
-	int ret;
-
-	trace_printk("order %d\n", order);
-
-	mutex_lock(&binf->mutex);
-
-	found = find_free_order(&super->buddy_root, order);
-	if (found < 0) {
-		ret = found;
-		goto out;
-	}
-
-	if (found < order)
-		order = found;
-
-	blk = 0;
-	ret = buddy_alloc(sb, &blk, order, found);
-	if (ret)
-		goto out;
-
-	*blkno = first_blkno(super) + blk;
-	le64_add_cpu(&super->free_blocks, -(1ULL << order));
-	atomic_add((1ULL << order), &binf->alloc_count);
-	ret = order;
-
-out:
-	trace_printk("blkno %llu order %d ret %d\n", *blkno, order, ret);
-	mutex_unlock(&binf->mutex);
-	return ret;
-}
-
-/*
- * We use the block _ref() routines to dirty existing blocks to reuse
- * all the block verification and cow machinery.  During cow this is
- * called to allocate a new blkno to cow an existing buddy block.  We
- * use the existing blkno to see if we have to return the other mirrored
- * buddy blkno or do a real allocation for every other kind of block
- * being cowed.
- */
-int scoutfs_buddy_alloc_same(struct super_block *sb, u64 *blkno, u64 existing)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-
-	if (buddy_blkno(super, existing)) {
-		*blkno = existing ^ 1;
-		trace_printk("existing %llu ret blkno %llu\n",
-			     existing, *blkno);
-		return 0;
-	}
-
-	return scoutfs_buddy_alloc(sb, blkno, 0);
-}
-
-struct extent_node {
-	struct rb_node node;
-	u64 start;
-	u64 len;
-};
-
-static int add_enode_extent(struct rb_root *root, u64 start, u64 len)
-{
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct extent_node *left = NULL;
-	struct extent_node *right = NULL;
-	struct extent_node *enode;
-
-	trace_printk("adding enode [%llu,%llu]\n", start, len);
-
-	while (*node && !(left && right)) {
-		parent = *node;
-		enode = container_of(*node, struct extent_node, node);
-
-		if (start < enode->start) {
-			if (!right && start + len == enode->start)
-				right = enode;
-			node = &(*node)->rb_left;
-		} else {
-			if (!left && enode->start + enode->len == start)
-				left = enode;
-			node = &(*node)->rb_right;
-		}
-	}
-
-	if (right) {
-		right->start = start;
-		right->len += len;
-		trace_printk("right now [%llu, %llu]\n",
-			     right->start, right->len);
-	}
-
-	if (left) {
-		if (right) {
-			left->len += right->len;
-			rb_erase(&right->node, root);
-			kfree(right);
-		} else {
-			left->len += len;
-		}
-		trace_printk("left now [%llu, %llu]\n", left->start, left->len);
-	}
-
-	if (left || right)
-		return 0;
-
-	enode = kmalloc(sizeof(struct extent_node), GFP_NOFS);
-	if (!enode)
-		return -ENOMEM;
-
-	enode->start = start;
-	enode->len = len;
-
-	trace_printk("inserted new [%llu, %llu]\n", enode->start, enode->len);
-
-	rb_link_node(&enode->node, parent, node);
-	rb_insert_color(&enode->node, root);
-
-	return 0;
-}
-
-static void destroy_pending_frees(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-	struct extent_node *enode;
-	struct rb_node *node;
-
-	for (node = rb_first(&binf->pending_frees); node;) {
-		enode = rb_entry(node, struct extent_node, node);
-		node = rb_next(node);
-
-		rb_erase(&enode->node, &binf->pending_frees);
-		kfree(enode);
-	}
-}
-
-/* XXX this should be generic */
-#define min3_t(t, a, b, c) min3((t)(a), (t)(b), (t)(c))
-
-/*
- * Allocate or free all the orders that make up a given arbitrary block
- * extent.  Today this is used by callers who know that the blocks for
- * the extent have already been pinned so we BUG on error.
- */
-static void apply_extent(struct super_block *sb, bool alloc, u64 blk, u64 len)
-{
-	unsigned int blk_order;
-	unsigned int blk_bit;
-	unsigned int size;
-	int order;
-	int ret;
-
-	trace_printk("applying extent blk %llu len %llu\n", blk, len);
-
-	while (len) {
-		/* buddy bit might be 0, len always has a bit set */
-		blk_bit = buddy_bit(blk);
-		blk_order = blk_bit ? ffs(blk_bit) - 1  : 0;
-		order = min3_t(int, blk_order, fls64(len) - 1,
-			       SCOUTFS_BUDDY_ORDERS - 1);
-		size = 1 << order;
-
-		trace_printk("applying blk %llu order %d\n", blk, order);
-
-		if (alloc)
-			ret = buddy_alloc(sb, &blk, order, -1);
-		else
-			ret = buddy_free(sb, blk, order);
-		BUG_ON(ret);
-
-		blk += size;
-		len -= size;
-	}
-}
-
-/*
- * The pending rbtree has recorded frees of stable data that we had to
- * wait until transaction commit to record.  Once these are tracked in
- * the allocator we can't use the allocator until the commit succeeds.
- * This is called by transaction commit to get these pending frees into
- * the current commit.  If it fails they pull them back out.
- */
-int scoutfs_buddy_apply_pending(struct super_block *sb, bool alloc)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-	struct extent_node *enode;
-	struct rb_node *node;
-
-	for (node = rb_first(&binf->pending_frees); node;) {
-		enode = rb_entry(node, struct extent_node, node);
-		node = rb_next(node);
-
-		apply_extent(sb, alloc, enode->start, enode->len);
-	}
-
-	return 0;
-}
-
-/*
- * Free a given allocated extent.  The seq tells us which transaction
- * first allocated the extent.  If it was allocated in this transaction
- * then we can return it to the free buddy and that must succeed.
- *
- * If it was allocated in a previous transaction then we dirty the
- * blocks it will take to free it then record it in an rbtree.  The
- * rbtree entries are replayed into the dirty blocks as the transaction
- * commits.
- *
- * Buddy block numbers are preallocated and calculated from the radix
- * tree structure so we can ignore the block layer's calls to free buddy
- * blocks during cow.
- */
-int scoutfs_buddy_free(struct super_block *sb, __le64 seq, u64 blkno, int order)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct buddy_info *binf = sbi->buddy_info;
-	u64 unused;
-	u64 blk;
-	int ret;
-
-	trace_printk("seq %llu blkno %llu order %d rsv %u\n",
-		     le64_to_cpu(seq), blkno, order, buddy_blkno(super, blkno));
-
-	/* no specific free tracking for buddy blocks */
-	if (buddy_blkno(super, blkno))
-		return 0;
-
-	/* XXX corruption? */
-	if (!device_blkno(super, blkno))
-		return -EINVAL;
-
-	blk = blkno - first_blkno(super);
-
-	if (!valid_order(blk, order))
-		return -EINVAL;
-
-	mutex_lock(&binf->mutex);
-
-	if (seq == super->hdr.seq) {
-		ret = buddy_free(sb, blk, order);
-		/*
-		 * If this order was allocated in this transaction then its
-		 * blocks should be pinned and we should always be able
-		 * to free it.
-		 */
-		BUG_ON(ret);
-	} else {
-		ret = buddy_walk(sb, blk, -1, &unused) ?:
-		      add_enode_extent(&binf->pending_frees, blk, 1 << order);
-		if (ret == 0)
-			trace_printk("added blk %llu order %d\n", blk, order);
-		stack_cleanup(sb);
-	}
-
-	if (ret == 0)
-		le64_add_cpu(&super->free_blocks, 1ULL << order);
-
-	mutex_unlock(&binf->mutex);
-
-	return ret;
-}
-
-/*
- * This is current only used to return partial extents from larger
- * allocations in this transaction.
- */
-void scoutfs_buddy_free_extent(struct super_block *sb, u64 blkno, u64 count)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-	struct scoutfs_super_block *super = &sbi->stable_super;
-	u64 blk;
-
-	BUG_ON(!device_blkno(super, blkno));
-
-	blk = blkno - first_blkno(super);
-
-	mutex_lock(&binf->mutex);
-
-	apply_extent(sb, false, blkno - first_blkno(super), count);
-	le64_add_cpu(&super->free_blocks, count);
-
-	mutex_unlock(&binf->mutex);
-}
-
-/*
- * Return the number of block allocations since the last time the
- * counter was reset.  This count doesn't include dirty buddy blocks.
- */
-unsigned int scoutfs_buddy_alloc_count(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-
-	return atomic_read(&binf->alloc_count);
-}
-
-u64 scoutfs_buddy_bfree(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-	struct scoutfs_super_block *super = &sbi->super;
-	u64 ret;
-
-	mutex_lock(&binf->mutex);
-	ret = le64_to_cpu(super->free_blocks);
-	mutex_unlock(&binf->mutex);
-
-	return ret;
-}
-
-void scoutfs_buddy_committed(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-
-	atomic_set(&binf->alloc_count, 0);
-	destroy_pending_frees(sb);
-}
-
-int scoutfs_buddy_setup(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct scoutfs_super_block *super = &sbi->super;
-	struct buddy_info *binf = sbi->buddy_info;
-	u64 level_blocks[SCOUTFS_BUDDY_MAX_HEIGHT];
-	u64 blocks;
-	int i;
-
-	/* first bit offsets in blocks are __le16 */
-	BUILD_BUG_ON(SCOUTFS_BUDDY_ORDER0_BITS >= U16_MAX);
-
-	/* bits need to be naturally aligned to long for _le bitops */
-	BUILD_BUG_ON(offsetof(struct scoutfs_buddy_block, bits) &
-		     (sizeof(long) - 1));
-
-	binf = kzalloc(sizeof(struct buddy_info), GFP_KERNEL);
-	if (!binf)
-		return -ENOMEM;
-	sbi->buddy_info = binf;
-
-	mutex_init(&binf->mutex);
-	atomic_set(&binf->alloc_count, 0);
-	binf->pending_frees = RB_ROOT;
-
-	/* calculate blocks at each level */
-	blocks = DIV_ROUND_UP_ULL(last_blk(super) + 1,
-				  SCOUTFS_BUDDY_ORDER0_BITS);
-	for (i = 0; i < SCOUTFS_BUDDY_MAX_HEIGHT; i++) {
-		level_blocks[i] = (blocks * 2);
-		if (blocks == 1) {
-			binf->max_height = i + 1;
-			break;
-		}
-		blocks = DIV_ROUND_UP_ULL(blocks, SCOUTFS_BUDDY_SLOTS);
-	}
-
-	/* calculate device blkno of first block in each level */
-	binf->level_blkno[binf->max_height - 1] = SCOUTFS_BUDDY_BLKNO;
-	for (i = (binf->max_height - 2); i >= 0; i--) {
-		binf->level_blkno[i] = binf->level_blkno[i + 1] +
-				       level_blocks[i + 1];
-	}
-
-	/* calculate blk divisor to find slot at a given level */
-	binf->level_div[1] = SCOUTFS_BUDDY_ORDER0_BITS;
-	for (i = 2; i < binf->max_height; i++) {
-		binf->level_div[i] = binf->level_div[i - 1] *
-				     SCOUTFS_BUDDY_SLOTS;
-	}
-
-	for (i = 0; i < binf->max_height; i++)
-		trace_printk("level %d div %llu blkno %llu blocks %llu\n",
-			     i, binf->level_div[i], binf->level_blkno[i],
-			     level_blocks[i]);
-
-	return 0;
-}
-
-void scoutfs_buddy_destroy(struct super_block *sb)
-{
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buddy_info *binf = sbi->buddy_info;
-
-	if (binf)
-		WARN_ON_ONCE(!RB_EMPTY_ROOT(&binf->pending_frees));
-	kfree(binf);
-}
-
diff --git a/kmod/src/buddy.h b/kmod/src/buddy.h
deleted file mode 100644
index 24c0ed0c..00000000
--- a/kmod/src/buddy.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _SCOUTFS_BUDDY_H_
-#define _SCOUTFS_BUDDY_H_
-
-int scoutfs_buddy_alloc(struct super_block *sb, u64 *blkno, int order);
-int scoutfs_buddy_alloc_same(struct super_block *sb, u64 *blkno, u64 existing);
-int scoutfs_buddy_free(struct super_block *sb, __le64 seq, u64 blkno,
-		       int order);
-void scoutfs_buddy_free_extent(struct super_block *sb, u64 blkno, u64 count);
-
-int scoutfs_buddy_was_free(struct super_block *sb, u64 blkno, int order);
-u64 scoutfs_buddy_bfree(struct super_block *sb);
-
-unsigned int scoutfs_buddy_alloc_count(struct super_block *sb);
-int scoutfs_buddy_apply_pending(struct super_block *sb, bool alloc);
-void scoutfs_buddy_committed(struct super_block *sb);
-
-int scoutfs_buddy_setup(struct super_block *sb);
-void scoutfs_buddy_destroy(struct super_block *sb);
-
-#endif
diff --git a/kmod/src/counters.h b/kmod/src/counters.h
index 137ebbae..c9d081a6 100644
--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -14,8 +14,6 @@
 #define EXPAND_EACH_COUNTER 		\
 	EXPAND_COUNTER(alloc_alloc)	\
 	EXPAND_COUNTER(alloc_free)	\
-	EXPAND_COUNTER(block_mem_alloc)	\
-	EXPAND_COUNTER(block_mem_free) \
 	EXPAND_COUNTER(seg_lru_shrink) \
 	EXPAND_COUNTER(trans_level0_seg_write) \
 	EXPAND_COUNTER(manifest_compact_migrate) \
diff --git a/kmod/src/crc.c b/kmod/src/crc.c
deleted file mode 100644
index cde9a1ae..00000000
--- a/kmod/src/crc.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2015 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/crc32c.h>
-
-#include "format.h"
-#include "crc.h"
-
-u32 scoutfs_crc_block(struct scoutfs_block_header *hdr)
-{
-	return crc32c(~0, (char *)hdr + sizeof(hdr->crc),
-		      SCOUTFS_BLOCK_SIZE - sizeof(hdr->crc));
-}
diff --git a/kmod/src/crc.h b/kmod/src/crc.h
deleted file mode 100644
index 7f1fbf56..00000000
--- a/kmod/src/crc.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _SCOUTFS_CRC_H_
-#define _SCOUTFS_CRC_H_
-
-u32 scoutfs_crc_block(struct scoutfs_block_header *hdr);
-
-#endif
diff --git a/kmod/src/dir.c b/kmod/src/dir.c
index 0d5f0bb2..79b75dcc 100644
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -23,9 +23,7 @@
 #include "inode.h"
 #include "key.h"
 #include "super.h"
-#include "btree.h"
 #include "trans.h"
-#include "name.h"
 #include "xattr.h"
 #include "kvec.h"
 #include "item.h"
diff --git a/kmod/src/format.h b/kmod/src/format.h
index a3784bcb..b877d4d8 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -35,9 +35,6 @@
  */
 #define SCOUTFS_SUPER_BLKNO ((64 * 1024) >> SCOUTFS_BLOCK_SHIFT)
 #define SCOUTFS_SUPER_NR 2
-#define SCOUTFS_BUDDY_BLKNO (SCOUTFS_SUPER_BLKNO + SCOUTFS_SUPER_NR)
-
-#define SCOUTFS_MAX_TRANS_BLOCKS  (128 * 1024 * 1024 / SCOUTFS_BLOCK_SIZE)
 
 /*
  * This header is found at the start of every block so that we can
@@ -161,70 +158,6 @@ struct scoutfs_segment_block {
 	/* packed vals */
 } __packed;
 
-/*
- * Block references include the sequence number so that we can detect
- * readers racing with writers and so that we can tell that we don't
- * need to follow a reference when traversing based on seqs.
- */
-struct scoutfs_block_ref {
-	__le64 blkno;
-	__le64 seq;
-} __packed;
-
-/*
- * If the block was full of bits the largest possible order would be
- * the block size shift + 3 (BITS_PER_BYTE).  But the header uses
- * up some space and then the buddy bits mean two bits per block.
- * Then +1 for this being the number, not the greatest order.
- */
-#define SCOUTFS_BUDDY_ORDERS (SCOUTFS_BLOCK_SHIFT + 3 - 2 + 1)
-
-struct scoutfs_buddy_block {
-	struct scoutfs_block_header hdr;
-	__le16 first_set[SCOUTFS_BUDDY_ORDERS];
-	__u8 level;
-	__u8 __pad[3]; /* naturally align bits */
-	union {
-		struct scoutfs_buddy_slot {
-			__le64 seq;
-			__le16 free_orders;
-			/* XXX seems like we could hide a bit somewhere */
-			__u8 blkno_off;
-		} __packed slots[0];
-		__le64 bits[0];
-	} __packed;
-} __packed;
-
-/*
- * Each buddy leaf block references order 0 blocks with half of its
- * bitmap.  The other half of the bits are used for the higher order
- * bits.
- */
-#define SCOUTFS_BUDDY_ORDER0_BITS \
-	(((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) * 8) / 2)
-
-#define SCOUTFS_BUDDY_SLOTS						\
-	((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_buddy_block)) /	\
-		sizeof(struct scoutfs_buddy_slot))
-
-struct scoutfs_buddy_root {
-	struct scoutfs_buddy_slot slot;
-	__u8 height;
-} __packed;
-
-/* ((SCOUTFS_BUDDY_SLOTS^5) * SCOUTFS_BUDDY_ORDER0_BITS) > 2^52 */
-#define SCOUTFS_BUDDY_MAX_HEIGHT 6
-
-/*
- * We should be able to make the offset smaller if neither dirents nor
- * data items use the full 64 bits.
- */
-struct scoutfs_key {
-	__le64 inode;
-	u8 type;
-	__le64 offset;
-} __packed;
-
 /*
  * Currently we sort keys by the numeric value of the types, but that
  * isn't necessary.  We could have an arbitrary sort order.  So we don't
@@ -241,8 +174,6 @@ struct scoutfs_key {
 #define SCOUTFS_DATA_KEY		11
 #define SCOUTFS_MAX_UNUSED_KEY		255
 
-#define SCOUTFS_MAX_ITEM_LEN 512
-
 /* value is struct scoutfs_inode */
 struct scoutfs_inode_key {
 	__u8 type;
@@ -307,66 +238,9 @@ struct scoutfs_symlink_key {
 	__be64 ino;
 } __packed;
 
-struct scoutfs_btree_root {
-	u8 height;
-	struct scoutfs_block_ref ref;
-} __packed;
-
-/*
- * @free_end: records the byte offset of the first byte after the free
- * space in the block between the header and the first item.  New items
- * are allocated by subtracting the space they need.
- *
- * @free_reclaim: records the number of bytes of free space amongst the
- * items after free_end.  If a block is compacted then this much new
- * free space would be reclaimed.
- */
-struct scoutfs_btree_block {
-	struct scoutfs_block_header hdr;
-	__le16 free_end;
-	__le16 free_reclaim;
-	__le16 nr_items;
-	__le16 item_offs[0];
-} __packed;
-
-/*
- * The item sequence number is set to the dirty block's sequence number
- * when the item is modified.  It is not changed by splits or merges.
- */
-struct scoutfs_btree_item {
-	struct scoutfs_key key;
-	__le64 seq;
-	__le16 val_len;
-	char val[0];
-} __packed;
-
-/* Blocks are no more than half free. */
-#define SCOUTFS_BTREE_FREE_LIMIT \
-	((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2)
-
 /* XXX does this exist upstream somewhere? */
 #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
 
-#define SCOUTFS_BTREE_MAX_ITEMS \
-	((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) /	\
-	 (member_sizeof(struct scoutfs_btree_block, item_offs[0]) +	\
-          sizeof(struct scoutfs_btree_item)))
-
-/*
- * We can calculate the max tree depth by calculating how many leaf
- * blocks the tree could reference.  The block device can only reference
- * 2^64 bytes.  The tallest parent tree has half full parent blocks.
- *
- * So we have the relation:
- *
- * ceil(max_items / 2) ^ (max_depth - 1) >= 2^64 / block_size
- *
- * and solve for depth:
- *
- * max_depth = log(ceil(max_items / 2), 2^64 / block_size) + 1
- */
-#define SCOUTFS_BTREE_MAX_DEPTH 10
-
 #define SCOUTFS_UUID_BYTES 16
 
 /*
@@ -382,16 +256,11 @@ struct scoutfs_super_block {
 	__le64 alloc_uninit;
 	__le64 total_segs;
 	__le64 free_segs;
-	__le64 total_blocks;
-	__le64 free_blocks;
 	__le64 ring_blkno;
 	__le64 ring_blocks;
 	__le64 ring_tail_block;
 	__le64 ring_gen;
 	__le64 next_seg_seq;
-	__le64 buddy_blocks;
-        struct scoutfs_buddy_root buddy_root;
-        struct scoutfs_btree_root btree_root;
 	struct scoutfs_treap_root alloc_treap_root;
 	struct scoutfs_manifest manifest;
 } __packed;
@@ -418,7 +287,6 @@ struct scoutfs_timespec {
 struct scoutfs_inode {
 	__le64 size;
 	__le64 blocks;
-	__le64 link_counter;
 	__le64 data_version;
 	__le64 next_readdir_pos;
 	__le32 nlink;
@@ -426,7 +294,6 @@ struct scoutfs_inode {
 	__le32 gid;
 	__le32 mode;
 	__le32 rdev;
-	__le32 salt;
 	struct scoutfs_timespec atime;
 	struct scoutfs_timespec ctime;
 	struct scoutfs_timespec mtime;
@@ -449,20 +316,6 @@ struct scoutfs_dirent {
 	__u8 name[0];
 } __packed;
 
-/*
- * Dirent items are stored at keys with the offset set to the hash of
- * the name.  Creation can find that hash values collide and will
- * attempt to linearly probe this many following hash values looking for
- * an unused value.
- *
- * In small directories this doesn't really matter because hash values
- * will so very rarely collide.  At around 50k items we start to see our
- * first collisions.  16 slots is still pretty quick to scan in the
- * btree and it gets us up into the hundreds of millions of entries
- * before enospc is returned as we run out of hash values.
- */
-#define SCOUTFS_DIRENT_COLL_NR 16
-
 #define SCOUTFS_NAME_LEN 255
 
 /* S32_MAX avoids the (int) sign bit and might avoid sloppy bugs */
@@ -475,14 +328,6 @@ struct scoutfs_dirent {
 #define SCOUTFS_XATTR_MAX_PARTS \
 	DIV_ROUND_UP(SCOUTFS_XATTR_MAX_SIZE, SCOUTFS_XATTR_PART_SIZE)
 
-/*
- * We only use 31 bits for readdir positions so that we don't confuse
- * old signed 32bit f_pos applications or those on the other side of
- * network protocols that have limited readir positions.
- */
-
-#define SCOUTFS_DIRENT_OFF_BITS 31
-#define SCOUTFS_DIRENT_OFF_MASK ((1U << SCOUTFS_DIRENT_OFF_BITS) - 1)
 /* entries begin after . and .. */
 #define SCOUTFS_DIRENT_FIRST_POS 2
 /* getdents returns next pos with an entry, no entry at (f_pos)~0 */
@@ -499,14 +344,6 @@ enum {
 	SCOUTFS_DT_WHT,
 };
 
-struct scoutfs_extent {
-	__le64	blkno;
-	__le64	len;
-	__u8	flags;
-} __packed;
-
-#define SCOUTFS_EXTENT_FLAG_OFFLINE (1 << 0)
-
 /* ino_path can search for backref items with a null term */
 #define SCOUTFS_MAX_KEY_SIZE \
 	offsetof(struct scoutfs_link_backref_key, name[SCOUTFS_NAME_LEN + 1])
diff --git a/kmod/src/inode.c b/kmod/src/inode.c
index ad1caa79..e34441de 100644
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -22,7 +22,6 @@
 #include "super.h"
 #include "key.h"
 #include "inode.h"
-#include "btree.h"
 #include "dir.h"
 #include "data.h"
 #include "scoutfs_trace.h"
@@ -126,8 +125,6 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
 	inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
 
-	ci->salt = le32_to_cpu(cinode->salt);
-	atomic64_set(&ci->link_counter, le64_to_cpu(cinode->link_counter));
 	ci->data_version = le64_to_cpu(cinode->data_version);
 	ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
 }
@@ -247,8 +244,6 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
 	cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec);
 	cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 
-	cinode->salt = cpu_to_le32(ci->salt);
-	cinode->link_counter = cpu_to_le64(atomic64_read(&ci->link_counter));
 	cinode->data_version = cpu_to_le64(ci->data_version);
 	cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos);
 }
@@ -415,8 +410,6 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
 	ci->data_version = 0;
 	ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
 	ci->staging = false;
-	get_random_bytes(&ci->salt, sizeof(ci->salt));
-	atomic64_set(&ci->link_counter, 0);
 
 	inode->i_ino = ino; /* XXX overflow */
 	inode_init_owner(inode, dir, mode);
diff --git a/kmod/src/inode.h b/kmod/src/inode.h
index f3badfb4..6dcb03d8 100644
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -5,7 +5,6 @@
 
 struct scoutfs_inode_info {
 	u64 ino;
-	u32 salt;
 
 	seqcount_t seqcount;
 	u64 data_version;
@@ -14,7 +13,6 @@ struct scoutfs_inode_info {
 	/* holder of i_mutex is staging */
 	bool staging;
 
-	atomic64_t link_counter;
 	struct rw_semaphore xattr_rwsem;
 
 	struct inode inode;
diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c
index 255e167f..82375047 100644
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -24,7 +24,6 @@
 #include "format.h"
 #include "key.h"
 #include "dir.h"
-#include "name.h"
 #include "ioctl.h"
 #include "super.h"
 #include "inode.h"
diff --git a/kmod/src/key.h b/kmod/src/key.h
index 7d3b2230..3c108555 100644
--- a/kmod/src/key.h
+++ b/kmod/src/key.h
@@ -126,127 +126,4 @@ static inline void scoutfs_key_set_max(struct scoutfs_key_buf *key)
 	scoutfs_key_memset(key, 0xff, sizeof(struct scoutfs_inode_key));
 }
 
-/*
- * What follows are the key functions for the small fixed size btree
- * keys.  It will all be removed once the callers are converted from
- * the btree to the item cache.
- */
-
-#define CKF "%llu.%u.%llu"
-#define CKA(key) \
-	le64_to_cpu((key)->inode), (key)->type, le64_to_cpu((key)->offset)
-
-static inline u64 scoutfs_key_inode(struct scoutfs_key *key)
-{
-	return le64_to_cpu(key->inode);
-}
-
-static inline u64 scoutfs_key_offset(struct scoutfs_key *key)
-{
-	return le64_to_cpu(key->offset);
-}
-
-static inline int le64_cmp(__le64 a, __le64 b)
-{
-	return le64_to_cpu(a) < le64_to_cpu(b) ? -1 : 
-	       le64_to_cpu(a) > le64_to_cpu(b) ? 1 : 0;
-}
-
-/*
- * Items are sorted by type and then by inode to reflect the relative
- * frequency of use.  Inodes and xattrs are hot, then dirents, then file
- * data extents.  We want each use class to be hot and dense, we don't
- * want a scan of the inodes to have to skip over each inode's extent
- * items.
- */
-static inline int scoutfs_key_cmp(struct scoutfs_key *a, struct scoutfs_key *b)
-{
-	return ((short)a->type - (short)b->type) ?:
-	       le64_cmp(a->inode, b->inode) ?:
-	       le64_cmp(a->offset, b->offset);
-}
-
-/*
- * return -ve if the first range is completely before the second, +ve for
- * completely after, and 0 if they intersect.
- */
-static inline int scoutfs_cmp_key_ranges(struct scoutfs_key *a_first,
-					 struct scoutfs_key *a_last,
-					 struct scoutfs_key *b_first,
-					 struct scoutfs_key *b_last)
-{
-	if (scoutfs_key_cmp(a_last, b_first) < 0)
-		return -1;
-	if (scoutfs_key_cmp(a_first, b_last) > 0)
-		return 1;
-	return 0;
-}
-
-static inline int scoutfs_cmp_key_range(struct scoutfs_key *key,
-					struct scoutfs_key *first,
-					struct scoutfs_key *last)
-{
-	return scoutfs_cmp_key_ranges(key, key, first, last);
-}
-
-static inline void scoutfs_set_key(struct scoutfs_key *key, u64 inode, u8 type,
-				   u64 offset)
-{
-	key->inode = cpu_to_le64(inode);
-	key->type = type;
-	key->offset = cpu_to_le64(offset);
-}
-
-static inline void scoutfs_set_max_key(struct scoutfs_key *key)
-{
-	scoutfs_set_key(key, ~0ULL, ~0, ~0ULL);
-}
-
-/*
- * This saturates at (~0,~0,~0) instead of wrapping.  This will never be
- * an issue for real item keys but parent item keys along the right
- * spine of the tree have maximal key values that could wrap if
- * incremented.
- */
-static inline void scoutfs_inc_key(struct scoutfs_key *key)
-{
-	if (key->inode == cpu_to_le64(~0ULL) &&
-	    key->type == (u8)~0 &&
-	    key->offset == cpu_to_le64(~0ULL))
-		return;
-
-	le64_add_cpu(&key->offset, 1);
-	if (!key->offset) {
-		if (++key->type == 0)
-			le64_add_cpu(&key->inode, 1);
-	}
-}
-
-static inline void scoutfs_dec_key(struct scoutfs_key *key)
-{
-	le64_add_cpu(&key->offset, -1ULL);
-	if (key->offset == cpu_to_le64(~0ULL)) {
-		if (key->type-- == 0)
-			le64_add_cpu(&key->inode, -1ULL);
-	}
-}
-
-static inline struct scoutfs_key *scoutfs_max_key(struct scoutfs_key *a,
-						  struct scoutfs_key *b)
-{
-	return scoutfs_key_cmp(a, b) > 0 ? a : b;
-}
-
-static inline bool scoutfs_key_is_zero(struct scoutfs_key *key)
-{
-	return key->inode == 0 && key->type == 0 && key->offset == 0;
-}
-
-static inline void scoutfs_key_set_zero(struct scoutfs_key *key)
-{
-	key->inode = 0;
-	key->type = 0;
-	key->offset = 0;
-}
-
 #endif
diff --git a/kmod/src/kvec.c b/kmod/src/kvec.c
index 5e49c6a3..422a4fc5 100644
--- a/kmod/src/kvec.c
+++ b/kmod/src/kvec.c
@@ -25,10 +25,8 @@
 #include "dir.h"
 #include "xattr.h"
 #include "msg.h"
-#include "block.h"
 #include "counters.h"
 #include "trans.h"
-#include "buddy.h"
 #include "kvec.h"
 #include "scoutfs_trace.h"
 
diff --git a/kmod/src/name.c b/kmod/src/name.c
deleted file mode 100644
index e14f52bd..00000000
--- a/kmod/src/name.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2016 Versity Software, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/kernel.h>
-#include <linux/crc32c.h>
-#include <linux/string.h>
-
-#include "name.h"
-
-/*
- * XXX This crc nonsense is a quick hack.  We'll want something a
- * lot stronger like siphash.
- */
-u64 scoutfs_name_hash(const char *name, unsigned int len)
-{
-	unsigned int half = (len + 1) / 2;
-
-	return crc32c(~0, name, half) |
-	       ((u64)crc32c(~0, name + len - half, half) << 32);
-}
-
-int scoutfs_names_equal(const char *name_a, int len_a,
-			       const char *name_b, int len_b)
-{
-	return (len_a == len_b) && !memcmp(name_a, name_b, len_a);
-}
diff --git a/kmod/src/name.h b/kmod/src/name.h
deleted file mode 100644
index 020ecb0f..00000000
--- a/kmod/src/name.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _SCOUTFS_NAME_H_
-#define _SCOUTFS_NAME_H_
-
-u64 scoutfs_name_hash(const char *data, unsigned int len);
-int scoutfs_names_equal(const char *name_a, int len_a,
-			const char *name_b, int len_b);
-
-#endif
diff --git a/kmod/src/scoutfs_trace.c b/kmod/src/scoutfs_trace.c
index 038eb228..6c775b9f 100644
--- a/kmod/src/scoutfs_trace.c
+++ b/kmod/src/scoutfs_trace.c
@@ -23,7 +23,6 @@
 #include "inode.h"
 #include "dir.h"
 #include "msg.h"
-#include "block.h"
 
 #define CREATE_TRACE_POINTS
 #include "scoutfs_trace.h"
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index 4a1d7a7f..669b99f4 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -180,171 +180,6 @@ TRACE_EVENT(scoutfs_scan_orphans,
 	TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
 );
 
-TRACE_EVENT(scoutfs_buddy_alloc,
-	TP_PROTO(u64 blkno, int order, int region, int ret),
-
-	TP_ARGS(blkno, order, region, ret),
-
-	TP_STRUCT__entry(
-		__field(u64, blkno)
-		__field(int, order)
-		__field(int, region)
-		__field(int, ret)
-	),
-
-	TP_fast_assign(
-		__entry->blkno = blkno;
-		__entry->order = order;
-		__entry->region = region;
-		__entry->ret = ret;
-	),
-
-	TP_printk("blkno %llu order %d region %d ret %d",
-		__entry->blkno, __entry->order, __entry->region, __entry->ret)
-);
-
-
-TRACE_EVENT(scoutfs_buddy_free,
-	TP_PROTO(u64 blkno, int order, int region, int ret),
-
-	TP_ARGS(blkno, order, region, ret),
-
-	TP_STRUCT__entry(
-		__field(u64, blkno)
-		__field(int, order)
-		__field(int, region)
-		__field(int, ret)
-	),
-
-	TP_fast_assign(
-		__entry->blkno = blkno;
-		__entry->order = order;
-		__entry->region = region;
-		__entry->ret = ret;
-	),
-
-	TP_printk("blkno %llu order %d region %d ret %d",
-		__entry->blkno, __entry->order, __entry->region, __entry->ret)
-);
-
-DECLARE_EVENT_CLASS(scoutfs_btree_op,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len),
-
-	TP_ARGS(sb, key, len),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev				)
-		__field(	u64,	key_ino				)
-		__field(	u64,	key_off				)
-		__field(	u8,	key_type			)
-		__field(	int,	val_len				)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= sb->s_dev;
-		__entry->key_ino	= le64_to_cpu(key->inode);
-		__entry->key_off	= le64_to_cpu(key->offset);
-		__entry->key_type	= key->type;
-		__entry->val_len	= len;
-	),
-
-	TP_printk("dev %d,%d key "TRACE_KEYF" size %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->key_ino, show_key_type(__entry->key_type),
-		  __entry->key_off, __entry->val_len)
-);
-
-DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_lookup,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len),
-
-	TP_ARGS(sb, key, len)
-);
-
-DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_insert,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len),
-
-	TP_ARGS(sb, key, len)
-);
-
-DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_delete,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len),
-
-	TP_ARGS(sb, key, len)
-);
-
-DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_dirty,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len),
-
-	TP_ARGS(sb, key, len)
-);
-
-DEFINE_EVENT(scoutfs_btree_op, scoutfs_btree_update,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *key, int len),
-
-	TP_ARGS(sb, key, len)
-);
-
-DECLARE_EVENT_CLASS(scoutfs_btree_ranged_op,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *first,
-		 struct scoutfs_key *last),
-
-	TP_ARGS(sb, first, last),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev				)
-		__field(	u64,	first_ino			)
-		__field(	u64,	first_off			)
-		__field(	u8,	first_type			)
-		__field(	u64,	last_ino			)
-		__field(	u64,	last_off			)
-		__field(	u8,	last_type			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= sb->s_dev;
-		__entry->first_ino	= le64_to_cpu(first->inode);
-		__entry->first_off	= le64_to_cpu(first->offset);
-		__entry->first_type	= first->type;
-		__entry->last_ino	= le64_to_cpu(last->inode);
-		__entry->last_off	= le64_to_cpu(last->offset);
-		__entry->last_type	= last->type;
-	),
-
-	TP_printk("dev %d,%d first key "TRACE_KEYF" last key "TRACE_KEYF,
-		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->first_ino,
-		  show_key_type(__entry->first_type), __entry->first_off,
-		  __entry->last_ino, show_key_type(__entry->last_type),
-		  __entry->last_off)
-);
-
-DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_hole,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *first,
-		 struct scoutfs_key *last),
-
-	TP_ARGS(sb, first, last)
-);
-
-DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_next,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *first,
-		 struct scoutfs_key *last),
-
-	TP_ARGS(sb, first, last)
-);
-
-DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_prev,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *first,
-		 struct scoutfs_key *last),
-
-	TP_ARGS(sb, first, last)
-);
-
-DEFINE_EVENT(scoutfs_btree_ranged_op, scoutfs_btree_since,
-	TP_PROTO(struct super_block *sb, struct scoutfs_key *first,
-		 struct scoutfs_key *last),
-
-	TP_ARGS(sb, first, last)
-);
-
 TRACE_EVENT(scoutfs_manifest_add,
         TP_PROTO(struct super_block *sb, struct kvec *first,
 		 struct kvec *last, u64 segno, u64 seq, u8 level),
diff --git a/kmod/src/super.c b/kmod/src/super.c
index 57dcbd46..158052c1 100644
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -25,10 +25,8 @@
 #include "dir.h"
 #include "xattr.h"
 #include "msg.h"
-#include "block.h"
 #include "counters.h"
 #include "trans.h"
-#include "buddy.h"
 #include "item.h"
 #include "manifest.h"
 #include "seg.h"
@@ -96,8 +94,6 @@ void scoutfs_advance_dirty_super(struct super_block *sb)
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
 
-	sbi->stable_super = sbi->super;
-
 	le64_add_cpu(&super->hdr.blkno, 1);
 	if (le64_to_cpu(super->hdr.blkno) == (SCOUTFS_SUPER_BLKNO +
 					      SCOUTFS_SUPER_NR))
@@ -182,8 +178,6 @@ static int read_supers(struct super_block *sb)
 	scoutfs_info(sb, "using super %u with seq %llu",
 		     found, le64_to_cpu(sbi->super.hdr.seq));
 
-	sbi->stable_super = sbi->super;
-
 	return 0;
 }
 
@@ -204,23 +198,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	spin_lock_init(&sbi->next_ino_lock);
-	spin_lock_init(&sbi->block_lock);
-	/* radix only inserted with NOFS _preload */
-	INIT_RADIX_TREE(&sbi->block_radix, GFP_ATOMIC);
-	init_waitqueue_head(&sbi->block_wq);
-	atomic_set(&sbi->block_writes, 0);
-	INIT_LIST_HEAD(&sbi->block_lru_list);
-	init_rwsem(&sbi->btree_rwsem);
 	atomic_set(&sbi->trans_holds, 0);
 	init_waitqueue_head(&sbi->trans_hold_wq);
 	spin_lock_init(&sbi->trans_write_lock);
 	INIT_WORK(&sbi->trans_write_work, scoutfs_trans_write_func);
 	init_waitqueue_head(&sbi->trans_write_wq);
 
-	sbi->block_shrinker.shrink = scoutfs_block_shrink;
-	sbi->block_shrinker.seeks = DEFAULT_SEEKS;
-	register_shrinker(&sbi->block_shrinker);
-
 	/* XXX can have multiple mounts of a  device, need mount id */
 	sbi->kset = kset_create_and_add(sb->s_id, NULL, &scoutfs_kset->kobj);
 	if (!sbi->kset)
@@ -269,16 +252,12 @@ static void scoutfs_kill_sb(struct super_block *sb)
 	if (sbi) {
 		scoutfs_compact_destroy(sb);
 		scoutfs_shutdown_trans(sb);
-		scoutfs_buddy_destroy(sb);
-		if (sbi->block_shrinker.shrink == scoutfs_block_shrink)
-			unregister_shrinker(&sbi->block_shrinker);
 		scoutfs_data_destroy(sb);
 		scoutfs_item_destroy(sb);
 		scoutfs_alloc_destroy(sb);
 		scoutfs_manifest_destroy(sb);
 		scoutfs_treap_destroy(sb);
 		scoutfs_seg_destroy(sb);
-		scoutfs_block_destroy(sb);
 		scoutfs_destroy_counters(sb);
 		if (sbi->kset)
 			kset_unregister(sbi->kset);
diff --git a/kmod/src/super.h b/kmod/src/super.h
index 82eb6bba..e791e76d 100644
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -5,10 +5,8 @@
 #include <linux/rbtree.h>
 
 #include "format.h"
-#include "buddy.h"
 
 struct scoutfs_counters;
-struct buddy_info;
 struct item_cache;
 struct manifest;
 struct segment_cache;
@@ -20,20 +18,9 @@ struct scoutfs_sb_info {
 	struct super_block *sb;
 
 	struct scoutfs_super_block super;
-	struct scoutfs_super_block stable_super;
 
 	spinlock_t next_ino_lock;
 
-	spinlock_t block_lock;
-	struct radix_tree_root block_radix;
-	wait_queue_head_t block_wq;
-	atomic_t block_writes;
-	int block_write_err;
-	/* block cache lru */
-	struct shrinker block_shrinker;
-	struct list_head block_lru_list;
-	unsigned long block_lru_nr;
-
 	struct manifest *manifest;
 	struct item_cache *item_cache;
 	struct segment_cache *segment_cache;
@@ -42,10 +29,6 @@ struct scoutfs_sb_info {
 	struct compact_info *compact_info;
 	struct data_info *data_info;
 
-	struct buddy_info *buddy_info;
-
-	struct rw_semaphore btree_rwsem;
-
 	atomic_t trans_holds;
 	wait_queue_head_t trans_hold_wq;
 	struct task_struct *trans_task;
@@ -68,17 +51,6 @@ static inline struct scoutfs_sb_info *SCOUTFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-/* The root of the metadata btree */
-static inline struct scoutfs_btree_root *SCOUTFS_META(struct super_block *sb)
-{
-	return &SCOUTFS_SB(sb)->super.btree_root;
-}
-
-static inline struct scoutfs_btree_root *SCOUTFS_STABLE_META(struct super_block *sb)
-{
-	return &SCOUTFS_SB(sb)->stable_super.btree_root;
-}
-
 void scoutfs_advance_dirty_super(struct super_block *sb);
 int scoutfs_write_dirty_super(struct super_block *sb);
 
diff --git a/kmod/src/trans.c b/kmod/src/trans.c
index 487514f6..d596bf68 100644
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -18,9 +18,7 @@
 #include <linux/writeback.h>
 
 #include "super.h"
-#include "block.h"
 #include "trans.h"
-#include "buddy.h"
 #include "data.h"
 #include "bio.h"
 #include "item.h"
diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c
index 52f1acd7..afe1cc14 100644
--- a/kmod/src/xattr.c
+++ b/kmod/src/xattr.c
@@ -22,7 +22,6 @@
 #include "kvec.h"
 #include "item.h"
 #include "trans.h"
-#include "name.h"
 #include "xattr.h"
 
 /*