Go back to having our own scoutfs_block cache

We used to have 16k blocks in our own radix_tree cache. When we introduced the simple file block mapping code it preferred to have block size == page size. That let us remove a bunch of code and reuse all the kernel's buffer head code. But it turns out that the buffer heads are just a bit too inflexible. We'd like to have blocks larger than page size, obviously, but it turns out there's real functional differences. Resolving the problem of unlocked readers and allocating writers working with the same blkno is the most powerful example of this. It's trivial to fix by always inserting new allocated cached blocks in the cache. But solving it with buffer heads requires expensive and risky locking around the buffer head cache which can only support a single physical instance of a given blkno because there can be multiple blocks per page. So this restores the simple block cache that was removed back in commit 'c8e76e2 scoutfs: use buffer heads'. There's still work to do to get this fully functional but it's worth it. Signed-off-by: Zach Brown <zab@versity.com> Reviewed-by: Mark Fasheh <mfasheh@versity.com>
2026-01-08 04:55:21 +00:00 · 2016-11-07 14:39:42 -08:00
parent 4042927519
commit f57c07381a
4 changed files with 340 additions and 264 deletions
--- a/kmod/src/block.c
+++ b/kmod/src/block.c
@@ -11,7 +11,6 @@
 * General Public License for more details.
 */
 #include <linux/kernel.h>
-#include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>

@@ -23,54 +22,105 @@
 #include "buddy.h"

 /*
- * scoutfs has a fixed 4k small block size for metadata blocks.  This
- * lets us consistently use buffer heads without worrying about having a
- * block size greater than the page size.
+ * scoutfs maintains a cache of metadata blocks in a radix tree.  This
+ * gives us blocks bigger than page size and avoids fixing the location
+ * of a logical cached block in one possible position in a larger block
+ * device page cache page.
 *
- * This block interface does the work to cow dirty blocks, track dirty
- * blocks, generate checksums as they're written, only write them in
- * transactions, verify checksums on read, and invalidate and retry
- * reads of stale cached blocks.  (That last bit only has a hint of an
- * implementation.)
+ * This does the work to cow dirty blocks, track dirty blocks, generate
+ * checksums as they're written, only write them in transactions, verify
+ * checksums on read, and invalidate and retry reads of stale cached
+ * blocks.  (That last bit only has a hint of an implementation.)
 *
 * XXX
 *  - tear down dirty blocks left by write errors on unmount
- *  - should invalidate dirty blocks if freed
+ *  - multiple smaller page allocs
+ *  - vmalloc?  vm_map_ram?
+ *  - blocks allocated from per-cpu pages when page size > block size
+ *  - cmwq crc calcs if that makes sense
+ *  - slab of block structs
+ *  - don't verify checksums in end_io context?
+ *  - fall back to multiple single bios per block io if bio alloc fails?
+ *  - fail mount if total_blocks is greater than long radix blkno
 */

-struct scoutfs_block;
-
-struct block_bh_private {
-	struct super_block *sb;
-	struct buffer_head *bh;
-	struct rb_node node;
+struct scoutfs_block {
 	struct rw_semaphore rwsem;
-	bool rwsem_class;
+	atomic_t refcount;
+	u64 blkno;
+
+	unsigned long bits;
+
+	struct super_block *sb;
+	struct page *page;
+	void *data;
 };

+#define DIRTY_RADIX_TAG 0
+
 enum {
-	BH_ScoutfsVerified = BH_PrivateStart,
+	BLOCK_BIT_UPTODATE = 0,
+	BLOCK_BIT_ERROR,
+	BLOCK_BIT_CLASS_SET,
 };
-BUFFER_FNS(ScoutfsVerified, scoutfs_verified)

-static int verify_block_header(struct scoutfs_sb_info *sbi,
-			       struct buffer_head *bh)
+static struct scoutfs_block *alloc_block(struct super_block *sb, u64 blkno)
 {
+	struct scoutfs_block *bl;
+	struct page *page;
+
+	/* we'd need to be just a bit more careful */
+	BUILD_BUG_ON(PAGE_SIZE > SCOUTFS_BLOCK_SIZE);
+
+	bl = kzalloc(sizeof(struct scoutfs_block), GFP_NOFS);
+	if (bl) {
+		/* change _from_contents if allocs not aligned */
+		page = alloc_pages(GFP_NOFS, SCOUTFS_BLOCK_PAGE_ORDER);
+		WARN_ON_ONCE(!page);
+		if (page) {
+			init_rwsem(&bl->rwsem);
+			atomic_set(&bl->refcount, 1);
+			bl->blkno = blkno;
+			bl->sb = sb;
+			bl->page = page;
+			bl->data = page_address(page);
+			trace_printk("allocated bl %p\n", bl);
+		} else {
+			kfree(bl);
+			bl = NULL;
+		}
+	}
+
+	return bl;
+}
+
+void scoutfs_block_put(struct scoutfs_block *bl)
+{
+	if (!IS_ERR_OR_NULL(bl) && atomic_dec_and_test(&bl->refcount)) {
+		trace_printk("freeing bl %p\n", bl);
+		__free_pages(bl->page, SCOUTFS_BLOCK_PAGE_ORDER);
+		kfree(bl);
+		scoutfs_inc_counter(bl->sb, block_mem_free);
+	}
+}
+
+static int verify_block_header(struct super_block *sb, struct scoutfs_block *bl)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->super;
-	struct scoutfs_block_header *hdr = (void *)bh->b_data;
+	struct scoutfs_block_header *hdr = bl->data;
 	u32 crc = scoutfs_crc_block(hdr);
 	int ret = -EIO;

 	if (le32_to_cpu(hdr->crc) != crc) {
-		printk("blkno %llu hdr crc %x != calculated %x\n",
-		       (u64)bh->b_blocknr, le32_to_cpu(hdr->crc), crc);
+		printk("blkno %llu hdr crc %x != calculated %x\n", bl->blkno,
+			le32_to_cpu(hdr->crc), crc);
 	} else if (super->hdr.fsid && hdr->fsid != super->hdr.fsid) {
-		printk("blkno %llu fsid %llx != super fsid %llx\n",
-		       (u64)bh->b_blocknr, le64_to_cpu(hdr->fsid),
-		       le64_to_cpu(super->hdr.fsid));
-	} else if (le64_to_cpu(hdr->blkno) != bh->b_blocknr) {
-		printk("blkno %llu invalid hdr blkno %llx\n",
-		       (u64)bh->b_blocknr, le64_to_cpu(hdr->blkno));
+		printk("blkno %llu fsid %llx != super fsid %llx\n", bl->blkno,
+			le64_to_cpu(hdr->fsid), le64_to_cpu(super->hdr.fsid));
+	} else if (le64_to_cpu(hdr->blkno) != bl->blkno) {
+		printk("blkno %llu invalid hdr blkno %llx\n", bl->blkno,
+			le64_to_cpu(hdr->blkno));
 	} else {
 		ret = 0;
 	}
@@ -78,143 +128,161 @@ static int verify_block_header(struct scoutfs_sb_info *sbi,
 	return ret;
 }

-static struct buffer_head *bh_from_bhp_node(struct rb_node *node)
+static void block_read_end_io(struct bio *bio, int err)
 {
-	struct block_bh_private *bhp;
+	struct scoutfs_block *bl = bio->bi_private;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);

-	bhp = container_of(node, struct block_bh_private, node);
-	return bhp->bh;
-}
+	if (!err && !verify_block_header(bl->sb, bl))
+		set_bit(BLOCK_BIT_UPTODATE, &bl->bits);
+	else
+		set_bit(BLOCK_BIT_ERROR, &bl->bits);

-static struct scoutfs_sb_info *sbi_from_bh(struct buffer_head *bh)
-{
-	struct block_bh_private *bhp = bh->b_private;
+	/*
+	 * uncontended spin_lock in wake_up and unconditional smp_mb to
+	 * make waitqueue_active safe are about the same cost, so we
+	 * prefer the obviously safe choice.
+	 */
+	wake_up(&sbi->block_wq);

-	return SCOUTFS_SB(bhp->sb);
-}
-
-static void insert_bhp_rb(struct rb_root *root, struct buffer_head *ins)
-{
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct block_bh_private *bhp;
-	struct buffer_head *bh;
-
-	while (*node) {
-		parent = *node;
-		bh = bh_from_bhp_node(*node);
-
-		if (ins->b_blocknr < bh->b_blocknr)
-			node = &(*node)->rb_left;
-		else
-			node = &(*node)->rb_right;
-	}
-
-	bhp = ins->b_private;
-	rb_link_node(&bhp->node, parent, node);
-	rb_insert_color(&bhp->node, root);
+	scoutfs_block_put(bl);
+	bio_put(bio);
 }

 /*
- * Track a dirty block by allocating private data and inserting it into
- * the dirty rbtree in the super block.
- *
- * Callers are in transactions that prevent metadata writeback so blocks
- * won't be written and cleaned while we're trying to dirty them.  We
- * serialize racing to add dirty tracking to the same block in case the
- * caller didn't.
- *
- * Presence in the dirty tree holds a bh ref.
+ * Once a transaction block is persistent it's fine to drop the dirty
+ * tag.  It's been checksummed so it can be read in again.  It's seq
+ * will be in the current transaction so it'll simply be dirtied and
+ * checksummed and written out again.
 */
-static int insert_bhp(struct super_block *sb, struct buffer_head *bh)
+static void block_write_end_io(struct bio *bio, int err)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct block_bh_private *bhp;
+	struct scoutfs_block *bl = bio->bi_private;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);
 	unsigned long flags;
-	int ret = 0;

-	if (bh->b_private)
-		return 0;
-
-	lock_buffer(bh);
-	if (bh->b_private)
-		goto out;
-
-	bhp = kmalloc(sizeof(*bhp), GFP_NOFS);
-	if (!bhp) {
-		ret = -ENOMEM;
-		goto out;
+	if (!err) {
+		spin_lock_irqsave(&sbi->block_lock, flags);
+		radix_tree_tag_clear(&sbi->block_radix,
+				     bl->blkno, DIRTY_RADIX_TAG);
+		spin_unlock_irqrestore(&sbi->block_lock, flags);
 	}

-	bhp->sb = sb;
-	bhp->bh = bh;
-	get_bh(bh);
-	bh->b_private = bhp;
-	/* lockdep class can be set by callers that use the lock */
-	init_rwsem(&bhp->rwsem);
-	bhp->rwsem_class = false;
+	/* not too worried about racing ints */
+	if (err && !sbi->block_write_err)
+		sbi->block_write_err = err;

-	spin_lock_irqsave(&sbi->block_lock, flags);
-	insert_bhp_rb(&sbi->block_dirty_tree, bh);
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
+	if (atomic_dec_and_test(&sbi->block_writes))
+		wake_up(&sbi->block_wq);
+
+	scoutfs_block_put(bl);
+	bio_put(bio);

-	trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh);
-out:
-	unlock_buffer(bh);
-	return ret;
 }

-static void erase_bhp(struct buffer_head *bh)
+static int block_submit_bio(struct scoutfs_block *bl, int rw)
 {
-	struct block_bh_private *bhp = bh->b_private;
-	struct scoutfs_sb_info *sbi = sbi_from_bh(bh);
-	unsigned long flags;
+	struct super_block *sb = bl->sb;
+	struct bio *bio;
+	int ret;

-	spin_lock_irqsave(&sbi->block_lock, flags);
-	rb_erase(&bhp->node, &sbi->block_dirty_tree);
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
+	bio = bio_alloc(GFP_NOFS, SCOUTFS_PAGES_PER_BLOCK);
+	if (WARN_ON_ONCE(!bio))
+		return -ENOMEM;

-	put_bh(bh);
-	kfree(bhp);
-	bh->b_private = NULL;
+	bio->bi_sector = bl->blkno << (SCOUTFS_BLOCK_SHIFT - 9);
+	bio->bi_bdev = sb->s_bdev;
+	if (rw & WRITE) {
+		bio->bi_end_io = block_write_end_io;
+	} else
+		bio->bi_end_io = block_read_end_io;
+	bio->bi_private = bl;

-	trace_printk("blkno %llu bh %p\n", (u64)bh->b_blocknr, bh);
+	ret = bio_add_page(bio, bl->page, SCOUTFS_BLOCK_SIZE, 0);
+	if (WARN_ON_ONCE(ret != SCOUTFS_BLOCK_SIZE)) {
+		bio_put(bio);
+		return -ENOMEM;
+	}
+
+	atomic_inc(&bl->refcount);
+	submit_bio(rw, bio);
+
+	return 0;
 }

 /*
 * Read an existing block from the device and verify its metadata header.
- * The buffer head is returned unlocked and uptodate.
 */
 struct scoutfs_block *scoutfs_block_read(struct super_block *sb, u64 blkno)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buffer_head *bh;
+	struct scoutfs_block *found;
+	struct scoutfs_block *bl;
+	unsigned long flags;
 	int ret;

-	bh = sb_bread(sb, blkno);
-	if (!bh) {
-		bh = ERR_PTR(-EIO);
+	/* find an existing block, dropping if it's errored */
+	spin_lock_irqsave(&sbi->block_lock, flags);
+
+	bl = radix_tree_lookup(&sbi->block_radix, blkno);
+	if (bl) {
+		if (test_bit(BLOCK_BIT_ERROR, &bl->bits)) {
+			radix_tree_delete(&sbi->block_radix, bl->blkno);
+			scoutfs_block_put(bl);
+			bl = NULL;
+		} else {
+			atomic_inc(&bl->refcount);
+		}
+	}
+	spin_unlock_irqrestore(&sbi->block_lock, flags);
+	if (bl)
+		goto wait;
+
+	/* allocate a new block and try to insert it */
+	bl = alloc_block(sb, blkno);
+	if (!bl) {
+		ret = -EIO;
 		goto out;
 	}

-	if (!buffer_scoutfs_verified(bh)) {
-		lock_buffer(bh);
-		if (!buffer_scoutfs_verified(bh)) {
-			ret = verify_block_header(sbi, bh);
-			if (!ret)
-				set_buffer_scoutfs_verified(bh);
-		} else {
-			ret = 0;
-		}
-		unlock_buffer(bh);
-		if (ret < 0) {
-			scoutfs_block_put((void *)bh);
-			bh = ERR_PTR(ret);
-		}
+	ret = radix_tree_preload(GFP_NOFS);
+	if (ret)
+		goto out;
+
+	spin_lock_irqsave(&sbi->block_lock, flags);
+
+	found = radix_tree_lookup(&sbi->block_radix, blkno);
+	if (found) {
+		scoutfs_block_put(bl);
+		bl = found;
+		atomic_inc(&bl->refcount);
+	} else {
+		radix_tree_insert(&sbi->block_radix, blkno, bl);
+		atomic_inc(&bl->refcount);
 	}

+	spin_unlock_irqrestore(&sbi->block_lock, flags);
+	radix_tree_preload_end();
+
+	if (!found) {
+		ret = block_submit_bio(bl, READ_SYNC | REQ_META);
+		if (ret)
+			goto out;
+	}
+
+wait:
+	ret = wait_event_interruptible(sbi->block_wq,
+				test_bit(BLOCK_BIT_UPTODATE, &bl->bits) ||
+				test_bit(BLOCK_BIT_ERROR, &bl->bits));
+	if (ret == 0 && test_bit(BLOCK_BIT_ERROR, &bl->bits))
+		ret = -EIO;
 out:
-	return (void *)bh;
+	if (ret) {
+		scoutfs_block_put(bl);
+		bl = ERR_PTR(ret);
+	}
+
+	return bl;
 }

 /*
@@ -226,7 +294,8 @@ out:
 * many times the caller assumes that we've hit persistent corruption
 * and returns an error.
 *
- * XXX how does this race with
+ * XXX:
+ *  - actually implement this
 *  - reads that span transactions?
 *  - writers creating a new dirty block?
 */
@@ -240,7 +309,6 @@ struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb,
 	if (!IS_ERR(bl)) {
 		hdr = scoutfs_block_data(bl);
 		if (WARN_ON_ONCE(hdr->seq != ref->seq)) {
-			clear_buffer_uptodate(bl);
 			scoutfs_block_put(bl);
 			bl = ERR_PTR(-EAGAIN);
 		}
@@ -250,35 +318,19 @@ struct scoutfs_block *scoutfs_block_read_ref(struct super_block *sb,
 }

 /*
- * We stop tracking dirty metadata blocks when their IO succeeds.  This
- * happens in the context of transaction commit which excludes other
- * metadata dirtying paths.
+ * The caller knows that it's not racing with writers.
 */
-static void block_write_end_io(struct buffer_head *bh, int uptodate)
+int scoutfs_block_has_dirty(struct super_block *sb)
 {
-	struct scoutfs_sb_info *sbi = sbi_from_bh(bh);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);

-	trace_printk("bh %p uptdate %d\n", bh, uptodate);
-
-	/* XXX */
-	unlock_buffer(bh);
-
-	if (uptodate) {
-		erase_bhp(bh);
-	} else {
-		/* don't care if this is racey? */
-		if (!sbi->block_write_err)
-			sbi->block_write_err = -EIO;
-	}
-
-	if (atomic_dec_and_test(&sbi->block_writes))
-		wake_up(&sbi->block_wq);
+	return radix_tree_tagged(&sbi->block_radix, DIRTY_RADIX_TAG);
 }

 /*
- * Submit writes for all the buffer heads in the dirty block tree.  The
- * write transaction machinery ensures that the dirty blocks form a
- * consistent image and excludes future dirtying while we're working.
+ * Submit writes for all the blocks in the radix with their dirty tag
+ * set.  The transaction machinery ensures that the dirty blocks form a
+ * consistent image and excludes future dirtying while IO is in flight.
 *
 * Presence in the dirty tree holds a reference.  Blocks are only
 * removed from the tree which drops the ref when IO completes.
@@ -291,38 +343,49 @@ static void block_write_end_io(struct buffer_head *bh, int uptodate)
 int scoutfs_block_write_dirty(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
-	struct buffer_head *bh;
-	struct rb_node *node;
+	struct scoutfs_block *blocks[16];
+	struct scoutfs_block *bl;
 	struct blk_plug plug;
 	unsigned long flags;
+	u64 blkno;
 	int ret;
+	int nr;
+	int i;

 	atomic_set(&sbi->block_writes, 1);
 	sbi->block_write_err = 0;
+	blkno = 0;
 	ret = 0;

 	blk_start_plug(&plug);

-	spin_lock_irqsave(&sbi->block_lock, flags);
-	node = rb_first(&sbi->block_dirty_tree);
-	while(node) {
-		bh = bh_from_bhp_node(node);
-		node = rb_next(node);
+	do {
+		/* get refs to a bunch of dirty blocks */
+		spin_lock_irqsave(&sbi->block_lock, flags);
+		nr = radix_tree_gang_lookup_tag(&sbi->block_radix,
+						(void **)blocks, blkno,
+						ARRAY_SIZE(blocks),
+						DIRTY_RADIX_TAG);
+		if (nr > 0)
+			blkno = blocks[nr - 1]->blkno + 1;
+		for (i = 0; i < nr; i++)
+			atomic_inc(&blocks[i]->refcount);
 		spin_unlock_irqrestore(&sbi->block_lock, flags);

-		atomic_inc(&sbi->block_writes);
-		scoutfs_block_set_crc((void *)bh);
+		/* submit them in order, being careful to put all on err */
+		for (i = 0; i < nr; i++) {
+			bl = blocks[i];

-		lock_buffer(bh);
-
-		bh->b_end_io = block_write_end_io;
-		ret = submit_bh(WRITE, bh); /* doesn't actually fail? */
-
-		spin_lock_irqsave(&sbi->block_lock, flags);
-		if (ret)
-			break;
-	}
-	spin_unlock_irqrestore(&sbi->block_lock, flags);
+			if (ret == 0) {
+				scoutfs_block_set_crc(bl);
+				atomic_inc(&sbi->block_writes);
+				ret = block_submit_bio(bl, WRITE);
+				if (ret)
+					atomic_dec(&sbi->block_writes);
+			}
+			scoutfs_block_put(bl);
+		}
+	} while (nr && !ret);

 	blk_finish_plug(&plug);

@@ -330,18 +393,29 @@ int scoutfs_block_write_dirty(struct super_block *sb)
 	atomic_dec(&sbi->block_writes);
 	wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0);

-	trace_printk("ret %d\n", ret);
-	return ret;
+	return ret ?: sbi->block_write_err;
 }

 /*
- * The caller knows that it's not racing with writers.
+ * XXX This is a gross hack for writing the super.  It doesn't have
+ * per-block write completion indication.  It knows that it's the only
+ * thing that will be writing.
 */
-int scoutfs_block_has_dirty(struct super_block *sb)
+int scoutfs_block_write_sync(struct scoutfs_block *bl)
 {
-	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);
+	int ret;

-	return !RB_EMPTY_ROOT(&sbi->block_dirty_tree);
+	BUG_ON(atomic_read(&sbi->block_writes) != 0);
+
+	atomic_inc(&sbi->block_writes);
+	ret = block_submit_bio(bl, WRITE);
+	if (ret)
+		atomic_dec(&sbi->block_writes);
+	else
+		wait_event(sbi->block_wq, atomic_read(&sbi->block_writes) == 0);
+
+	return ret ?: sbi->block_write_err;
 }

 /*
@@ -418,37 +492,64 @@ out:
 * Return a dirty metadata block with an updated block header to match
 * the current dirty seq.  Callers are responsible for serializing
 * access to the block and for zeroing unwritten block contents.
+ *
+ * Always allocating a new block and replacing any old cached block
+ * serves a very specific purpose.  We can have an unlocked reader
+ * traversing stable structures actively using a clean block while a
+ * writer gets that same blkno from the allocator and starts modifying
+ * it.  By always allocating a new block we let the reader continue
+ * safely using their old immutable block while the writer works on the
+ * newly allocated block.  The old stable block will be freed once the
+ * reader drops their reference.
 */
 struct scoutfs_block *scoutfs_block_dirty(struct super_block *sb, u64 blkno)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_block_header *hdr;
-	struct buffer_head *bh;
+	struct scoutfs_block *found;
+	struct scoutfs_block *bl;
+	unsigned long flags;
 	int ret;

 	/* allocate a new block and try to insert it */
-	bh = sb_getblk(sb, blkno);
-	if (!bh) {
-		bh = ERR_PTR(-ENOMEM);
+	bl = alloc_block(sb, blkno);
+	if (!bl) {
+		ret = -EIO;
 		goto out;
 	}

-	ret = insert_bhp(sb, bh);
-	if (ret < 0) {
-		scoutfs_block_put((void *)bh);
-		bh = ERR_PTR(ret);
-		goto out;
-	}
+	set_bit(BLOCK_BIT_UPTODATE, &bl->bits);

-	hdr = scoutfs_block_data((void *)bh);
+	ret = radix_tree_preload(GFP_NOFS);
+	if (ret)
+		goto out;
+
+	hdr = bl->data;
 	*hdr = sbi->super.hdr;
 	hdr->blkno = cpu_to_le64(blkno);
 	hdr->seq = sbi->super.hdr.seq;

-	set_buffer_uptodate(bh);
-	set_buffer_scoutfs_verified(bh);
+	spin_lock_irqsave(&sbi->block_lock, flags);
+	found = radix_tree_lookup(&sbi->block_radix, blkno);
+	if (found) {
+		radix_tree_delete(&sbi->block_radix, blkno);
+		scoutfs_block_put(found);
+	}
+
+	radix_tree_insert(&sbi->block_radix, blkno, bl);
+	radix_tree_tag_set(&sbi->block_radix, blkno, DIRTY_RADIX_TAG);
+	atomic_inc(&bl->refcount);
+	spin_unlock_irqrestore(&sbi->block_lock, flags);
+
+	radix_tree_preload_end();
+	ret = 0;
 out:
-	return (void *)bh;
+	if (ret) {
+		scoutfs_block_put(bl);
+		bl = ERR_PTR(ret);
+	}
+
+	return bl;
 }

 /*
@@ -476,29 +577,6 @@ struct scoutfs_block *scoutfs_block_dirty_alloc(struct super_block *sb)
 	return bl;
 }

-/*
- * Make sure that we don't have a dirty block at the given blkno.  If we
- * do we remove it from our tree of dirty blocks and clear the buffer
- * dirty bit.
- *
- * XXX for now callers have only needed to forget blknos, maybe they'll
- * have the bh some day.
- */
-void scoutfs_block_forget(struct super_block *sb, u64 blkno)
-{
-	struct block_bh_private *bhp;
-	struct buffer_head *bh;
-
-	bh = sb_find_get_block(sb, blkno);
-	if (bh) {
-		bhp = bh->b_private;
-		if (bhp) {
-			erase_bhp(bh);
-			bforget(bh);
-		}
-	}
-}
-
 void scoutfs_block_set_crc(struct scoutfs_block *bl)
 {
 	struct scoutfs_block_header *hdr = scoutfs_block_data(bl);
@@ -531,46 +609,41 @@ void scoutfs_block_zero_from(struct scoutfs_block *bl, void *ptr)
 void scoutfs_block_set_lock_class(struct scoutfs_block *bl,
 			          struct lock_class_key *class)
 {
-	struct buffer_head *bh = (void *)bl;
-	struct block_bh_private *bhp = bh->b_private;
-
-	if (bhp && !bhp->rwsem_class) {
-		lockdep_set_class(&bhp->rwsem, class);
-		bhp->rwsem_class = true;
+	if (!test_bit(BLOCK_BIT_CLASS_SET, &bl->bits)) {
+		lockdep_set_class(&bl->rwsem, class);
+		set_bit(BLOCK_BIT_CLASS_SET, &bl->bits);
 	}
 }

 void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass)
 {
-	struct buffer_head *bh = (void *)bl;
-	struct block_bh_private *bhp = bh->b_private;
+	struct scoutfs_block_header *hdr = scoutfs_block_data(bl);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);

-	if (bhp) {
+	if (hdr->seq == sbi->super.hdr.seq) {
 		if (write)
-			down_write_nested(&bhp->rwsem, subclass);
+			down_write_nested(&bl->rwsem, subclass);
 		else
-			down_read_nested(&bhp->rwsem, subclass);
+			down_read_nested(&bl->rwsem, subclass);
 	}
 }

 void scoutfs_block_unlock(struct scoutfs_block *bl, bool write)
 {
-	struct buffer_head *bh = (void *)bl;
-	struct block_bh_private *bhp = bh->b_private;
+	struct scoutfs_block_header *hdr = scoutfs_block_data(bl);
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(bl->sb);

-	if (bhp) {
+	if (hdr->seq == sbi->super.hdr.seq) {
 		if (write)
-			up_write(&bhp->rwsem);
+			up_write(&bl->rwsem);
 		else
-			up_read(&bhp->rwsem);
+			up_read(&bl->rwsem);
 	}
 }

 void *scoutfs_block_data(struct scoutfs_block *bl)
 {
-	struct buffer_head *bh = (void *)bl;
-
-	return (void *)bh->b_data;
+	return bl->data;
 }

 void *scoutfs_block_data_from_contents(const void *ptr)
@@ -580,10 +653,23 @@ void *scoutfs_block_data_from_contents(const void *ptr)
 	return (void *)(addr & ~((unsigned long)SCOUTFS_BLOCK_MASK));
 }

-void scoutfs_block_put(struct scoutfs_block *bl)
+void scoutfs_block_destroy(struct super_block *sb)
 {
-	struct buffer_head *bh = (void *)bl;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct scoutfs_block *blocks[16];
+	struct scoutfs_block *bl;
+	unsigned long blkno = 0;
+	int nr;
+	int i;

-	if (!IS_ERR_OR_NULL(bh))
-		brelse(bh);
+	do {
+		nr = radix_tree_gang_lookup(&sbi->block_radix, (void **)blocks,
+					    blkno, ARRAY_SIZE(blocks));
+		for (i = 0; i < nr; i++) {
+			bl = blocks[i];
+			radix_tree_delete(&sbi->block_radix, bl->blkno);
+			blkno = bl->blkno + 1;
+			scoutfs_block_put(bl);
+		}
+	} while (nr);
 }
--- a/kmod/src/block.h
+++ b/kmod/src/block.h
@@ -16,6 +16,7 @@ struct scoutfs_block *scoutfs_block_dirty_ref(struct super_block *sb,

 int scoutfs_block_has_dirty(struct super_block *sb);
 int scoutfs_block_write_dirty(struct super_block *sb);
+int scoutfs_block_write_sync(struct scoutfs_block *bl);

 void scoutfs_block_set_crc(struct scoutfs_block *bl);
 void scoutfs_block_zero(struct scoutfs_block *bl, size_t off);
@@ -26,10 +27,10 @@ void scoutfs_block_set_lock_class(struct scoutfs_block *bl,
 void scoutfs_block_lock(struct scoutfs_block *bl, bool write, int subclass);
 void scoutfs_block_unlock(struct scoutfs_block *bl, bool write);

-void scoutfs_block_forget(struct super_block *sb, u64 blkno);
-
 void *scoutfs_block_data(struct scoutfs_block *bl);
 void *scoutfs_block_data_from_contents(const void *ptr);
 void scoutfs_block_put(struct scoutfs_block *bl);

+void scoutfs_block_destroy(struct super_block *sb);
+
 #endif
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -15,7 +15,6 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/magic.h>
-#include <linux/buffer_head.h>
 #include <linux/random.h>
 #include <linux/statfs.h>

@@ -109,23 +108,19 @@ int scoutfs_write_dirty_super(struct super_block *sb)
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super;
 	struct scoutfs_block *bl;
-	struct buffer_head *bh;
 	int ret;

-	/* XXX hack is immediately repaired in the coming patches */
-	bh = sb_getblk(sb, le64_to_cpu(sbi->super.hdr.blkno));
-	if (!bh)
-		return -ENOMEM;
-	bl = (void *)bh;
+	/* XXX prealloc? */
+	bl = scoutfs_block_dirty(sb, le64_to_cpu(sbi->super.hdr.blkno));
+	if (WARN_ON_ONCE(IS_ERR(bl)))
+		return PTR_ERR(bl);
 	super = scoutfs_block_data(bl);

-	*super = sbi->super;
-	scoutfs_block_zero(bl, sizeof(struct scoutfs_super_block));
+	memcpy(super, &sbi->super, sizeof(*super));
+	scoutfs_block_zero(bl, sizeof(*super));
 	scoutfs_block_set_crc(bl);

-	mark_buffer_dirty(bh);
-	ret = sync_dirty_buffer(bh);
-
+	ret = scoutfs_block_write_sync(bl);
 	scoutfs_block_put(bl);
 	return ret;
 }
@@ -193,7 +188,8 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)

 	spin_lock_init(&sbi->next_ino_lock);
 	spin_lock_init(&sbi->block_lock);
-	sbi->block_dirty_tree = RB_ROOT;
+	/* radix only inserted with NOFS _preload */
+	INIT_RADIX_TREE(&sbi->block_radix, GFP_ATOMIC);
 	init_waitqueue_head(&sbi->block_wq);
 	atomic_set(&sbi->block_writes, 0);
 	init_rwsem(&sbi->btree_rwsem);
@@ -204,11 +200,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	init_waitqueue_head(&sbi->trans_write_wq);
 	spin_lock_init(&sbi->file_alloc_lock);

-	if (!sb_set_blocksize(sb, SCOUTFS_BLOCK_SIZE)) {
-		printk(KERN_ERR "couldn't set blocksize\n");
-		return -EINVAL;
-	}
-
 	/* XXX can have multiple mounts of a  device, need mount id */
 	sbi->kset = kset_create_and_add(sb->s_id, NULL, &scoutfs_kset->kobj);
 	if (!sbi->kset)
@@ -250,12 +241,10 @@ static void scoutfs_kill_sb(struct super_block *sb)
 	if (sbi) {
 		scoutfs_shutdown_trans(sb);
 		scoutfs_buddy_destroy(sb);
+		scoutfs_block_destroy(sb);
 		scoutfs_destroy_counters(sb);
 		if (sbi->kset)
 			kset_unregister(sbi->kset);
-
-		/* XXX write errors can leave dirty blocks */
-		WARN_ON_ONCE(!RB_EMPTY_ROOT(&sbi->block_dirty_tree));
 		kfree(sbi);
 	}
 }
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -19,7 +19,7 @@ struct scoutfs_sb_info {
 	spinlock_t next_ino_lock;

 	spinlock_t block_lock;
-	struct rb_root block_dirty_tree;
+	struct radix_tree_root block_radix;
 	wait_queue_head_t block_wq;
 	atomic_t block_writes;
 	int block_write_err;