From 6afeb978028e1b6efcac93d6af945ee201ba04e7 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 1 May 2017 13:57:59 -0700
Subject: [PATCH] scoutfs: reference file data with extent items

Our first attempt at storing file data put them in items.  This was easy
to implement but won't be acceptable in the long term.  The cost of the
power of LSM indexing is compaction overhead.  That's acceptable for
fine grained metadata but is totally unacceptable for bulk file data.

This switches to storing file data in seperate block allocations which
are referenced by extent items.

The bulk of the change is the mechanics of working with extents.  We
have high level callers which add or remove logical extents and then
underlying mechanisms that insert, merge, or split the items that
the extents are stored in.

We have three types of extent items.  The primary type maps logical file
regions to physical block extents.  The next two store free extents
per-node so that clients don't create lock and LSM contention as they
try and allocate extents.

To fill those per-node free extents we add messages that communcate free
extents in the form of lists of segment allocations from the server.

We don't do any fancy multi-block allocation yet.  We only allocate
blocks in get_blocks as writes find unmapped blocks.  We do use some
per-task cursors to cache block allocation positions so that these
single block allocations are very likely to merge into larger extents as
tasks stream wites.

This is just the first chunk of the extent work that's coming.  A later
patch adds offline flags and fixes up the change nonsense that seemed
like a good idea here.

The final moving part is that we initiate writeback on all newly
allocated extents before we commit the metadata that references the new
blocks.  We do this with our own dirty inode tracking because the high
level vfs methods are unusably slow in some upstream kernels (they walk
all inodes, not just dirty inodes.)

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/data.c   | 1278 ++++++++++++++++++++++++++++++---------------
 kmod/src/data.h   |    1 -
 kmod/src/format.h |   34 +-
 kmod/src/inode.c  |  158 +++++-
 kmod/src/inode.h  |    4 +
 kmod/src/net.c    |  164 ++++++
 kmod/src/net.h    |    1 +
 kmod/src/super.c  |    6 +
 kmod/src/super.h  |    6 +-
 kmod/src/trans.c  |    8 +-
 10 files changed, 1224 insertions(+), 436 deletions(-)

diff --git a/kmod/src/data.c b/kmod/src/data.c
index 61547c2e..8533a3c5 100644
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -14,8 +14,12 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/backing-dev.h>
-#include <linux/delay.h>
+#include <linux/mpage.h>
+#include <linux/rhashtable.h>
+#include <linux/sched.h>
+#include <linux/buffer_head.h>
+#include <linux/hash.h>
+#include <linux/random.h>
 
 #include "format.h"
 #include "super.h"
@@ -27,354 +31,844 @@
 #include "scoutfs_trace.h"
 #include "item.h"
 #include "ioctl.h"
+#include "net.h"
 
 /*
- * scoutfs stores data in items that can be up to the small 4K block
- * size.  The page cache address space callbacks work with the item
- * cache.  Each OS page can be stored in multiple of our smaller fixed
- * size items.  The code doesn't understand OS pages that are smaller
- * than our block size.
+ * scoutfs uses extent records to reference file data.
  *
- * readpage does a blocking read of the item and then copies its
- * contents into the page.  Since the segments are huge we sort of get
- * limited read-ahead by reading in segments at a time.
+ * The extent items map logical file regions to device blocks at at 4K
+ * block granularity.  File data isn't overwritten so that overwriting
+ * doesn't generate extent item locking and modification.
  *
- * Writing is quite a bit more fiddly.  We want to pack small files.
- * The item cache and transactions want to accurately track the size of
- * dirty items to fill the next segment.  And we would like to minimize
- * cpu copying as much as we can.
+ * Nodes have their own free extent items stored at their node id to
+ * avoid lock contention during allocation and freeing.  These pools are
+ * filled and drained with RPCs to the server who allocates blocks in
+ * segment-sized regions.
  *
- * This simplest first pass creates dirty items as pages are dirtied
- * whose values reference the page contents.  They're freed after
- * they're written to the segment so that we don't have to worry about
- * items that reference clean pages.  Invalidatepage forgets any items
- * if a dirty page is truncated away.
+ * Block allocation maintains a fixed number of allocation cursors that
+ * remember the position of tasks within free regions.  This is very
+ * simple and maintains decent extents for simple streaming writes.  It
+ * eventually won't be good enough and we'll spend complexity on
+ * delalloc but we want to put that off as long as possible.
  *
- * Writeback is built around all the dirty items being written by a
- * commit.  This can happen naturally in the backgroud.  Or writepage
- * can initiate it to start by kicking the commit thread.  In either
- * case our dirty pages are "in writeback" by being put on a list that
- * is walked by the end of the commit.  Because writes and page dirtying
- * are serialized with the commit we know that there can be no dirty
- * pages after the commit and we can mark writeback complete on all the
- * pages that started writeback before the commit finished.  motivate
- * having items in the item cache while there are dirty pages.
+ * There's no unwritten extents.  As we dirty file data pages, possibly
+ * allocating extents for the first time, we track their inodes.  Before
+ * we commit dirty metadata we write out all tracked inodes.  This
+ * ensures that data is persistent before the metadata that references
+ * it is usable.
  *
- * Data is copied from the dirty page contents into the segment pages
- * for writing.  This lets us easily pack small files without worrying
- * about DMA alignment and avoids the stable page problem of the page
- * being modified after the cpu calculates the checksum but before the
- * DMA reads to the device.
+ * Weirdly, the extents are indexed by the *final* logical block and
+ * blkno of the extent.  This lets us search for neighbouring previous
+ * extents with a _next() call and avoids having to implement item
+ * reading that iterates backwards through the manifest and segments.
+ *
+ * There are two items that track free extents, one indexed by the block
+ * location of the free extent and one indexed by the size of the free
+ * region.  This means that one allocation can update a great number of
+ * items throughout the tree as file and both kinds of free extents
+ * split and merge.  The code goes to great lengths to stage these
+ * updates so that it can always unwind and return errors without
+ * leaving the items inconsistent.
  *
  * XXX
  *  - truncate
  *  - mmap
  *  - better io error propagation
- *  - async readpages for more concurrent readahead
  *  - forced unmount with dirty data
  *  - direct IO
- *  - probably stitch page vecs into block struct page fragments for bios
- *  - maybe cut segment boundaries on aligned data offsets
- *  - maybe decouple metadata and data segment writes
  */
 
 struct data_info {
-	struct llist_head writeback_pages;
+	struct rw_semaphore alloc_rwsem;
+	u64 next_large_blkno;
+	struct rhashtable cursors;
+	struct list_head cursor_lru;
 };
 
 #define DECLARE_DATA_INFO(sb, name) \
 	struct data_info *name = SCOUTFS_SB(sb)->data_info
 
-/*
- * trace_printk() doesn't support %c?
- *
- * 1 - 1ocked
- * a - uptodAte
- * d - Dirty
- * b - writeBack
- * e - Error
- */
-#define page_hexflag(page, name, val, shift) \
-	(Page##name(page) ? (val << (shift * 4)) : 0)
-
-#define page_hexflags(page) \
-	(page_hexflag(page, Locked, 0x1, 4)	|	\
-	 page_hexflag(page, Uptodate, 0xa, 3)	|	\
-	 page_hexflag(page, Dirty, 0xd, 2)	|	\
-	 page_hexflag(page, Writeback, 0xb, 1)	|	\
-	 page_hexflag(page, Error, 0xe, 0))
-
-#define PGF "page %p [index %lu flags %x]"
-#define PGA(page)					\
-	(page), (page)->index, page_hexflags(page)	\
-
-#define BHF "bh %p [blocknr %llu size %zu state %lx]"
-#define BHA(bh)							\
-	(bh), (u64)(bh)->b_blocknr, (bh)->b_size, (bh)->b_state	\
-
-static void init_data_key(struct scoutfs_key_buf *key,
-			  struct scoutfs_data_key *dkey, u64 ino, u64 block)
-{
-	dkey->type = SCOUTFS_DATA_KEY;
-	dkey->ino = cpu_to_be64(ino);
-	dkey->block = cpu_to_be64(block);
-
-	scoutfs_key_init(key, dkey, sizeof(struct scoutfs_data_key));
-}
+/* more than enough for a few tasks per core on moderate hardware */
+#define NR_CURSORS 4096
 
 /*
- * Delete the data block items in the given region.
+ * This is the size of extents that are tracked by a cursor and so end
+ * up being the largest file item extent length given concurrent
+ * streaming writes.
  *
- * This is the low level extent item truncate code.  Callers manage
- * higher order truncation and orphan cleanup.
- *
- * XXX
- *  - restore support for releasing data.
- *  - for final unlink this would be better as a range deletion
- *  - probably don't want to read items to find them for removal
+ * XXX We probably want this to be a bit larger to further reduce the
+ * amount of item churn involved in truncating tremendous files.
  */
-int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
-				u64 len, bool offline)
-{
-	struct scoutfs_data_key last_dkey;
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf last;
-	struct scoutfs_key_buf key;
-	int ret;
+#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS
 
-	trace_printk("iblock %llu len %llu offline %u\n",
-		     iblock, len, offline);
+struct cursor_id {
+	struct task_struct *task;
+	pid_t pid;
+} __packed; /* rhashtable_lookup() always memcmp()s, avoid padding */
 
-	if (WARN_ON_ONCE(iblock + len <= iblock) ||
-	    WARN_ON_ONCE(offline))
-		return -EINVAL;
-
-	init_data_key(&key, &dkey, ino, iblock);
-	init_data_key(&last, &last_dkey, ino, iblock + len - 1);
-
-	for (;;) {
-		ret = scoutfs_item_next(sb, &key, &last, NULL);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
-			break;
-		}
-
-		/* XXX would set offline bit items here */
-
-		ret = scoutfs_item_delete(sb, &key);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-static inline struct page *page_from_llist_node(struct llist_node *node)
-{
-	BUILD_BUG_ON(member_sizeof(struct page, private) !=
-		     sizeof(struct llist_node));
-
-	return container_of((void *)node, struct page, private);
-}
-
-static inline struct llist_node *llist_node_from_page(struct page *page)
-{
-	return (void *)&page->private;
-}
-
-static inline void page_llist_add(struct page *page, struct llist_head *head)
-{
-	llist_add(llist_node_from_page(page), head);
-}
+struct task_cursor {
+	u64 blkno;
+	u64 blocks;
+	struct rhash_head hash_head;
+	struct list_head list_head;
+	struct cursor_id id;
+};
 
 /*
- * The transaction has committed so there are no more dirty items.  End
- * writeback on all the dirty pages that started writeback before the
- * commit finished.  The commit doesn't start until all holders which
- * could dirty are released so there couldn't have been new dirty pages
- * and writeback entries while the commit was in flight.
+ * Both file extent and free extent keys are converted into this native
+ * form for manipulation.  The free extents set blk_off to blkno.
  */
-void scoutfs_data_end_writeback(struct super_block *sb, int err)
+struct native_extent {
+	u64 blk_off;
+	u64 blkno;
+	u64 blocks;
+};
+
+/* These are stored in a (type==0) terminated array on caller's stacks */
+struct extent_change {
+	struct native_extent ext;
+	u64 arg;
+	unsigned ins:1,
+		 type;
+};
+
+/* insert file extent + remove both blkno and blocks extents + 0 term */
+#define MAX_CHANGES (3 + 3 + 3 + 1)
+
+/* XXX avoiding dynamic on-stack array initializers :/ */
+union extent_key_union {
+	struct scoutfs_file_extent_key file;
+	struct scoutfs_free_extent_blkno_key blkno;
+	struct scoutfs_free_extent_blocks_key blocks;
+} __packed;
+#define MAX_KEY_BYTES sizeof(union extent_key_union)
+
+static void init_file_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
+			         struct native_extent *ext, u64 arg)
 {
-	DECLARE_DATA_INFO(sb, datinf);
-	struct llist_node *node;
-	struct page *page;
+	struct scoutfs_file_extent_key *fkey = key_bytes;
 
-	/* XXX haven't thought about errors here */
-	BUG_ON(err);
+	fkey->type = SCOUTFS_FILE_EXTENT_KEY;
+	fkey->ino = cpu_to_be64(arg);
+	fkey->last_blk_off = cpu_to_be64(ext->blk_off + ext->blocks - 1);
+	fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1);
+	fkey->blocks = cpu_to_be64(ext->blocks);
 
-	node = llist_del_all(&datinf->writeback_pages);
-
-	while (node) {
-		page = page_from_llist_node(node);
-		node = llist_next(node);
-
-		trace_printk("ending writeback "PGF"\n", PGA(page));
-		scoutfs_inc_counter(sb, data_end_writeback_page);
-
-
-		set_page_private(page, 0);
-		end_page_writeback(page);
-		page_cache_release(page);
-	}
+	scoutfs_key_init(key, fkey, sizeof(struct scoutfs_file_extent_key));
 }
 
-#define for_each_page_block(page, start, loff, block, key, dkey, val)	   \
-	for (start = 0;							   \
-	     start < PAGE_CACHE_SIZE &&					   \
-		(loff = ((loff_t)page->index << PAGE_CACHE_SHIFT) + start, \
-		 block = loff >> SCOUTFS_BLOCK_SHIFT,			   \
-		 init_data_key(&key, &dkey,				   \
-			       scoutfs_ino(page->mapping->host), block),   \
-		 scoutfs_kvec_init(val, page_address(page) + start,	   \
-				   SCOUTFS_BLOCK_SIZE),			   \
-		 1);							   \
-	     start += SCOUTFS_BLOCK_SIZE)
+#define INIT_FREE_EXTENT_KEY(which_type, key, key_bytes, ext, arg, type)  \
+do {									  \
+	struct which_type *fkey = key_bytes;				  \
+									  \
+	fkey->type = type;						  \
+	fkey->node_id = cpu_to_be64(arg);				  \
+	fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1);	  \
+	fkey->blocks = cpu_to_be64(ext->blocks);			  \
+									  \
+	scoutfs_key_init(key, fkey, sizeof(struct which_type));		  \
+} while (0)
 
-/*
- * Copy the contents of each item that makes up the page into their
- * regions of the page, zeroing any page contents not covered by items.
- *
- * This is the simplest loop that looks up every possible block.  We
- * could instead have a readpages() that iterates over present items and
- * puts them in the pages in the batch.
- */
-static int scoutfs_readpage(struct file *file, struct page *page)
+static void init_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
+			    struct native_extent *ext, u64 arg, u8 type)
 {
-	struct inode *inode = page->mapping->host;
-	struct super_block *sb = inode->i_sb;
-	loff_t size = i_size_read(inode);
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf key;
-	SCOUTFS_DECLARE_KVEC(val);
-	unsigned start;
-	loff_t loff;
-	u64 block;
-	int ret = 0;
-
-
-	trace_printk(PGF"\n", PGA(page));
-	scoutfs_inc_counter(sb, data_readpage);
-
-	for_each_page_block(page, start, loff, block, key, dkey, val) {
-		/* the rest of the page is zero when block is past i_size */
-		if (loff >= size)
-			break;
-
-		/* copy the block item contents into the page */
-		ret = scoutfs_item_lookup(sb, &key, val);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
-			else
-				break;
-		}
-
-		/*
-		 * XXX do we need to clamp the item length by i_size?
-		 * truncate should purge the item cache and create
-		 * truncation range items that'd merge away old data
-		 * items, and invalidatepage should shrink any ephemeral
-		 * vecs.  Seems like the item length should be accurate?
-		 */
-
-		/* zero the tail of the block */
-		if (ret < SCOUTFS_BLOCK_SIZE)
-			zero_user(page, start, SCOUTFS_BLOCK_SIZE - ret);
-	}
-
-	/* zero any remaining tail blocks */
-	if (start < PAGE_CACHE_SIZE)
-		zero_user(page, start, PAGE_CACHE_SIZE - start);
-
-	if (ret == 0)
-		SetPageUptodate(page);
+	if (type == SCOUTFS_FILE_EXTENT_KEY)
+		init_file_extent_key(key, key_bytes, ext, arg);
+	else if(type == SCOUTFS_FREE_EXTENT_BLKNO_KEY)
+		INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blkno_key,
+				     key, key_bytes, ext, arg, type);
 	else
-		SetPageError(page);
+		INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blocks_key,
+				     key, key_bytes, ext, arg, type);
+}
 
-	trace_printk("ret %d\n", ret);
-	unlock_page(page);
-	return ret;
+/* XXX could have some sanity checks */
+static void load_file_extent(struct native_extent *ext,
+			     struct scoutfs_key_buf *key)
+{
+	struct scoutfs_file_extent_key *fkey = key->data;
+
+	ext->blocks = be64_to_cpu(fkey->blocks);
+	ext->blk_off = be64_to_cpu(fkey->last_blk_off) - ext->blocks + 1;
+	ext->blkno = be64_to_cpu(fkey->last_blkno) - ext->blocks + 1;
+}
+
+#define LOAD_FREE_EXTENT(which_type, ext, key)		\
+do {							\
+	struct which_type *fkey = key->data;		\
+							\
+	ext->blkno = be64_to_cpu(fkey->last_blkno) -	\
+		     be64_to_cpu(fkey->blocks) + 1;	\
+	ext->blk_off = ext->blkno;			\
+	ext->blocks = be64_to_cpu(fkey->blocks);	\
+} while (0)
+
+static void load_extent(struct native_extent *ext, struct scoutfs_key_buf *key)
+{
+	struct scoutfs_free_extent_blocks_key *fkey = key->data;
+
+	BUILD_BUG_ON(offsetof(struct scoutfs_file_extent_key, type) !=
+		     offsetof(struct scoutfs_free_extent_blkno_key, type) ||
+		     offsetof(struct scoutfs_file_extent_key, type) !=
+		     offsetof(struct scoutfs_free_extent_blocks_key, type));
+
+	if (fkey->type == SCOUTFS_FILE_EXTENT_KEY)
+		load_file_extent(ext, key);
+	else if (fkey->type == SCOUTFS_FREE_EXTENT_BLKNO_KEY)
+		LOAD_FREE_EXTENT(scoutfs_free_extent_blkno_key, ext, key);
+	else
+		LOAD_FREE_EXTENT(scoutfs_free_extent_blocks_key, ext, key);
 }
 
 /*
- * Start writeback on a dirty page.  We always try to kick off a commit.
- * Repeated calls harmlessly bounce off the thread work's pending bit.
- * (we could probably test that the writeback pgaes list is empty before
- * trying to kick off a commit.)
- *
- * We add ourselves to a list of pages that the commit will end
- * writeback on once its done.  If there's no dirty data the commit
- * thread will end writeback after not doing anything.
+ * Merge two extents if they're adjacent.  First we arrange them to
+ * only test their adjoining endpoints, then are careful to not reference
+ * fields after we've modified them.
  */
-static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
+static int merge_extents(struct native_extent *mod,
+			 struct native_extent *ext)
 {
-	struct inode *inode = page->mapping->host;
-	struct super_block *sb = inode->i_sb;
-	DECLARE_DATA_INFO(sb, datinf);
+	struct native_extent *left;
+	struct native_extent *right;
 
-	trace_printk(PGF"\n", PGA(page));
-	scoutfs_inc_counter(sb, data_writepage);
+	if (mod->blk_off < ext->blk_off) {
+		left = mod;
+		right = ext;
+	} else {
+		left = ext;
+		right = mod;
+	}
 
-	BUG_ON(PageWriteback(page));
-	BUG_ON(page->private != 0);
-
-	ClearPagePrivate(page); /* invalidatepage not needed */
-	set_page_writeback(page);
-	page_cache_get(page);
-	page_llist_add(page, &datinf->writeback_pages);
-	unlock_page(page);
-	scoutfs_sync_fs(sb, 0);
+	if (left->blk_off + left->blocks == right->blk_off &&
+	    left->blkno + left->blocks == right->blkno) {
+		mod->blk_off = left->blk_off;
+		mod->blkno = left->blkno;
+		mod->blocks = left->blocks + right->blocks;
+		return 1;
+	}
 
 	return 0;
 }
 
 /*
- * Truncate is invalidating part of the contents of a page.
- *
- * We can't return errors here so our job is not to create dirty items
- * that end up executing the truncate.  That's the job of higher level
- * callers.  Our job is to make sure that we update references to the
- * page from existing ephemeral items if they already exist.
+ * The caller has ensured that the inner extent is entirely within
+ * the outer extent.  Fill out the left and right regions of outter
+ * that don't overlap with inner.
  */
-static void scoutfs_invalidatepage(struct page *page, unsigned long offset)
+static void trim_extents(struct native_extent *left,
+			 struct native_extent *right,
+			 struct native_extent *outer,
+			 struct native_extent *inner)
 {
-	struct inode *inode = page->mapping->host;
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf key;
-	SCOUTFS_DECLARE_KVEC(val);
-	unsigned start;
-	loff_t loff;
-	u64 block;
+	left->blk_off = outer->blk_off;
+	left->blkno = outer->blkno;
+	left->blocks = inner->blk_off - outer->blk_off;
 
-	trace_printk(PGF"\n", PGA(page));
-	scoutfs_inc_counter(sb, data_invalidatepage);
+	right->blk_off = inner->blk_off + inner->blocks;
+	right->blkno = inner->blkno + inner->blocks;
+	right->blocks = (outer->blk_off + outer->blocks) - right->blk_off;
+}
 
-	for_each_page_block(page, start, loff, block, key, dkey, val) {
-		if (offset) {
-			/* XXX maybe integrate offset into foreach */
-			/* XXX ugh, kvecs are still clumsy :) */
-			if (start + SCOUTFS_BLOCK_SIZE > offset)
-				val[0].iov_len = offset - start;
-			scoutfs_item_update_ephemeral(sb, &key, val);
-		} else {
-			scoutfs_item_forget(sb, &key);
-		}
-	}
+/* return true if inner is fully contained by outer */
+static bool extents_within(struct native_extent *outer,
+			   struct native_extent *inner)
+{
+	u64 outer_end = outer->blk_off + outer->blocks - 1;
+	u64 inner_end = inner->blk_off + inner->blocks - 1;
+
+	return outer->blk_off <= inner_end && outer_end >= inner_end;
 }
 
 /*
- * Start modifying a page cache page.
- *
- * We hold the transaction for write_end's inode updates before
- * acquiring the page lock.
- *
- * We give the writer the current page contents in the relatively rare
- * case of writing a partial page inside i_size.  write_end will zero
- * any region around the write if the page isn't uptodate.
+ * Add a new entry to the array of changes.  The _BLOCKS extent items
+ * exactly match the _BLKNO items but with different field order for
+ * searching by size.  We keep them in sync by always adding a _BLOCKS
+ * change for every _BLKNO change.
  */
+static struct extent_change *append_change(struct extent_change *chg,
+					   bool ins, struct native_extent *ext,
+					   u64 arg, u8 type)
+{
+	trace_printk("appending ins %d blk_off %llu blkno %llu blocks %llu arg %llu type %u\n",
+			ins, ext->blk_off, ext->blkno, ext->blocks,
+			arg, type);
+
+	chg->ext = *ext;
+	chg->arg = arg;
+	chg->ins = ins;
+	chg->type = type;
+
+	if (type == SCOUTFS_FREE_EXTENT_BLKNO_KEY) {
+		chg++;
+		*chg = *(chg - 1);
+		chg->type = SCOUTFS_FREE_EXTENT_BLOCKS_KEY;
+	}
+
+	return chg + 1;
+}
+
+/*
+ * Find an adjacent extent in the direction of the delta.  If we can
+ * merge with it then we modify the incoming cur extent.  nei is set to
+ * the neighbour we found.  > 0 is returned if we merged, 0 if not, and
+ * < 0 on error.
+ */
+static int try_merge(struct super_block *sb, struct native_extent *cur,
+		     s64 delta, struct native_extent *nei, u64 arg, u8 type)
+{
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent ext;
+	int ret;
+
+	/* short circuit prev search for common first block alloc */
+	if (cur->blk_off == 0 && delta < 0)
+		return 0;
+
+	trace_printk("nei %lld from blk_off %llu blkno %llu blocks %llu\n",
+		     delta, cur->blk_off, cur->blkno, cur->blocks);
+
+	memset(&ext, ~0, sizeof(ext));
+	init_extent_key(&last, last_bytes, &ext, arg, type);
+
+	ext.blk_off = cur->blk_off + delta;
+	ext.blkno = cur->blkno + delta;
+	ext.blocks = 1;
+	init_extent_key(&key, key_bytes, &ext, arg, type);
+
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	load_extent(nei, &key);
+	trace_printk("found nei blk_off %llu blkno %llu blocks %llu\n",
+		     nei->blk_off, nei->blkno, nei->blocks);
+
+	ret = merge_extents(cur, nei);
+out:
+	return ret;
+}
+
+/*
+ * Build the changes needed to insert the given extent.  The semantics
+ * of the extents and callers means that we should not find existing extents
+ * that overlap the insertion.
+ */
+static int record_insert_changes(struct super_block *sb,
+				 struct extent_change *chg,
+				 struct native_extent *caller_ins,
+				 u64 arg, u8 type)
+{
+	struct native_extent ins = *caller_ins;
+	struct native_extent ext;
+	int ret;
+
+	trace_printk("inserting arg %llu type %u blk_off %llu blkno %llu blocks %llu\n",
+		     arg, type, ins.blk_off, ins.blkno, ins.blocks);
+
+	/* find the end */
+	while (chg->type)
+		chg++;
+
+	/* find previous that might be adjacent */
+	ret = try_merge(sb, &ins, -1, &ext, arg, type);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0)
+		chg = append_change(chg, false, &ext, arg, type);
+
+	/* find next that might be adjacent */
+	ret = try_merge(sb, &ins, 1, &ext, arg, type);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0)
+		chg = append_change(chg, false, &ext, arg, type);
+
+	/* and insert the new extent, possibly including merged neighbours */
+	chg = append_change(chg, true, &ins, arg, type);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * Record the changes needed to remove a portion of an existing extent.
+ */
+static int record_remove_changes(struct super_block *sb,
+				 struct extent_change *chg,
+				 struct native_extent *rem, u64 arg,
+				 u8 type)
+{
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent left;
+	struct native_extent right;
+	struct native_extent outer;
+	int ret;
+
+	trace_printk("removing arg %llu type %u blk_off %llu blkno %llu blocks %llu\n",
+		     arg, type, rem->blk_off, rem->blkno, rem->blocks);
+
+	/* find the end */
+	while (chg->type)
+		chg++;
+
+	memset(&outer, ~0, sizeof(outer));
+	init_extent_key(&last, last_bytes, &outer, arg, type);
+
+	/* find outer existing extent that contains removal extent */
+	init_extent_key(&key, key_bytes, rem, arg, type);
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	if (ret)
+		goto out;
+
+	load_extent(&outer, &key);
+
+	trace_printk("found outer blk_off %llu blkno %llu blocks %llu\n",
+		     outer.blk_off, outer.blkno, outer.blocks);
+
+	if (!extents_within(&outer, rem)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	trim_extents(&left, &right, &outer, rem);
+
+	chg = append_change(chg, false, &outer, arg, type);
+
+	if (left.blocks) {
+		trace_printk("left trim blk_off %llu blkno %llu blocks %llu\n",
+			     left.blk_off, left.blkno, left.blocks);
+		chg = append_change(chg, true, &left, arg, type);
+	}
+
+	if (right.blocks) {
+		trace_printk("right trim blk_off %llu blkno %llu blocks %llu\n",
+			     right.blk_off, right.blkno, right.blocks);
+		chg = append_change(chg, true, &right, arg, type);
+	}
+
+	ret = 0;
+out:
+	if (ret)
+		trace_printk("ret %d\n", ret);
+	return ret;
+}
+
+/*
+ * Any given allocation or free of a file data extent can involve both
+ * insertion and deletion of both file extent and free extent items.  To
+ * make these atomic we record all the insertions and deletions that are
+ * performed.  We first dirty the deletions, then insert, then delete.
+ * This lets us always safely unwind on failure.
+ */
+static int apply_changes(struct super_block *sb, struct extent_change *changes)
+{
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf key;
+	struct extent_change *chg;
+	int ret;
+	int err;
+
+	for (chg = changes; chg->type; chg++) {
+		if (chg->ins)
+			continue;
+
+		init_extent_key(&key, key_bytes, &chg->ext, chg->arg,
+				chg->type);
+		ret = scoutfs_item_dirty(sb, &key);
+		if (ret)
+			goto out;
+	}
+
+	for (chg = changes; chg->type; chg++) {
+		if (!chg->ins)
+			continue;
+
+		init_extent_key(&key, key_bytes, &chg->ext, chg->arg,
+				chg->type);
+		ret = scoutfs_item_create(sb, &key, NULL);
+		if (ret) {
+			while ((--chg) >= changes) {
+				if (!chg->ins)
+					continue;
+				init_extent_key(&key, key_bytes, &chg->ext,
+						chg->arg, chg->type);
+				err = scoutfs_item_delete(sb, &key);
+				BUG_ON(err);
+			}
+			goto out;
+		}
+	}
+
+	for (chg = changes; chg->type; chg++) {
+		if (chg->ins)
+			continue;
+
+		init_extent_key(&key, key_bytes, &chg->ext, chg->arg,
+				chg->type);
+		ret = scoutfs_item_delete(sb, &key);
+		BUG_ON(ret);
+	}
+
+out:
+	return ret;
+}
+
+int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
+				u64 len, bool offline)
+{
+	BUG();  /* NYI */
+}
+
+/*
+ * These cheesy cursors are only meant to encourage nice IO patterns for
+ * concurrent tasks either streaming large file writes or creating lots
+ * of small files.  It will do very poorly in many other situations.  To
+ * do better we'd need to go further down the road to delalloc and take
+ * more surrounding context into account.
+ */
+static struct task_cursor *get_cursor(struct data_info *datinf)
+{
+	struct task_cursor *curs;
+	struct cursor_id id = {
+		.task = current,
+		.pid = current->pid,
+	};
+
+	curs = rhashtable_lookup(&datinf->cursors, &id);
+	if (!curs) {
+		curs = list_last_entry(&datinf->cursor_lru,
+				       struct task_cursor, list_head);
+		trace_printk("resetting curs %p was task %p pid %u\n",
+				curs, curs->id.task, curs->id.pid);
+		rhashtable_remove(&datinf->cursors, &curs->hash_head, GFP_NOFS);
+		curs->id = id;
+		rhashtable_insert(&datinf->cursors, &curs->hash_head, GFP_NOFS);
+		curs->blkno = 0;
+		curs->blocks = 0;
+	}
+
+	list_move(&curs->list_head, &datinf->cursor_lru);
+
+	return curs;
+}
+
+static int bulk_alloc(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct extent_change changes[MAX_CHANGES];
+	struct native_extent ext;
+	u64 *segnos = NULL;
+	int ret;
+	int i;
+
+	segnos = scoutfs_net_bulk_alloc(sb);
+	if (IS_ERR(segnos)) {
+		ret = PTR_ERR(segnos);
+		goto out;
+	}
+
+	for (i = 0; segnos[i]; i++) {
+		memset(changes, 0, sizeof(changes));
+
+		/* merge or set this one */
+		if (i > 0 && (segnos[i] == segnos[i - 1] + 1)) {
+			ext.blocks += SCOUTFS_SEGMENT_BLOCKS;
+			trace_printk("merged segno [%u] %llu blocks %llu\n",
+					i, segnos[i], ext.blocks);
+		} else {
+			ext.blkno = segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT;
+			ext.blocks = SCOUTFS_SEGMENT_BLOCKS;
+			trace_printk("set extent segno [%u] %llu blkno %llu\n",
+					i, segnos[i], ext.blkno);
+		}
+
+		/* don't write if we merge with the next one */
+		if ((segnos[i] + 1) == segnos[i + 1])
+			continue;
+
+		trace_printk("inserting extent [%u] blkno %llu blocks %llu\n",
+			     i, ext.blkno, ext.blocks);
+
+		ext.blk_off = ext.blkno;
+		ret = record_insert_changes(sb, changes, &ext, sbi->node_id,
+					    SCOUTFS_FREE_EXTENT_BLKNO_KEY) ?:
+		      apply_changes(sb, changes);
+		/* XXX error here leaks segnos */
+		if (ret)
+			break;
+	}
+
+out:
+	if (!IS_ERR_OR_NULL(segnos))
+		kfree(segnos);
+
+	return ret;
+}
+
+/*
+ * Allocate a single block for the logical block offset in the file.
+ *
+ * We try to merge single block allocations into large extents by using
+ * per-task cursors.  Each cursor tracks a block region that should be
+ * searched for free extents.  If we don't have a cursor, or we find
+ * free space outside of our cursor, then we look for the next large
+ * free extent.
+ */
+static int allocate_block(struct inode *inode, sector_t iblock, u64 *blkno)
+{
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_DATA_INFO(sb, datinf);
+	struct extent_change changes[MAX_CHANGES] = {{{0,}}};
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent last_ext;
+	struct native_extent found;
+	struct native_extent ext;
+	struct task_cursor *curs;
+	bool alloced = false;
+	u8 type;
+	int ret;
+
+	memset(&last_ext, ~0, sizeof(last_ext));
+
+	down_write(&datinf->alloc_rwsem);
+
+	curs = get_cursor(datinf);
+
+	/* start from the cursor or look for the next large extent */
+reset_cursor:
+	if (curs->blocks) {
+		ext.blkno = curs->blkno;
+		ext.blocks = 0;
+		type = SCOUTFS_FREE_EXTENT_BLKNO_KEY;
+	} else {
+		ext.blkno = datinf->next_large_blkno;
+		ext.blocks = LARGE_EXTENT_BLOCKS;
+		type = SCOUTFS_FREE_EXTENT_BLOCKS_KEY;
+	}
+
+retry:
+	trace_printk("searching %llu,%llu curs %p task %p pid %u %llu,%llu\n",
+		     ext.blkno, ext.blocks, curs, curs->id.task, curs->id.pid,
+		     curs->blkno, curs->blocks);
+
+	ext.blk_off = ext.blkno;
+	init_extent_key(&key, key_bytes, &ext, sbi->node_id, type);
+	init_extent_key(&last, last_bytes, &last_ext, sbi->node_id, type);
+
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			/* if the cursor's empty fall back to next large */
+	 		if (ext.blkno && ext.blocks == 0) {
+				curs->blkno = 0;
+				curs->blocks = 0;
+				goto reset_cursor;
+			}
+
+			/* wrap the search for large extents */
+			if (ext.blkno > LARGE_EXTENT_BLOCKS && ext.blocks) {
+				datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
+				ext.blkno = datinf->next_large_blkno;
+				goto retry;
+			}
+
+			/* ask the server for more extents */
+			if (ext.blocks && !alloced) {
+				ret = bulk_alloc(sb);
+				if (ret < 0)
+					goto out;
+				alloced = true;
+				goto retry;
+			}
+
+			/* finally look for any free block at all */
+			if (ext.blocks) {
+				ext.blkno = 0;
+				ext.blocks = 0;
+				type = SCOUTFS_FREE_EXTENT_BLKNO_KEY;
+				goto retry;
+			}
+
+			/* after all that return -ENOSPC */
+			ret = -ENOSPC;
+		}
+		goto out;
+	}
+
+	load_extent(&found, &key);
+	trace_printk("found %llu,%llu\n", found.blkno, found.blocks);
+
+	/* look for a new large extent if found is outside cursor */
+	if (curs->blocks &&
+	    (found.blkno + found.blocks <= curs->blkno ||
+	     found.blkno >= curs->blkno + curs->blocks)) {
+		curs->blkno = 0;
+		curs->blocks = 0;
+		goto reset_cursor;
+	}
+
+	/*
+	 * Set the cursor if:
+	 *  - we didn't already have one
+	 *  - it's large enough for a large extent with alignment padding
+	 *  - the sufficiently large free region is past next large
+	 */
+	if (!curs->blocks &&
+	    found.blocks >= (2 * LARGE_EXTENT_BLOCKS) &&
+	    (found.blkno + found.blocks - (2 * LARGE_EXTENT_BLOCKS) >=
+		datinf->next_large_blkno)) {
+
+		curs->blkno = ALIGN(max(found.blkno, datinf->next_large_blkno),
+				    LARGE_EXTENT_BLOCKS);
+		curs->blocks = LARGE_EXTENT_BLOCKS;
+		found.blkno = curs->blkno;
+		found.blocks = curs->blocks;
+
+		datinf->next_large_blkno = curs->blkno + LARGE_EXTENT_BLOCKS;
+	}
+
+	trace_printk("using %llu,%llu curs %llu,%llu\n",
+		     found.blkno, found.blocks, curs->blkno, curs->blocks);
+
+	*blkno = found.blkno;
+	ext.blk_off = iblock;
+	ext.blkno = found.blkno;
+	ext.blocks = 1;
+	ret = record_insert_changes(sb, changes, &ext, scoutfs_ino(inode),
+				    SCOUTFS_FILE_EXTENT_KEY);
+	if (ret < 0)
+		goto out;
+
+	ext.blk_off = ext.blkno;
+	ret = record_remove_changes(sb, changes, &ext, sbi->node_id,
+			    SCOUTFS_FREE_EXTENT_BLKNO_KEY) ?:
+	      apply_changes(sb, changes);
+
+	/* advance cursor if we're using it */
+	if (ret == 0 && curs->blocks) {
+		if (--curs->blocks == 0)
+			curs->blkno = 0;
+		else
+			curs->blkno++;
+	}
+
+out:
+	up_write(&datinf->alloc_rwsem);
+	return ret;
+}
+
+static int scoutfs_get_block(struct inode *inode, sector_t iblock,
+			     struct buffer_head *bh, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	DECLARE_DATA_INFO(sb, datinf);
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent ext;
+	u64 blocks;
+	u64 blkno;
+	u64 off;
+	int ret;
+
+	bh->b_blocknr = 0;
+	bh->b_size = 0;
+	blocks = 0;
+
+	ext.blk_off = iblock;
+	ext.blocks = 1;
+	ext.blkno = 0;
+	init_extent_key(&key, key_bytes, &ext, scoutfs_ino(inode),
+			SCOUTFS_FILE_EXTENT_KEY);
+
+	ext.blk_off = ~0ULL;
+	ext.blkno = ~0ULL;
+	ext.blocks = ~0ULL;
+	init_extent_key(&last, last_bytes, &ext, scoutfs_ino(inode),
+			SCOUTFS_FILE_EXTENT_KEY);
+
+	/*
+	 * XXX think about how far this next can go, given locking and
+	 * item consistency.
+	 */
+	down_read(&datinf->alloc_rwsem);
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	up_read(&datinf->alloc_rwsem);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		else
+			goto out;
+	} else {
+		load_extent(&ext, &key);
+		trace_printk("found blk_off %llu blkno %llu blocks %llu\n",
+			     ext.blk_off, ext.blkno, ext.blocks);
+		if (iblock >= ext.blk_off &&
+		    iblock < (ext.blk_off + ext.blocks)) {
+			off = iblock - ext.blk_off;
+			blkno = ext.blkno + off;
+			blocks = ext.blocks - off;
+		}
+	}
+
+	if (blocks == 0 && create) {
+		ret = allocate_block(inode, iblock, &blkno);
+		if (ret)
+			goto out;
+
+		blocks = 1;
+	}
+
+	if (blocks) {
+		map_bh(bh, inode->i_sb, blkno);
+		bh->b_size = min_t(u64, SIZE_MAX,
+				   blocks << SCOUTFS_BLOCK_SHIFT);
+	}
+
+out:
+	trace_printk("ino %llu iblock %llu create %d ret %d bnr %llu size %zu\n",
+		     scoutfs_ino(inode), (u64)iblock, create, ret,
+		     (u64)bh->b_blocknr, bh->b_size);
+
+	return ret;
+}
+
+static int scoutfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, scoutfs_get_block);
+}
+
+static int scoutfs_readpages(struct file *file, struct address_space *mapping,
+			     struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
+}
+
+static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, scoutfs_get_block, wbc);
+}
+
+static int scoutfs_writepages(struct address_space *mapping,
+			      struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, scoutfs_get_block);
+}
+
 static int scoutfs_write_begin(struct file *file,
 			       struct address_space *mapping, loff_t pos,
 			       unsigned len, unsigned flags,
@@ -382,158 +876,56 @@ static int scoutfs_write_begin(struct file *file,
 {
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
-        pgoff_t index = pos >> PAGE_SHIFT;
-	loff_t size = i_size_read(inode);
-        struct page *page;
 	int ret;
 
-	trace_printk("ino %llu pos %llu len %u flags %x\n",
-		     scoutfs_ino(inode), (u64)pos, len, flags);
-	scoutfs_inc_counter(sb, data_write_begin);
+	trace_printk("ino %llu pos %llu len %u\n",
+		     scoutfs_ino(inode), (u64)pos, len);
 
 	ret = scoutfs_hold_trans(sb);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* can't re-enter fs, have trans */
 	flags |= AOP_FLAG_NOFS;
 
+	/* generic write_end updates i_size and calls dirty_inode */
 	ret = scoutfs_dirty_inode_item(inode);
-	if (ret)
-		goto out;
-
-retry:
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	trace_printk(PGF"\n", PGA(page));
-
-	if (!PageUptodate(page) && (pos < size && len < PAGE_CACHE_SIZE)) {
-		ClearPageError(page);
-		ret = scoutfs_readpage(file, page);
-		if (!ret) {
-			wait_on_page_locked(page);
-			if (!PageUptodate(page))
-				ret = -EIO;
-		}
-		page_cache_release(page);
-		if (ret)
-			goto out;
-
-		/* let grab_ lock and check for truncated pages */
-		goto retry;
-	}
-
-        *pagep = page;
-	ret = 0;
-out:
+	if (ret == 0)
+		ret = block_write_begin(mapping, pos, len, flags, pagep,
+					scoutfs_get_block);
 	if (ret)
 		scoutfs_release_trans(sb);
-
-	trace_printk("ret %d\n", ret);
+out:
         return ret;
 }
 
-/*
- * Finish modification of a page cache page.
- *
- * write_begin has held the transaction and dirtied the inode.  We
- * create items for each dirty block whose value references the page
- * contents that will be written.
- *
- * We Modify the dirty item and its dependent metadata items while
- * holding the transaction so that we never get missing data.
- *
- * XXX
- *  - detect no change with copied == 0?
- *  - only iterate over written blocks, not the whole page?
- *  - make sure page granular locking and concurrent extending writes works
- *  - error handling needs work, truncate partial writes on failure?
- */
 static int scoutfs_write_end(struct file *file, struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned copied,
 			     struct page *page, void *fsdata)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf key;
-	SCOUTFS_DECLARE_KVEC(val);
-	loff_t old_size = i_size_read(inode);
-	bool update_inode = false;
-	loff_t new_size;
-	unsigned start;
-	loff_t loff;
-	u64 block;
 	int ret;
 
-	trace_printk("ino %llu "PGF" pos %llu len %u copied %d\n",
-		     scoutfs_ino(inode), PGA(page), (u64)pos, len, copied);
-	scoutfs_inc_counter(sb, data_write_end);
+	trace_printk("ino %llu pgind %lu pos %llu len %u copied %d\n",
+		     scoutfs_ino(inode), page->index, (u64)pos, len, copied);
 
-	/* zero any unwritten portions of a new page around the write */
-	if (!PageUptodate(page)) {
-		if (copied != PAGE_CACHE_SIZE) {
-			start = pos & ~PAGE_CACHE_MASK;
-			zero_user_segments(page, 0, start,
-					   start + copied, PAGE_CACHE_SIZE);
-		}
-		SetPageUptodate(page);
-	}
-
-	new_size = pos + copied;
-
-	for_each_page_block(page, start, loff, block, key, dkey, val) {
-
-		/* only put data inside i_size in items */
-		/* XXX ugh, kvecs are still clumsy :) */
-		if (loff + SCOUTFS_BLOCK_SIZE > new_size)
-			val[0].iov_len = new_size - loff;
-
-		ret = scoutfs_item_create_ephemeral(sb, &key, val);
-		if (ret)
-			goto out;
-	}
-
-	/* update i_size if we extended */
-        if (new_size > inode->i_size) {
-                i_size_write(inode, new_size);
-		update_inode = true;
-        }
-
-        if (old_size < pos)
-                pagecache_isize_extended(inode, old_size, pos);
-
-	if (copied) {
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret > 0) {
 		scoutfs_inode_inc_data_version(inode);
-		update_inode = true;
-	}
-
-	if (update_inode)
+		/* XXX kind of a big hammer, inode life cycle needs work */
 		scoutfs_update_inode_item(inode);
-
-	flush_dcache_page(page);
-	set_page_dirty(page);
-	SetPagePrivate(page); /* call invalidatepage */
-
-	ret = copied;
-out:
-	unlock_page(page);
+		scoutfs_inode_queue_writeback(inode);
+	}
 	scoutfs_release_trans(sb);
-
-	/* XXX error handling needs work */
-	WARN_ON_ONCE(ret < 0);
 	return ret;
 }
 
 const struct address_space_operations scoutfs_file_aops = {
 	.readpage		= scoutfs_readpage,
+	.readpages		= scoutfs_readpages,
 	.writepage		= scoutfs_writepage,
-	.set_page_dirty		= __set_page_dirty_nobuffers,
-	.invalidatepage		= scoutfs_invalidatepage,
+	.writepages		= scoutfs_writepages,
 	.write_begin		= scoutfs_write_begin,
 	.write_end		= scoutfs_write_end,
 };
@@ -545,23 +937,75 @@ const struct file_operations scoutfs_file_fops = {
 	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= scoutfs_ioctl,
 	.fsync		= scoutfs_file_fsync,
-	.llseek		= generic_file_llseek,
 };
 
+static int derpy_global_mutex_is_held(void)
+{
+	return 1;
+}
+
+static struct rhashtable_params cursor_hash_params = {
+	.key_len = member_sizeof(struct task_cursor, id),
+	.key_offset = offsetof(struct task_cursor, id),
+	.head_offset = offsetof(struct task_cursor, hash_head),
+	.hashfn = arch_fast_hash,
+	.grow_decision = rht_grow_above_75,
+	.shrink_decision = rht_shrink_below_30,
+
+	.mutex_is_held = derpy_global_mutex_is_held,
+};
+
+static void destroy_cursors(struct data_info *datinf)
+{
+	struct task_cursor *curs;
+	struct task_cursor *pos;
+
+	list_for_each_entry_safe(curs, pos, &datinf->cursor_lru, list_head) {
+		list_del_init(&curs->list_head);
+		kfree(curs);
+	}
+	rhashtable_destroy(&datinf->cursors);
+}
+
 int scoutfs_data_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct data_info *datinf;
-
-	/* page block iteration doesn't understand multiple pages per block */
-	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SIZE);
+	struct task_cursor *curs;
+	int ret;
+	int i;
 
 	datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL);
 	if (!datinf)
 		return -ENOMEM;
-	sbi->data_info = datinf;
 
-	init_llist_head(&datinf->writeback_pages);
+	init_rwsem(&datinf->alloc_rwsem);
+	INIT_LIST_HEAD(&datinf->cursor_lru);
+	/* always search for large aligned extents */
+	datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
+
+	ret = rhashtable_init(&datinf->cursors, &cursor_hash_params);
+	if (ret) {
+		kfree(datinf);
+		return -ENOMEM;
+	}
+
+	/* just allocate all of these up front */
+	for (i = 0; i < NR_CURSORS; i++) {
+		curs = kzalloc(sizeof(struct task_cursor), GFP_KERNEL);
+		if (!curs) {
+			destroy_cursors(datinf);
+			kfree(datinf);
+			return -ENOMEM;
+		}
+
+		curs->id.pid = i;
+		rhashtable_insert(&datinf->cursors, &curs->hash_head,
+				  GFP_KERNEL);
+		list_add(&curs->list_head, &datinf->cursor_lru);
+	}
+
+	sbi->data_info = datinf;
 
 	return 0;
 }
@@ -572,7 +1016,7 @@ void scoutfs_data_destroy(struct super_block *sb)
 	struct data_info *datinf = sbi->data_info;
 
 	if (datinf) {
-		WARN_ON_ONCE(!llist_empty(&datinf->writeback_pages));
+		destroy_cursors(datinf);
 		kfree(datinf);
 	}
 }
diff --git a/kmod/src/data.h b/kmod/src/data.h
index 189b2cba..1319d100 100644
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -6,7 +6,6 @@ extern const struct file_operations scoutfs_file_fops;
 
 int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
 				u64 len, bool offline);
-void scoutfs_data_end_writeback(struct super_block *sb, int err);
 
 int scoutfs_data_setup(struct super_block *sb);
 void scoutfs_data_destroy(struct super_block *sb);
diff --git a/kmod/src/format.h b/kmod/src/format.h
index 5d3c184b..a58d12b6 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -156,9 +156,10 @@ struct scoutfs_segment_block {
 #define SCOUTFS_READDIR_KEY		6
 #define SCOUTFS_LINK_BACKREF_KEY	7
 #define SCOUTFS_SYMLINK_KEY		8
-#define SCOUTFS_EXTENT_KEY		9
+#define SCOUTFS_FILE_EXTENT_KEY		9
 #define SCOUTFS_ORPHAN_KEY		10
-#define SCOUTFS_DATA_KEY		11
+#define SCOUTFS_FREE_EXTENT_BLKNO_KEY	11
+#define SCOUTFS_FREE_EXTENT_BLOCKS_KEY	12
 /* not found in the fs */
 #define SCOUTFS_MAX_UNUSED_KEY		253
 #define SCOUTFS_NET_ADDR_KEY		254
@@ -198,11 +199,28 @@ struct scoutfs_orphan_key {
 	__be64 ino;
 } __packed;
 
-/* value is data payload bytes */
-struct scoutfs_data_key {
+/* no value */
+struct scoutfs_file_extent_key {
 	__u8 type;
 	__be64 ino;
-	__be64 block;
+	__be64 last_blk_off;
+	__be64 last_blkno;
+	__be64 blocks;
+} __packed;
+
+/* no value */
+struct scoutfs_free_extent_blkno_key {
+	__u8 type;
+	__be64 node_id;
+	__be64 last_blkno;
+	__be64 blocks;
+} __packed;
+
+struct scoutfs_free_extent_blocks_key {
+	__u8 type;
+	__be64 node_id;
+	__be64 blocks;
+	__be64 last_blkno;
 } __packed;
 
 /* value is each item's part of the full xattr value for the off/len */
@@ -384,6 +402,11 @@ struct scoutfs_net_manifest_entries {
 	struct scoutfs_manifest_entry ments[0];
 } __packed;
 
+struct scoutfs_net_segnos {
+	__le16 nr;
+	__le64 segnos[0];
+} __packed;
+
 enum {
 	/* sends and receives a struct scoutfs_timeval */
 	SCOUTFS_NET_TRADE_TIME = 0,
@@ -391,6 +414,7 @@ enum {
 	SCOUTFS_NET_MANIFEST_RANGE_ENTRIES,
 	SCOUTFS_NET_ALLOC_SEGNO,
 	SCOUTFS_NET_RECORD_SEGMENT,
+	SCOUTFS_NET_BULK_ALLOC,
 	SCOUTFS_NET_UNKNOWN,
 };
 
diff --git a/kmod/src/inode.c b/kmod/src/inode.c
index 7f61f552..71a98d33 100644
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -47,6 +47,16 @@ struct free_ino_pool {
 	bool in_flight;
 };
 
+struct inode_sb_info {
+	struct free_ino_pool pool;
+
+	spinlock_t writeback_lock;
+	struct rb_root writeback_inodes;
+};
+
+#define DECLARE_INODE_SB_INFO(sb, name) \
+	struct inode_sb_info *name = SCOUTFS_SB(sb)->inode_sb_info
+
 static struct kmem_cache *scoutfs_inode_cachep;
 
 /*
@@ -61,6 +71,7 @@ static void scoutfs_inode_ctor(void *obj)
 	seqcount_init(&ci->seqcount);
 	ci->staging = false;
 	init_rwsem(&ci->xattr_rwsem);
+	RB_CLEAR_NODE(&ci->writeback_node);
 
 	inode_init_once(&ci->inode);
 }
@@ -84,8 +95,48 @@ static void scoutfs_i_callback(struct rcu_head *head)
 	kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
 }
 
+static void insert_writeback_inode(struct inode_sb_info *inf,
+				   struct scoutfs_inode_info *ins)
+{
+	struct rb_root *root = &inf->writeback_inodes;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct scoutfs_inode_info *si;
+
+	while (*node) {
+		parent = *node;
+		si = container_of(*node, struct scoutfs_inode_info,
+				  writeback_node);
+
+		if (ins->ino < si->ino)
+			node = &(*node)->rb_left;
+		else if (ins->ino > si->ino)
+			node = &(*node)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&ins->writeback_node, parent, node);
+	rb_insert_color(&ins->writeback_node, root);
+}
+
+static void remove_writeback_inode(struct inode_sb_info *inf,
+			       struct scoutfs_inode_info *si)
+{
+	if (!RB_EMPTY_NODE(&si->writeback_node)) {
+		rb_erase(&si->writeback_node, &inf->writeback_inodes);
+		RB_CLEAR_NODE(&si->writeback_node);
+	}
+}
+
 void scoutfs_destroy_inode(struct inode *inode)
 {
+	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
+
+	spin_lock(&inf->writeback_lock);
+	remove_writeback_inode(inf, SCOUTFS_I(inode));
+	spin_unlock(&inf->writeback_lock);
+
 	call_rcu(&inode->i_rcu, scoutfs_i_callback);
 }
 
@@ -393,7 +444,7 @@ u64 scoutfs_last_ino(struct super_block *sb)
  */
 void scoutfs_inode_fill_pool(struct super_block *sb, u64 ino, u64 nr)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;
 
 	trace_printk("filling ino %llu nr %llu\n", ino, nr);
 
@@ -427,7 +478,7 @@ static bool pool_in_flight(struct free_ino_pool *pool)
  */
 static int alloc_ino(struct super_block *sb, u64 *ino)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;
 	bool request;
 	int ret;
 
@@ -733,28 +784,121 @@ int scoutfs_orphan_inode(struct inode *inode)
 	return ret;
 }
 
+/*
+ * Track an inode that could have dirty pages.  Used to kick off writeback
+ * on all dirty pages during transaction commit without tying ourselves in
+ * knots trying to call through the high level vfs sync methods.
+ */
+void scoutfs_inode_queue_writeback(struct inode *inode)
+{
+	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+
+	spin_lock(&inf->writeback_lock);
+	if (RB_EMPTY_NODE(&si->writeback_node))
+		insert_writeback_inode(inf, si);
+	spin_unlock(&inf->writeback_lock);
+}
+
+/*
+ * Walk our dirty inodes in ino order and either start dirty page
+ * writeback or wait for writeback to complete.
+ *
+ * This is called by transaction commiting so other writers are
+ * excluded.  We're still very careful to iterate over the tree while it
+ * and the inodes could be changing.
+ *
+ * Because writes are excluded we know that there's no remaining dirty
+ * pages once waiting returns successfully.
+ *
+ * XXX not sure what to do about retrying io errors.
+ */
+int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+	struct scoutfs_inode_info *si;
+	struct rb_node *node;
+	struct inode *inode;
+	struct inode *defer_iput = NULL;
+	int ret;
+
+	spin_lock(&inf->writeback_lock);
+
+	node = rb_first(&inf->writeback_inodes);
+	while (node) {
+		si = container_of(node, struct scoutfs_inode_info,
+				  writeback_node);
+		node = rb_next(node);
+		inode = igrab(&si->inode);
+		if (!inode)
+			continue;
+
+		spin_unlock(&inf->writeback_lock);
+
+		if (defer_iput) {
+			iput(defer_iput);
+			defer_iput = NULL;
+		}
+
+		if (write)
+			ret = filemap_fdatawrite(inode->i_mapping);
+		else
+			ret = filemap_fdatawait(inode->i_mapping);
+		trace_printk("ino %llu write %d ret %d\n",
+			     scoutfs_ino(inode), write, ret);
+		if (ret) {
+			iput(inode);
+			goto out;
+		}
+
+		spin_lock(&inf->writeback_lock);
+
+		if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node)))
+			node = rb_first(&inf->writeback_inodes);
+		else
+			node = rb_next(&si->writeback_node);
+
+		if (!write)
+			remove_writeback_inode(inf, si);
+
+		/* avoid iput->destroy lock deadlock */
+		defer_iput = inode;
+	}
+
+	spin_unlock(&inf->writeback_lock);
+out:
+	if (defer_iput)
+		iput(defer_iput);
+	return ret;
+}
+
 int scoutfs_inode_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct free_ino_pool *pool;
+	struct inode_sb_info *inf;
 
-	pool = kzalloc(sizeof(struct free_ino_pool), GFP_KERNEL);
-	if (!pool)
+	inf = kzalloc(sizeof(struct inode_sb_info), GFP_KERNEL);
+	if (!inf)
 		return -ENOMEM;
 
+	pool = &inf->pool;
 	init_waitqueue_head(&pool->waitq);
 	spin_lock_init(&pool->lock);
 
-	sbi->free_ino_pool = pool;
+	spin_lock_init(&inf->writeback_lock);
+	inf->writeback_inodes = RB_ROOT;
+
+	sbi->inode_sb_info = inf;
 
 	return 0;
 }
 
 void scoutfs_inode_destroy(struct super_block *sb)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
 
-	kfree(pool);
+	kfree(inf);
 }
 
 void scoutfs_inode_exit(void)
diff --git a/kmod/src/inode.h b/kmod/src/inode.h
index da24e9af..59da8f7a 100644
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -13,6 +13,7 @@ struct scoutfs_inode_info {
 	seqcount_t seqcount;
 	bool staging;			/* holder of i_mutex is staging */
 	struct rw_semaphore xattr_rwsem;
+	struct rb_node writeback_node;
 
 	struct inode inode;
 };
@@ -48,6 +49,9 @@ u64 scoutfs_inode_get_data_version(struct inode *inode);
 
 int scoutfs_scan_orphans(struct super_block *sb);
 
+void scoutfs_inode_queue_writeback(struct inode *inode);
+int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
+
 u64 scoutfs_last_ino(struct super_block *sb);
 
 void scoutfs_inode_exit(void);
diff --git a/kmod/src/net.c b/kmod/src/net.c
index 11e5fe30..33d2da3c 100644
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -18,6 +18,7 @@
 #include <linux/in.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <linux/sort.h>
 
 #include "format.h"
 #include "net.h"
@@ -363,6 +364,61 @@ static struct send_buf *alloc_sbuf(unsigned data_len)
 	return sbuf;
 }
 
+/* XXX I dunno, totally made up */
+#define BULK_COUNT 32
+
+static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req,
+					   int req_len)
+{
+	DECLARE_NET_INFO(sb, nti);
+	struct scoutfs_net_segnos *ns;
+	struct commit_waiter cw;
+	struct send_buf *sbuf;
+	u64 segno;
+	int ret;
+	int i;
+
+	if (req_len != 0)
+		return ERR_PTR(-EINVAL);
+
+	sbuf = alloc_sbuf(offsetof(struct scoutfs_net_segnos,
+				   segnos[BULK_COUNT]));
+	if (!sbuf)
+		return ERR_PTR(-ENOMEM);
+
+	ns = (void *)sbuf->nh->data;
+	ns->nr = cpu_to_le16(BULK_COUNT);
+
+	down_read(&nti->ring_commit_rwsem);
+
+	for (i = 0; i < BULK_COUNT; i++) {
+		ret = scoutfs_alloc_segno(sb, &segno);
+		if (ret) {
+			while (i-- > 0)
+				scoutfs_alloc_free(sb,
+					le64_to_cpu(ns->segnos[i]));
+			break;
+		}
+
+		ns->segnos[i] = cpu_to_le64(segno);
+	}
+
+
+	if (ret == 0)
+		queue_commit_work(nti, &cw);
+	up_read(&nti->ring_commit_rwsem);
+
+	if (ret == 0)
+		ret = wait_for_commit(&cw);
+
+	if (ret)
+		sbuf->nh->status = SCOUTFS_NET_STATUS_ERROR;
+	else
+		sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS;
+
+	return sbuf;
+}
+
 static struct send_buf *process_record_segment(struct super_block *sb,
 					       void *req, int req_len)
 {
@@ -616,6 +672,7 @@ static proc_func_t type_proc_func(u8 type)
 			process_manifest_range_entries,
 		[SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno,
 		[SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment,
+		[SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc,
 	};
 
 	return type < SCOUTFS_NET_UNKNOWN ? funcs[type] : NULL;
@@ -1100,6 +1157,113 @@ static int add_send_buf(struct super_block *sb, int type, void *data,
 	return 0;
 }
 
+struct bulk_alloc_args {
+	struct completion comp;
+	u64 *segnos;
+	int ret;
+};
+
+static int sort_cmp_u64s(const void *A, const void *B)
+{
+	const u64 *a = A;
+	const u64 *b = B;
+
+	return *a < *b ? -1  : *a > *b ? 1 : 0;
+}
+
+static void sort_swap_u64s(void *A, void *B, int size)
+{
+	u64 *a = A;
+	u64 *b = B;
+
+	swap(*a, *b);
+}
+
+static int bulk_alloc_reply(struct super_block *sb, void *reply, int ret,
+			    void *arg)
+{
+	struct bulk_alloc_args *args = arg;
+	struct scoutfs_net_segnos *ns = reply;
+	u16 nr;
+	int i;
+
+	if (ret < sizeof(struct scoutfs_net_segnos) ||
+	    ret != offsetof(struct scoutfs_net_segnos,
+			    segnos[le16_to_cpu(ns->nr)])) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nr = le16_to_cpu(ns->nr);
+
+	args->segnos = kmalloc((nr + 1) * sizeof(args->segnos[0]), GFP_NOFS);
+	if (args->segnos == NULL) {
+		ret = -ENOMEM; /* XXX hmm. */
+		goto out;
+	}
+
+	for (i = 0; i < nr; i++) {
+		args->segnos[i] = le64_to_cpu(ns->segnos[i]);
+
+		/* make sure they're all non-zero */
+		if (args->segnos[i] == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	sort(args->segnos, nr, sizeof(args->segnos[0]),
+	     sort_cmp_u64s, sort_swap_u64s);
+
+	/* make sure they're all unique */
+	for (i = 1; i < nr; i++) {
+		if (args->segnos[i] == args->segnos[i - 1]) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	args->segnos[nr] = 0;
+	ret = 0;
+out:
+	if (ret && args->segnos) {
+		kfree(args->segnos);
+		args->segnos = NULL;
+	}
+	args->ret = ret;
+	complete(&args->comp);
+	return args->ret;
+}
+
+/*
+ * Returns a 0-terminated allocated array of segnos, the caller is
+ * responsible for freeing it.
+ */
+u64 *scoutfs_net_bulk_alloc(struct super_block *sb)
+{
+	struct bulk_alloc_args args;
+	int ret;
+
+	args.segnos = NULL;
+	init_completion(&args.comp);
+
+	ret = add_send_buf(sb, SCOUTFS_NET_BULK_ALLOC, NULL, 0,
+			   bulk_alloc_reply, &args);
+	if (ret == 0) {
+		wait_for_completion(&args.comp);
+		ret = args.ret;
+		if (ret == 0 && (args.segnos == NULL || args.segnos[0] == 0))
+			ret = -ENOSPC;
+	}
+
+	if (ret) {
+		kfree(args.segnos);
+		args.segnos = ERR_PTR(ret);
+	}
+
+	return args.segnos;
+}
+
 /*
  * Eventually we're going to have messages that control compaction.
  * Each client mount would have long-lived work that sends requests
diff --git a/kmod/src/net.h b/kmod/src/net.h
index d48fc2b7..125bf327 100644
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -13,6 +13,7 @@ int scoutfs_net_manifest_range_entries(struct super_block *sb,
 int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno);
 int scoutfs_net_record_segment(struct super_block *sb,
 			       struct scoutfs_segment *seg, u8 level);
+u64 *scoutfs_net_bulk_alloc(struct super_block *sb);
 
 int scoutfs_net_get_compaction(struct super_block *sb, void *curs);
 int scoutfs_net_finish_compaction(struct super_block *sb, void *curs,
diff --git a/kmod/src/super.c b/kmod/src/super.c
index 48fb27d2..0fe2ed52 100644
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -204,6 +204,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbi)
 		return -ENOMEM;
 
+	/*
+	 * XXX this is random today for initial testing, but we'll want
+	 * it to be assigned by the server.
+	 */
+	get_random_bytes_arch(&sbi->node_id, sizeof(sbi->node_id));
+
 	spin_lock_init(&sbi->next_ino_lock);
 	atomic_set(&sbi->trans_holds, 0);
 	init_waitqueue_head(&sbi->trans_hold_wq);
diff --git a/kmod/src/super.h b/kmod/src/super.h
index 184c92b8..5b6d5903 100644
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -14,11 +14,13 @@ struct compact_info;
 struct data_info;
 struct lock_info;
 struct net_info;
-struct free_ino_pool;
+struct inode_sb_info;
 
 struct scoutfs_sb_info {
 	struct super_block *sb;
 
+	u64 node_id;
+
 	struct scoutfs_super_block super;
 
 	spinlock_t next_ino_lock;
@@ -29,7 +31,7 @@ struct scoutfs_sb_info {
 	struct seg_alloc *seg_alloc;
 	struct compact_info *compact_info;
 	struct data_info *data_info;
-	struct free_ino_pool *free_ino_pool;
+	struct inode_sb_info *inode_sb_info;
 
 	atomic_t trans_holds;
 	wait_queue_head_t trans_hold_wq;
diff --git a/kmod/src/trans.c b/kmod/src/trans.c
index e6247bc0..11941c7e 100644
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -26,6 +26,7 @@
 #include "seg.h"
 #include "counters.h"
 #include "net.h"
+#include "inode.h"
 #include "scoutfs_trace.h"
 
 /*
@@ -97,10 +98,12 @@ void scoutfs_trans_write_func(struct work_struct *work)
 		 * about leaking segnos nor duplicate manifest entries
 		 * on crashes between us and the server.
 		 */
-		ret = scoutfs_net_alloc_segno(sb, &segno) ?:
+		ret = scoutfs_inode_walk_writeback(sb, true) ?:
+		      scoutfs_net_alloc_segno(sb, &segno) ?:
 		      scoutfs_seg_alloc(sb, segno, &seg) ?:
 		      scoutfs_item_dirty_seg(sb, seg) ?:
 		      scoutfs_seg_submit_write(sb, seg, &comp) ?:
+		      scoutfs_inode_walk_writeback(sb, false) ?:
 		      scoutfs_bio_wait_comp(sb, &comp) ?:
 		      scoutfs_net_record_segment(sb, seg, 0);
 		if (ret)
@@ -112,9 +115,6 @@ out:
 	/* XXX this all needs serious work for dealing with errors */
 	WARN_ON_ONCE(ret);
 
-	/* must be done before waking waiting trans holders who might dirty */
-	scoutfs_data_end_writeback(sb, ret);
-
 	spin_lock(&sbi->trans_write_lock);
 	sbi->trans_write_count++;
 	sbi->trans_write_ret = ret;