diff --git a/kmod/src/data.c b/kmod/src/data.c
index 61547c2e..8533a3c5 100644
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -14,8 +14,12 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/backing-dev.h>
-#include <linux/delay.h>
+#include <linux/mpage.h>
+#include <linux/rhashtable.h>
+#include <linux/sched.h>
+#include <linux/buffer_head.h>
+#include <linux/hash.h>
+#include <linux/random.h>
 
 #include "format.h"
 #include "super.h"
@@ -27,354 +31,844 @@
 #include "scoutfs_trace.h"
 #include "item.h"
 #include "ioctl.h"
+#include "net.h"
 
 /*
- * scoutfs stores data in items that can be up to the small 4K block
- * size.  The page cache address space callbacks work with the item
- * cache.  Each OS page can be stored in multiple of our smaller fixed
- * size items.  The code doesn't understand OS pages that are smaller
- * than our block size.
+ * scoutfs uses extent records to reference file data.
  *
- * readpage does a blocking read of the item and then copies its
- * contents into the page.  Since the segments are huge we sort of get
- * limited read-ahead by reading in segments at a time.
+ * The extent items map logical file regions to device blocks at at 4K
+ * block granularity.  File data isn't overwritten so that overwriting
+ * doesn't generate extent item locking and modification.
  *
- * Writing is quite a bit more fiddly.  We want to pack small files.
- * The item cache and transactions want to accurately track the size of
- * dirty items to fill the next segment.  And we would like to minimize
- * cpu copying as much as we can.
+ * Nodes have their own free extent items stored at their node id to
+ * avoid lock contention during allocation and freeing.  These pools are
+ * filled and drained with RPCs to the server who allocates blocks in
+ * segment-sized regions.
  *
- * This simplest first pass creates dirty items as pages are dirtied
- * whose values reference the page contents.  They're freed after
- * they're written to the segment so that we don't have to worry about
- * items that reference clean pages.  Invalidatepage forgets any items
- * if a dirty page is truncated away.
+ * Block allocation maintains a fixed number of allocation cursors that
+ * remember the position of tasks within free regions.  This is very
+ * simple and maintains decent extents for simple streaming writes.  It
+ * eventually won't be good enough and we'll spend complexity on
+ * delalloc but we want to put that off as long as possible.
  *
- * Writeback is built around all the dirty items being written by a
- * commit.  This can happen naturally in the backgroud.  Or writepage
- * can initiate it to start by kicking the commit thread.  In either
- * case our dirty pages are "in writeback" by being put on a list that
- * is walked by the end of the commit.  Because writes and page dirtying
- * are serialized with the commit we know that there can be no dirty
- * pages after the commit and we can mark writeback complete on all the
- * pages that started writeback before the commit finished.  motivate
- * having items in the item cache while there are dirty pages.
+ * There's no unwritten extents.  As we dirty file data pages, possibly
+ * allocating extents for the first time, we track their inodes.  Before
+ * we commit dirty metadata we write out all tracked inodes.  This
+ * ensures that data is persistent before the metadata that references
+ * it is usable.
  *
- * Data is copied from the dirty page contents into the segment pages
- * for writing.  This lets us easily pack small files without worrying
- * about DMA alignment and avoids the stable page problem of the page
- * being modified after the cpu calculates the checksum but before the
- * DMA reads to the device.
+ * Weirdly, the extents are indexed by the *final* logical block and
+ * blkno of the extent.  This lets us search for neighbouring previous
+ * extents with a _next() call and avoids having to implement item
+ * reading that iterates backwards through the manifest and segments.
+ *
+ * There are two items that track free extents, one indexed by the block
+ * location of the free extent and one indexed by the size of the free
+ * region.  This means that one allocation can update a great number of
+ * items throughout the tree as file and both kinds of free extents
+ * split and merge.  The code goes to great lengths to stage these
+ * updates so that it can always unwind and return errors without
+ * leaving the items inconsistent.
  *
  * XXX
  *  - truncate
  *  - mmap
  *  - better io error propagation
- *  - async readpages for more concurrent readahead
  *  - forced unmount with dirty data
  *  - direct IO
- *  - probably stitch page vecs into block struct page fragments for bios
- *  - maybe cut segment boundaries on aligned data offsets
- *  - maybe decouple metadata and data segment writes
  */
 
 struct data_info {
-	struct llist_head writeback_pages;
+	struct rw_semaphore alloc_rwsem;
+	u64 next_large_blkno;
+	struct rhashtable cursors;
+	struct list_head cursor_lru;
 };
 
 #define DECLARE_DATA_INFO(sb, name) \
 	struct data_info *name = SCOUTFS_SB(sb)->data_info
 
-/*
- * trace_printk() doesn't support %c?
- *
- * 1 - 1ocked
- * a - uptodAte
- * d - Dirty
- * b - writeBack
- * e - Error
- */
-#define page_hexflag(page, name, val, shift) \
-	(Page##name(page) ? (val << (shift * 4)) : 0)
-
-#define page_hexflags(page) \
-	(page_hexflag(page, Locked, 0x1, 4)	|	\
-	 page_hexflag(page, Uptodate, 0xa, 3)	|	\
-	 page_hexflag(page, Dirty, 0xd, 2)	|	\
-	 page_hexflag(page, Writeback, 0xb, 1)	|	\
-	 page_hexflag(page, Error, 0xe, 0))
-
-#define PGF "page %p [index %lu flags %x]"
-#define PGA(page)					\
-	(page), (page)->index, page_hexflags(page)	\
-
-#define BHF "bh %p [blocknr %llu size %zu state %lx]"
-#define BHA(bh)							\
-	(bh), (u64)(bh)->b_blocknr, (bh)->b_size, (bh)->b_state	\
-
-static void init_data_key(struct scoutfs_key_buf *key,
-			  struct scoutfs_data_key *dkey, u64 ino, u64 block)
-{
-	dkey->type = SCOUTFS_DATA_KEY;
-	dkey->ino = cpu_to_be64(ino);
-	dkey->block = cpu_to_be64(block);
-
-	scoutfs_key_init(key, dkey, sizeof(struct scoutfs_data_key));
-}
+/* more than enough for a few tasks per core on moderate hardware */
+#define NR_CURSORS 4096
 
 /*
- * Delete the data block items in the given region.
+ * This is the size of extents that are tracked by a cursor and so end
+ * up being the largest file item extent length given concurrent
+ * streaming writes.
  *
- * This is the low level extent item truncate code.  Callers manage
- * higher order truncation and orphan cleanup.
- *
- * XXX
- *  - restore support for releasing data.
- *  - for final unlink this would be better as a range deletion
- *  - probably don't want to read items to find them for removal
+ * XXX We probably want this to be a bit larger to further reduce the
+ * amount of item churn involved in truncating tremendous files.
  */
-int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
-				u64 len, bool offline)
-{
-	struct scoutfs_data_key last_dkey;
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf last;
-	struct scoutfs_key_buf key;
-	int ret;
+#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS
 
-	trace_printk("iblock %llu len %llu offline %u\n",
-		     iblock, len, offline);
+struct cursor_id {
+	struct task_struct *task;
+	pid_t pid;
+} __packed; /* rhashtable_lookup() always memcmp()s, avoid padding */
 
-	if (WARN_ON_ONCE(iblock + len <= iblock) ||
-	    WARN_ON_ONCE(offline))
-		return -EINVAL;
-
-	init_data_key(&key, &dkey, ino, iblock);
-	init_data_key(&last, &last_dkey, ino, iblock + len - 1);
-
-	for (;;) {
-		ret = scoutfs_item_next(sb, &key, &last, NULL);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
-			break;
-		}
-
-		/* XXX would set offline bit items here */
-
-		ret = scoutfs_item_delete(sb, &key);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-static inline struct page *page_from_llist_node(struct llist_node *node)
-{
-	BUILD_BUG_ON(member_sizeof(struct page, private) !=
-		     sizeof(struct llist_node));
-
-	return container_of((void *)node, struct page, private);
-}
-
-static inline struct llist_node *llist_node_from_page(struct page *page)
-{
-	return (void *)&page->private;
-}
-
-static inline void page_llist_add(struct page *page, struct llist_head *head)
-{
-	llist_add(llist_node_from_page(page), head);
-}
+struct task_cursor {
+	u64 blkno;
+	u64 blocks;
+	struct rhash_head hash_head;
+	struct list_head list_head;
+	struct cursor_id id;
+};
 
 /*
- * The transaction has committed so there are no more dirty items.  End
- * writeback on all the dirty pages that started writeback before the
- * commit finished.  The commit doesn't start until all holders which
- * could dirty are released so there couldn't have been new dirty pages
- * and writeback entries while the commit was in flight.
+ * Both file extent and free extent keys are converted into this native
+ * form for manipulation.  The free extents set blk_off to blkno.
  */
-void scoutfs_data_end_writeback(struct super_block *sb, int err)
+struct native_extent {
+	u64 blk_off;
+	u64 blkno;
+	u64 blocks;
+};
+
+/* These are stored in a (type==0) terminated array on caller's stacks */
+struct extent_change {
+	struct native_extent ext;
+	u64 arg;
+	unsigned ins:1,
+		 type;
+};
+
+/* insert file extent + remove both blkno and blocks extents + 0 term */
+#define MAX_CHANGES (3 + 3 + 3 + 1)
+
+/* XXX avoiding dynamic on-stack array initializers :/ */
+union extent_key_union {
+	struct scoutfs_file_extent_key file;
+	struct scoutfs_free_extent_blkno_key blkno;
+	struct scoutfs_free_extent_blocks_key blocks;
+} __packed;
+#define MAX_KEY_BYTES sizeof(union extent_key_union)
+
+static void init_file_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
+			         struct native_extent *ext, u64 arg)
 {
-	DECLARE_DATA_INFO(sb, datinf);
-	struct llist_node *node;
-	struct page *page;
+	struct scoutfs_file_extent_key *fkey = key_bytes;
 
-	/* XXX haven't thought about errors here */
-	BUG_ON(err);
+	fkey->type = SCOUTFS_FILE_EXTENT_KEY;
+	fkey->ino = cpu_to_be64(arg);
+	fkey->last_blk_off = cpu_to_be64(ext->blk_off + ext->blocks - 1);
+	fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1);
+	fkey->blocks = cpu_to_be64(ext->blocks);
 
-	node = llist_del_all(&datinf->writeback_pages);
-
-	while (node) {
-		page = page_from_llist_node(node);
-		node = llist_next(node);
-
-		trace_printk("ending writeback "PGF"\n", PGA(page));
-		scoutfs_inc_counter(sb, data_end_writeback_page);
-
-
-		set_page_private(page, 0);
-		end_page_writeback(page);
-		page_cache_release(page);
-	}
+	scoutfs_key_init(key, fkey, sizeof(struct scoutfs_file_extent_key));
 }
 
-#define for_each_page_block(page, start, loff, block, key, dkey, val)	   \
-	for (start = 0;							   \
-	     start < PAGE_CACHE_SIZE &&					   \
-		(loff = ((loff_t)page->index << PAGE_CACHE_SHIFT) + start, \
-		 block = loff >> SCOUTFS_BLOCK_SHIFT,			   \
-		 init_data_key(&key, &dkey,				   \
-			       scoutfs_ino(page->mapping->host), block),   \
-		 scoutfs_kvec_init(val, page_address(page) + start,	   \
-				   SCOUTFS_BLOCK_SIZE),			   \
-		 1);							   \
-	     start += SCOUTFS_BLOCK_SIZE)
+#define INIT_FREE_EXTENT_KEY(which_type, key, key_bytes, ext, arg, type)  \
+do {									  \
+	struct which_type *fkey = key_bytes;				  \
+									  \
+	fkey->type = type;						  \
+	fkey->node_id = cpu_to_be64(arg);				  \
+	fkey->last_blkno = cpu_to_be64(ext->blkno + ext->blocks - 1);	  \
+	fkey->blocks = cpu_to_be64(ext->blocks);			  \
+									  \
+	scoutfs_key_init(key, fkey, sizeof(struct which_type));		  \
+} while (0)
 
-/*
- * Copy the contents of each item that makes up the page into their
- * regions of the page, zeroing any page contents not covered by items.
- *
- * This is the simplest loop that looks up every possible block.  We
- * could instead have a readpages() that iterates over present items and
- * puts them in the pages in the batch.
- */
-static int scoutfs_readpage(struct file *file, struct page *page)
+static void init_extent_key(struct scoutfs_key_buf *key, void *key_bytes,
+			    struct native_extent *ext, u64 arg, u8 type)
 {
-	struct inode *inode = page->mapping->host;
-	struct super_block *sb = inode->i_sb;
-	loff_t size = i_size_read(inode);
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf key;
-	SCOUTFS_DECLARE_KVEC(val);
-	unsigned start;
-	loff_t loff;
-	u64 block;
-	int ret = 0;
-
-
-	trace_printk(PGF"\n", PGA(page));
-	scoutfs_inc_counter(sb, data_readpage);
-
-	for_each_page_block(page, start, loff, block, key, dkey, val) {
-		/* the rest of the page is zero when block is past i_size */
-		if (loff >= size)
-			break;
-
-		/* copy the block item contents into the page */
-		ret = scoutfs_item_lookup(sb, &key, val);
-		if (ret < 0) {
-			if (ret == -ENOENT)
-				ret = 0;
-			else
-				break;
-		}
-
-		/*
-		 * XXX do we need to clamp the item length by i_size?
-		 * truncate should purge the item cache and create
-		 * truncation range items that'd merge away old data
-		 * items, and invalidatepage should shrink any ephemeral
-		 * vecs.  Seems like the item length should be accurate?
-		 */
-
-		/* zero the tail of the block */
-		if (ret < SCOUTFS_BLOCK_SIZE)
-			zero_user(page, start, SCOUTFS_BLOCK_SIZE - ret);
-	}
-
-	/* zero any remaining tail blocks */
-	if (start < PAGE_CACHE_SIZE)
-		zero_user(page, start, PAGE_CACHE_SIZE - start);
-
-	if (ret == 0)
-		SetPageUptodate(page);
+	if (type == SCOUTFS_FILE_EXTENT_KEY)
+		init_file_extent_key(key, key_bytes, ext, arg);
+	else if(type == SCOUTFS_FREE_EXTENT_BLKNO_KEY)
+		INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blkno_key,
+				     key, key_bytes, ext, arg, type);
 	else
-		SetPageError(page);
+		INIT_FREE_EXTENT_KEY(scoutfs_free_extent_blocks_key,
+				     key, key_bytes, ext, arg, type);
+}
 
-	trace_printk("ret %d\n", ret);
-	unlock_page(page);
-	return ret;
+/* XXX could have some sanity checks */
+static void load_file_extent(struct native_extent *ext,
+			     struct scoutfs_key_buf *key)
+{
+	struct scoutfs_file_extent_key *fkey = key->data;
+
+	ext->blocks = be64_to_cpu(fkey->blocks);
+	ext->blk_off = be64_to_cpu(fkey->last_blk_off) - ext->blocks + 1;
+	ext->blkno = be64_to_cpu(fkey->last_blkno) - ext->blocks + 1;
+}
+
+#define LOAD_FREE_EXTENT(which_type, ext, key)		\
+do {							\
+	struct which_type *fkey = key->data;		\
+							\
+	ext->blkno = be64_to_cpu(fkey->last_blkno) -	\
+		     be64_to_cpu(fkey->blocks) + 1;	\
+	ext->blk_off = ext->blkno;			\
+	ext->blocks = be64_to_cpu(fkey->blocks);	\
+} while (0)
+
+static void load_extent(struct native_extent *ext, struct scoutfs_key_buf *key)
+{
+	struct scoutfs_free_extent_blocks_key *fkey = key->data;
+
+	BUILD_BUG_ON(offsetof(struct scoutfs_file_extent_key, type) !=
+		     offsetof(struct scoutfs_free_extent_blkno_key, type) ||
+		     offsetof(struct scoutfs_file_extent_key, type) !=
+		     offsetof(struct scoutfs_free_extent_blocks_key, type));
+
+	if (fkey->type == SCOUTFS_FILE_EXTENT_KEY)
+		load_file_extent(ext, key);
+	else if (fkey->type == SCOUTFS_FREE_EXTENT_BLKNO_KEY)
+		LOAD_FREE_EXTENT(scoutfs_free_extent_blkno_key, ext, key);
+	else
+		LOAD_FREE_EXTENT(scoutfs_free_extent_blocks_key, ext, key);
 }
 
 /*
- * Start writeback on a dirty page.  We always try to kick off a commit.
- * Repeated calls harmlessly bounce off the thread work's pending bit.
- * (we could probably test that the writeback pgaes list is empty before
- * trying to kick off a commit.)
- *
- * We add ourselves to a list of pages that the commit will end
- * writeback on once its done.  If there's no dirty data the commit
- * thread will end writeback after not doing anything.
+ * Merge two extents if they're adjacent.  First we arrange them to
+ * only test their adjoining endpoints, then are careful to not reference
+ * fields after we've modified them.
  */
-static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
+static int merge_extents(struct native_extent *mod,
+			 struct native_extent *ext)
 {
-	struct inode *inode = page->mapping->host;
-	struct super_block *sb = inode->i_sb;
-	DECLARE_DATA_INFO(sb, datinf);
+	struct native_extent *left;
+	struct native_extent *right;
 
-	trace_printk(PGF"\n", PGA(page));
-	scoutfs_inc_counter(sb, data_writepage);
+	if (mod->blk_off < ext->blk_off) {
+		left = mod;
+		right = ext;
+	} else {
+		left = ext;
+		right = mod;
+	}
 
-	BUG_ON(PageWriteback(page));
-	BUG_ON(page->private != 0);
-
-	ClearPagePrivate(page); /* invalidatepage not needed */
-	set_page_writeback(page);
-	page_cache_get(page);
-	page_llist_add(page, &datinf->writeback_pages);
-	unlock_page(page);
-	scoutfs_sync_fs(sb, 0);
+	if (left->blk_off + left->blocks == right->blk_off &&
+	    left->blkno + left->blocks == right->blkno) {
+		mod->blk_off = left->blk_off;
+		mod->blkno = left->blkno;
+		mod->blocks = left->blocks + right->blocks;
+		return 1;
+	}
 
 	return 0;
 }
 
 /*
- * Truncate is invalidating part of the contents of a page.
- *
- * We can't return errors here so our job is not to create dirty items
- * that end up executing the truncate.  That's the job of higher level
- * callers.  Our job is to make sure that we update references to the
- * page from existing ephemeral items if they already exist.
+ * The caller has ensured that the inner extent is entirely within
+ * the outer extent.  Fill out the left and right regions of outter
+ * that don't overlap with inner.
  */
-static void scoutfs_invalidatepage(struct page *page, unsigned long offset)
+static void trim_extents(struct native_extent *left,
+			 struct native_extent *right,
+			 struct native_extent *outer,
+			 struct native_extent *inner)
 {
-	struct inode *inode = page->mapping->host;
-	struct super_block *sb = inode->i_sb;
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf key;
-	SCOUTFS_DECLARE_KVEC(val);
-	unsigned start;
-	loff_t loff;
-	u64 block;
+	left->blk_off = outer->blk_off;
+	left->blkno = outer->blkno;
+	left->blocks = inner->blk_off - outer->blk_off;
 
-	trace_printk(PGF"\n", PGA(page));
-	scoutfs_inc_counter(sb, data_invalidatepage);
+	right->blk_off = inner->blk_off + inner->blocks;
+	right->blkno = inner->blkno + inner->blocks;
+	right->blocks = (outer->blk_off + outer->blocks) - right->blk_off;
+}
 
-	for_each_page_block(page, start, loff, block, key, dkey, val) {
-		if (offset) {
-			/* XXX maybe integrate offset into foreach */
-			/* XXX ugh, kvecs are still clumsy :) */
-			if (start + SCOUTFS_BLOCK_SIZE > offset)
-				val[0].iov_len = offset - start;
-			scoutfs_item_update_ephemeral(sb, &key, val);
-		} else {
-			scoutfs_item_forget(sb, &key);
-		}
-	}
+/* return true if inner is fully contained by outer */
+static bool extents_within(struct native_extent *outer,
+			   struct native_extent *inner)
+{
+	u64 outer_end = outer->blk_off + outer->blocks - 1;
+	u64 inner_end = inner->blk_off + inner->blocks - 1;
+
+	return outer->blk_off <= inner_end && outer_end >= inner_end;
 }
 
 /*
- * Start modifying a page cache page.
- *
- * We hold the transaction for write_end's inode updates before
- * acquiring the page lock.
- *
- * We give the writer the current page contents in the relatively rare
- * case of writing a partial page inside i_size.  write_end will zero
- * any region around the write if the page isn't uptodate.
+ * Add a new entry to the array of changes.  The _BLOCKS extent items
+ * exactly match the _BLKNO items but with different field order for
+ * searching by size.  We keep them in sync by always adding a _BLOCKS
+ * change for every _BLKNO change.
  */
+static struct extent_change *append_change(struct extent_change *chg,
+					   bool ins, struct native_extent *ext,
+					   u64 arg, u8 type)
+{
+	trace_printk("appending ins %d blk_off %llu blkno %llu blocks %llu arg %llu type %u\n",
+			ins, ext->blk_off, ext->blkno, ext->blocks,
+			arg, type);
+
+	chg->ext = *ext;
+	chg->arg = arg;
+	chg->ins = ins;
+	chg->type = type;
+
+	if (type == SCOUTFS_FREE_EXTENT_BLKNO_KEY) {
+		chg++;
+		*chg = *(chg - 1);
+		chg->type = SCOUTFS_FREE_EXTENT_BLOCKS_KEY;
+	}
+
+	return chg + 1;
+}
+
+/*
+ * Find an adjacent extent in the direction of the delta.  If we can
+ * merge with it then we modify the incoming cur extent.  nei is set to
+ * the neighbour we found.  > 0 is returned if we merged, 0 if not, and
+ * < 0 on error.
+ */
+static int try_merge(struct super_block *sb, struct native_extent *cur,
+		     s64 delta, struct native_extent *nei, u64 arg, u8 type)
+{
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent ext;
+	int ret;
+
+	/* short circuit prev search for common first block alloc */
+	if (cur->blk_off == 0 && delta < 0)
+		return 0;
+
+	trace_printk("nei %lld from blk_off %llu blkno %llu blocks %llu\n",
+		     delta, cur->blk_off, cur->blkno, cur->blocks);
+
+	memset(&ext, ~0, sizeof(ext));
+	init_extent_key(&last, last_bytes, &ext, arg, type);
+
+	ext.blk_off = cur->blk_off + delta;
+	ext.blkno = cur->blkno + delta;
+	ext.blocks = 1;
+	init_extent_key(&key, key_bytes, &ext, arg, type);
+
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		goto out;
+	}
+
+	load_extent(nei, &key);
+	trace_printk("found nei blk_off %llu blkno %llu blocks %llu\n",
+		     nei->blk_off, nei->blkno, nei->blocks);
+
+	ret = merge_extents(cur, nei);
+out:
+	return ret;
+}
+
+/*
+ * Build the changes needed to insert the given extent.  The semantics
+ * of the extents and callers means that we should not find existing extents
+ * that overlap the insertion.
+ */
+static int record_insert_changes(struct super_block *sb,
+				 struct extent_change *chg,
+				 struct native_extent *caller_ins,
+				 u64 arg, u8 type)
+{
+	struct native_extent ins = *caller_ins;
+	struct native_extent ext;
+	int ret;
+
+	trace_printk("inserting arg %llu type %u blk_off %llu blkno %llu blocks %llu\n",
+		     arg, type, ins.blk_off, ins.blkno, ins.blocks);
+
+	/* find the end */
+	while (chg->type)
+		chg++;
+
+	/* find previous that might be adjacent */
+	ret = try_merge(sb, &ins, -1, &ext, arg, type);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0)
+		chg = append_change(chg, false, &ext, arg, type);
+
+	/* find next that might be adjacent */
+	ret = try_merge(sb, &ins, 1, &ext, arg, type);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0)
+		chg = append_change(chg, false, &ext, arg, type);
+
+	/* and insert the new extent, possibly including merged neighbours */
+	chg = append_change(chg, true, &ins, arg, type);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * Record the changes needed to remove a portion of an existing extent.
+ */
+static int record_remove_changes(struct super_block *sb,
+				 struct extent_change *chg,
+				 struct native_extent *rem, u64 arg,
+				 u8 type)
+{
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent left;
+	struct native_extent right;
+	struct native_extent outer;
+	int ret;
+
+	trace_printk("removing arg %llu type %u blk_off %llu blkno %llu blocks %llu\n",
+		     arg, type, rem->blk_off, rem->blkno, rem->blocks);
+
+	/* find the end */
+	while (chg->type)
+		chg++;
+
+	memset(&outer, ~0, sizeof(outer));
+	init_extent_key(&last, last_bytes, &outer, arg, type);
+
+	/* find outer existing extent that contains removal extent */
+	init_extent_key(&key, key_bytes, rem, arg, type);
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	if (ret)
+		goto out;
+
+	load_extent(&outer, &key);
+
+	trace_printk("found outer blk_off %llu blkno %llu blocks %llu\n",
+		     outer.blk_off, outer.blkno, outer.blocks);
+
+	if (!extents_within(&outer, rem)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	trim_extents(&left, &right, &outer, rem);
+
+	chg = append_change(chg, false, &outer, arg, type);
+
+	if (left.blocks) {
+		trace_printk("left trim blk_off %llu blkno %llu blocks %llu\n",
+			     left.blk_off, left.blkno, left.blocks);
+		chg = append_change(chg, true, &left, arg, type);
+	}
+
+	if (right.blocks) {
+		trace_printk("right trim blk_off %llu blkno %llu blocks %llu\n",
+			     right.blk_off, right.blkno, right.blocks);
+		chg = append_change(chg, true, &right, arg, type);
+	}
+
+	ret = 0;
+out:
+	if (ret)
+		trace_printk("ret %d\n", ret);
+	return ret;
+}
+
+/*
+ * Any given allocation or free of a file data extent can involve both
+ * insertion and deletion of both file extent and free extent items.  To
+ * make these atomic we record all the insertions and deletions that are
+ * performed.  We first dirty the deletions, then insert, then delete.
+ * This lets us always safely unwind on failure.
+ */
+static int apply_changes(struct super_block *sb, struct extent_change *changes)
+{
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf key;
+	struct extent_change *chg;
+	int ret;
+	int err;
+
+	for (chg = changes; chg->type; chg++) {
+		if (chg->ins)
+			continue;
+
+		init_extent_key(&key, key_bytes, &chg->ext, chg->arg,
+				chg->type);
+		ret = scoutfs_item_dirty(sb, &key);
+		if (ret)
+			goto out;
+	}
+
+	for (chg = changes; chg->type; chg++) {
+		if (!chg->ins)
+			continue;
+
+		init_extent_key(&key, key_bytes, &chg->ext, chg->arg,
+				chg->type);
+		ret = scoutfs_item_create(sb, &key, NULL);
+		if (ret) {
+			while ((--chg) >= changes) {
+				if (!chg->ins)
+					continue;
+				init_extent_key(&key, key_bytes, &chg->ext,
+						chg->arg, chg->type);
+				err = scoutfs_item_delete(sb, &key);
+				BUG_ON(err);
+			}
+			goto out;
+		}
+	}
+
+	for (chg = changes; chg->type; chg++) {
+		if (chg->ins)
+			continue;
+
+		init_extent_key(&key, key_bytes, &chg->ext, chg->arg,
+				chg->type);
+		ret = scoutfs_item_delete(sb, &key);
+		BUG_ON(ret);
+	}
+
+out:
+	return ret;
+}
+
+int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
+				u64 len, bool offline)
+{
+	BUG();  /* NYI */
+}
+
+/*
+ * These cheesy cursors are only meant to encourage nice IO patterns for
+ * concurrent tasks either streaming large file writes or creating lots
+ * of small files.  It will do very poorly in many other situations.  To
+ * do better we'd need to go further down the road to delalloc and take
+ * more surrounding context into account.
+ */
+static struct task_cursor *get_cursor(struct data_info *datinf)
+{
+	struct task_cursor *curs;
+	struct cursor_id id = {
+		.task = current,
+		.pid = current->pid,
+	};
+
+	curs = rhashtable_lookup(&datinf->cursors, &id);
+	if (!curs) {
+		curs = list_last_entry(&datinf->cursor_lru,
+				       struct task_cursor, list_head);
+		trace_printk("resetting curs %p was task %p pid %u\n",
+				curs, curs->id.task, curs->id.pid);
+		rhashtable_remove(&datinf->cursors, &curs->hash_head, GFP_NOFS);
+		curs->id = id;
+		rhashtable_insert(&datinf->cursors, &curs->hash_head, GFP_NOFS);
+		curs->blkno = 0;
+		curs->blocks = 0;
+	}
+
+	list_move(&curs->list_head, &datinf->cursor_lru);
+
+	return curs;
+}
+
+static int bulk_alloc(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct extent_change changes[MAX_CHANGES];
+	struct native_extent ext;
+	u64 *segnos = NULL;
+	int ret;
+	int i;
+
+	segnos = scoutfs_net_bulk_alloc(sb);
+	if (IS_ERR(segnos)) {
+		ret = PTR_ERR(segnos);
+		goto out;
+	}
+
+	for (i = 0; segnos[i]; i++) {
+		memset(changes, 0, sizeof(changes));
+
+		/* merge or set this one */
+		if (i > 0 && (segnos[i] == segnos[i - 1] + 1)) {
+			ext.blocks += SCOUTFS_SEGMENT_BLOCKS;
+			trace_printk("merged segno [%u] %llu blocks %llu\n",
+					i, segnos[i], ext.blocks);
+		} else {
+			ext.blkno = segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT;
+			ext.blocks = SCOUTFS_SEGMENT_BLOCKS;
+			trace_printk("set extent segno [%u] %llu blkno %llu\n",
+					i, segnos[i], ext.blkno);
+		}
+
+		/* don't write if we merge with the next one */
+		if ((segnos[i] + 1) == segnos[i + 1])
+			continue;
+
+		trace_printk("inserting extent [%u] blkno %llu blocks %llu\n",
+			     i, ext.blkno, ext.blocks);
+
+		ext.blk_off = ext.blkno;
+		ret = record_insert_changes(sb, changes, &ext, sbi->node_id,
+					    SCOUTFS_FREE_EXTENT_BLKNO_KEY) ?:
+		      apply_changes(sb, changes);
+		/* XXX error here leaks segnos */
+		if (ret)
+			break;
+	}
+
+out:
+	if (!IS_ERR_OR_NULL(segnos))
+		kfree(segnos);
+
+	return ret;
+}
+
+/*
+ * Allocate a single block for the logical block offset in the file.
+ *
+ * We try to merge single block allocations into large extents by using
+ * per-task cursors.  Each cursor tracks a block region that should be
+ * searched for free extents.  If we don't have a cursor, or we find
+ * free space outside of our cursor, then we look for the next large
+ * free extent.
+ */
+static int allocate_block(struct inode *inode, sector_t iblock, u64 *blkno)
+{
+	struct super_block *sb = inode->i_sb;
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	DECLARE_DATA_INFO(sb, datinf);
+	struct extent_change changes[MAX_CHANGES] = {{{0,}}};
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent last_ext;
+	struct native_extent found;
+	struct native_extent ext;
+	struct task_cursor *curs;
+	bool alloced = false;
+	u8 type;
+	int ret;
+
+	memset(&last_ext, ~0, sizeof(last_ext));
+
+	down_write(&datinf->alloc_rwsem);
+
+	curs = get_cursor(datinf);
+
+	/* start from the cursor or look for the next large extent */
+reset_cursor:
+	if (curs->blocks) {
+		ext.blkno = curs->blkno;
+		ext.blocks = 0;
+		type = SCOUTFS_FREE_EXTENT_BLKNO_KEY;
+	} else {
+		ext.blkno = datinf->next_large_blkno;
+		ext.blocks = LARGE_EXTENT_BLOCKS;
+		type = SCOUTFS_FREE_EXTENT_BLOCKS_KEY;
+	}
+
+retry:
+	trace_printk("searching %llu,%llu curs %p task %p pid %u %llu,%llu\n",
+		     ext.blkno, ext.blocks, curs, curs->id.task, curs->id.pid,
+		     curs->blkno, curs->blocks);
+
+	ext.blk_off = ext.blkno;
+	init_extent_key(&key, key_bytes, &ext, sbi->node_id, type);
+	init_extent_key(&last, last_bytes, &last_ext, sbi->node_id, type);
+
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			/* if the cursor's empty fall back to next large */
+	 		if (ext.blkno && ext.blocks == 0) {
+				curs->blkno = 0;
+				curs->blocks = 0;
+				goto reset_cursor;
+			}
+
+			/* wrap the search for large extents */
+			if (ext.blkno > LARGE_EXTENT_BLOCKS && ext.blocks) {
+				datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
+				ext.blkno = datinf->next_large_blkno;
+				goto retry;
+			}
+
+			/* ask the server for more extents */
+			if (ext.blocks && !alloced) {
+				ret = bulk_alloc(sb);
+				if (ret < 0)
+					goto out;
+				alloced = true;
+				goto retry;
+			}
+
+			/* finally look for any free block at all */
+			if (ext.blocks) {
+				ext.blkno = 0;
+				ext.blocks = 0;
+				type = SCOUTFS_FREE_EXTENT_BLKNO_KEY;
+				goto retry;
+			}
+
+			/* after all that return -ENOSPC */
+			ret = -ENOSPC;
+		}
+		goto out;
+	}
+
+	load_extent(&found, &key);
+	trace_printk("found %llu,%llu\n", found.blkno, found.blocks);
+
+	/* look for a new large extent if found is outside cursor */
+	if (curs->blocks &&
+	    (found.blkno + found.blocks <= curs->blkno ||
+	     found.blkno >= curs->blkno + curs->blocks)) {
+		curs->blkno = 0;
+		curs->blocks = 0;
+		goto reset_cursor;
+	}
+
+	/*
+	 * Set the cursor if:
+	 *  - we didn't already have one
+	 *  - it's large enough for a large extent with alignment padding
+	 *  - the sufficiently large free region is past next large
+	 */
+	if (!curs->blocks &&
+	    found.blocks >= (2 * LARGE_EXTENT_BLOCKS) &&
+	    (found.blkno + found.blocks - (2 * LARGE_EXTENT_BLOCKS) >=
+		datinf->next_large_blkno)) {
+
+		curs->blkno = ALIGN(max(found.blkno, datinf->next_large_blkno),
+				    LARGE_EXTENT_BLOCKS);
+		curs->blocks = LARGE_EXTENT_BLOCKS;
+		found.blkno = curs->blkno;
+		found.blocks = curs->blocks;
+
+		datinf->next_large_blkno = curs->blkno + LARGE_EXTENT_BLOCKS;
+	}
+
+	trace_printk("using %llu,%llu curs %llu,%llu\n",
+		     found.blkno, found.blocks, curs->blkno, curs->blocks);
+
+	*blkno = found.blkno;
+	ext.blk_off = iblock;
+	ext.blkno = found.blkno;
+	ext.blocks = 1;
+	ret = record_insert_changes(sb, changes, &ext, scoutfs_ino(inode),
+				    SCOUTFS_FILE_EXTENT_KEY);
+	if (ret < 0)
+		goto out;
+
+	ext.blk_off = ext.blkno;
+	ret = record_remove_changes(sb, changes, &ext, sbi->node_id,
+			    SCOUTFS_FREE_EXTENT_BLKNO_KEY) ?:
+	      apply_changes(sb, changes);
+
+	/* advance cursor if we're using it */
+	if (ret == 0 && curs->blocks) {
+		if (--curs->blocks == 0)
+			curs->blkno = 0;
+		else
+			curs->blkno++;
+	}
+
+out:
+	up_write(&datinf->alloc_rwsem);
+	return ret;
+}
+
+static int scoutfs_get_block(struct inode *inode, sector_t iblock,
+			     struct buffer_head *bh, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	DECLARE_DATA_INFO(sb, datinf);
+	u8 last_bytes[MAX_KEY_BYTES];
+	u8 key_bytes[MAX_KEY_BYTES];
+	struct scoutfs_key_buf last;
+	struct scoutfs_key_buf key;
+	struct native_extent ext;
+	u64 blocks;
+	u64 blkno;
+	u64 off;
+	int ret;
+
+	bh->b_blocknr = 0;
+	bh->b_size = 0;
+	blocks = 0;
+
+	ext.blk_off = iblock;
+	ext.blocks = 1;
+	ext.blkno = 0;
+	init_extent_key(&key, key_bytes, &ext, scoutfs_ino(inode),
+			SCOUTFS_FILE_EXTENT_KEY);
+
+	ext.blk_off = ~0ULL;
+	ext.blkno = ~0ULL;
+	ext.blocks = ~0ULL;
+	init_extent_key(&last, last_bytes, &ext, scoutfs_ino(inode),
+			SCOUTFS_FILE_EXTENT_KEY);
+
+	/*
+	 * XXX think about how far this next can go, given locking and
+	 * item consistency.
+	 */
+	down_read(&datinf->alloc_rwsem);
+	ret = scoutfs_item_next_same(sb, &key, &last, NULL);
+	up_read(&datinf->alloc_rwsem);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		else
+			goto out;
+	} else {
+		load_extent(&ext, &key);
+		trace_printk("found blk_off %llu blkno %llu blocks %llu\n",
+			     ext.blk_off, ext.blkno, ext.blocks);
+		if (iblock >= ext.blk_off &&
+		    iblock < (ext.blk_off + ext.blocks)) {
+			off = iblock - ext.blk_off;
+			blkno = ext.blkno + off;
+			blocks = ext.blocks - off;
+		}
+	}
+
+	if (blocks == 0 && create) {
+		ret = allocate_block(inode, iblock, &blkno);
+		if (ret)
+			goto out;
+
+		blocks = 1;
+	}
+
+	if (blocks) {
+		map_bh(bh, inode->i_sb, blkno);
+		bh->b_size = min_t(u64, SIZE_MAX,
+				   blocks << SCOUTFS_BLOCK_SHIFT);
+	}
+
+out:
+	trace_printk("ino %llu iblock %llu create %d ret %d bnr %llu size %zu\n",
+		     scoutfs_ino(inode), (u64)iblock, create, ret,
+		     (u64)bh->b_blocknr, bh->b_size);
+
+	return ret;
+}
+
+static int scoutfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, scoutfs_get_block);
+}
+
+static int scoutfs_readpages(struct file *file, struct address_space *mapping,
+			     struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
+}
+
+static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, scoutfs_get_block, wbc);
+}
+
+static int scoutfs_writepages(struct address_space *mapping,
+			      struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, scoutfs_get_block);
+}
+
 static int scoutfs_write_begin(struct file *file,
 			       struct address_space *mapping, loff_t pos,
 			       unsigned len, unsigned flags,
@@ -382,158 +876,56 @@ static int scoutfs_write_begin(struct file *file,
 {
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
-        pgoff_t index = pos >> PAGE_SHIFT;
-	loff_t size = i_size_read(inode);
-        struct page *page;
 	int ret;
 
-	trace_printk("ino %llu pos %llu len %u flags %x\n",
-		     scoutfs_ino(inode), (u64)pos, len, flags);
-	scoutfs_inc_counter(sb, data_write_begin);
+	trace_printk("ino %llu pos %llu len %u\n",
+		     scoutfs_ino(inode), (u64)pos, len);
 
 	ret = scoutfs_hold_trans(sb);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* can't re-enter fs, have trans */
 	flags |= AOP_FLAG_NOFS;
 
+	/* generic write_end updates i_size and calls dirty_inode */
 	ret = scoutfs_dirty_inode_item(inode);
-	if (ret)
-		goto out;
-
-retry:
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	trace_printk(PGF"\n", PGA(page));
-
-	if (!PageUptodate(page) && (pos < size && len < PAGE_CACHE_SIZE)) {
-		ClearPageError(page);
-		ret = scoutfs_readpage(file, page);
-		if (!ret) {
-			wait_on_page_locked(page);
-			if (!PageUptodate(page))
-				ret = -EIO;
-		}
-		page_cache_release(page);
-		if (ret)
-			goto out;
-
-		/* let grab_ lock and check for truncated pages */
-		goto retry;
-	}
-
-        *pagep = page;
-	ret = 0;
-out:
+	if (ret == 0)
+		ret = block_write_begin(mapping, pos, len, flags, pagep,
+					scoutfs_get_block);
 	if (ret)
 		scoutfs_release_trans(sb);
-
-	trace_printk("ret %d\n", ret);
+out:
         return ret;
 }
 
-/*
- * Finish modification of a page cache page.
- *
- * write_begin has held the transaction and dirtied the inode.  We
- * create items for each dirty block whose value references the page
- * contents that will be written.
- *
- * We Modify the dirty item and its dependent metadata items while
- * holding the transaction so that we never get missing data.
- *
- * XXX
- *  - detect no change with copied == 0?
- *  - only iterate over written blocks, not the whole page?
- *  - make sure page granular locking and concurrent extending writes works
- *  - error handling needs work, truncate partial writes on failure?
- */
 static int scoutfs_write_end(struct file *file, struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned copied,
 			     struct page *page, void *fsdata)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
-	struct scoutfs_data_key dkey;
-	struct scoutfs_key_buf key;
-	SCOUTFS_DECLARE_KVEC(val);
-	loff_t old_size = i_size_read(inode);
-	bool update_inode = false;
-	loff_t new_size;
-	unsigned start;
-	loff_t loff;
-	u64 block;
 	int ret;
 
-	trace_printk("ino %llu "PGF" pos %llu len %u copied %d\n",
-		     scoutfs_ino(inode), PGA(page), (u64)pos, len, copied);
-	scoutfs_inc_counter(sb, data_write_end);
+	trace_printk("ino %llu pgind %lu pos %llu len %u copied %d\n",
+		     scoutfs_ino(inode), page->index, (u64)pos, len, copied);
 
-	/* zero any unwritten portions of a new page around the write */
-	if (!PageUptodate(page)) {
-		if (copied != PAGE_CACHE_SIZE) {
-			start = pos & ~PAGE_CACHE_MASK;
-			zero_user_segments(page, 0, start,
-					   start + copied, PAGE_CACHE_SIZE);
-		}
-		SetPageUptodate(page);
-	}
-
-	new_size = pos + copied;
-
-	for_each_page_block(page, start, loff, block, key, dkey, val) {
-
-		/* only put data inside i_size in items */
-		/* XXX ugh, kvecs are still clumsy :) */
-		if (loff + SCOUTFS_BLOCK_SIZE > new_size)
-			val[0].iov_len = new_size - loff;
-
-		ret = scoutfs_item_create_ephemeral(sb, &key, val);
-		if (ret)
-			goto out;
-	}
-
-	/* update i_size if we extended */
-        if (new_size > inode->i_size) {
-                i_size_write(inode, new_size);
-		update_inode = true;
-        }
-
-        if (old_size < pos)
-                pagecache_isize_extended(inode, old_size, pos);
-
-	if (copied) {
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret > 0) {
 		scoutfs_inode_inc_data_version(inode);
-		update_inode = true;
-	}
-
-	if (update_inode)
+		/* XXX kind of a big hammer, inode life cycle needs work */
 		scoutfs_update_inode_item(inode);
-
-	flush_dcache_page(page);
-	set_page_dirty(page);
-	SetPagePrivate(page); /* call invalidatepage */
-
-	ret = copied;
-out:
-	unlock_page(page);
+		scoutfs_inode_queue_writeback(inode);
+	}
 	scoutfs_release_trans(sb);
-
-	/* XXX error handling needs work */
-	WARN_ON_ONCE(ret < 0);
 	return ret;
 }
 
 const struct address_space_operations scoutfs_file_aops = {
 	.readpage		= scoutfs_readpage,
+	.readpages		= scoutfs_readpages,
 	.writepage		= scoutfs_writepage,
-	.set_page_dirty		= __set_page_dirty_nobuffers,
-	.invalidatepage		= scoutfs_invalidatepage,
+	.writepages		= scoutfs_writepages,
 	.write_begin		= scoutfs_write_begin,
 	.write_end		= scoutfs_write_end,
 };
@@ -545,23 +937,75 @@ const struct file_operations scoutfs_file_fops = {
 	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= scoutfs_ioctl,
 	.fsync		= scoutfs_file_fsync,
-	.llseek		= generic_file_llseek,
 };
 
+static int derpy_global_mutex_is_held(void)
+{
+	return 1;
+}
+
+static struct rhashtable_params cursor_hash_params = {
+	.key_len = member_sizeof(struct task_cursor, id),
+	.key_offset = offsetof(struct task_cursor, id),
+	.head_offset = offsetof(struct task_cursor, hash_head),
+	.hashfn = arch_fast_hash,
+	.grow_decision = rht_grow_above_75,
+	.shrink_decision = rht_shrink_below_30,
+
+	.mutex_is_held = derpy_global_mutex_is_held,
+};
+
+static void destroy_cursors(struct data_info *datinf)
+{
+	struct task_cursor *curs;
+	struct task_cursor *pos;
+
+	list_for_each_entry_safe(curs, pos, &datinf->cursor_lru, list_head) {
+		list_del_init(&curs->list_head);
+		kfree(curs);
+	}
+	rhashtable_destroy(&datinf->cursors);
+}
+
 int scoutfs_data_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct data_info *datinf;
-
-	/* page block iteration doesn't understand multiple pages per block */
-	BUILD_BUG_ON(PAGE_SIZE < SCOUTFS_BLOCK_SIZE);
+	struct task_cursor *curs;
+	int ret;
+	int i;
 
 	datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL);
 	if (!datinf)
 		return -ENOMEM;
-	sbi->data_info = datinf;
 
-	init_llist_head(&datinf->writeback_pages);
+	init_rwsem(&datinf->alloc_rwsem);
+	INIT_LIST_HEAD(&datinf->cursor_lru);
+	/* always search for large aligned extents */
+	datinf->next_large_blkno = LARGE_EXTENT_BLOCKS;
+
+	ret = rhashtable_init(&datinf->cursors, &cursor_hash_params);
+	if (ret) {
+		kfree(datinf);
+		return -ENOMEM;
+	}
+
+	/* just allocate all of these up front */
+	for (i = 0; i < NR_CURSORS; i++) {
+		curs = kzalloc(sizeof(struct task_cursor), GFP_KERNEL);
+		if (!curs) {
+			destroy_cursors(datinf);
+			kfree(datinf);
+			return -ENOMEM;
+		}
+
+		curs->id.pid = i;
+		rhashtable_insert(&datinf->cursors, &curs->hash_head,
+				  GFP_KERNEL);
+		list_add(&curs->list_head, &datinf->cursor_lru);
+	}
+
+	sbi->data_info = datinf;
 
 	return 0;
 }
@@ -572,7 +1016,7 @@ void scoutfs_data_destroy(struct super_block *sb)
 	struct data_info *datinf = sbi->data_info;
 
 	if (datinf) {
-		WARN_ON_ONCE(!llist_empty(&datinf->writeback_pages));
+		destroy_cursors(datinf);
 		kfree(datinf);
 	}
 }
diff --git a/kmod/src/data.h b/kmod/src/data.h
index 189b2cba..1319d100 100644
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -6,7 +6,6 @@ extern const struct file_operations scoutfs_file_fops;
 
 int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
 				u64 len, bool offline);
-void scoutfs_data_end_writeback(struct super_block *sb, int err);
 
 int scoutfs_data_setup(struct super_block *sb);
 void scoutfs_data_destroy(struct super_block *sb);
diff --git a/kmod/src/format.h b/kmod/src/format.h
index 5d3c184b..a58d12b6 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -156,9 +156,10 @@ struct scoutfs_segment_block {
 #define SCOUTFS_READDIR_KEY		6
 #define SCOUTFS_LINK_BACKREF_KEY	7
 #define SCOUTFS_SYMLINK_KEY		8
-#define SCOUTFS_EXTENT_KEY		9
+#define SCOUTFS_FILE_EXTENT_KEY		9
 #define SCOUTFS_ORPHAN_KEY		10
-#define SCOUTFS_DATA_KEY		11
+#define SCOUTFS_FREE_EXTENT_BLKNO_KEY	11
+#define SCOUTFS_FREE_EXTENT_BLOCKS_KEY	12
 /* not found in the fs */
 #define SCOUTFS_MAX_UNUSED_KEY		253
 #define SCOUTFS_NET_ADDR_KEY		254
@@ -198,11 +199,28 @@ struct scoutfs_orphan_key {
 	__be64 ino;
 } __packed;
 
-/* value is data payload bytes */
-struct scoutfs_data_key {
+/* no value */
+struct scoutfs_file_extent_key {
 	__u8 type;
 	__be64 ino;
-	__be64 block;
+	__be64 last_blk_off;
+	__be64 last_blkno;
+	__be64 blocks;
+} __packed;
+
+/* no value */
+struct scoutfs_free_extent_blkno_key {
+	__u8 type;
+	__be64 node_id;
+	__be64 last_blkno;
+	__be64 blocks;
+} __packed;
+
+struct scoutfs_free_extent_blocks_key {
+	__u8 type;
+	__be64 node_id;
+	__be64 blocks;
+	__be64 last_blkno;
 } __packed;
 
 /* value is each item's part of the full xattr value for the off/len */
@@ -384,6 +402,11 @@ struct scoutfs_net_manifest_entries {
 	struct scoutfs_manifest_entry ments[0];
 } __packed;
 
+struct scoutfs_net_segnos {
+	__le16 nr;
+	__le64 segnos[0];
+} __packed;
+
 enum {
 	/* sends and receives a struct scoutfs_timeval */
 	SCOUTFS_NET_TRADE_TIME = 0,
@@ -391,6 +414,7 @@ enum {
 	SCOUTFS_NET_MANIFEST_RANGE_ENTRIES,
 	SCOUTFS_NET_ALLOC_SEGNO,
 	SCOUTFS_NET_RECORD_SEGMENT,
+	SCOUTFS_NET_BULK_ALLOC,
 	SCOUTFS_NET_UNKNOWN,
 };
 
diff --git a/kmod/src/inode.c b/kmod/src/inode.c
index 7f61f552..71a98d33 100644
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -47,6 +47,16 @@ struct free_ino_pool {
 	bool in_flight;
 };
 
+struct inode_sb_info {
+	struct free_ino_pool pool;
+
+	spinlock_t writeback_lock;
+	struct rb_root writeback_inodes;
+};
+
+#define DECLARE_INODE_SB_INFO(sb, name) \
+	struct inode_sb_info *name = SCOUTFS_SB(sb)->inode_sb_info
+
 static struct kmem_cache *scoutfs_inode_cachep;
 
 /*
@@ -61,6 +71,7 @@ static void scoutfs_inode_ctor(void *obj)
 	seqcount_init(&ci->seqcount);
 	ci->staging = false;
 	init_rwsem(&ci->xattr_rwsem);
+	RB_CLEAR_NODE(&ci->writeback_node);
 
 	inode_init_once(&ci->inode);
 }
@@ -84,8 +95,48 @@ static void scoutfs_i_callback(struct rcu_head *head)
 	kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
 }
 
+static void insert_writeback_inode(struct inode_sb_info *inf,
+				   struct scoutfs_inode_info *ins)
+{
+	struct rb_root *root = &inf->writeback_inodes;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct scoutfs_inode_info *si;
+
+	while (*node) {
+		parent = *node;
+		si = container_of(*node, struct scoutfs_inode_info,
+				  writeback_node);
+
+		if (ins->ino < si->ino)
+			node = &(*node)->rb_left;
+		else if (ins->ino > si->ino)
+			node = &(*node)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&ins->writeback_node, parent, node);
+	rb_insert_color(&ins->writeback_node, root);
+}
+
+static void remove_writeback_inode(struct inode_sb_info *inf,
+			       struct scoutfs_inode_info *si)
+{
+	if (!RB_EMPTY_NODE(&si->writeback_node)) {
+		rb_erase(&si->writeback_node, &inf->writeback_inodes);
+		RB_CLEAR_NODE(&si->writeback_node);
+	}
+}
+
 void scoutfs_destroy_inode(struct inode *inode)
 {
+	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
+
+	spin_lock(&inf->writeback_lock);
+	remove_writeback_inode(inf, SCOUTFS_I(inode));
+	spin_unlock(&inf->writeback_lock);
+
 	call_rcu(&inode->i_rcu, scoutfs_i_callback);
 }
 
@@ -393,7 +444,7 @@ u64 scoutfs_last_ino(struct super_block *sb)
  */
 void scoutfs_inode_fill_pool(struct super_block *sb, u64 ino, u64 nr)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;
 
 	trace_printk("filling ino %llu nr %llu\n", ino, nr);
 
@@ -427,7 +478,7 @@ static bool pool_in_flight(struct free_ino_pool *pool)
  */
 static int alloc_ino(struct super_block *sb, u64 *ino)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;
 	bool request;
 	int ret;
 
@@ -733,28 +784,121 @@ int scoutfs_orphan_inode(struct inode *inode)
 	return ret;
 }
 
+/*
+ * Track an inode that could have dirty pages.  Used to kick off writeback
+ * on all dirty pages during transaction commit without tying ourselves in
+ * knots trying to call through the high level vfs sync methods.
+ */
+void scoutfs_inode_queue_writeback(struct inode *inode)
+{
+	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+
+	spin_lock(&inf->writeback_lock);
+	if (RB_EMPTY_NODE(&si->writeback_node))
+		insert_writeback_inode(inf, si);
+	spin_unlock(&inf->writeback_lock);
+}
+
+/*
+ * Walk our dirty inodes in ino order and either start dirty page
+ * writeback or wait for writeback to complete.
+ *
+ * This is called by transaction commiting so other writers are
+ * excluded.  We're still very careful to iterate over the tree while it
+ * and the inodes could be changing.
+ *
+ * Because writes are excluded we know that there's no remaining dirty
+ * pages once waiting returns successfully.
+ *
+ * XXX not sure what to do about retrying io errors.
+ */
+int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+	struct scoutfs_inode_info *si;
+	struct rb_node *node;
+	struct inode *inode;
+	struct inode *defer_iput = NULL;
+	int ret;
+
+	spin_lock(&inf->writeback_lock);
+
+	node = rb_first(&inf->writeback_inodes);
+	while (node) {
+		si = container_of(node, struct scoutfs_inode_info,
+				  writeback_node);
+		node = rb_next(node);
+		inode = igrab(&si->inode);
+		if (!inode)
+			continue;
+
+		spin_unlock(&inf->writeback_lock);
+
+		if (defer_iput) {
+			iput(defer_iput);
+			defer_iput = NULL;
+		}
+
+		if (write)
+			ret = filemap_fdatawrite(inode->i_mapping);
+		else
+			ret = filemap_fdatawait(inode->i_mapping);
+		trace_printk("ino %llu write %d ret %d\n",
+			     scoutfs_ino(inode), write, ret);
+		if (ret) {
+			iput(inode);
+			goto out;
+		}
+
+		spin_lock(&inf->writeback_lock);
+
+		if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node)))
+			node = rb_first(&inf->writeback_inodes);
+		else
+			node = rb_next(&si->writeback_node);
+
+		if (!write)
+			remove_writeback_inode(inf, si);
+
+		/* avoid iput->destroy lock deadlock */
+		defer_iput = inode;
+	}
+
+	spin_unlock(&inf->writeback_lock);
+out:
+	if (defer_iput)
+		iput(defer_iput);
+	return ret;
+}
+
 int scoutfs_inode_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct free_ino_pool *pool;
+	struct inode_sb_info *inf;
 
-	pool = kzalloc(sizeof(struct free_ino_pool), GFP_KERNEL);
-	if (!pool)
+	inf = kzalloc(sizeof(struct inode_sb_info), GFP_KERNEL);
+	if (!inf)
 		return -ENOMEM;
 
+	pool = &inf->pool;
 	init_waitqueue_head(&pool->waitq);
 	spin_lock_init(&pool->lock);
 
-	sbi->free_ino_pool = pool;
+	spin_lock_init(&inf->writeback_lock);
+	inf->writeback_inodes = RB_ROOT;
+
+	sbi->inode_sb_info = inf;
 
 	return 0;
 }
 
 void scoutfs_inode_destroy(struct super_block *sb)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
 
-	kfree(pool);
+	kfree(inf);
 }
 
 void scoutfs_inode_exit(void)
diff --git a/kmod/src/inode.h b/kmod/src/inode.h
index da24e9af..59da8f7a 100644
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -13,6 +13,7 @@ struct scoutfs_inode_info {
 	seqcount_t seqcount;
 	bool staging;			/* holder of i_mutex is staging */
 	struct rw_semaphore xattr_rwsem;
+	struct rb_node writeback_node;
 
 	struct inode inode;
 };
@@ -48,6 +49,9 @@ u64 scoutfs_inode_get_data_version(struct inode *inode);
 
 int scoutfs_scan_orphans(struct super_block *sb);
 
+void scoutfs_inode_queue_writeback(struct inode *inode);
+int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
+
 u64 scoutfs_last_ino(struct super_block *sb);
 
 void scoutfs_inode_exit(void);
diff --git a/kmod/src/net.c b/kmod/src/net.c
index 11e5fe30..33d2da3c 100644
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -18,6 +18,7 @@
 #include <linux/in.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <linux/sort.h>
 
 #include "format.h"
 #include "net.h"
@@ -363,6 +364,61 @@ static struct send_buf *alloc_sbuf(unsigned data_len)
 	return sbuf;
 }
 
+/* XXX I dunno, totally made up */
+#define BULK_COUNT 32
+
+static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req,
+					   int req_len)
+{
+	DECLARE_NET_INFO(sb, nti);
+	struct scoutfs_net_segnos *ns;
+	struct commit_waiter cw;
+	struct send_buf *sbuf;
+	u64 segno;
+	int ret;
+	int i;
+
+	if (req_len != 0)
+		return ERR_PTR(-EINVAL);
+
+	sbuf = alloc_sbuf(offsetof(struct scoutfs_net_segnos,
+				   segnos[BULK_COUNT]));
+	if (!sbuf)
+		return ERR_PTR(-ENOMEM);
+
+	ns = (void *)sbuf->nh->data;
+	ns->nr = cpu_to_le16(BULK_COUNT);
+
+	down_read(&nti->ring_commit_rwsem);
+
+	for (i = 0; i < BULK_COUNT; i++) {
+		ret = scoutfs_alloc_segno(sb, &segno);
+		if (ret) {
+			while (i-- > 0)
+				scoutfs_alloc_free(sb,
+					le64_to_cpu(ns->segnos[i]));
+			break;
+		}
+
+		ns->segnos[i] = cpu_to_le64(segno);
+	}
+
+
+	if (ret == 0)
+		queue_commit_work(nti, &cw);
+	up_read(&nti->ring_commit_rwsem);
+
+	if (ret == 0)
+		ret = wait_for_commit(&cw);
+
+	if (ret)
+		sbuf->nh->status = SCOUTFS_NET_STATUS_ERROR;
+	else
+		sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS;
+
+	return sbuf;
+}
+
 static struct send_buf *process_record_segment(struct super_block *sb,
 					       void *req, int req_len)
 {
@@ -616,6 +672,7 @@ static proc_func_t type_proc_func(u8 type)
 			process_manifest_range_entries,
 		[SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno,
 		[SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment,
+		[SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc,
 	};
 
 	return type < SCOUTFS_NET_UNKNOWN ? funcs[type] : NULL;
@@ -1100,6 +1157,113 @@ static int add_send_buf(struct super_block *sb, int type, void *data,
 	return 0;
 }
 
+struct bulk_alloc_args {
+	struct completion comp;
+	u64 *segnos;
+	int ret;
+};
+
+static int sort_cmp_u64s(const void *A, const void *B)
+{
+	const u64 *a = A;
+	const u64 *b = B;
+
+	return *a < *b ? -1  : *a > *b ? 1 : 0;
+}
+
+static void sort_swap_u64s(void *A, void *B, int size)
+{
+	u64 *a = A;
+	u64 *b = B;
+
+	swap(*a, *b);
+}
+
+static int bulk_alloc_reply(struct super_block *sb, void *reply, int ret,
+			    void *arg)
+{
+	struct bulk_alloc_args *args = arg;
+	struct scoutfs_net_segnos *ns = reply;
+	u16 nr;
+	int i;
+
+	if (ret < sizeof(struct scoutfs_net_segnos) ||
+	    ret != offsetof(struct scoutfs_net_segnos,
+			    segnos[le16_to_cpu(ns->nr)])) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nr = le16_to_cpu(ns->nr);
+
+	args->segnos = kmalloc((nr + 1) * sizeof(args->segnos[0]), GFP_NOFS);
+	if (args->segnos == NULL) {
+		ret = -ENOMEM; /* XXX hmm. */
+		goto out;
+	}
+
+	for (i = 0; i < nr; i++) {
+		args->segnos[i] = le64_to_cpu(ns->segnos[i]);
+
+		/* make sure they're all non-zero */
+		if (args->segnos[i] == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	sort(args->segnos, nr, sizeof(args->segnos[0]),
+	     sort_cmp_u64s, sort_swap_u64s);
+
+	/* make sure they're all unique */
+	for (i = 1; i < nr; i++) {
+		if (args->segnos[i] == args->segnos[i - 1]) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	args->segnos[nr] = 0;
+	ret = 0;
+out:
+	if (ret && args->segnos) {
+		kfree(args->segnos);
+		args->segnos = NULL;
+	}
+	args->ret = ret;
+	complete(&args->comp);
+	return args->ret;
+}
+
+/*
+ * Returns a 0-terminated allocated array of segnos, the caller is
+ * responsible for freeing it.
+ */
+u64 *scoutfs_net_bulk_alloc(struct super_block *sb)
+{
+	struct bulk_alloc_args args;
+	int ret;
+
+	args.segnos = NULL;
+	init_completion(&args.comp);
+
+	ret = add_send_buf(sb, SCOUTFS_NET_BULK_ALLOC, NULL, 0,
+			   bulk_alloc_reply, &args);
+	if (ret == 0) {
+		wait_for_completion(&args.comp);
+		ret = args.ret;
+		if (ret == 0 && (args.segnos == NULL || args.segnos[0] == 0))
+			ret = -ENOSPC;
+	}
+
+	if (ret) {
+		kfree(args.segnos);
+		args.segnos = ERR_PTR(ret);
+	}
+
+	return args.segnos;
+}
+
 /*
  * Eventually we're going to have messages that control compaction.
  * Each client mount would have long-lived work that sends requests
diff --git a/kmod/src/net.h b/kmod/src/net.h
index d48fc2b7..125bf327 100644
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -13,6 +13,7 @@ int scoutfs_net_manifest_range_entries(struct super_block *sb,
 int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno);
 int scoutfs_net_record_segment(struct super_block *sb,
 			       struct scoutfs_segment *seg, u8 level);
+u64 *scoutfs_net_bulk_alloc(struct super_block *sb);
 
 int scoutfs_net_get_compaction(struct super_block *sb, void *curs);
 int scoutfs_net_finish_compaction(struct super_block *sb, void *curs,
diff --git a/kmod/src/super.c b/kmod/src/super.c
index 48fb27d2..0fe2ed52 100644
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -204,6 +204,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbi)
 		return -ENOMEM;
 
+	/*
+	 * XXX this is random today for initial testing, but we'll want
+	 * it to be assigned by the server.
+	 */
+	get_random_bytes_arch(&sbi->node_id, sizeof(sbi->node_id));
+
 	spin_lock_init(&sbi->next_ino_lock);
 	atomic_set(&sbi->trans_holds, 0);
 	init_waitqueue_head(&sbi->trans_hold_wq);
diff --git a/kmod/src/super.h b/kmod/src/super.h
index 184c92b8..5b6d5903 100644
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -14,11 +14,13 @@ struct compact_info;
 struct data_info;
 struct lock_info;
 struct net_info;
-struct free_ino_pool;
+struct inode_sb_info;
 
 struct scoutfs_sb_info {
 	struct super_block *sb;
 
+	u64 node_id;
+
 	struct scoutfs_super_block super;
 
 	spinlock_t next_ino_lock;
@@ -29,7 +31,7 @@ struct scoutfs_sb_info {
 	struct seg_alloc *seg_alloc;
 	struct compact_info *compact_info;
 	struct data_info *data_info;
-	struct free_ino_pool *free_ino_pool;
+	struct inode_sb_info *inode_sb_info;
 
 	atomic_t trans_holds;
 	wait_queue_head_t trans_hold_wq;
diff --git a/kmod/src/trans.c b/kmod/src/trans.c
index e6247bc0..11941c7e 100644
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -26,6 +26,7 @@
 #include "seg.h"
 #include "counters.h"
 #include "net.h"
+#include "inode.h"
 #include "scoutfs_trace.h"
 
 /*
@@ -97,10 +98,12 @@ void scoutfs_trans_write_func(struct work_struct *work)
 		 * about leaking segnos nor duplicate manifest entries
 		 * on crashes between us and the server.
 		 */
-		ret = scoutfs_net_alloc_segno(sb, &segno) ?:
+		ret = scoutfs_inode_walk_writeback(sb, true) ?:
+		      scoutfs_net_alloc_segno(sb, &segno) ?:
 		      scoutfs_seg_alloc(sb, segno, &seg) ?:
 		      scoutfs_item_dirty_seg(sb, seg) ?:
 		      scoutfs_seg_submit_write(sb, seg, &comp) ?:
+		      scoutfs_inode_walk_writeback(sb, false) ?:
 		      scoutfs_bio_wait_comp(sb, &comp) ?:
 		      scoutfs_net_record_segment(sb, seg, 0);
 		if (ret)
@@ -112,9 +115,6 @@ out:
 	/* XXX this all needs serious work for dealing with errors */
 	WARN_ON_ONCE(ret);
 
-	/* must be done before waking waiting trans holders who might dirty */
-	scoutfs_data_end_writeback(sb, ret);
-
 	spin_lock(&sbi->trans_write_lock);
 	sbi->trans_write_count++;
 	sbi->trans_write_ret = ret;