scoutfs: use extents in the server allocator

Have the server use the extent core to maintain free extent items in the allocation btree instead of the bitmap items. We add a client request to allocate an extent of a given length. The existing segment alloc and free now work with a segment's worth of blocks. The server maintains counters in the super block of free blocks instead of free segments. We maintain an allocation cursor so that allocation results tend to cycle through the device. It's stored in the super so that it is maintained across server instances. This doesn't remove unused dead code to keep the commit from getting too noisy. It'll be removed in a future commit. Signed-off-by: Zach Brown <zab@versity.com>
2026-02-07 19:20:44 +00:00 · 2018-04-10 13:01:31 -07:00
parent 19f7e0284b
commit c01a715852
12 changed files with 545 additions and 65 deletions
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -556,6 +556,25 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,
 	return ret;
 }

+int scoutfs_client_alloc_extent(struct super_block *sb, u64 len, u64 *start)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	__le64 lelen = cpu_to_le64(len);
+	__le64 lestart;
+	int ret;
+
+	ret = client_request(client, SCOUTFS_NET_ALLOC_EXTENT,
+			     &lelen, sizeof(lelen), &lestart, sizeof(lestart));
+	if (ret == 0) {
+		if (lestart == 0)
+			ret = -ENOSPC;
+		else
+			*start = le64_to_cpu(lestart);
+	}
+
+	return ret;
+}
+
 int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno)
 {
 	struct client_info *client = SCOUTFS_SB(sb)->client_info;
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -3,6 +3,7 @@

 int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,
 				u64 *ino, u64 *nr);
+int scoutfs_client_alloc_extent(struct super_block *sb, u64 len, u64 *start);
 int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno);
 int scoutfs_client_record_segment(struct super_block *sb,
 				  struct scoutfs_segment *seg, u8 level);
--- a/kmod/src/compact.c
+++ b/kmod/src/compact.c
@@ -494,7 +494,7 @@ void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno)

 /*
 * Commit the result of a compaction based on the state of the cursor.
- * The net caller stops the manifest from being written while we're
+ * The server caller stops the manifest from being written while we're
 * making changes.  We lock the manifest to atomically make our changes.
 *
 * The erorr handling is sketchy here because calling the manifest from
@@ -513,7 +513,7 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r)
 	/* free unused segnos that were allocated for the compaction */
 	for (i = 0; i < curs->nr_segnos; i++) {
 		if (curs->segnos[i]) {
-			ret = scoutfs_alloc_free(sb, curs->segnos[i]);
+			ret = scoutfs_server_free_segno(sb, curs->segnos[i]);
 			BUG_ON(ret);
 		}
 	}
@@ -523,7 +523,7 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r)
 	/* delete input segments, probably freeing their segnos */
 	list_for_each_entry(cseg, &curs->csegs, entry) {
 		if (!cseg->part_of_move) {
-			ret = scoutfs_alloc_free(sb, cseg->segno);
+			ret = scoutfs_server_free_segno(sb, cseg->segno);
 			BUG_ON(ret);
 		}

--- a/kmod/src/count.h
+++ b/kmod/src/count.h
@@ -211,8 +211,7 @@ static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,
 static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void)
 {
 	struct scoutfs_item_count cnt = {0,};
-	unsigned nr_free = (SCOUTFS_BULK_ALLOC_COUNT +
-			    SCOUTFS_BLOCKS_PER_PAGE) * 3;
+	unsigned nr_free = (1 + SCOUTFS_BLOCKS_PER_PAGE) * 3;
 	unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCKS_PER_PAGE, 2) +
 			    SCOUTFS_BLOCKS_PER_PAGE) * 3;

--- a/kmod/src/counters.h
+++ b/kmod/src/counters.h
@@ -101,6 +101,13 @@
 	EXPAND_COUNTER(seg_free)				\
 	EXPAND_COUNTER(seg_shrink)				\
 	EXPAND_COUNTER(seg_stale_read)				\
+	EXPAND_COUNTER(server_alloc_segno)			\
+	EXPAND_COUNTER(server_extent_alloc)			\
+	EXPAND_COUNTER(server_extent_alloc_error)		\
+	EXPAND_COUNTER(server_free_extent)			\
+	EXPAND_COUNTER(server_free_pending_extent)		\
+	EXPAND_COUNTER(server_free_pending_error)		\
+	EXPAND_COUNTER(server_free_segno)			\
 	EXPAND_COUNTER(trans_commit_fsync)			\
 	EXPAND_COUNTER(trans_commit_full)			\
 	EXPAND_COUNTER(trans_commit_item_flush)			\
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -456,38 +456,24 @@ static struct task_cursor *get_cursor(struct data_info *datinf)
 	return curs;
 }

-static int bulk_alloc(struct super_block *sb)
+static int get_server_extent(struct super_block *sb, u64 len)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_extent ext;
-	u64 *segnos = NULL;
-	int ret = 0;
-	int i;
+	u64 start;
+	int ret;

-	segnos = scoutfs_client_bulk_alloc(sb);
-	if (IS_ERR(segnos)) {
-		ret = PTR_ERR(segnos);
+	ret = scoutfs_client_alloc_extent(sb, len, &start);
+	if (ret)
 		goto out;
-	}

-	for (i = 0; segnos[i]; i++) {
-		scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
-				    sbi->node_id,
-				    segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT,
-				    SCOUTFS_SEGMENT_BLOCKS, 0, 0);
-		trace_scoutfs_data_bulk_alloc(sb, &ext);
-		ret = scoutfs_extent_add(sb, data_extent_io, &ext,
-					 sbi->node_id_lock);
-		if (ret)
-			break;
-	}
+	scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
+			    sbi->node_id, start, len, 0, 0);
+	trace_scoutfs_data_get_server_extent(sb, &ext);
+	ret = scoutfs_extent_add(sb, data_extent_io, &ext, sbi->node_id_lock);
+	/* XXX don't free extent on error, crash recovery with server */

 out:
-	if (!IS_ERR_OR_NULL(segnos))
-		kfree(segnos);
-
-	/* XXX don't orphan segnos on error, crash recovery with server */
-
 	return ret;
 }

@@ -500,8 +486,10 @@ out:
 * that track large extents.  Each new allocating task will get a new
 * extent.
 */
-/* XXX initially tied to segment size, should be a lot larger */
-#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS
+#define CURSOR_BLOCKS		(1 * 1024 * 1024 / BLOCK_SIZE)
+#define CURSOR_BLOCKS_MASK	(CURSOR_BLOCKS - 1)
+#define CURSOR_BLOCKS_SEARCH	(CURSOR_BLOCKS + CURSOR_BLOCKS - 1)
+#define CURSOR_BLOCKS_ALLOC	(CURSOR_BLOCKS * 64)
 static int find_alloc_block(struct super_block *sb, struct inode *inode,
 			    u64 iblock, bool was_offline,
 			    struct scoutfs_lock *lock)
@@ -543,16 +531,26 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode,
 	}

 	/* try to find a new large extent, possibly asking for more */
-	while (curs->blkno == 0) {
+	if (curs->blkno == 0) {
 		scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE,
-				    sbi->node_id, 0, 2 * LARGE_EXTENT_BLOCKS,
+				    sbi->node_id, 0, CURSOR_BLOCKS_SEARCH,
 				    0, 0);
 		ret = scoutfs_extent_next(sb, data_extent_io, &ext,
 					  sbi->node_id_lock);
-		if (ret && ret != -ENOENT)
+		if (ret == -ENOENT) {
+			/* try to get allocation from the server if we're out */
+			ret = get_server_extent(sb, CURSOR_BLOCKS_ALLOC);
+			if (ret == 0)
+				ret = scoutfs_extent_next(sb, data_extent_io,
+							  &ext,
+							  sbi->node_id_lock);
+		}
+		if (ret) {
+			/* XXX should try to look for smaller free extents :/ */
+			if (ret == -ENOENT)
+				ret = -ENOSPC;
 			goto out;
-
-		/* XXX should try to look for smaller free extents :/ */
+		}

 		/*
 		 * set our cursor to the aligned start of a large extent
@@ -561,19 +559,10 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode,
 		 * constantly setting cursors to the start of a large
 		 * free extent that keeps have its start allocated.
 		 */
-		if (ret == 0) {
-			trace_scoutfs_data_alloc_block_free(sb, &ext);
-			curs->blkno = ALIGN(ext.start, LARGE_EXTENT_BLOCKS);
-			break;
-		}
-
-		/* try to get allocation from the server if we're out */
-		ret = bulk_alloc(sb);
-		if (ret < 0)
-			goto out;
+		trace_scoutfs_data_alloc_block_free(sb, &ext);
+		curs->blkno = ALIGN(ext.start, CURSOR_BLOCKS);
 	}

-
 	/* remove the free block we're using */
 	scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE,
 			    sbi->node_id, curs->blkno, 1, 0, 0);
@@ -603,9 +592,9 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode,
 	scoutfs_inode_add_onoff(inode, 1, was_offline ? -1ULL : 0);

 	/* set cursor to next block, clearing if we finish a large extent */
-	BUILD_BUG_ON(!is_power_of_2(LARGE_EXTENT_BLOCKS));
+	BUILD_BUG_ON(!is_power_of_2(CURSOR_BLOCKS));
 	curs->blkno++;
-	if ((curs->blkno & (LARGE_EXTENT_BLOCKS - 1)) == 0)
+	if ((curs->blkno & CURSOR_BLOCKS_MASK) == 0)
 		curs->blkno = 0;

 	ret = 0;
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -236,6 +236,19 @@ struct scoutfs_manifest_btree_val {
 	struct scoutfs_key last_key;
 } __packed;

+/*
+ * Free extents are stored in the server in an allocation btree.  The
+ * type differentiates whether start or length is in stored in the major
+ * value and is the primary sort key.  'start' is set to the final block
+ * in the extent so that overlaping queries can be done with next
+ * instead prev.
+ */
+struct scoutfs_extent_btree_key {
+	__u8 type;
+	__be64 major;
+	__be64 minor;
+} __packed;
+
 #define SCOUTFS_ALLOC_REGION_SHIFT 8
 #define SCOUTFS_ALLOC_REGION_BITS (1 << SCOUTFS_ALLOC_REGION_SHIFT)
 #define SCOUTFS_ALLOC_REGION_MASK (SCOUTFS_ALLOC_REGION_BITS - 1)
@@ -303,7 +316,7 @@ struct scoutfs_segment_block {
 #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
 #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */

-/* node zone */
+/* node zone (also used in server alloc btree) */
 #define SCOUTFS_FREE_EXTENT_BLKNO_TYPE		1
 #define SCOUTFS_FREE_EXTENT_BLOCKS_TYPE		2

@@ -370,6 +383,9 @@ struct scoutfs_super_block {
 	__le64 alloc_uninit;
 	__le64 total_segs;
 	__le64 free_segs;
+	__le64 total_blocks;
+	__le64 free_blocks;
+	__le64 alloc_cursor;
 	struct scoutfs_btree_ring bring;
 	__le64 next_seg_seq;
 	struct scoutfs_btree_root alloc_root;
@@ -564,9 +580,9 @@ struct scoutfs_net_segnos {
 } __packed;

 struct scoutfs_net_statfs {
-	__le64 total_segs;		/* total segments in device */
+	__le64 total_blocks;		/* total blocks in device */
 	__le64 next_ino;		/* next unused inode number */
-	__le64 bfree;			/* total free small blocks */
+	__le64 bfree;			/* free blocks */
 	__u8 uuid[SCOUTFS_UUID_BYTES];	/* logical volume uuid */
 } __packed;

@@ -582,6 +598,7 @@ struct scoutfs_net_statfs {

 enum {
 	SCOUTFS_NET_ALLOC_INODES = 0,
+	SCOUTFS_NET_ALLOC_EXTENT,
 	SCOUTFS_NET_ALLOC_SEGNO,
 	SCOUTFS_NET_RECORD_SEGMENT,
 	SCOUTFS_NET_BULK_ALLOC,
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -2117,7 +2117,7 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_truncate_offline,
 	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
 	TP_ARGS(sb, ext)
 );
-DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_bulk_alloc,
+DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_get_server_extent,
 	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
 	TP_ARGS(sb, ext)
 );
@@ -2145,6 +2145,30 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_fiemap_extent,
 	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
 	TP_ARGS(sb, ext)
 );
+DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_extent_next,
+	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
+	TP_ARGS(sb, ext)
+);
+DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_extent_allocated,
+	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
+	TP_ARGS(sb, ext)
+);
+DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_segno_next,
+	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
+	TP_ARGS(sb, ext)
+);
+DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_segno_allocated,
+	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
+	TP_ARGS(sb, ext)
+);
+DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_free_pending_extent,
+	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
+	TP_ARGS(sb, ext)
+);
+DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_extent_io,
+	TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
+	TP_ARGS(sb, ext)
+);

 TRACE_EVENT(scoutfs_online_offline_blocks,
 	TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta,
@@ -2173,6 +2197,33 @@ TRACE_EVENT(scoutfs_online_offline_blocks,
 		  __entry->on_now, __entry->off_now)
 );

+DECLARE_EVENT_CLASS(scoutfs_segno_class,
+	TP_PROTO(struct super_block *sb, u64 segno),
+
+	TP_ARGS(sb, segno),
+
+	TP_STRUCT__entry(
+		__field(__u64, fsid)
+		__field(__s64, segno)
+	),
+
+	TP_fast_assign(
+		__entry->fsid = FSID_ARG(sb);
+		__entry->segno = segno;
+	),
+
+	TP_printk("fsid "FSID_FMT" segno %llu",
+		  __entry->fsid, __entry->segno)
+);
+DEFINE_EVENT(scoutfs_segno_class, scoutfs_alloc_segno,
+	TP_PROTO(struct super_block *sb, u64 segno),
+	TP_ARGS(sb, segno)
+);
+DEFINE_EVENT(scoutfs_segno_class, scoutfs_free_segno,
+	TP_PROTO(struct super_block *sb, u64 segno),
+	TP_ARGS(sb, segno)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/seg.c
+++ b/kmod/src/seg.c
@@ -28,6 +28,7 @@
 #include "counters.h"
 #include "triggers.h"
 #include "msg.h"
+#include "server.h"
 #include "scoutfs_trace.h"

 /*
@@ -298,7 +299,7 @@ out:
 */
 int scoutfs_seg_free_segno(struct super_block *sb, struct scoutfs_segment *seg)
 {
-	return scoutfs_alloc_free(sb, seg->segno);
+	return scoutfs_server_free_segno(sb, seg->segno);
 }

 /*
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
@@ -20,6 +20,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/sort.h>
+#include <linux/log2.h>

 #include "format.h"
 #include "counters.h"
@@ -67,6 +68,10 @@ struct server_info {
 	/* server tracks seq use */
 	spinlock_t seq_lock;
 	struct list_head pending_seqs;
+
+	/* server tracks pending frees to be applied during commit */
+	struct rw_semaphore alloc_rwsem;
+	struct list_head pending_frees;
 };

 struct server_request {
@@ -93,6 +98,350 @@ struct commit_waiter {
 	int ret;
 };

+static void init_extent_btree_key(struct scoutfs_extent_btree_key *ebk,
+				  u8 type, u64 major, u64 minor)
+{
+	ebk->type = type;
+	ebk->major = cpu_to_be64(major);
+	ebk->minor = cpu_to_be64(minor);
+}
+
+static int init_extent_from_btree_key(struct scoutfs_extent *ext, u8 type,
+				      struct scoutfs_extent_btree_key *ebk,
+				      unsigned int key_bytes)
+{
+	u64 start;
+	u64 len;
+
+	/* btree _next doesn't have last key limit */
+	if (ebk->type != type)
+		return -ENOENT;
+
+	if (key_bytes != sizeof(struct scoutfs_extent_btree_key) ||
+	    (ebk->type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE &&
+	     ebk->type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE))
+		return -EIO; /* XXX corruption, bad key */
+
+	start = be64_to_cpu(ebk->major);
+	len = be64_to_cpu(ebk->minor);
+	if (ebk->type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE)
+		swap(start, len);
+	start -= len - 1;
+
+	return scoutfs_extent_init(ext, ebk->type, 0, start, len, 0, 0);
+}
+
+/*
+ * This is called by the extent core on behalf of the server who holds
+ * the appropriate locks to protect the many btree items that can be
+ * accessed on behalf of one extent operation.
+ */
+static int server_extent_io(struct super_block *sb, int op,
+			    struct scoutfs_extent *ext, void *data)
+{
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_extent_btree_key ebk;
+	SCOUTFS_BTREE_ITEM_REF(iref);
+	bool mirror = false;
+	u8 mirror_type;
+	u8 mirror_op = 0;
+	int ret;
+	int err;
+
+	trace_scoutfs_server_extent_io(sb, ext);
+
+	if (WARN_ON_ONCE(ext->type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE &&
+			 ext->type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE))
+		return -EINVAL;
+
+	if (ext->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE &&
+	    (op == SEI_INSERT || op == SEI_DELETE)) {
+		mirror = true;
+		mirror_type = SCOUTFS_FREE_EXTENT_BLOCKS_TYPE;
+		mirror_op = op == SEI_INSERT ? SEI_DELETE : SEI_INSERT;
+	}
+
+	init_extent_btree_key(&ebk, ext->type, ext->start + ext->len - 1,
+			      ext->len);
+	if (ext->type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE)
+		swap(ebk.major, ebk.minor);
+
+	if (op == SEI_NEXT) {
+		ret = scoutfs_btree_next(sb, &super->alloc_root,
+					 &ebk, sizeof(ebk), &iref);
+		if (ret == 0) {
+			ret = init_extent_from_btree_key(ext, ext->type,
+							 iref.key,
+							 iref.key_len);
+			scoutfs_btree_put_iref(&iref);
+		}
+
+	} else if (op == SEI_INSERT) {
+		ret = scoutfs_btree_insert(sb, &super->alloc_root,
+					   &ebk, sizeof(ebk), NULL, 0);
+
+	} else if (op == SEI_DELETE) {
+		ret = scoutfs_btree_delete(sb, &super->alloc_root,
+					   &ebk, sizeof(ebk));
+
+	} else {
+		ret = WARN_ON_ONCE(-EINVAL);
+	}
+
+	if (ret == 0 && mirror) {
+		swap(ext->type, mirror_type);
+		ret = server_extent_io(sb, op, ext, data);
+		swap(ext->type, mirror_type);
+		if (ret) {
+			err = server_extent_io(sb, mirror_op, ext, data);
+			BUG_ON(err);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Allocate an extent of the given length in the first smallest free
+ * extent that contains it.  We allocate in multiples of segment blocks
+ * and expose that to callers today.
+ *
+ * This doesn't have the cursor that segment allocation does.  It's
+ * possible that a recently freed segment can merge to form a larger
+ * free extent that can be very quickly allocated to a node.  The hope is
+ * that doesn't happen very often.
+ */
+static int alloc_extent(struct super_block *sb, u64 len, u64 *start)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_extent ext;
+	int ret;
+
+	*start = 0;
+
+	down_write(&server->alloc_rwsem);
+
+	if (len & (SCOUTFS_SEGMENT_BLOCKS - 1)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, 0,
+			    0, len, 0, 0);
+	ret = scoutfs_extent_next(sb, server_extent_io, &ext, NULL);
+	if (ret) {
+		if (ret == -ENOENT)
+			ret = -ENOSPC;
+		goto out;
+	}
+
+	trace_scoutfs_server_alloc_extent_next(sb, &ext);
+
+	ext.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
+	ext.len = len;
+
+	ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL);
+	if (ret)
+		goto out;
+
+	trace_scoutfs_server_alloc_extent_allocated(sb, &ext);
+	le64_add_cpu(&super->free_blocks, -ext.len);
+
+	*start = ext.start;
+	ret = 0;
+
+out:
+	up_write(&server->alloc_rwsem);
+
+	if (ret)
+		scoutfs_inc_counter(sb, server_extent_alloc_error);
+	else
+		scoutfs_inc_counter(sb, server_extent_alloc);
+
+	return ret;
+}
+
+struct pending_free_extent {
+	struct list_head head;
+	u64 start;
+	u64 len;
+};
+
+/*
+ * Now that the transaction's done we can apply all the pending frees.
+ * The list entries are totally unsorted so this is the first time that
+ * we can discover corruption from duplicated frees, etc.  This can also
+ * fail on normal transient io or memory errors.
+ *
+ * We can't unwind if this fails.  The caller can freak out or keep
+ * trying forever.
+ */
+static int apply_pending_frees(struct super_block *sb)
+{
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct pending_free_extent *pfe;
+	struct pending_free_extent *tmp;
+	struct scoutfs_extent ext;
+	int ret;
+
+	down_write(&server->alloc_rwsem);
+
+	list_for_each_entry_safe(pfe, tmp, &server->pending_frees, head) {
+		scoutfs_inc_counter(sb, server_free_pending_extent);
+		scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, 0,
+				    pfe->start, pfe->len, 0, 0);
+		trace_scoutfs_server_free_pending_extent(sb, &ext);
+		ret = scoutfs_extent_add(sb, server_extent_io, &ext, NULL);
+		if (ret) {
+			scoutfs_inc_counter(sb, server_free_pending_error);
+			break;
+		}
+
+		le64_add_cpu(&super->free_blocks, pfe->len);
+		list_del_init(&pfe->head);
+		kfree(pfe);
+	}
+
+	up_write(&server->alloc_rwsem);
+
+	return 0;
+}
+
+/*
+ * If there are still pending frees to destroy it means the server didn't
+ * shut down cleanly and that's not well supported today so we want to
+ * have it holler if this happens.  In the future we'd cleanly support
+ * forced shutdown that had been told that it's OK to throw away dirty
+ * state.
+ */
+static int destroy_pending_frees(struct super_block *sb)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct pending_free_extent *pfe;
+	struct pending_free_extent *tmp;
+
+	WARN_ON_ONCE(!list_empty(&server->pending_frees));
+
+	down_write(&server->alloc_rwsem);
+
+	list_for_each_entry_safe(pfe, tmp, &server->pending_frees, head) {
+		list_del_init(&pfe->head);
+		kfree(pfe);
+	}
+
+	up_write(&server->alloc_rwsem);
+
+	return 0;
+}
+
+/*
+ * We can't satisfy allocations with freed extents until the removed
+ * references to the freed extents have been committed.  We add freed
+ * extents to a list that is only applied to the persistent indexes as
+ * the transaction is being committed and the current transaction won't
+ * try to allocate any more extents.  If we didn't do this then we could
+ * write to referenced data as part of the commit that frees it.  If the
+ * commit was interrupted the stable data could have been overwritten.
+ */
+static int free_extent(struct super_block *sb, u64 start, u64 len)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct pending_free_extent *pfe;
+	int ret;
+
+	scoutfs_inc_counter(sb, server_free_extent);
+
+	down_write(&server->alloc_rwsem);
+
+	pfe = kmalloc(sizeof(struct pending_free_extent), GFP_NOFS);
+	if (!pfe) {
+		ret = -ENOMEM;
+	} else {
+		pfe->start = start;
+		pfe->len = len;
+		list_add_tail(&pfe->head, &server->pending_frees);
+		ret = 0;
+	}
+
+	up_write(&server->alloc_rwsem);
+
+	return ret;
+}
+
+/*
+ * This is called by the compaction code which is running in the server.
+ * The server caller has held all the locks, etc.
+ */
+int scoutfs_server_free_segno(struct super_block *sb, u64 segno)
+{
+	scoutfs_inc_counter(sb, server_free_segno);
+	trace_scoutfs_free_segno(sb, segno);
+	return free_extent(sb, segno << SCOUTFS_SEGMENT_BLOCK_SHIFT,
+			   SCOUTFS_SEGMENT_BLOCKS);
+}
+
+/*
+ * Allocate a segment on behalf of compaction or a node wanting to write
+ * a level 0 segment.  It has to be aligned to the segment size because
+ * we address segments with aligned segment numbers instead of block
+ * offsets.
+ *
+ * We can use a simple cursor sweep of the index by start because all
+ * server extents are multiples of the segment size.  Sweeping through
+ * the volume tries to spread out new segment writes and make it more
+ * rare to write to a recently freed segment which can cause a client to
+ * have to re-read the manifest.
+ */
+static int alloc_segno(struct super_block *sb, u64 *segno)
+{
+	struct server_info *server = SCOUTFS_SB(sb)->server_info;
+	struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
+	struct scoutfs_extent ext;
+	u64 curs;
+	int ret;
+
+	down_write(&server->alloc_rwsem);
+
+	curs = ALIGN(le64_to_cpu(super->alloc_cursor), SCOUTFS_SEGMENT_BLOCKS);
+	*segno = 0;
+
+	do {
+		scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, 0,
+				    curs, 1, 0, 0);
+		ret = scoutfs_extent_next(sb, server_extent_io, &ext, NULL);
+	} while (ret == -ENOENT && curs && (curs = 0, 1));
+	if (ret) {
+		if (ret == -ENOENT)
+			ret = -ENOSPC;
+		goto out;
+	}
+
+	trace_scoutfs_server_alloc_segno_next(sb, &ext);
+
+	/* use cursor if within extent, otherwise start of next extent */
+	if (ext.start < curs)
+		ext.start = curs;
+	ext.len = SCOUTFS_SEGMENT_BLOCKS;
+
+	ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL);
+	if (ret)
+		goto out;
+
+	super->alloc_cursor = cpu_to_le64(ext.start + ext.len);
+
+	*segno = ext.start >> SCOUTFS_SEGMENT_BLOCK_SHIFT;
+
+	trace_scoutfs_server_alloc_segno_allocated(sb, &ext);
+	trace_scoutfs_alloc_segno(sb, *segno);
+	scoutfs_inc_counter(sb, server_alloc_segno);
+
+out:
+	up_write(&server->alloc_rwsem);
+	return ret;
+}
+
 /*
 * Trigger a server shutdown by shutting down the listening socket.  The
 * server thread will break out of accept and exit.
@@ -188,9 +537,9 @@ static void scoutfs_server_commit_func(struct work_struct *work)
 		goto out;
 	}

-	ret = scoutfs_alloc_apply_pending(sb);
+	ret = apply_pending_frees(sb);
 	if (ret) {
-		scoutfs_err(sb, "server error freeing segments: %d", ret);
+		scoutfs_err(sb, "server error freeing extents: %d", ret);
 		goto out;
 	}

@@ -336,6 +685,50 @@ out:
 	return send_reply(conn, id, type, ret, &ial, sizeof(ial));
 }

+/*
+ * Give the client an extent allocation of len blocks.  We leave the
+ * details to the extent allocator.
+ */
+static int process_alloc_extent(struct server_connection *conn,
+			       u64 id, u8 type, void *data, unsigned data_len)
+{
+	struct server_info *server = conn->server;
+	struct super_block *sb = server->sb;
+	struct commit_waiter cw;
+	__le64 lestart;
+	__le64 lelen;
+	u64 start;
+	int ret;
+
+	if (data_len != sizeof(lelen)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	memcpy(&lelen, data, data_len);
+
+	down_read(&server->commit_rwsem);
+	ret = alloc_extent(sb, le64_to_cpu(lelen), &start);
+	if (ret == -ENOSPC) {
+		start = 0;
+		ret = 0;
+	}
+	if (ret == 0) {
+		lestart = cpu_to_le64(start);
+		queue_commit_work(server, &cw);
+	}
+	up_read(&server->commit_rwsem);
+
+	if (ret == 0)
+		ret = wait_for_commit(server, &cw, id, type);
+out:
+	return send_reply(conn, id, type, ret, &lestart, sizeof(lestart));
+}
+
+/*
+ * We still special case segno allocation because it's aligned and we'd
+ * like to keep that detail in the server.
+ */
 static int process_alloc_segno(struct server_connection *conn,
 			       u64 id, u8 type, void *data, unsigned data_len)
 {
@@ -352,7 +745,7 @@ static int process_alloc_segno(struct server_connection *conn,
 	}

 	down_read(&server->commit_rwsem);
-	ret = scoutfs_alloc_segno(sb, &segno);
+	ret = alloc_segno(sb, &segno);
 	if (ret == 0) {
 		lesegno = cpu_to_le64(segno);
 		queue_commit_work(server, &cw);
@@ -607,14 +1000,15 @@ static int process_statfs(struct server_connection *conn, u64 id, u8 type,
 	if (data_len == 0) {
 		/* uuid and total_segs are constant, so far */
 		memcpy(nstatfs.uuid, super->uuid, sizeof(nstatfs.uuid));
-		nstatfs.total_segs = super->total_segs;

 		spin_lock(&sbi->next_ino_lock);
 		nstatfs.next_ino = super->next_ino;
 		spin_unlock(&sbi->next_ino_lock);

-		/* alloc locks the bfree calculation */
-		nstatfs.bfree = cpu_to_le64(scoutfs_alloc_bfree(sb));
+		down_read(&server->alloc_rwsem);
+		nstatfs.total_blocks = super->total_blocks;
+		nstatfs.bfree = super->free_blocks;
+		up_read(&server->alloc_rwsem);
 		ret = 0;
 	} else {
 		ret = -EINVAL;
@@ -657,7 +1051,7 @@ int scoutfs_client_get_compaction(struct super_block *sb, void *curs)

 	/* allow for expansion slop from sticky and alignment */
 	for (i = 0; i < nr + SCOUTFS_COMPACTION_SLOP; i++) {
-		ret = scoutfs_alloc_segno(sb, &segno);
+		ret = alloc_segno(sb, &segno);
 		if (ret < 0)
 			break;
 		scoutfs_compact_add_segno(sb, curs, segno);
@@ -728,6 +1122,7 @@ static void scoutfs_server_process_func(struct work_struct *work)
 	struct server_connection *conn = req->conn;
 	static process_func_t process_funcs[] = {
 		[SCOUTFS_NET_ALLOC_INODES]	= process_alloc_inodes,
+		[SCOUTFS_NET_ALLOC_EXTENT]	= process_alloc_extent,
 		[SCOUTFS_NET_ALLOC_SEGNO]	= process_alloc_segno,
 		[SCOUTFS_NET_RECORD_SEGMENT]	= process_record_segment,
 		[SCOUTFS_NET_BULK_ALLOC]	= process_bulk_alloc,
@@ -994,7 +1389,6 @@ static void scoutfs_server_func(struct work_struct *work)
 	/* finally start up the server subsystems before accepting */
 	ret = scoutfs_btree_setup(sb) ?:
 	      scoutfs_manifest_setup(sb) ?:
-	      scoutfs_alloc_setup(sb) ?:
 	      scoutfs_compact_setup(sb);
 	if (ret)
 		goto shutdown;
@@ -1067,7 +1461,7 @@ shutdown:

 	/* shut down all the server subsystems */
 	scoutfs_compact_destroy(sb);
-	scoutfs_alloc_destroy(sb);
+	destroy_pending_frees(sb);
 	scoutfs_manifest_destroy(sb);
 	scoutfs_btree_destroy(sb);

@@ -1108,6 +1502,8 @@ int scoutfs_server_setup(struct super_block *sb)
 	seqcount_init(&server->stable_seqcount);
 	spin_lock_init(&server->seq_lock);
 	INIT_LIST_HEAD(&server->pending_seqs);
+	init_rwsem(&server->alloc_rwsem);
+	INIT_LIST_HEAD(&server->pending_frees);

 	server->wq = alloc_workqueue("scoutfs_server", WQ_NON_REENTRANT, 0);
 	if (!server->wq) {
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -9,6 +9,7 @@ void scoutfs_init_ment_from_net(struct scoutfs_manifest_entry *ment,
 int scoutfs_client_get_compaction(struct super_block *sb, void *curs);
 int scoutfs_client_finish_compaction(struct super_block *sb, void *curs,
 				     void *list);
+int scoutfs_server_free_segno(struct super_block *sb, u64 segno);

 int scoutfs_server_setup(struct super_block *sb);
 void scoutfs_server_destroy(struct super_block *sb);
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -74,8 +74,7 @@ static int scoutfs_statfs(struct dentry *dentry, struct kstatfs *kst)
 	kst->f_bfree = le64_to_cpu(nstatfs.bfree);
 	kst->f_type = SCOUTFS_SUPER_MAGIC;
 	kst->f_bsize = SCOUTFS_BLOCK_SIZE;
-	kst->f_blocks = le64_to_cpu(nstatfs.total_segs) *
-			SCOUTFS_SEGMENT_BLOCKS;
+	kst->f_blocks = le64_to_cpu(nstatfs.total_blocks);
 	kst->f_bavail = kst->f_bfree;

 	kst->f_ffree = kst->f_bfree * 16;