scoutfs: reference file data with extent items

Our first attempt at storing file data put them in items. This was easy to implement but won't be acceptable in the long term. The cost of the power of LSM indexing is compaction overhead. That's acceptable for fine grained metadata but is totally unacceptable for bulk file data. This switches to storing file data in seperate block allocations which are referenced by extent items. The bulk of the change is the mechanics of working with extents. We have high level callers which add or remove logical extents and then underlying mechanisms that insert, merge, or split the items that the extents are stored in. We have three types of extent items. The primary type maps logical file regions to physical block extents. The next two store free extents per-node so that clients don't create lock and LSM contention as they try and allocate extents. To fill those per-node free extents we add messages that communcate free extents in the form of lists of segment allocations from the server. We don't do any fancy multi-block allocation yet. We only allocate blocks in get_blocks as writes find unmapped blocks. We do use some per-task cursors to cache block allocation positions so that these single block allocations are very likely to merge into larger extents as tasks stream wites. This is just the first chunk of the extent work that's coming. A later patch adds offline flags and fixes up the change nonsense that seemed like a good idea here. The final moving part is that we initiate writeback on all newly allocated extents before we commit the metadata that references the new blocks. We do this with our own dirty inode tracking because the high level vfs methods are unusably slow in some upstream kernels (they walk all inodes, not just dirty inodes.) Signed-off-by: Zach Brown <zab@versity.com>
2026-02-07 19:20:44 +00:00 · 2017-05-01 13:57:59 -07:00
parent 6719733ddc
commit 6afeb97802
10 changed files with 1224 additions and 436 deletions
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -6,7 +6,6 @@ extern const struct file_operations scoutfs_file_fops;

 int scoutfs_data_truncate_items(struct super_block *sb, u64 ino, u64 iblock,
 				u64 len, bool offline);
-void scoutfs_data_end_writeback(struct super_block *sb, int err);

 int scoutfs_data_setup(struct super_block *sb);
 void scoutfs_data_destroy(struct super_block *sb);
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -156,9 +156,10 @@ struct scoutfs_segment_block {
 #define SCOUTFS_READDIR_KEY		6
 #define SCOUTFS_LINK_BACKREF_KEY	7
 #define SCOUTFS_SYMLINK_KEY		8
-#define SCOUTFS_EXTENT_KEY		9
+#define SCOUTFS_FILE_EXTENT_KEY		9
 #define SCOUTFS_ORPHAN_KEY		10
-#define SCOUTFS_DATA_KEY		11
+#define SCOUTFS_FREE_EXTENT_BLKNO_KEY	11
+#define SCOUTFS_FREE_EXTENT_BLOCKS_KEY	12
 /* not found in the fs */
 #define SCOUTFS_MAX_UNUSED_KEY		253
 #define SCOUTFS_NET_ADDR_KEY		254
@@ -198,11 +199,28 @@ struct scoutfs_orphan_key {
 	__be64 ino;
 } __packed;

-/* value is data payload bytes */
-struct scoutfs_data_key {
+/* no value */
+struct scoutfs_file_extent_key {
 	__u8 type;
 	__be64 ino;
-	__be64 block;
+	__be64 last_blk_off;
+	__be64 last_blkno;
+	__be64 blocks;
+} __packed;
+
+/* no value */
+struct scoutfs_free_extent_blkno_key {
+	__u8 type;
+	__be64 node_id;
+	__be64 last_blkno;
+	__be64 blocks;
+} __packed;
+
+struct scoutfs_free_extent_blocks_key {
+	__u8 type;
+	__be64 node_id;
+	__be64 blocks;
+	__be64 last_blkno;
 } __packed;

 /* value is each item's part of the full xattr value for the off/len */
@@ -384,6 +402,11 @@ struct scoutfs_net_manifest_entries {
 	struct scoutfs_manifest_entry ments[0];
 } __packed;

+struct scoutfs_net_segnos {
+	__le16 nr;
+	__le64 segnos[0];
+} __packed;
+
 enum {
 	/* sends and receives a struct scoutfs_timeval */
 	SCOUTFS_NET_TRADE_TIME = 0,
@@ -391,6 +414,7 @@ enum {
 	SCOUTFS_NET_MANIFEST_RANGE_ENTRIES,
 	SCOUTFS_NET_ALLOC_SEGNO,
 	SCOUTFS_NET_RECORD_SEGMENT,
+	SCOUTFS_NET_BULK_ALLOC,
 	SCOUTFS_NET_UNKNOWN,
 };

--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -47,6 +47,16 @@ struct free_ino_pool {
 	bool in_flight;
 };

+struct inode_sb_info {
+	struct free_ino_pool pool;
+
+	spinlock_t writeback_lock;
+	struct rb_root writeback_inodes;
+};
+
+#define DECLARE_INODE_SB_INFO(sb, name) \
+	struct inode_sb_info *name = SCOUTFS_SB(sb)->inode_sb_info
+
 static struct kmem_cache *scoutfs_inode_cachep;

 /*
@@ -61,6 +71,7 @@ static void scoutfs_inode_ctor(void *obj)
 	seqcount_init(&ci->seqcount);
 	ci->staging = false;
 	init_rwsem(&ci->xattr_rwsem);
+	RB_CLEAR_NODE(&ci->writeback_node);

 	inode_init_once(&ci->inode);
 }
@@ -84,8 +95,48 @@ static void scoutfs_i_callback(struct rcu_head *head)
 	kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
 }

+static void insert_writeback_inode(struct inode_sb_info *inf,
+				   struct scoutfs_inode_info *ins)
+{
+	struct rb_root *root = &inf->writeback_inodes;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct scoutfs_inode_info *si;
+
+	while (*node) {
+		parent = *node;
+		si = container_of(*node, struct scoutfs_inode_info,
+				  writeback_node);
+
+		if (ins->ino < si->ino)
+			node = &(*node)->rb_left;
+		else if (ins->ino > si->ino)
+			node = &(*node)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&ins->writeback_node, parent, node);
+	rb_insert_color(&ins->writeback_node, root);
+}
+
+static void remove_writeback_inode(struct inode_sb_info *inf,
+			       struct scoutfs_inode_info *si)
+{
+	if (!RB_EMPTY_NODE(&si->writeback_node)) {
+		rb_erase(&si->writeback_node, &inf->writeback_inodes);
+		RB_CLEAR_NODE(&si->writeback_node);
+	}
+}
+
 void scoutfs_destroy_inode(struct inode *inode)
 {
+	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
+
+	spin_lock(&inf->writeback_lock);
+	remove_writeback_inode(inf, SCOUTFS_I(inode));
+	spin_unlock(&inf->writeback_lock);
+
 	call_rcu(&inode->i_rcu, scoutfs_i_callback);
 }

@@ -393,7 +444,7 @@ u64 scoutfs_last_ino(struct super_block *sb)
 */
 void scoutfs_inode_fill_pool(struct super_block *sb, u64 ino, u64 nr)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;

 	trace_printk("filling ino %llu nr %llu\n", ino, nr);

@@ -427,7 +478,7 @@ static bool pool_in_flight(struct free_ino_pool *pool)
 */
 static int alloc_ino(struct super_block *sb, u64 *ino)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct free_ino_pool *pool = &SCOUTFS_SB(sb)->inode_sb_info->pool;
 	bool request;
 	int ret;

@@ -733,28 +784,121 @@ int scoutfs_orphan_inode(struct inode *inode)
 	return ret;
 }

+/*
+ * Track an inode that could have dirty pages.  Used to kick off writeback
+ * on all dirty pages during transaction commit without tying ourselves in
+ * knots trying to call through the high level vfs sync methods.
+ */
+void scoutfs_inode_queue_writeback(struct inode *inode)
+{
+	DECLARE_INODE_SB_INFO(inode->i_sb, inf);
+	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
+
+	spin_lock(&inf->writeback_lock);
+	if (RB_EMPTY_NODE(&si->writeback_node))
+		insert_writeback_inode(inf, si);
+	spin_unlock(&inf->writeback_lock);
+}
+
+/*
+ * Walk our dirty inodes in ino order and either start dirty page
+ * writeback or wait for writeback to complete.
+ *
+ * This is called by transaction commiting so other writers are
+ * excluded.  We're still very careful to iterate over the tree while it
+ * and the inodes could be changing.
+ *
+ * Because writes are excluded we know that there's no remaining dirty
+ * pages once waiting returns successfully.
+ *
+ * XXX not sure what to do about retrying io errors.
+ */
+int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
+{
+	DECLARE_INODE_SB_INFO(sb, inf);
+	struct scoutfs_inode_info *si;
+	struct rb_node *node;
+	struct inode *inode;
+	struct inode *defer_iput = NULL;
+	int ret;
+
+	spin_lock(&inf->writeback_lock);
+
+	node = rb_first(&inf->writeback_inodes);
+	while (node) {
+		si = container_of(node, struct scoutfs_inode_info,
+				  writeback_node);
+		node = rb_next(node);
+		inode = igrab(&si->inode);
+		if (!inode)
+			continue;
+
+		spin_unlock(&inf->writeback_lock);
+
+		if (defer_iput) {
+			iput(defer_iput);
+			defer_iput = NULL;
+		}
+
+		if (write)
+			ret = filemap_fdatawrite(inode->i_mapping);
+		else
+			ret = filemap_fdatawait(inode->i_mapping);
+		trace_printk("ino %llu write %d ret %d\n",
+			     scoutfs_ino(inode), write, ret);
+		if (ret) {
+			iput(inode);
+			goto out;
+		}
+
+		spin_lock(&inf->writeback_lock);
+
+		if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node)))
+			node = rb_first(&inf->writeback_inodes);
+		else
+			node = rb_next(&si->writeback_node);
+
+		if (!write)
+			remove_writeback_inode(inf, si);
+
+		/* avoid iput->destroy lock deadlock */
+		defer_iput = inode;
+	}
+
+	spin_unlock(&inf->writeback_lock);
+out:
+	if (defer_iput)
+		iput(defer_iput);
+	return ret;
+}
+
 int scoutfs_inode_setup(struct super_block *sb)
 {
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct free_ino_pool *pool;
+	struct inode_sb_info *inf;

-	pool = kzalloc(sizeof(struct free_ino_pool), GFP_KERNEL);
-	if (!pool)
+	inf = kzalloc(sizeof(struct inode_sb_info), GFP_KERNEL);
+	if (!inf)
 		return -ENOMEM;

+	pool = &inf->pool;
 	init_waitqueue_head(&pool->waitq);
 	spin_lock_init(&pool->lock);

-	sbi->free_ino_pool = pool;
+	spin_lock_init(&inf->writeback_lock);
+	inf->writeback_inodes = RB_ROOT;
+
+	sbi->inode_sb_info = inf;

 	return 0;
 }

 void scoutfs_inode_destroy(struct super_block *sb)
 {
-	struct free_ino_pool *pool = SCOUTFS_SB(sb)->free_ino_pool;
+	struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;

-	kfree(pool);
+	kfree(inf);
 }

 void scoutfs_inode_exit(void)
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -13,6 +13,7 @@ struct scoutfs_inode_info {
 	seqcount_t seqcount;
 	bool staging;			/* holder of i_mutex is staging */
 	struct rw_semaphore xattr_rwsem;
+	struct rb_node writeback_node;

 	struct inode inode;
 };
@@ -48,6 +49,9 @@ u64 scoutfs_inode_get_data_version(struct inode *inode);

 int scoutfs_scan_orphans(struct super_block *sb);

+void scoutfs_inode_queue_writeback(struct inode *inode);
+int scoutfs_inode_walk_writeback(struct super_block *sb, bool write);
+
 u64 scoutfs_last_ino(struct super_block *sb);

 void scoutfs_inode_exit(void);
--- a/kmod/src/net.c
+++ b/kmod/src/net.c
@@ -18,6 +18,7 @@
 #include <linux/in.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <linux/sort.h>

 #include "format.h"
 #include "net.h"
@@ -363,6 +364,61 @@ static struct send_buf *alloc_sbuf(unsigned data_len)
 	return sbuf;
 }

+/* XXX I dunno, totally made up */
+#define BULK_COUNT 32
+
+static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req,
+					   int req_len)
+{
+	DECLARE_NET_INFO(sb, nti);
+	struct scoutfs_net_segnos *ns;
+	struct commit_waiter cw;
+	struct send_buf *sbuf;
+	u64 segno;
+	int ret;
+	int i;
+
+	if (req_len != 0)
+		return ERR_PTR(-EINVAL);
+
+	sbuf = alloc_sbuf(offsetof(struct scoutfs_net_segnos,
+				   segnos[BULK_COUNT]));
+	if (!sbuf)
+		return ERR_PTR(-ENOMEM);
+
+	ns = (void *)sbuf->nh->data;
+	ns->nr = cpu_to_le16(BULK_COUNT);
+
+	down_read(&nti->ring_commit_rwsem);
+
+	for (i = 0; i < BULK_COUNT; i++) {
+		ret = scoutfs_alloc_segno(sb, &segno);
+		if (ret) {
+			while (i-- > 0)
+				scoutfs_alloc_free(sb,
+					le64_to_cpu(ns->segnos[i]));
+			break;
+		}
+
+		ns->segnos[i] = cpu_to_le64(segno);
+	}
+
+
+	if (ret == 0)
+		queue_commit_work(nti, &cw);
+	up_read(&nti->ring_commit_rwsem);
+
+	if (ret == 0)
+		ret = wait_for_commit(&cw);
+
+	if (ret)
+		sbuf->nh->status = SCOUTFS_NET_STATUS_ERROR;
+	else
+		sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS;
+
+	return sbuf;
+}
+
 static struct send_buf *process_record_segment(struct super_block *sb,
 					       void *req, int req_len)
 {
@@ -616,6 +672,7 @@ static proc_func_t type_proc_func(u8 type)
 			process_manifest_range_entries,
 		[SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno,
 		[SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment,
+		[SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc,
 	};

 	return type < SCOUTFS_NET_UNKNOWN ? funcs[type] : NULL;
@@ -1100,6 +1157,113 @@ static int add_send_buf(struct super_block *sb, int type, void *data,
 	return 0;
 }

+struct bulk_alloc_args {
+	struct completion comp;
+	u64 *segnos;
+	int ret;
+};
+
+static int sort_cmp_u64s(const void *A, const void *B)
+{
+	const u64 *a = A;
+	const u64 *b = B;
+
+	return *a < *b ? -1  : *a > *b ? 1 : 0;
+}
+
+static void sort_swap_u64s(void *A, void *B, int size)
+{
+	u64 *a = A;
+	u64 *b = B;
+
+	swap(*a, *b);
+}
+
+static int bulk_alloc_reply(struct super_block *sb, void *reply, int ret,
+			    void *arg)
+{
+	struct bulk_alloc_args *args = arg;
+	struct scoutfs_net_segnos *ns = reply;
+	u16 nr;
+	int i;
+
+	if (ret < sizeof(struct scoutfs_net_segnos) ||
+	    ret != offsetof(struct scoutfs_net_segnos,
+			    segnos[le16_to_cpu(ns->nr)])) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nr = le16_to_cpu(ns->nr);
+
+	args->segnos = kmalloc((nr + 1) * sizeof(args->segnos[0]), GFP_NOFS);
+	if (args->segnos == NULL) {
+		ret = -ENOMEM; /* XXX hmm. */
+		goto out;
+	}
+
+	for (i = 0; i < nr; i++) {
+		args->segnos[i] = le64_to_cpu(ns->segnos[i]);
+
+		/* make sure they're all non-zero */
+		if (args->segnos[i] == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	sort(args->segnos, nr, sizeof(args->segnos[0]),
+	     sort_cmp_u64s, sort_swap_u64s);
+
+	/* make sure they're all unique */
+	for (i = 1; i < nr; i++) {
+		if (args->segnos[i] == args->segnos[i - 1]) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	args->segnos[nr] = 0;
+	ret = 0;
+out:
+	if (ret && args->segnos) {
+		kfree(args->segnos);
+		args->segnos = NULL;
+	}
+	args->ret = ret;
+	complete(&args->comp);
+	return args->ret;
+}
+
+/*
+ * Returns a 0-terminated allocated array of segnos, the caller is
+ * responsible for freeing it.
+ */
+u64 *scoutfs_net_bulk_alloc(struct super_block *sb)
+{
+	struct bulk_alloc_args args;
+	int ret;
+
+	args.segnos = NULL;
+	init_completion(&args.comp);
+
+	ret = add_send_buf(sb, SCOUTFS_NET_BULK_ALLOC, NULL, 0,
+			   bulk_alloc_reply, &args);
+	if (ret == 0) {
+		wait_for_completion(&args.comp);
+		ret = args.ret;
+		if (ret == 0 && (args.segnos == NULL || args.segnos[0] == 0))
+			ret = -ENOSPC;
+	}
+
+	if (ret) {
+		kfree(args.segnos);
+		args.segnos = ERR_PTR(ret);
+	}
+
+	return args.segnos;
+}
+
 /*
 * Eventually we're going to have messages that control compaction.
 * Each client mount would have long-lived work that sends requests
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -13,6 +13,7 @@ int scoutfs_net_manifest_range_entries(struct super_block *sb,
 int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno);
 int scoutfs_net_record_segment(struct super_block *sb,
 			       struct scoutfs_segment *seg, u8 level);
+u64 *scoutfs_net_bulk_alloc(struct super_block *sb);

 int scoutfs_net_get_compaction(struct super_block *sb, void *curs);
 int scoutfs_net_finish_compaction(struct super_block *sb, void *curs,
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -204,6 +204,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbi)
 		return -ENOMEM;

+	/*
+	 * XXX this is random today for initial testing, but we'll want
+	 * it to be assigned by the server.
+	 */
+	get_random_bytes_arch(&sbi->node_id, sizeof(sbi->node_id));
+
 	spin_lock_init(&sbi->next_ino_lock);
 	atomic_set(&sbi->trans_holds, 0);
 	init_waitqueue_head(&sbi->trans_hold_wq);
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -14,11 +14,13 @@ struct compact_info;
 struct data_info;
 struct lock_info;
 struct net_info;
-struct free_ino_pool;
+struct inode_sb_info;

 struct scoutfs_sb_info {
 	struct super_block *sb;

+	u64 node_id;
+
 	struct scoutfs_super_block super;

 	spinlock_t next_ino_lock;
@@ -29,7 +31,7 @@ struct scoutfs_sb_info {
 	struct seg_alloc *seg_alloc;
 	struct compact_info *compact_info;
 	struct data_info *data_info;
-	struct free_ino_pool *free_ino_pool;
+	struct inode_sb_info *inode_sb_info;

 	atomic_t trans_holds;
 	wait_queue_head_t trans_hold_wq;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -26,6 +26,7 @@
 #include "seg.h"
 #include "counters.h"
 #include "net.h"
+#include "inode.h"
 #include "scoutfs_trace.h"

 /*
@@ -97,10 +98,12 @@ void scoutfs_trans_write_func(struct work_struct *work)
 		 * about leaking segnos nor duplicate manifest entries
 		 * on crashes between us and the server.
 		 */
-		ret = scoutfs_net_alloc_segno(sb, &segno) ?:
+		ret = scoutfs_inode_walk_writeback(sb, true) ?:
+		      scoutfs_net_alloc_segno(sb, &segno) ?:
 		      scoutfs_seg_alloc(sb, segno, &seg) ?:
 		      scoutfs_item_dirty_seg(sb, seg) ?:
 		      scoutfs_seg_submit_write(sb, seg, &comp) ?:
+		      scoutfs_inode_walk_writeback(sb, false) ?:
 		      scoutfs_bio_wait_comp(sb, &comp) ?:
 		      scoutfs_net_record_segment(sb, seg, 0);
 		if (ret)
@@ -112,9 +115,6 @@ out:
 	/* XXX this all needs serious work for dealing with errors */
 	WARN_ON_ONCE(ret);

-	/* must be done before waking waiting trans holders who might dirty */
-	scoutfs_data_end_writeback(sb, ret);
-
 	spin_lock(&sbi->trans_write_lock);
 	sbi->trans_write_count++;
 	sbi->trans_write_ret = ret;