From 467801de73dd1847c5e7ca8739b7e97aca88500a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@versity.com>
Date: Thu, 27 Oct 2016 14:03:41 -0500
Subject: [PATCH] scoutfs: use extents for file data

We're very basic here at this stage and simply put a single-block extent
item where we would have previously had a multi-block bmap item.
Multi-block extents will come in future patches.

Signed-off-by: Mark Fasheh <mfasheh@versity.com>
Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/filerw.c        | 165 +++++++++++++++------------------------
 kmod/src/filerw.h        |   2 +-
 kmod/src/format.h        |  24 ++----
 kmod/src/inode.c         |   2 +-
 kmod/src/ioctl.c         |   2 +-
 kmod/src/scoutfs_trace.h |   2 +-
 6 files changed, 74 insertions(+), 123 deletions(-)

diff --git a/kmod/src/filerw.c b/kmod/src/filerw.c
index 091f2333..841b5a01 100644
--- a/kmod/src/filerw.c
+++ b/kmod/src/filerw.c
@@ -27,8 +27,8 @@
 #include "ioctl.h"
 
 /*
- * scoutfs uses simple fixed size block mapping items to map aligned
- * groups of logical file data blocks to physical block locations.
+ * scoutfs uses an extent item to map logical file data blocks to
+ * physical block locations.
  *
  * The small block size is set to the smallest supported page size.
  * This means that our file IO code never has to worry about the
@@ -40,7 +40,7 @@
  * is, and have a 1:1 relationship between block writes and block
  * mapping item entries.
  *
- * Dirty blocks are only written to free space.  The first time a block
+ * Dirty extents are only written to free space.  The first time a block
  * hits write_page in a transaction it gets a newly allocated block.  We
  * get decent contiguous allocations by having per-task preallocation
  * streams.  These are trimmed back as the transaction is committed.  We
@@ -64,7 +64,7 @@
  *  - need to wire up dirty inode?
  *  - enforce writing to free blknos
  *  - per-task allocation regions
- *  - tear down dirty blocks left by write errors on unmount
+ *  - tear down dirty extents left by write errors on unmount
  *  - should invalidate dirty blocks if freed
  *  - data block checksumming (stable pages)
  *  - mmap creating dirty unmapped pages at writepage
@@ -177,96 +177,72 @@ static void return_file_block(struct super_block *sb, u64 blkno)
 	spin_unlock(&sbi->file_alloc_lock);
 }
 
-static bool bmap_has_blocks(struct scoutfs_block_map *bmap)
-{
-	int i;
-
-	for (i = 0; i < SCOUTFS_BLOCK_MAP_COUNT; i++) {
-		if (bmap->blkno[i])
-			return true;
-	}
-
-	return false;
-}
-
 /*
- * Free mapped blocks whose entire contents are past the new specified
- * size.  The caller holds a transaction.  If we truncate all the blocks
- * in a mapping item then we remove the item.
+ * Free mapped extents whose entire contents are past the new
+ * specified size.  The caller holds a transaction.
  *
- * This is the low level block allocation and bmap item manipulation.
+ * This is the low level extent item truncate code.
  * Callers manage higher order truncation and orphan cleanup.
  *
- * XXX what to do about leaving items past i_size?
  * XXX probably should be a range
  */
-int scoutfs_truncate_block_items(struct super_block *sb, u64 ino, u64 size)
+int scoutfs_truncate_extent_items(struct super_block *sb, u64 ino, u64 size)
 {
 	struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
-	struct scoutfs_block_map bmap;
+	struct scoutfs_extent extent;
 	struct scoutfs_btree_val val;
-	struct scoutfs_key last;
 	struct scoutfs_key key;
-	bool modified;
+	struct scoutfs_key first;
 	u64 iblock;
-	u64 blkno;
+	u64 len;
+	u64 loff;
+	u64 seq;
 	int ret;
-	int i;
 
 	iblock = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SIZE);
-	i = iblock & SCOUTFS_BLOCK_MAP_MASK;
 
-	scoutfs_set_key(&key, ino, SCOUTFS_BMAP_KEY,
-			iblock & ~(u64)SCOUTFS_BLOCK_MAP_MASK);
-	scoutfs_set_key(&last, ino, SCOUTFS_BMAP_KEY, ~0ULL);
+	scoutfs_set_key(&first, ino, SCOUTFS_EXTENT_KEY, 0);
+	scoutfs_set_key(&key, ino, SCOUTFS_EXTENT_KEY, ~0ULL);
 
-	trace_printk("iblock %llu i %d\n", iblock, i);
+	trace_printk("iblock %llu\n", iblock);
 
-	scoutfs_btree_init_val(&val, &bmap, sizeof(bmap));
+	scoutfs_btree_init_val(&val, &extent, sizeof(extent));
 	val.check_size_eq = 1;
 
 	for (;;) {
-		ret = scoutfs_btree_next(sb, meta, &key, &last, &key, &val);
+		ret = scoutfs_btree_prev(sb, meta, &first, &key, &key, &seq,
+					 &val);
 		if (ret < 0) {
 			if (ret == -ENOENT)
 				ret = 0;
 			break;
 		}
 
-		/* XXX check bmap sanity */
+		loff = le64_to_cpu(key.offset);
+		len = le64_to_cpu(extent.len);
 
-		/* make sure we can update bmap after freeing */
+		if (WARN_ON_ONCE(len != 1)) {
+			ret = -EIO;
+			break;
+		}
+
+		if ((loff + len) <= iblock)
+			break;
+
+		/* make sure we can delete the extent after freeing */
 		ret = scoutfs_btree_dirty(sb, meta, &key);
 		if (ret)
 			break;
 
-		modified = false;
-		for (; i < SCOUTFS_BLOCK_MAP_COUNT; i++) {
-			blkno = le64_to_cpu(bmap.blkno[i]);
-			if (blkno == 0)
-				continue;
-
-			ret = scoutfs_buddy_free(sb, bmap.seq[i], blkno, 0);
-			if (ret)
-				break;
-
-			bmap.blkno[i] = 0;
-			bmap.seq[i] = 0;
-			modified = true;
-		}
-		i = 0;
-
-		/* dirtying should have prevented these from failing */
-		if (!bmap_has_blocks(&bmap))
-			scoutfs_btree_delete(sb, meta, &key);
-		else if (modified)
-			scoutfs_btree_update(sb, meta, &key, &val);
-
+		ret = scoutfs_buddy_free(sb, cpu_to_le64(seq),
+					 le64_to_cpu(extent.blkno), 0);
 		if (ret)
 			break;
 
+		scoutfs_btree_delete(sb, meta, &key);
+
 		/* XXX sync transaction if it's enormous */
-		scoutfs_inc_key(&key);
+		scoutfs_dec_key(&key);
 	}
 
 	return ret;
@@ -291,44 +267,27 @@ void scoutfs_filerw_free_alloc(struct super_block *sb)
 	sbi->file_alloc_count = 0;
 }
 
-static void set_bmap_key(struct scoutfs_key *key, struct inode *inode,
-			 u64 iblock)
-{
-	scoutfs_set_key(key, scoutfs_ino(inode), SCOUTFS_BMAP_KEY,
-			iblock >> SCOUTFS_BLOCK_MAP_SHIFT);
-}
-
 /*
  * Return the number of contiguously mapped blocks starting from the
- * given logical block in the inode.  We only return the number
- * contained in one block map item.  We walk through more items if it
- * makes a difference.
+ * given logical block in the inode.
  */
 static int contig_mapped_blocks(struct inode *inode, u64 iblock, u64 *blkno)
 {
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
 	struct scoutfs_btree_val val;
-	struct scoutfs_block_map bmap;
+	struct scoutfs_extent extent;
 	struct scoutfs_key key;
 	int ret;
-	int i;
 
 	*blkno = 0;
-
-	set_bmap_key(&key, inode, iblock);
-	scoutfs_btree_init_val(&val, &bmap, sizeof(bmap));
+	scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, iblock);
+	scoutfs_btree_init_val(&val, &extent, sizeof(extent));
 
 	ret = scoutfs_btree_lookup(sb, meta, &key, &val);
-	if (ret == sizeof(bmap)) {
-		i = iblock & SCOUTFS_BLOCK_MAP_MASK;
-		*blkno = le64_to_cpu(bmap.blkno[i]);
-
-		ret = 0;
-		while (i < SCOUTFS_BLOCK_MAP_COUNT && bmap.blkno[i]) {
-			ret++;
-			i++;
-		}
+	if (ret == sizeof(extent)) {
+		*blkno = le64_to_cpu(extent.blkno);
+		ret = min_t(u64, le64_to_cpu(extent.len), INT_MAX);
 	} else if (ret >= 0) {
 		/* XXX corruption */
 		ret = -EIO;
@@ -360,44 +319,47 @@ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret)
 	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
 	struct scoutfs_super_block *super = &sbi->stable_super;
 	struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
-	struct scoutfs_block_map bmap;
+	struct scoutfs_extent extent;
 	struct scoutfs_btree_val val;
+	struct scoutfs_key first;
 	struct scoutfs_key key;
 	bool inserted = false;
 	u64 old_blkno = 0;
 	u64 new_blkno = 0;
+	u64 seq;
 	int ret;
 	int err;
-	int i;
 
-	set_bmap_key(&key, inode, iblock);
-	scoutfs_btree_init_val(&val, &bmap, sizeof(bmap));
+	scoutfs_set_key(&first, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, 0);
+	scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, iblock);
+	scoutfs_btree_init_val(&val, &extent, sizeof(extent));
 	val.check_size_eq = 1;
 
 	/* see if there's an existing mapping */
-	ret = scoutfs_btree_lookup(sb, meta, &key, &val);
+	ret = scoutfs_btree_prev(sb, meta, &first, &key, &key, &seq, &val);
+	if (ret == 0 && ((le64_to_cpu(key.offset) +
+			  le64_to_cpu(extent.len)) <= iblock))
+		ret = -ENOENT;
 	if (ret < 0 && ret != -ENOENT)
 		goto out;
 
-	/* make sure that updating the bmap item won't fail */
+	/* make sure that updating the extent item won't fail */
 	if (ret == -ENOENT) {
-		memset(&bmap, 0, sizeof(bmap));
+		memset(&extent, 0, sizeof(extent));
 		ret = scoutfs_btree_insert(sb, meta, &key, &val);
 		if (ret)
 			goto out;
 		inserted = true;
-
 	} else {
 		ret = scoutfs_btree_dirty(sb, meta, &key);
 		if (ret)
 			goto out;
 	}
 
-	i = iblock & SCOUTFS_BLOCK_MAP_MASK;
-	old_blkno = le64_to_cpu(bmap.blkno[i]);
+	old_blkno = le64_to_cpu(extent.blkno);
 
 	/* If the existing block is dirty then we can use it */
-	if (old_blkno && (bmap.seq[i] == super->hdr.seq)) {
+	if (old_blkno && cpu_to_le64(seq) == super->hdr.seq) {
 		*blkno_ret = old_blkno;
 		ret = 0;
 		goto out;
@@ -408,13 +370,13 @@ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret)
 		goto out;
 
 	if (old_blkno) {
-		ret = scoutfs_buddy_free(sb, bmap.seq[i], old_blkno, 0);
+		ret = scoutfs_buddy_free(sb, cpu_to_le64(seq), old_blkno, 0);
 		if (ret)
 			goto out;
 	}
 
-	bmap.blkno[i] = cpu_to_le64(new_blkno);
-	bmap.seq[i] = super->hdr.seq;
+	extent.blkno = cpu_to_le64(new_blkno);
+	extent.len = cpu_to_le64(1);
 
 	/* dirtying guarantees success */
 	err = scoutfs_btree_update(sb, meta, &key, &val);
@@ -448,7 +410,8 @@ static int scoutfs_readpage_get_block(struct inode *inode, sector_t iblock,
 	ret = contig_mapped_blocks(inode, iblock, &blkno);
 	if (ret > 0) {
 		map_bh(bh, inode->i_sb, blkno);
-		bh->b_size = min_t(int, bh->b_size, ret << inode->i_blkbits);
+		bh->b_size = min_t(u64, bh->b_size,
+				   (u64)ret << inode->i_blkbits);
 		ret = 0;
 	}
 
@@ -486,7 +449,7 @@ static int scoutfs_writepage_get_block(struct inode *inode, sector_t iblock,
 }
 
 /*
- * Dirty file blocks can be written to their newly allocated free blocks
+ * Dirty file pages can be written to their newly allocated free extents
  * at any time.  They won't be referenced by metadata until the current
  * transaction is committed.  They can be re-read and re-dirtied at
  * their free block number in this transaction.
@@ -507,8 +470,8 @@ static int scoutfs_writepages(struct address_space *mapping,
 }
 
 /*
- * Block allocation during buffered writes needs to make sure that the
- * dirty block will be written to free space.
+ * Extent allocation during buffered writes needs to make sure that the
+ * dirty blocks will be written to free space.
  */
 static int scoutfs_write_begin_get_block(struct inode *inode, sector_t iblock,
 					 struct buffer_head *bh, int create)
diff --git a/kmod/src/filerw.h b/kmod/src/filerw.h
index ba2bb81f..de4972d3 100644
--- a/kmod/src/filerw.h
+++ b/kmod/src/filerw.h
@@ -5,6 +5,6 @@ extern const struct address_space_operations scoutfs_file_aops;
 extern const struct file_operations scoutfs_file_fops;
 
 void scoutfs_filerw_free_alloc(struct super_block *sb);
-int scoutfs_truncate_block_items(struct super_block *sb, u64 ino, u64 size);
+int scoutfs_truncate_extent_items(struct super_block *sb, u64 ino, u64 size);
 
 #endif
diff --git a/kmod/src/format.h b/kmod/src/format.h
index fafe802a..2b2b31b0 100644
--- a/kmod/src/format.h
+++ b/kmod/src/format.h
@@ -113,7 +113,7 @@ struct scoutfs_key {
 #define SCOUTFS_DIRENT_KEY		5
 #define SCOUTFS_LINK_BACKREF_KEY	6
 #define SCOUTFS_SYMLINK_KEY		7
-#define SCOUTFS_BMAP_KEY		8
+#define SCOUTFS_EXTENT_KEY		8
 #define SCOUTFS_ORPHAN_KEY		9
 
 #define SCOUTFS_MAX_ITEM_LEN 512
@@ -288,23 +288,11 @@ struct scoutfs_xattr {
 	__u8 name[0];
 } __packed;
 
-/*
- * We use simple block map items to map a aligned fixed group of logical
- * block offsets to physical blocks.  We make them a decent size to
- * reduce the item storage overhead per block referenced, but we don't
- * want them so large that they start to take up an extraordinary amount
- * of space for small files.  8 block items ranges from around 3% to .3%
- * overhead for files that use only one or all of the blocks in the
- * mapping item.
- */
-#define SCOUTFS_BLOCK_MAP_SHIFT 3
-#define SCOUTFS_BLOCK_MAP_COUNT (1 << SCOUTFS_BLOCK_MAP_SHIFT)
-#define SCOUTFS_BLOCK_MAP_MASK (SCOUTFS_BLOCK_MAP_COUNT - 1)
-
-struct scoutfs_block_map {
-	__le64 blkno[SCOUTFS_BLOCK_MAP_COUNT];
-	__le64 seq[SCOUTFS_BLOCK_MAP_COUNT];
-};
+struct scoutfs_extent {
+	__le64	blkno;
+	__le64	len;
+	__u8	flags;
+} __packed;
 
 /*
  * link backrefs give us a way to find all the hard links that refer
diff --git a/kmod/src/inode.c b/kmod/src/inode.c
index 98a005ca..81a18a85 100644
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -425,7 +425,7 @@ static int __delete_inode(struct super_block *sb, struct scoutfs_key *key,
 	if (S_ISLNK(mode))
 		ret = scoutfs_symlink_drop(sb, ino);
 	else if (S_ISREG(mode))
-		ret = scoutfs_truncate_block_items(sb, ino, 0);
+		ret = scoutfs_truncate_extent_items(sb, ino, 0);
 	if (ret)
 		goto out;
 
diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c
index 14c54448..47082869 100644
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -286,7 +286,7 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case SCOUTFS_IOC_FIND_XATTR_VAL:
 		return scoutfs_ioc_find_xattr(file, arg, false);
 	case SCOUTFS_IOC_INODE_DATA_SINCE:
-		return scoutfs_ioc_inodes_since(file, arg, SCOUTFS_BMAP_KEY);
+		return scoutfs_ioc_inodes_since(file, arg, SCOUTFS_EXTENT_KEY);
 	}
 
 	return -ENOTTY;
diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
index a1ca45d8..a9a18118 100644
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -39,7 +39,7 @@ struct scoutfs_sb_info;
 		{ SCOUTFS_DIRENT_KEY,		"DIRENT" },		       \
 		{ SCOUTFS_LINK_BACKREF_KEY,	"LINK_BACKREF"},	       \
 		{ SCOUTFS_SYMLINK_KEY,		"SYMLINK" },		       \
-		{ SCOUTFS_BMAP_KEY,		"BMAP" })
+		{ SCOUTFS_EXTENT_KEY,		"EXTENT" })
 
 #define	TRACE_KEYF	"%llu.%s.%llu"