scoutfs: use extents for file data

We're very basic here at this stage and simply put a single-block extent
item where we would have previously had a multi-block bmap item.
Multi-block extents will come in future patches.

Signed-off-by: Mark Fasheh <mfasheh@versity.com>
Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Mark Fasheh
2016-10-27 14:03:41 -05:00
committed by Zach Brown
parent f1b29c8372
commit 467801de73
6 changed files with 74 additions and 123 deletions

View File

@@ -27,8 +27,8 @@
#include "ioctl.h"
/*
* scoutfs uses simple fixed size block mapping items to map aligned
* groups of logical file data blocks to physical block locations.
* scoutfs uses an extent item to map logical file data blocks to
* physical block locations.
*
* The small block size is set to the smallest supported page size.
* This means that our file IO code never has to worry about the
@@ -40,7 +40,7 @@
* is, and have a 1:1 relationship between block writes and block
* mapping item entries.
*
* Dirty blocks are only written to free space. The first time a block
* Dirty extents are only written to free space. The first time a block
* hits write_page in a transaction it gets a newly allocated block. We
* get decent contiguous allocations by having per-task preallocation
* streams. These are trimmed back as the transaction is committed. We
@@ -64,7 +64,7 @@
* - need to wire up dirty inode?
* - enforce writing to free blknos
* - per-task allocation regions
* - tear down dirty blocks left by write errors on unmount
* - tear down dirty extents left by write errors on unmount
* - should invalidate dirty blocks if freed
* - data block checksumming (stable pages)
* - mmap creating dirty unmapped pages at writepage
@@ -177,96 +177,72 @@ static void return_file_block(struct super_block *sb, u64 blkno)
spin_unlock(&sbi->file_alloc_lock);
}
static bool bmap_has_blocks(struct scoutfs_block_map *bmap)
{
int i;
for (i = 0; i < SCOUTFS_BLOCK_MAP_COUNT; i++) {
if (bmap->blkno[i])
return true;
}
return false;
}
/*
* Free mapped blocks whose entire contents are past the new specified
* size. The caller holds a transaction. If we truncate all the blocks
* in a mapping item then we remove the item.
* Free mapped extents whose entire contents are past the new
* specified size. The caller holds a transaction.
*
* This is the low level block allocation and bmap item manipulation.
* This is the low level extent item truncate code.
* Callers manage higher order truncation and orphan cleanup.
*
* XXX what to do about leaving items past i_size?
* XXX probably should be a range
*/
int scoutfs_truncate_block_items(struct super_block *sb, u64 ino, u64 size)
int scoutfs_truncate_extent_items(struct super_block *sb, u64 ino, u64 size)
{
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_block_map bmap;
struct scoutfs_extent extent;
struct scoutfs_btree_val val;
struct scoutfs_key last;
struct scoutfs_key key;
bool modified;
struct scoutfs_key first;
u64 iblock;
u64 blkno;
u64 len;
u64 loff;
u64 seq;
int ret;
int i;
iblock = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SIZE);
i = iblock & SCOUTFS_BLOCK_MAP_MASK;
scoutfs_set_key(&key, ino, SCOUTFS_BMAP_KEY,
iblock & ~(u64)SCOUTFS_BLOCK_MAP_MASK);
scoutfs_set_key(&last, ino, SCOUTFS_BMAP_KEY, ~0ULL);
scoutfs_set_key(&first, ino, SCOUTFS_EXTENT_KEY, 0);
scoutfs_set_key(&key, ino, SCOUTFS_EXTENT_KEY, ~0ULL);
trace_printk("iblock %llu i %d\n", iblock, i);
trace_printk("iblock %llu\n", iblock);
scoutfs_btree_init_val(&val, &bmap, sizeof(bmap));
scoutfs_btree_init_val(&val, &extent, sizeof(extent));
val.check_size_eq = 1;
for (;;) {
ret = scoutfs_btree_next(sb, meta, &key, &last, &key, &val);
ret = scoutfs_btree_prev(sb, meta, &first, &key, &key, &seq,
&val);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
break;
}
/* XXX check bmap sanity */
loff = le64_to_cpu(key.offset);
len = le64_to_cpu(extent.len);
/* make sure we can update bmap after freeing */
if (WARN_ON_ONCE(len != 1)) {
ret = -EIO;
break;
}
if ((loff + len) <= iblock)
break;
/* make sure we can delete the extent after freeing */
ret = scoutfs_btree_dirty(sb, meta, &key);
if (ret)
break;
modified = false;
for (; i < SCOUTFS_BLOCK_MAP_COUNT; i++) {
blkno = le64_to_cpu(bmap.blkno[i]);
if (blkno == 0)
continue;
ret = scoutfs_buddy_free(sb, bmap.seq[i], blkno, 0);
if (ret)
break;
bmap.blkno[i] = 0;
bmap.seq[i] = 0;
modified = true;
}
i = 0;
/* dirtying should have prevented these from failing */
if (!bmap_has_blocks(&bmap))
scoutfs_btree_delete(sb, meta, &key);
else if (modified)
scoutfs_btree_update(sb, meta, &key, &val);
ret = scoutfs_buddy_free(sb, cpu_to_le64(seq),
le64_to_cpu(extent.blkno), 0);
if (ret)
break;
scoutfs_btree_delete(sb, meta, &key);
/* XXX sync transaction if it's enormous */
scoutfs_inc_key(&key);
scoutfs_dec_key(&key);
}
return ret;
@@ -291,44 +267,27 @@ void scoutfs_filerw_free_alloc(struct super_block *sb)
sbi->file_alloc_count = 0;
}
static void set_bmap_key(struct scoutfs_key *key, struct inode *inode,
u64 iblock)
{
scoutfs_set_key(key, scoutfs_ino(inode), SCOUTFS_BMAP_KEY,
iblock >> SCOUTFS_BLOCK_MAP_SHIFT);
}
/*
* Return the number of contiguously mapped blocks starting from the
* given logical block in the inode. We only return the number
* contained in one block map item. We walk through more items if it
* makes a difference.
* given logical block in the inode.
*/
static int contig_mapped_blocks(struct inode *inode, u64 iblock, u64 *blkno)
{
struct super_block *sb = inode->i_sb;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_btree_val val;
struct scoutfs_block_map bmap;
struct scoutfs_extent extent;
struct scoutfs_key key;
int ret;
int i;
*blkno = 0;
set_bmap_key(&key, inode, iblock);
scoutfs_btree_init_val(&val, &bmap, sizeof(bmap));
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, iblock);
scoutfs_btree_init_val(&val, &extent, sizeof(extent));
ret = scoutfs_btree_lookup(sb, meta, &key, &val);
if (ret == sizeof(bmap)) {
i = iblock & SCOUTFS_BLOCK_MAP_MASK;
*blkno = le64_to_cpu(bmap.blkno[i]);
ret = 0;
while (i < SCOUTFS_BLOCK_MAP_COUNT && bmap.blkno[i]) {
ret++;
i++;
}
if (ret == sizeof(extent)) {
*blkno = le64_to_cpu(extent.blkno);
ret = min_t(u64, le64_to_cpu(extent.len), INT_MAX);
} else if (ret >= 0) {
/* XXX corruption */
ret = -EIO;
@@ -360,44 +319,47 @@ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret)
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_super_block *super = &sbi->stable_super;
struct scoutfs_btree_root *meta = SCOUTFS_META(sb);
struct scoutfs_block_map bmap;
struct scoutfs_extent extent;
struct scoutfs_btree_val val;
struct scoutfs_key first;
struct scoutfs_key key;
bool inserted = false;
u64 old_blkno = 0;
u64 new_blkno = 0;
u64 seq;
int ret;
int err;
int i;
set_bmap_key(&key, inode, iblock);
scoutfs_btree_init_val(&val, &bmap, sizeof(bmap));
scoutfs_set_key(&first, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, 0);
scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, iblock);
scoutfs_btree_init_val(&val, &extent, sizeof(extent));
val.check_size_eq = 1;
/* see if there's an existing mapping */
ret = scoutfs_btree_lookup(sb, meta, &key, &val);
ret = scoutfs_btree_prev(sb, meta, &first, &key, &key, &seq, &val);
if (ret == 0 && ((le64_to_cpu(key.offset) +
le64_to_cpu(extent.len)) <= iblock))
ret = -ENOENT;
if (ret < 0 && ret != -ENOENT)
goto out;
/* make sure that updating the bmap item won't fail */
/* make sure that updating the extent item won't fail */
if (ret == -ENOENT) {
memset(&bmap, 0, sizeof(bmap));
memset(&extent, 0, sizeof(extent));
ret = scoutfs_btree_insert(sb, meta, &key, &val);
if (ret)
goto out;
inserted = true;
} else {
ret = scoutfs_btree_dirty(sb, meta, &key);
if (ret)
goto out;
}
i = iblock & SCOUTFS_BLOCK_MAP_MASK;
old_blkno = le64_to_cpu(bmap.blkno[i]);
old_blkno = le64_to_cpu(extent.blkno);
/* If the existing block is dirty then we can use it */
if (old_blkno && (bmap.seq[i] == super->hdr.seq)) {
if (old_blkno && cpu_to_le64(seq) == super->hdr.seq) {
*blkno_ret = old_blkno;
ret = 0;
goto out;
@@ -408,13 +370,13 @@ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret)
goto out;
if (old_blkno) {
ret = scoutfs_buddy_free(sb, bmap.seq[i], old_blkno, 0);
ret = scoutfs_buddy_free(sb, cpu_to_le64(seq), old_blkno, 0);
if (ret)
goto out;
}
bmap.blkno[i] = cpu_to_le64(new_blkno);
bmap.seq[i] = super->hdr.seq;
extent.blkno = cpu_to_le64(new_blkno);
extent.len = cpu_to_le64(1);
/* dirtying guarantees success */
err = scoutfs_btree_update(sb, meta, &key, &val);
@@ -448,7 +410,8 @@ static int scoutfs_readpage_get_block(struct inode *inode, sector_t iblock,
ret = contig_mapped_blocks(inode, iblock, &blkno);
if (ret > 0) {
map_bh(bh, inode->i_sb, blkno);
bh->b_size = min_t(int, bh->b_size, ret << inode->i_blkbits);
bh->b_size = min_t(u64, bh->b_size,
(u64)ret << inode->i_blkbits);
ret = 0;
}
@@ -486,7 +449,7 @@ static int scoutfs_writepage_get_block(struct inode *inode, sector_t iblock,
}
/*
* Dirty file blocks can be written to their newly allocated free blocks
* Dirty file pages can be written to their newly allocated free extents
* at any time. They won't be referenced by metadata until the current
* transaction is committed. They can be re-read and re-dirtied at
* their free block number in this transaction.
@@ -507,8 +470,8 @@ static int scoutfs_writepages(struct address_space *mapping,
}
/*
* Block allocation during buffered writes needs to make sure that the
* dirty block will be written to free space.
* Extent allocation during buffered writes needs to make sure that the
* dirty blocks will be written to free space.
*/
static int scoutfs_write_begin_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)

View File

@@ -5,6 +5,6 @@ extern const struct address_space_operations scoutfs_file_aops;
extern const struct file_operations scoutfs_file_fops;
void scoutfs_filerw_free_alloc(struct super_block *sb);
int scoutfs_truncate_block_items(struct super_block *sb, u64 ino, u64 size);
int scoutfs_truncate_extent_items(struct super_block *sb, u64 ino, u64 size);
#endif

View File

@@ -113,7 +113,7 @@ struct scoutfs_key {
#define SCOUTFS_DIRENT_KEY 5
#define SCOUTFS_LINK_BACKREF_KEY 6
#define SCOUTFS_SYMLINK_KEY 7
#define SCOUTFS_BMAP_KEY 8
#define SCOUTFS_EXTENT_KEY 8
#define SCOUTFS_ORPHAN_KEY 9
#define SCOUTFS_MAX_ITEM_LEN 512
@@ -288,23 +288,11 @@ struct scoutfs_xattr {
__u8 name[0];
} __packed;
/*
* We use simple block map items to map a aligned fixed group of logical
* block offsets to physical blocks. We make them a decent size to
* reduce the item storage overhead per block referenced, but we don't
* want them so large that they start to take up an extraordinary amount
* of space for small files. 8 block items ranges from around 3% to .3%
* overhead for files that use only one or all of the blocks in the
* mapping item.
*/
#define SCOUTFS_BLOCK_MAP_SHIFT 3
#define SCOUTFS_BLOCK_MAP_COUNT (1 << SCOUTFS_BLOCK_MAP_SHIFT)
#define SCOUTFS_BLOCK_MAP_MASK (SCOUTFS_BLOCK_MAP_COUNT - 1)
struct scoutfs_block_map {
__le64 blkno[SCOUTFS_BLOCK_MAP_COUNT];
__le64 seq[SCOUTFS_BLOCK_MAP_COUNT];
};
struct scoutfs_extent {
__le64 blkno;
__le64 len;
__u8 flags;
} __packed;
/*
* link backrefs give us a way to find all the hard links that refer

View File

@@ -425,7 +425,7 @@ static int __delete_inode(struct super_block *sb, struct scoutfs_key *key,
if (S_ISLNK(mode))
ret = scoutfs_symlink_drop(sb, ino);
else if (S_ISREG(mode))
ret = scoutfs_truncate_block_items(sb, ino, 0);
ret = scoutfs_truncate_extent_items(sb, ino, 0);
if (ret)
goto out;

View File

@@ -286,7 +286,7 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case SCOUTFS_IOC_FIND_XATTR_VAL:
return scoutfs_ioc_find_xattr(file, arg, false);
case SCOUTFS_IOC_INODE_DATA_SINCE:
return scoutfs_ioc_inodes_since(file, arg, SCOUTFS_BMAP_KEY);
return scoutfs_ioc_inodes_since(file, arg, SCOUTFS_EXTENT_KEY);
}
return -ENOTTY;

View File

@@ -39,7 +39,7 @@ struct scoutfs_sb_info;
{ SCOUTFS_DIRENT_KEY, "DIRENT" }, \
{ SCOUTFS_LINK_BACKREF_KEY, "LINK_BACKREF"}, \
{ SCOUTFS_SYMLINK_KEY, "SYMLINK" }, \
{ SCOUTFS_BMAP_KEY, "BMAP" })
{ SCOUTFS_EXTENT_KEY, "EXTENT" })
#define TRACE_KEYF "%llu.%s.%llu"