From 467801de73dd1847c5e7ca8739b7e97aca88500a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 27 Oct 2016 14:03:41 -0500 Subject: [PATCH] scoutfs: use extents for file data We're very basic here at this stage and simply put a single-block extent item where we would have previously had a multi-block bmap item. Multi-block extents will come in future patches. Signed-off-by: Mark Fasheh Signed-off-by: Zach Brown --- kmod/src/filerw.c | 165 +++++++++++++++------------------------ kmod/src/filerw.h | 2 +- kmod/src/format.h | 24 ++---- kmod/src/inode.c | 2 +- kmod/src/ioctl.c | 2 +- kmod/src/scoutfs_trace.h | 2 +- 6 files changed, 74 insertions(+), 123 deletions(-) diff --git a/kmod/src/filerw.c b/kmod/src/filerw.c index 091f2333..841b5a01 100644 --- a/kmod/src/filerw.c +++ b/kmod/src/filerw.c @@ -27,8 +27,8 @@ #include "ioctl.h" /* - * scoutfs uses simple fixed size block mapping items to map aligned - * groups of logical file data blocks to physical block locations. + * scoutfs uses an extent item to map logical file data blocks to + * physical block locations. * * The small block size is set to the smallest supported page size. * This means that our file IO code never has to worry about the @@ -40,7 +40,7 @@ * is, and have a 1:1 relationship between block writes and block * mapping item entries. * - * Dirty blocks are only written to free space. The first time a block + * Dirty extents are only written to free space. The first time a block * hits write_page in a transaction it gets a newly allocated block. We * get decent contiguous allocations by having per-task preallocation * streams. These are trimmed back as the transaction is committed. We @@ -64,7 +64,7 @@ * - need to wire up dirty inode? * - enforce writing to free blknos * - per-task allocation regions - * - tear down dirty blocks left by write errors on unmount + * - tear down dirty extents left by write errors on unmount * - should invalidate dirty blocks if freed * - data block checksumming (stable pages) * - mmap creating dirty unmapped pages at writepage @@ -177,96 +177,72 @@ static void return_file_block(struct super_block *sb, u64 blkno) spin_unlock(&sbi->file_alloc_lock); } -static bool bmap_has_blocks(struct scoutfs_block_map *bmap) -{ - int i; - - for (i = 0; i < SCOUTFS_BLOCK_MAP_COUNT; i++) { - if (bmap->blkno[i]) - return true; - } - - return false; -} - /* - * Free mapped blocks whose entire contents are past the new specified - * size. The caller holds a transaction. If we truncate all the blocks - * in a mapping item then we remove the item. + * Free mapped extents whose entire contents are past the new + * specified size. The caller holds a transaction. * - * This is the low level block allocation and bmap item manipulation. + * This is the low level extent item truncate code. * Callers manage higher order truncation and orphan cleanup. * - * XXX what to do about leaving items past i_size? * XXX probably should be a range */ -int scoutfs_truncate_block_items(struct super_block *sb, u64 ino, u64 size) +int scoutfs_truncate_extent_items(struct super_block *sb, u64 ino, u64 size) { struct scoutfs_btree_root *meta = SCOUTFS_META(sb); - struct scoutfs_block_map bmap; + struct scoutfs_extent extent; struct scoutfs_btree_val val; - struct scoutfs_key last; struct scoutfs_key key; - bool modified; + struct scoutfs_key first; u64 iblock; - u64 blkno; + u64 len; + u64 loff; + u64 seq; int ret; - int i; iblock = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SIZE); - i = iblock & SCOUTFS_BLOCK_MAP_MASK; - scoutfs_set_key(&key, ino, SCOUTFS_BMAP_KEY, - iblock & ~(u64)SCOUTFS_BLOCK_MAP_MASK); - scoutfs_set_key(&last, ino, SCOUTFS_BMAP_KEY, ~0ULL); + scoutfs_set_key(&first, ino, SCOUTFS_EXTENT_KEY, 0); + scoutfs_set_key(&key, ino, SCOUTFS_EXTENT_KEY, ~0ULL); - trace_printk("iblock %llu i %d\n", iblock, i); + trace_printk("iblock %llu\n", iblock); - scoutfs_btree_init_val(&val, &bmap, sizeof(bmap)); + scoutfs_btree_init_val(&val, &extent, sizeof(extent)); val.check_size_eq = 1; for (;;) { - ret = scoutfs_btree_next(sb, meta, &key, &last, &key, &val); + ret = scoutfs_btree_prev(sb, meta, &first, &key, &key, &seq, + &val); if (ret < 0) { if (ret == -ENOENT) ret = 0; break; } - /* XXX check bmap sanity */ + loff = le64_to_cpu(key.offset); + len = le64_to_cpu(extent.len); - /* make sure we can update bmap after freeing */ + if (WARN_ON_ONCE(len != 1)) { + ret = -EIO; + break; + } + + if ((loff + len) <= iblock) + break; + + /* make sure we can delete the extent after freeing */ ret = scoutfs_btree_dirty(sb, meta, &key); if (ret) break; - modified = false; - for (; i < SCOUTFS_BLOCK_MAP_COUNT; i++) { - blkno = le64_to_cpu(bmap.blkno[i]); - if (blkno == 0) - continue; - - ret = scoutfs_buddy_free(sb, bmap.seq[i], blkno, 0); - if (ret) - break; - - bmap.blkno[i] = 0; - bmap.seq[i] = 0; - modified = true; - } - i = 0; - - /* dirtying should have prevented these from failing */ - if (!bmap_has_blocks(&bmap)) - scoutfs_btree_delete(sb, meta, &key); - else if (modified) - scoutfs_btree_update(sb, meta, &key, &val); - + ret = scoutfs_buddy_free(sb, cpu_to_le64(seq), + le64_to_cpu(extent.blkno), 0); if (ret) break; + scoutfs_btree_delete(sb, meta, &key); + /* XXX sync transaction if it's enormous */ - scoutfs_inc_key(&key); + scoutfs_dec_key(&key); } return ret; @@ -291,44 +267,27 @@ void scoutfs_filerw_free_alloc(struct super_block *sb) sbi->file_alloc_count = 0; } -static void set_bmap_key(struct scoutfs_key *key, struct inode *inode, - u64 iblock) -{ - scoutfs_set_key(key, scoutfs_ino(inode), SCOUTFS_BMAP_KEY, - iblock >> SCOUTFS_BLOCK_MAP_SHIFT); -} - /* * Return the number of contiguously mapped blocks starting from the - * given logical block in the inode. We only return the number - * contained in one block map item. We walk through more items if it - * makes a difference. + * given logical block in the inode. */ static int contig_mapped_blocks(struct inode *inode, u64 iblock, u64 *blkno) { struct super_block *sb = inode->i_sb; struct scoutfs_btree_root *meta = SCOUTFS_META(sb); struct scoutfs_btree_val val; - struct scoutfs_block_map bmap; + struct scoutfs_extent extent; struct scoutfs_key key; int ret; - int i; *blkno = 0; - - set_bmap_key(&key, inode, iblock); - scoutfs_btree_init_val(&val, &bmap, sizeof(bmap)); + scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, iblock); + scoutfs_btree_init_val(&val, &extent, sizeof(extent)); ret = scoutfs_btree_lookup(sb, meta, &key, &val); - if (ret == sizeof(bmap)) { - i = iblock & SCOUTFS_BLOCK_MAP_MASK; - *blkno = le64_to_cpu(bmap.blkno[i]); - - ret = 0; - while (i < SCOUTFS_BLOCK_MAP_COUNT && bmap.blkno[i]) { - ret++; - i++; - } + if (ret == sizeof(extent)) { + *blkno = le64_to_cpu(extent.blkno); + ret = min_t(u64, le64_to_cpu(extent.len), INT_MAX); } else if (ret >= 0) { /* XXX corruption */ ret = -EIO; @@ -360,44 +319,47 @@ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->stable_super; struct scoutfs_btree_root *meta = SCOUTFS_META(sb); - struct scoutfs_block_map bmap; + struct scoutfs_extent extent; struct scoutfs_btree_val val; + struct scoutfs_key first; struct scoutfs_key key; bool inserted = false; u64 old_blkno = 0; u64 new_blkno = 0; + u64 seq; int ret; int err; - int i; - set_bmap_key(&key, inode, iblock); - scoutfs_btree_init_val(&val, &bmap, sizeof(bmap)); + scoutfs_set_key(&first, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, 0); + scoutfs_set_key(&key, scoutfs_ino(inode), SCOUTFS_EXTENT_KEY, iblock); + scoutfs_btree_init_val(&val, &extent, sizeof(extent)); val.check_size_eq = 1; /* see if there's an existing mapping */ - ret = scoutfs_btree_lookup(sb, meta, &key, &val); + ret = scoutfs_btree_prev(sb, meta, &first, &key, &key, &seq, &val); + if (ret == 0 && ((le64_to_cpu(key.offset) + + le64_to_cpu(extent.len)) <= iblock)) + ret = -ENOENT; if (ret < 0 && ret != -ENOENT) goto out; - /* make sure that updating the bmap item won't fail */ + /* make sure that updating the extent item won't fail */ if (ret == -ENOENT) { - memset(&bmap, 0, sizeof(bmap)); + memset(&extent, 0, sizeof(extent)); ret = scoutfs_btree_insert(sb, meta, &key, &val); if (ret) goto out; inserted = true; - } else { ret = scoutfs_btree_dirty(sb, meta, &key); if (ret) goto out; } - i = iblock & SCOUTFS_BLOCK_MAP_MASK; - old_blkno = le64_to_cpu(bmap.blkno[i]); + old_blkno = le64_to_cpu(extent.blkno); /* If the existing block is dirty then we can use it */ - if (old_blkno && (bmap.seq[i] == super->hdr.seq)) { + if (old_blkno && cpu_to_le64(seq) == super->hdr.seq) { *blkno_ret = old_blkno; ret = 0; goto out; @@ -408,13 +370,13 @@ static int map_writable_block(struct inode *inode, u64 iblock, u64 *blkno_ret) goto out; if (old_blkno) { - ret = scoutfs_buddy_free(sb, bmap.seq[i], old_blkno, 0); + ret = scoutfs_buddy_free(sb, cpu_to_le64(seq), old_blkno, 0); if (ret) goto out; } - bmap.blkno[i] = cpu_to_le64(new_blkno); - bmap.seq[i] = super->hdr.seq; + extent.blkno = cpu_to_le64(new_blkno); + extent.len = cpu_to_le64(1); /* dirtying guarantees success */ err = scoutfs_btree_update(sb, meta, &key, &val); @@ -448,7 +410,8 @@ static int scoutfs_readpage_get_block(struct inode *inode, sector_t iblock, ret = contig_mapped_blocks(inode, iblock, &blkno); if (ret > 0) { map_bh(bh, inode->i_sb, blkno); - bh->b_size = min_t(int, bh->b_size, ret << inode->i_blkbits); + bh->b_size = min_t(u64, bh->b_size, + (u64)ret << inode->i_blkbits); ret = 0; } @@ -486,7 +449,7 @@ static int scoutfs_writepage_get_block(struct inode *inode, sector_t iblock, } /* - * Dirty file blocks can be written to their newly allocated free blocks + * Dirty file pages can be written to their newly allocated free extents * at any time. They won't be referenced by metadata until the current * transaction is committed. They can be re-read and re-dirtied at * their free block number in this transaction. @@ -507,8 +470,8 @@ static int scoutfs_writepages(struct address_space *mapping, } /* - * Block allocation during buffered writes needs to make sure that the - * dirty block will be written to free space. + * Extent allocation during buffered writes needs to make sure that the + * dirty blocks will be written to free space. */ static int scoutfs_write_begin_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) diff --git a/kmod/src/filerw.h b/kmod/src/filerw.h index ba2bb81f..de4972d3 100644 --- a/kmod/src/filerw.h +++ b/kmod/src/filerw.h @@ -5,6 +5,6 @@ extern const struct address_space_operations scoutfs_file_aops; extern const struct file_operations scoutfs_file_fops; void scoutfs_filerw_free_alloc(struct super_block *sb); -int scoutfs_truncate_block_items(struct super_block *sb, u64 ino, u64 size); +int scoutfs_truncate_extent_items(struct super_block *sb, u64 ino, u64 size); #endif diff --git a/kmod/src/format.h b/kmod/src/format.h index fafe802a..2b2b31b0 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -113,7 +113,7 @@ struct scoutfs_key { #define SCOUTFS_DIRENT_KEY 5 #define SCOUTFS_LINK_BACKREF_KEY 6 #define SCOUTFS_SYMLINK_KEY 7 -#define SCOUTFS_BMAP_KEY 8 +#define SCOUTFS_EXTENT_KEY 8 #define SCOUTFS_ORPHAN_KEY 9 #define SCOUTFS_MAX_ITEM_LEN 512 @@ -288,23 +288,11 @@ struct scoutfs_xattr { __u8 name[0]; } __packed; -/* - * We use simple block map items to map a aligned fixed group of logical - * block offsets to physical blocks. We make them a decent size to - * reduce the item storage overhead per block referenced, but we don't - * want them so large that they start to take up an extraordinary amount - * of space for small files. 8 block items ranges from around 3% to .3% - * overhead for files that use only one or all of the blocks in the - * mapping item. - */ -#define SCOUTFS_BLOCK_MAP_SHIFT 3 -#define SCOUTFS_BLOCK_MAP_COUNT (1 << SCOUTFS_BLOCK_MAP_SHIFT) -#define SCOUTFS_BLOCK_MAP_MASK (SCOUTFS_BLOCK_MAP_COUNT - 1) - -struct scoutfs_block_map { - __le64 blkno[SCOUTFS_BLOCK_MAP_COUNT]; - __le64 seq[SCOUTFS_BLOCK_MAP_COUNT]; -}; +struct scoutfs_extent { + __le64 blkno; + __le64 len; + __u8 flags; +} __packed; /* * link backrefs give us a way to find all the hard links that refer diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 98a005ca..81a18a85 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -425,7 +425,7 @@ static int __delete_inode(struct super_block *sb, struct scoutfs_key *key, if (S_ISLNK(mode)) ret = scoutfs_symlink_drop(sb, ino); else if (S_ISREG(mode)) - ret = scoutfs_truncate_block_items(sb, ino, 0); + ret = scoutfs_truncate_extent_items(sb, ino, 0); if (ret) goto out; diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 14c54448..47082869 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -286,7 +286,7 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case SCOUTFS_IOC_FIND_XATTR_VAL: return scoutfs_ioc_find_xattr(file, arg, false); case SCOUTFS_IOC_INODE_DATA_SINCE: - return scoutfs_ioc_inodes_since(file, arg, SCOUTFS_BMAP_KEY); + return scoutfs_ioc_inodes_since(file, arg, SCOUTFS_EXTENT_KEY); } return -ENOTTY; diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index a1ca45d8..a9a18118 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -39,7 +39,7 @@ struct scoutfs_sb_info; { SCOUTFS_DIRENT_KEY, "DIRENT" }, \ { SCOUTFS_LINK_BACKREF_KEY, "LINK_BACKREF"}, \ { SCOUTFS_SYMLINK_KEY, "SYMLINK" }, \ - { SCOUTFS_BMAP_KEY, "BMAP" }) + { SCOUTFS_EXTENT_KEY, "EXTENT" }) #define TRACE_KEYF "%llu.%s.%llu"