From fe94eb7363bcd54d8487fedc7d31d9ce5146058a Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 19 Apr 2018 14:31:16 -0700 Subject: [PATCH] scoutfs: add unwritten extents Now that we have extents we can address the fragmentation of concurrent writes with large preallocated unwritten extents instead of trying to allocate from disjoint free space with cursors. First we add support for unwritten extents. Truncate needs to make sure it doesn't treat truncated unwritten blocks as online just because they're not offline. If we try to write into them we convert them to written extents. And fiemap needs to flag them as unwritten and be sure to check for extents past i_size. Then we allocate unwritten extents only if we're extending a contiguous file. We try to preallocate the size of the file and cap it to a meg. This ends up with a power of two progression of preallocation sizes, which nicely balances extent sizes and wasted allocation as file sizes increase. We need to be careful to truncate the preallocated regions if the entire file is released. We take that as an indication that the user doesn't want the file consuming any more space. This removes most of the use of the cursor code. It will be completely removed in a further patch. Signed-off-by: Zach Brown --- kmod/src/data.c | 259 +++++++++++++++++++++++---------------- kmod/src/format.h | 1 + kmod/src/ioctl.c | 20 +++ kmod/src/scoutfs_trace.h | 67 +++++----- 4 files changed, 207 insertions(+), 140 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 12a39425..1fc9bb20 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -40,17 +40,12 @@ * scoutfs uses extent items to track file data block mappings and free * blocks. * - * Block allocation maintains a fixed number of allocation cursors that - * remember the position of tasks within free regions. This is very - * simple and maintains contiguous allocations for simple streaming - * writes. It eventually won't be good enough and we'll spend - * complexity on delalloc but we want to put that off as long as - * possible. + * Typically we'll allocate a single block in get_block if a mapping + * isn't found. * - * There's no unwritten extents. As we dirty file data pages we track - * their inodes. Before we commit dirty metadata we write out all - * tracked inodes. This ensures that data is persistent before the - * metadata that references it is visible. + * We special case extending contiguous files. In that case we'll preallocate + * an unwritten extent at the end of the file. The size of the preallocation + * is based on the file size and is capped. * * XXX * - truncate @@ -253,6 +248,8 @@ static s64 truncate_one_extent(struct super_block *sb, struct inode *inode, struct scoutfs_extent ofl; bool rem_fr = false; bool add_rem = false; + s64 offline_delta = 0; + s64 online_delta = 0; s64 ret; int err; @@ -309,9 +306,15 @@ static s64 truncate_one_extent(struct super_block *sb, struct inode *inode, goto out; } - scoutfs_inode_add_onoff(inode, rem.map ? -rem.len : 0, - (rem.flags & SEF_OFFLINE ? -rem.len : 0) + - (offline ? ofl.len : 0)); + if (rem.map && !(rem.flags & SEF_UNWRITTEN)) + online_delta += -rem.len; + if (rem.flags & SEF_OFFLINE) + offline_delta += -rem.len; + if (offline) + offline_delta += ofl.len; + + scoutfs_inode_add_onoff(inode, online_delta, offline_delta); + ret = 1; out: if (ret < 0) { @@ -396,6 +399,7 @@ static inline struct hlist_head *cursor_head(struct data_info *datinf, return &datinf->cursor_hash[h]; } +#if 0 static struct task_cursor *search_head(struct hlist_head *head, struct task_struct *task, pid_t pid) { @@ -455,6 +459,7 @@ static struct task_cursor *get_cursor(struct data_info *datinf) return curs; } +#endif static int get_server_extent(struct super_block *sb, u64 len) { @@ -482,96 +487,86 @@ out: * The caller tells us if the block was offline or not. We modify the * extent items and the caller will search for the resulting extent. * - * We try to encourage contiguous allocation by having per-task cursors - * that track large extents. Each new allocating task will get a new - * extent. + * If we're writing to the final block of the file then we try to + * preallocate unwritten blocks past i_size for future extending writes + * to use. We only base this decision on the file size. Truncating + * down the size, unlink, or releasing all blocks in the file will + * remove these preallocated blocks. Truncating past them will preserve + * them and treat them as 0. + * + * This assumes that there can't be existing unwritten extents in the + * inode that would overlap with our allocations. Writes are serialized + * and the caller only calls us if an extent doesn't exist. Unwritten + * extents are only created adjacent to i_size extensions. The only way + * to pull i_size back behind unwritten extents is to truncate and it + * frees them. Corrupt disk images could have fragmented unwritten + * extents past i_size in inodes and that'd manifest as errors inserting + * overlapping new allocations. */ -#define CURSOR_BLOCKS (1 * 1024 * 1024 / BLOCK_SIZE) -#define CURSOR_BLOCKS_MASK (CURSOR_BLOCKS - 1) -#define CURSOR_BLOCKS_SEARCH (CURSOR_BLOCKS + CURSOR_BLOCKS - 1) -#define CURSOR_BLOCKS_ALLOC (CURSOR_BLOCKS * 64) -static int find_alloc_block(struct super_block *sb, struct inode *inode, - u64 iblock, bool was_offline, - struct scoutfs_lock *lock) +#define MAX_UNWRITTEN_BLOCKS ((u64)SCOUTFS_SEGMENT_BLOCKS) +#define SERVER_ALLOC_BLOCKS (MAX_UNWRITTEN_BLOCKS * 32) +static int alloc_block(struct super_block *sb, struct inode *inode, u64 iblock, + bool was_offline, struct scoutfs_lock *lock) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); DECLARE_DATA_INFO(sb, datinf); const u64 ino = scoutfs_ino(inode); + struct scoutfs_extent unwr; struct scoutfs_extent ext; struct scoutfs_extent ofl; + struct scoutfs_extent blk; struct scoutfs_extent fr; - struct task_cursor *curs; bool add_ofl = false; bool add_fr = false; + bool rem_blk = false; + u64 offline; + u64 online; + u64 len; int err; int ret; down_write(&datinf->alloc_rwsem); - curs = get_cursor(datinf); + scoutfs_inode_get_onoff(inode, &online, &offline); - trace_scoutfs_data_find_alloc_block_curs(sb, curs, curs->blkno); + /* exponentially prealloc unwritten extents to a limit */ + if (iblock > 1 && iblock == (online + offline)) + len = min(iblock, MAX_UNWRITTEN_BLOCKS); + else + len = 1; - /* see if our cursor is still free */ - if (curs->blkno) { - /* look for the extent that overlaps our iblock */ - scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, - sbi->node_id, curs->blkno, 1, 0, 0); - ret = scoutfs_extent_next(sb, data_extent_io, &ext, - sbi->node_id_lock); - if (ret && ret != -ENOENT) - goto out; + trace_scoutfs_data_alloc_block(sb, inode, iblock, was_offline, + online, offline, len); + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, + sbi->node_id, 0, len, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, + sbi->node_id_lock); + if (ret == -ENOENT) { + /* try to get allocation from the server if we're out */ + ret = get_server_extent(sb, SERVER_ALLOC_BLOCKS); if (ret == 0) - trace_scoutfs_data_alloc_block_cursor(sb, &ext); - - /* find a new large extent if our cursor isn't free */ - if (ret < 0 || ext.start > curs->blkno) - curs->blkno = 0; + ret = scoutfs_extent_next(sb, data_extent_io, &ext, + sbi->node_id_lock); + } + if (ret) { + /* XXX should try to look for smaller free extents :/ */ + if (ret == -ENOENT) + ret = -ENOSPC; + goto out; } - /* try to find a new large extent, possibly asking for more */ - if (curs->blkno == 0) { - scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, - sbi->node_id, 0, CURSOR_BLOCKS_SEARCH, - 0, 0); - ret = scoutfs_extent_next(sb, data_extent_io, &ext, - sbi->node_id_lock); - if (ret == -ENOENT) { - /* try to get allocation from the server if we're out */ - ret = get_server_extent(sb, CURSOR_BLOCKS_ALLOC); - if (ret == 0) - ret = scoutfs_extent_next(sb, data_extent_io, - &ext, - sbi->node_id_lock); - } - if (ret) { - /* XXX should try to look for smaller free extents :/ */ - if (ret == -ENOENT) - ret = -ENOSPC; - goto out; - } + trace_scoutfs_data_alloc_block_next(sb, &ext); - /* - * set our cursor to the aligned start of a large extent - * We'll then remove it and the next aligned free large - * extent will start much later. This stops us from - * constantly setting cursors to the start of a large - * free extent that keeps have its start allocated. - */ - trace_scoutfs_data_alloc_block_free(sb, &ext); - curs->blkno = ALIGN(ext.start, CURSOR_BLOCKS); - } - - /* remove the free block we're using */ + /* remove the free extent we're using */ scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, - sbi->node_id, curs->blkno, 1, 0, 0); + sbi->node_id, ext.start, len, 0, 0); ret = scoutfs_extent_remove(sb, data_extent_io, &fr, sbi->node_id_lock); if (ret) goto out; add_fr = true; - /* remove an offline file extent */ + /* remove an offline block extent */ if (was_offline) { scoutfs_extent_init(&ofl, SCOUTFS_FILE_EXTENT_TYPE, ino, iblock, 1, 0, SEF_OFFLINE); @@ -581,26 +576,32 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode, add_ofl = true; } - /* add (and hopefully merge!) the new allocation */ - scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE, ino, - iblock, 1, curs->blkno, 0); - trace_scoutfs_data_alloc_block(sb, &ext); - ret = scoutfs_extent_add(sb, data_extent_io, &ext, lock); + /* add the block that the caller is writing */ + scoutfs_extent_init(&blk, SCOUTFS_FILE_EXTENT_TYPE, ino, + iblock, 1, ext.start, 0); + ret = scoutfs_extent_add(sb, data_extent_io, &blk, lock); if (ret) goto out; + rem_blk = true; + + /* and maybe add the remaining unwritten extent */ + if (len > 1) { + scoutfs_extent_init(&unwr, SCOUTFS_FILE_EXTENT_TYPE, ino, + iblock + 1, len - 1, ext.start + 1, + SEF_UNWRITTEN); + ret = scoutfs_extent_add(sb, data_extent_io, &unwr, lock); + if (ret) + goto out; + } scoutfs_inode_add_onoff(inode, 1, was_offline ? -1ULL : 0); - - /* set cursor to next block, clearing if we finish a large extent */ - BUILD_BUG_ON(!is_power_of_2(CURSOR_BLOCKS)); - curs->blkno++; - if ((curs->blkno & CURSOR_BLOCKS_MASK) == 0) - curs->blkno = 0; - ret = 0; out: if (ret) { err = 0; + if (rem_blk) + err |= scoutfs_extent_remove(sb, data_extent_io, &blk, + lock); if (add_ofl) err |= scoutfs_extent_add(sb, data_extent_io, &ofl, lock); @@ -612,7 +613,52 @@ out: up_write(&datinf->alloc_rwsem); - trace_scoutfs_data_find_alloc_block_ret(sb, ret); + trace_scoutfs_data_alloc_block_ret(sb, ret); + return ret; +} + +/* + * Remove the unwritten flag from an existing extent. We don't have to + * wait for dirty block IO to complete before clearing the unwritten + * flag in metadata because we have strict synchronization between data + * and metadata. All dirty data in the current transaction is written + * before the metadata in the transaction that references it is + * committed. + * + * The extent is unwritten so it can't be offline nor online. We remove + * the unwritten flag, possibly splitting and merging. We record the + * extent as online now as initial block allocation would. + */ +static int convert_unwritten(struct super_block *sb, struct inode *inode, + struct scoutfs_extent *ext, u64 start, u64 len, + struct scoutfs_lock *lock) +{ + struct scoutfs_extent conv; + int err; + int ret; + + if (WARN_ON_ONCE(!ext->map) || + WARN_ON_ONCE(!(ext->flags & SEF_UNWRITTEN))) + return -EINVAL; + + scoutfs_extent_init(&conv, ext->type, ext->owner, start, len, + ext->map + (start - ext->start), ext->flags); + ret = scoutfs_extent_remove(sb, data_extent_io, &conv, lock); + if (ret) + goto out; + + conv.flags &= ~SEF_UNWRITTEN; + ret = scoutfs_extent_add(sb, data_extent_io, &conv, lock); + if (ret) { + conv.flags |= SEF_UNWRITTEN; + err = scoutfs_extent_add(sb, data_extent_io, &conv, lock); + BUG_ON(err); + goto out; + } + + ret = 0; +out: + scoutfs_inode_add_onoff(inode, len, 0); return ret; } @@ -656,15 +702,18 @@ restart: goto out; } + /* convert unwritten to written */ + if (create && (ext.flags & SEF_UNWRITTEN)) { + ret = convert_unwritten(sb, inode, &ext, iblock, 1, lock); + if (ret) + goto out; + goto restart; + } + /* try to allocate if we're writing */ if (create && !ext.map) { - /* - * XXX can blow the transaction here.. need to back off - * and try again if we've already done a bulk alloc in - * our transaction. - */ - ret = find_alloc_block(sb, inode, iblock, - ext.flags & SEF_OFFLINE, lock); + ret = alloc_block(sb, inode, iblock, ext.flags & SEF_OFFLINE, + lock); if (ret) goto out; set_buffer_new(bh); @@ -853,7 +902,6 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; struct scoutfs_extent ext; - loff_t i_size; u64 blk_off; u64 logical = 0; u64 phys = 0; @@ -868,13 +916,6 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* XXX overkill? */ mutex_lock(&inode->i_mutex); - /* stop at i_size, we don't allocate outside i_size */ - i_size = i_size_read(inode); - if (i_size == 0) { - ret = 0; - goto out; - } - ret = scoutfs_lock_inode(sb, DLM_LOCK_PR, 0, inode, &inode_lock); if (ret) goto out; @@ -907,7 +948,11 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, logical = ext.start << SCOUTFS_BLOCK_SHIFT; phys = ext.map << SCOUTFS_BLOCK_SHIFT; size = ext.len << SCOUTFS_BLOCK_SHIFT; - flags = (ext.flags & SEF_OFFLINE) ? FIEMAP_EXTENT_UNKNOWN : 0; + flags = 0; + if (ext.flags & SEF_OFFLINE) + flags |= FIEMAP_EXTENT_UNKNOWN; + if (ext.flags & SEF_UNWRITTEN) + flags |= FIEMAP_EXTENT_UNWRITTEN; blk_off = ext.start + ext.len; } @@ -961,7 +1006,6 @@ int scoutfs_data_setup(struct super_block *sb) for (i = 0; i < NR_CURSORS; i++) { curs = kzalloc(sizeof(struct task_cursor), GFP_KERNEL); if (!curs) { - destroy_cursors(datinf); kfree(datinf); return -ENOMEM; } @@ -984,8 +1028,5 @@ void scoutfs_data_destroy(struct super_block *sb) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct data_info *datinf = sbi->data_info; - if (datinf) { - destroy_cursors(datinf); - kfree(datinf); - } + kfree(datinf); } diff --git a/kmod/src/format.h b/kmod/src/format.h index 87655716..cecd5c8b 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -330,6 +330,7 @@ struct scoutfs_file_extent { } __packed; #define SEF_OFFLINE 0x1 +#define SEF_UNWRITTEN 0x2 /* * The first xattr part item has a header that describes the xattr. The diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 206a054d..31a4a612 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -262,6 +262,10 @@ out: * offline. Attempts to use the blocks in the future will trigger * recall from the archive. * + * If the file's online blocks drop to 0 then we also truncate any + * blocks beyond i_size. This honors the intent of fully releasing a file + * without the user needing to know to release past i_size or truncate. + * * XXX permissions? * XXX a lot of this could be generic file write prep */ @@ -273,6 +277,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg) struct scoutfs_lock *lock = NULL; loff_t start; loff_t end_inc; + u64 online; + u64 offline; + u64 isize; int ret; if (copy_from_user(&args, (void __user *)arg, sizeof(args))) @@ -323,6 +330,19 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg) args.block, args.block + args.count - 1, true, lock); + if (ret == 0) { + scoutfs_inode_get_onoff(inode, &online, &offline); + isize = i_size_read(inode); + if (online == 0 && isize) { + start = (isize + SCOUTFS_BLOCK_SIZE - 1) + >> SCOUTFS_BLOCK_SHIFT; + ret = scoutfs_data_truncate_items(sb, inode, + scoutfs_ino(inode), + start, U64_MAX, + false, lock); + } + } + out: scoutfs_unlock(sb, lock, DLM_LOCK_EX); mutex_unlock(&inode->i_mutex); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 5c3178ff..29c3b3bf 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -421,7 +421,41 @@ TRACE_EVENT(scoutfs_get_block, __entry->create, __entry->ret, __entry->blkno, __entry->size) ); -TRACE_EVENT(scoutfs_data_find_alloc_block_ret, +TRACE_EVENT(scoutfs_data_alloc_block, + TP_PROTO(struct super_block *sb, struct inode *inode, u64 iblock, + bool was_offline, u64 online_blocks, u64 offline_blocks, + u64 len), + + TP_ARGS(sb, inode, iblock, was_offline, online_blocks, offline_blocks, + len), + + TP_STRUCT__entry( + __field(__u64, fsid) + __field(__u64, ino) + __field(__u64, iblock) + __field(__u8, was_offline) + __field(__u64, online_blocks) + __field(__u64, offline_blocks) + __field(__u64, len) + ), + + TP_fast_assign( + __entry->fsid = FSID_ARG(sb); + __entry->ino = scoutfs_ino(inode); + __entry->iblock = iblock; + __entry->was_offline = was_offline; + __entry->online_blocks = online_blocks; + __entry->offline_blocks = offline_blocks; + __entry->len = len; + ), + + TP_printk("fsid "FSID_FMT" ino %llu iblock %llu was_offline %u online_blocks %llu offline_blocks %llu len %llu", + __entry->fsid, __entry->ino, __entry->iblock, + __entry->was_offline, __entry->online_blocks, + __entry->offline_blocks, __entry->len) +); + +TRACE_EVENT(scoutfs_data_alloc_block_ret, TP_PROTO(struct super_block *sb, int ret), TP_ARGS(sb, ret), @@ -439,27 +473,6 @@ TRACE_EVENT(scoutfs_data_find_alloc_block_ret, TP_printk(FSID_FMT" ret %d", __entry->fsid, __entry->ret) ); -TRACE_EVENT(scoutfs_data_find_alloc_block_found_seg, - TP_PROTO(struct super_block *sb, __u64 segno, __u64 blkno), - - TP_ARGS(sb, segno, blkno), - - TP_STRUCT__entry( - __field(__u64, fsid) - __field(__u64, segno) - __field(__u64, blkno) - ), - - TP_fast_assign( - __entry->fsid = FSID_ARG(sb); - __entry->segno = segno; - __entry->blkno = blkno; - ), - - TP_printk(FSID_FMT" found free segno %llu blkno %llu", __entry->fsid, - __entry->segno, __entry->blkno) -); - TRACE_EVENT(scoutfs_data_find_alloc_block_curs, TP_PROTO(struct super_block *sb, void *curs, __u64 blkno), @@ -2106,15 +2119,7 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_get_server_extent, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) ); -DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block_cursor, - TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), - TP_ARGS(sb, ext) -); -DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block_free, - TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), - TP_ARGS(sb, ext) -); -DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block, +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block_next, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) );