From 1fca13b092f127d05f5836bdf1dabfcc9730d0df Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 15 Jun 2018 15:16:48 -0700 Subject: [PATCH] scoutfs: add fallocate Add an fallocate operation. This changes the possible combinations of flags in extents and makes it possible to create extents beyond i_size. This will confuse the rest of the code in a few places and that will be fixed up next. Signed-off-by: Zach Brown --- kmod/src/count.h | 21 ++++ kmod/src/counters.h | 1 + kmod/src/data.c | 226 +++++++++++++++++++++++++++++++++++++++ kmod/src/data.h | 1 + kmod/src/format.h | 1 + kmod/src/scoutfs_trace.h | 29 +++++ 6 files changed, 279 insertions(+) diff --git a/kmod/src/count.h b/kmod/src/count.h index ae0c98ec..fc2f993e 100644 --- a/kmod/src/count.h +++ b/kmod/src/count.h @@ -247,4 +247,25 @@ SIC_TRUNC_EXTENT(struct inode *inode) return cnt; } +/* + * Fallocating an extent can, at most: + * - allocate from the server: delete two free and insert merged + * - free an allocated extent: delete one and create two split + * - remove an unallocated file extent: delete one and create two split + * - add an fallocated flie extent: delete two and inset one merged + */ +static inline const struct scoutfs_item_count SIC_FALLOCATE_ONE(void) +{ + struct scoutfs_item_count cnt = {0,}; + unsigned int nr_free = ((1 + 2) * 2) * 2; + unsigned int nr_file = (1 + 2) * 2; + + __count_dirty_inode(&cnt); + + cnt.items += nr_free + nr_file; + cnt.vals += nr_file * sizeof(struct scoutfs_file_extent); + + return cnt; +} + #endif diff --git a/kmod/src/counters.h b/kmod/src/counters.h index cc4fc7a8..b8432d56 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -27,6 +27,7 @@ EXPAND_COUNTER(corrupt_btree_no_child_ref) \ EXPAND_COUNTER(corrupt_data_extent_trunc_cleanup) \ EXPAND_COUNTER(corrupt_data_extent_alloc_cleanup) \ + EXPAND_COUNTER(corrupt_data_extent_fallocate_cleanup) \ EXPAND_COUNTER(corrupt_dirent_backref_name_len) \ EXPAND_COUNTER(corrupt_dirent_name_len) \ EXPAND_COUNTER(corrupt_dirent_readdir_name_len) \ diff --git a/kmod/src/data.c b/kmod/src/data.c index 83c3bee8..706a9382 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "format.h" #include "super.h" @@ -820,6 +821,230 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping, return ret; } +/* + * Update one extent on behalf of fallocate. + * + * The caller has searched for the next extent that intersects with the + * region including first_block and last_block. The next extent will be + * zeroed if it wasn't found. We don't know the state of the offsets + * past the next extent. + * + * The caller has held transactions and acquired locks. We only ever + * make one extent modification here. + * + * If this returns 0 then the caller's extent is clobbered. It is set + * to the newly fallocated extent so that the caller can continue with + * the fallocate operation. + */ +static int fallocate_one_extent(struct super_block *sb, u64 ino, u64 start, + u64 len, u8 flags, u8 rem_flags, + struct scoutfs_lock *lock) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_extent fal; + struct scoutfs_extent rem; + struct scoutfs_extent fr; + bool add_rem = false; + bool add_fr = false; + int ret; + + if (WARN_ON_ONCE(len == 0) || + WARN_ON_ONCE(start + len < start)) { + ret = -EINVAL; + goto out; + } + + /* find a sufficiently large free extent */ + scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, + sbi->node_id, 0, len, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &fr, + sbi->node_id_lock); + if (ret == -ENOENT) { + /* try to get allocation from the server if we're out */ + ret = get_server_extent(sb, SERVER_ALLOC_BLOCKS); + if (ret == 0) + ret = scoutfs_extent_next(sb, data_extent_io, &fr, + sbi->node_id_lock); + /* XXX try to find smaller free extents */ + } + if (ret < 0) { + if (ret == -ENOENT) + ret = -ENOSPC; + goto out; + } + + /* trim our allocation from the length indexed extent */ + scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + sbi->node_id, fr.start, min(fr.len, len), 0, 0); + + ret = scoutfs_extent_init(&fal, SCOUTFS_FILE_EXTENT_TYPE, ino, + start, fr.len, fr.start, flags); + if (WARN_ON_ONCE(ret)) + goto out; + + ret = scoutfs_extent_remove(sb, data_extent_io, &fr, sbi->node_id_lock); + if (ret) + goto out; + add_fr = true; + + /* remove a region of the existing extent */ + if (rem_flags) { + scoutfs_extent_init(&rem, SCOUTFS_FILE_EXTENT_TYPE, ino, + fal.start, fal.len, 0, rem_flags); + ret = scoutfs_extent_remove(sb, data_extent_io, &rem, lock); + if (ret) + goto out; + add_rem = true; + } + + ret = scoutfs_extent_add(sb, data_extent_io, &fal, lock); +out: + scoutfs_extent_cleanup(ret < 0 && add_rem, scoutfs_extent_add, sb, + data_extent_io, &rem, lock, + SC_DATA_EXTENT_FALLOCATE_CLEANUP, + corrupt_data_extent_fallocate_cleanup, &fal); + scoutfs_extent_cleanup(ret < 0 && add_fr, scoutfs_extent_add, sb, + data_extent_io, &fr, sbi->node_id_lock, + SC_DATA_EXTENT_FALLOCATE_CLEANUP, + corrupt_data_extent_alloc_cleanup, &fal); + return ret; +} + +/* + * Modify the extents that map the blocks that store the len byte region + * starting at offset. + * + * The caller has only prevented freezing by entering a fs write + * context. We're responsible for all other locking and consistency. + */ +long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + const u64 ino = scoutfs_ino(inode); + struct scoutfs_lock *lock = NULL; + DECLARE_DATA_INFO(sb, datinf); + struct scoutfs_extent ext; + LIST_HEAD(ind_locks); + u64 last_block; + u64 iblock; + u64 blocks; + loff_t end; + u8 rem_flags; + u8 flags; + int ret; + + mutex_lock(&inode->i_mutex); + + /* XXX support more flags */ + if (mode & ~(FALLOC_FL_KEEP_SIZE)) { + ret = -EOPNOTSUPP; + goto out; + } + + /* catch wrapping */ + if (offset + len < offset) { + ret = -EINVAL; + goto out; + } + + if (len == 0) { + ret = 0; + goto out; + } + + ret = scoutfs_lock_inode(sb, DLM_LOCK_EX, SCOUTFS_LKF_REFRESH_INODE, + inode, &lock); + if (ret) + goto out; + + inode_dio_wait(inode); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (offset + len > i_size_read(inode))) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out; + } + + iblock = offset >> SCOUTFS_BLOCK_SHIFT; + last_block = (offset + len - 1) >> SCOUTFS_BLOCK_SHIFT; + + for (; iblock <= last_block; iblock = ext.start + ext.len) { + + scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE, + ino, iblock, 1, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, lock); + if (ret < 0 && ret != -ENOENT) + goto out; + + blocks = last_block - iblock + 1; + flags = SEF_UNWRITTEN; + rem_flags = 0; + + if (ret == -ENOENT || ext.start > last_block) { + /* no next extent or past us, all remaining blocks */ + + } else if (iblock < ext.start) { + /* sparse region until next extent */ + blocks = min(blocks, ext.start - iblock); + + } else if (ext.map > 0) { + /* skip past an allocated extent */ + blocks = min(blocks, (ext.start + ext.len) - iblock); + iblock += blocks; + blocks = 0; + + } else { + /* allocating a portion of an unallocated extent */ + blocks = min(blocks, (ext.start + ext.len) - iblock); + flags |= ext.flags; + rem_flags = ext.flags; + /* XXX corruption; why'd we store map == flags == 0? */ + if (rem_flags == 0) { + ret = -EIO; + goto out; + } + } + + ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, + SIC_FALLOCATE_ONE()); + if (ret) + goto out; + + if (blocks > 0) { + down_write(&datinf->alloc_rwsem); + ret = fallocate_one_extent(sb, ino, iblock, blocks, + flags, rem_flags, lock); + up_write(&datinf->alloc_rwsem); + } + + if (ret == 0 && !(mode & FALLOC_FL_KEEP_SIZE)) { + end = (iblock + blocks) << SCOUTFS_BLOCK_SHIFT; + if (end == 0 || end > offset + len) + end = offset + len; + if (end > i_size_read(inode)) + i_size_write(inode, end); + scoutfs_update_inode_item(inode, lock, &ind_locks); + } + scoutfs_release_trans(sb); + scoutfs_inode_index_unlock(sb, &ind_locks); + + if (ret) + goto out; + + iblock += blocks; + } + ret = 0; +out: + scoutfs_unlock(sb, lock, DLM_LOCK_EX); + mutex_unlock(&inode->i_mutex); + + trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret); + return ret; +} + + /* * Return all the file's extents whose blocks overlap with the caller's * byte region. We set _LAST on the last extent and _UNKNOWN on offline @@ -910,6 +1135,7 @@ const struct file_operations scoutfs_file_fops = { .unlocked_ioctl = scoutfs_ioctl, .fsync = scoutfs_file_fsync, .llseek = scoutfs_file_llseek, + .fallocate = scoutfs_fallocate, }; diff --git a/kmod/src/data.h b/kmod/src/data.h index c9214c5c..bd9f84fa 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -9,6 +9,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, struct scoutfs_lock *lock); int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); int scoutfs_data_setup(struct super_block *sb); void scoutfs_data_destroy(struct super_block *sb); diff --git a/kmod/src/format.h b/kmod/src/format.h index f9a33b83..e3aa67d7 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -622,6 +622,7 @@ enum { SC_DATA_EXTENT_TRUNC_CLEANUP, SC_DATA_EXTENT_ALLOC_CLEANUP, SC_SERVER_EXTENT_CLEANUP, + SC_DATA_EXTENT_FALLOCATE_CLEANUP, SC_NR_SOURCES, }; diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 0f88e423..6fa4863b 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -370,6 +370,35 @@ TRACE_EVENT(scoutfs_erase_item, TP_printk(FSID_FMT" erasing item %p", __entry->fsid, __entry->item) ); +TRACE_EVENT(scoutfs_data_fallocate, + TP_PROTO(struct super_block *sb, u64 ino, int mode, loff_t offset, + loff_t len, int ret), + + TP_ARGS(sb, ino, mode, offset, len, ret), + + TP_STRUCT__entry( + __field(__u64, fsid) + __field(__u64, ino) + __field(int, mode) + __field(__u64, offset) + __field(__u64, len) + __field(int, ret) + ), + + TP_fast_assign( + __entry->fsid = FSID_ARG(sb); + __entry->ino = ino; + __entry->mode = mode; + __entry->offset = offset; + __entry->len = len; + __entry->ret = ret; + ), + + TP_printk("fsid "FSID_FMT" ino %llu mode 0x%x offset %llu len %llu ret %d", + __entry->fsid, __entry->ino, __entry->mode, __entry->offset, + __entry->len, __entry->ret) +); + TRACE_EVENT(scoutfs_data_fiemap, TP_PROTO(struct super_block *sb, __u64 off, int i, __u64 blkno),