diff --git a/kmod/src/data.c b/kmod/src/data.c index a8bca721..c518e16f 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -93,6 +93,16 @@ static void ext_from_item(struct scoutfs_extent *ext, ext->flags = dv->flags; } +static void data_ext_op_warn(struct inode *inode) +{ + struct scoutfs_inode_info *si; + + if (inode) { + si = SCOUTFS_I(inode); + WARN_ON_ONCE(!rwsem_is_locked(&si->extent_sem)); + } +} + static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len, struct scoutfs_extent *ext) { @@ -102,6 +112,8 @@ static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len, struct scoutfs_key last; int ret; + data_ext_op_warn(args->inode); + item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0); item_from_extent(&key, &dv, args->ino, start, len, 0, 0); @@ -139,6 +151,8 @@ static int data_ext_insert(struct super_block *sb, void *arg, u64 start, struct scoutfs_key key; int ret; + data_ext_op_warn(args->inode); + item_from_extent(&key, &dv, args->ino, start, len, map, flags); ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock); if (ret == 0 && args->inode) @@ -154,6 +168,8 @@ static int data_ext_remove(struct super_block *sb, void *arg, u64 start, struct scoutfs_key key; int ret; + data_ext_op_warn(args->inode); + item_from_extent(&key, &dv, args->ino, start, len, map, flags); ret = scoutfs_item_delete(sb, &key, args->lock); if (ret == 0 && args->inode) @@ -276,6 +292,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, struct scoutfs_lock *lock) { struct scoutfs_item_count cnt = SIC_TRUNC_EXTENT(inode); + struct scoutfs_inode_info *si = NULL; LIST_HEAD(ind_locks); s64 ret = 0; @@ -290,6 +307,11 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, if (WARN_ON_ONCE(last < iblock)) return -EINVAL; + if (inode) { + si = SCOUTFS_I(inode); + down_write(&si->extent_sem); + } + while (iblock <= last) { if (inode) ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, @@ -321,6 +343,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, ret = 0; } + if (si) + up_write(&si->extent_sem); + return ret; } @@ -533,6 +558,38 @@ out: return ret; } +/* + * Typically extent item users are serialized by i_mutex. But page + * readers only hold the page lock and need to be protected from writers + * in other pages which can be manipulating neighbouring extents as + * they split and merge. + */ +static int scoutfs_get_block_read(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + int ret; + + down_read(&si->extent_sem); + ret = scoutfs_get_block(inode, iblock, bh, create); + up_read(&si->extent_sem); + + return ret; +} + +static int scoutfs_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + int ret; + + down_write(&si->extent_sem); + ret = scoutfs_get_block(inode, iblock, bh, create); + up_write(&si->extent_sem); + + return ret; +} + /* * This is almost never used. We can't block on a cluster lock while * holding the page lock because lock invalidation gets the page lock @@ -598,7 +655,7 @@ static int scoutfs_readpage(struct file *file, struct page *page) return ret; } - ret = mpage_readpage(page, scoutfs_get_block); + ret = mpage_readpage(page, scoutfs_get_block_read); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); @@ -646,7 +703,7 @@ static int scoutfs_readpages(struct file *file, struct address_space *mapping, } } - ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block); + ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block_read); out: scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); BUG_ON(!list_empty(pages)); @@ -655,13 +712,13 @@ out: static int scoutfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page, scoutfs_get_block, wbc); + return block_write_full_page(page, scoutfs_get_block_write, wbc); } static int scoutfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, scoutfs_get_block); + return mpage_writepages(mapping, wbc, scoutfs_get_block_write); } /* fsdata allocated in write_begin and freed in write_end */ @@ -715,7 +772,7 @@ static int scoutfs_write_begin(struct file *file, ret = scoutfs_dirty_inode_item(inode, wbd->lock); if (ret == 0) ret = block_write_begin(mapping, pos, len, flags, pagep, - scoutfs_get_block); + scoutfs_get_block_write); if (ret) scoutfs_release_trans(sb); out: @@ -903,6 +960,7 @@ static s64 fallocate_extents(struct super_block *sb, struct inode *inode, long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct scoutfs_lock *lock = NULL; @@ -913,6 +971,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) s64 ret; mutex_lock(&inode->i_mutex); + down_write(&si->extent_sem); /* XXX support more flags */ if (mode & ~(FALLOC_FL_KEEP_SIZE)) { @@ -978,6 +1037,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) out: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); + up_write(&si->extent_sem); mutex_unlock(&inode->i_mutex); trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret); @@ -998,6 +1058,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, struct scoutfs_lock *lock) { + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct data_ext_args args = { .ino = scoutfs_ino(inode), @@ -1028,8 +1089,10 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, if (ret < 0) goto unlock; + down_write(&si->extent_sem); ret = scoutfs_ext_insert(sb, &data_ext_ops, &args, 0, count, 0, SEF_OFFLINE); + up_write(&si->extent_sem); if (ret < 0) goto unlock; @@ -1075,6 +1138,7 @@ static int fill_extent(struct fiemap_extent_info *fieinfo, int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct scoutfs_lock *lock = NULL; @@ -1095,8 +1159,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) goto out; - /* XXX overkill? */ mutex_lock(&inode->i_mutex); + down_read(&si->extent_sem); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock); if (ret) @@ -1148,6 +1212,7 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = fill_extent(fieinfo, &cur, last_flags); unlock: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); + up_read(&si->extent_sem); mutex_unlock(&inode->i_mutex); out: @@ -1227,8 +1292,9 @@ static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw) * Check if we should wait by looking for extents whose flags match. * Returns 0 if no extents were found or any error encountered. * - * The caller must have locked the extents before calling, both across - * mounts and within this mount. + * The caller must have acquired a cluster lock that covers the extent + * items. We acquire the extent_sem to protect our read from writers in + * other tasks. * * Returns 1 if any file extents in the caller's region matched. If the * wait struct is provided then it is initialized to be woken when the @@ -1240,6 +1306,7 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, u8 sef, u8 op, struct scoutfs_data_wait *dw, struct scoutfs_lock *lock) { + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct data_ext_args args = { @@ -1272,6 +1339,8 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, } } + down_read(&si->extent_sem); + iblock = pos >> SCOUTFS_BLOCK_SM_SHIFT; last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; @@ -1308,6 +1377,8 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, iblock = ext.start + ext.len; } + up_read(&si->extent_sem); + out: trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret); diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 63aa70b9..c5fbbb00 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -71,29 +71,30 @@ static struct kmem_cache *scoutfs_inode_cachep; */ static void scoutfs_inode_ctor(void *obj) { - struct scoutfs_inode_info *ci = obj; + struct scoutfs_inode_info *si = obj; - mutex_init(&ci->item_mutex); - seqcount_init(&ci->seqcount); - ci->staging = false; - scoutfs_per_task_init(&ci->pt_data_lock); - atomic64_set(&ci->data_waitq.changed, 0); - init_waitqueue_head(&ci->data_waitq.waitq); - init_rwsem(&ci->xattr_rwsem); - RB_CLEAR_NODE(&ci->writeback_node); + init_rwsem(&si->extent_sem); + mutex_init(&si->item_mutex); + seqcount_init(&si->seqcount); + si->staging = false; + scoutfs_per_task_init(&si->pt_data_lock); + atomic64_set(&si->data_waitq.changed, 0); + init_waitqueue_head(&si->data_waitq.waitq); + init_rwsem(&si->xattr_rwsem); + RB_CLEAR_NODE(&si->writeback_node); - inode_init_once(&ci->inode); + inode_init_once(&si->inode); } struct inode *scoutfs_alloc_inode(struct super_block *sb) { - struct scoutfs_inode_info *ci; + struct scoutfs_inode_info *si; - ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS); - if (!ci) + si = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS); + if (!si) return NULL; - return &ci->inode; + return &si->inode; } static void scoutfs_i_callback(struct rcu_head *head) @@ -221,7 +222,7 @@ static void set_item_info(struct scoutfs_inode_info *si, static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) { - struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); i_size_write(inode, le64_to_cpu(cinode->size)); set_nlink(inode, le32_to_cpu(cinode->nlink)); @@ -236,23 +237,23 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec); inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec); - ci->meta_seq = le64_to_cpu(cinode->meta_seq); - ci->data_seq = le64_to_cpu(cinode->data_seq); - ci->data_version = le64_to_cpu(cinode->data_version); - ci->online_blocks = le64_to_cpu(cinode->online_blocks); - ci->offline_blocks = le64_to_cpu(cinode->offline_blocks); - ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos); - ci->next_xattr_id = le64_to_cpu(cinode->next_xattr_id); - ci->flags = le32_to_cpu(cinode->flags); + si->meta_seq = le64_to_cpu(cinode->meta_seq); + si->data_seq = le64_to_cpu(cinode->data_seq); + si->data_version = le64_to_cpu(cinode->data_version); + si->online_blocks = le64_to_cpu(cinode->online_blocks); + si->offline_blocks = le64_to_cpu(cinode->offline_blocks); + si->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos); + si->next_xattr_id = le64_to_cpu(cinode->next_xattr_id); + si->flags = le32_to_cpu(cinode->flags); /* * i_blocks is initialized from online and offline and is then * maintained as blocks come and go. */ - inode->i_blocks = (ci->online_blocks + ci->offline_blocks) + inode->i_blocks = (si->online_blocks + si->offline_blocks) << SCOUTFS_BLOCK_SM_SECTOR_SHIFT; - set_item_info(ci, cinode); + set_item_info(si, cinode); } static void init_inode_key(struct scoutfs_key *key, u64 ino) @@ -334,7 +335,7 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry, static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, u64 new_size, bool truncate) { - struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; LIST_HEAD(ind_locks); int ret; @@ -353,7 +354,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, truncate_setsize(inode, new_size); inode->i_ctime = inode->i_mtime = CURRENT_TIME; if (truncate) - ci->flags |= SCOUTFS_INO_FLAG_TRUNCATE; + si->flags |= SCOUTFS_INO_FLAG_TRUNCATE; scoutfs_inode_set_data_seq(inode); scoutfs_update_inode_item(inode, lock, &ind_locks); @@ -365,7 +366,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock, static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock) { - struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; LIST_HEAD(ind_locks); int ret; @@ -375,7 +376,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock) if (ret) return ret; - ci->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE; + si->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE; scoutfs_update_inode_item(inode, lock, &ind_locks); scoutfs_release_trans(sb); @@ -386,13 +387,13 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock) int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock) { - struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); u64 start; int ret, err; - trace_scoutfs_complete_truncate(inode, ci->flags); + trace_scoutfs_complete_truncate(inode, si->flags); - if (!(ci->flags & SCOUTFS_INO_FLAG_TRUNCATE)) + if (!(si->flags & SCOUTFS_INO_FLAG_TRUNCATE)) return 0; start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >> @@ -643,19 +644,19 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off) static int scoutfs_iget_test(struct inode *inode, void *arg) { - struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); u64 *ino = arg; - return ci->ino == *ino; + return si->ino == *ino; } static int scoutfs_iget_set(struct inode *inode, void *arg) { - struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); u64 *ino = arg; inode->i_ino = *ino; - ci->ino = *ino; + si->ino = *ino; return 0; } @@ -705,7 +706,7 @@ out: static void store_inode(struct scoutfs_inode *cinode, struct inode *inode) { - struct scoutfs_inode_info *ci = SCOUTFS_I(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); u64 online_blocks; u64 offline_blocks; @@ -732,9 +733,9 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode) cinode->data_version = cpu_to_le64(scoutfs_inode_data_version(inode)); cinode->online_blocks = cpu_to_le64(online_blocks); cinode->offline_blocks = cpu_to_le64(offline_blocks); - cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos); - cinode->next_xattr_id = cpu_to_le64(ci->next_xattr_id); - cinode->flags = cpu_to_le32(ci->flags); + cinode->next_readdir_pos = cpu_to_le64(si->next_readdir_pos); + cinode->next_xattr_id = cpu_to_le64(si->next_xattr_id); + cinode->flags = cpu_to_le32(si->flags); } /* @@ -1368,7 +1369,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev, u64 ino, struct scoutfs_lock *lock) { - struct scoutfs_inode_info *ci; + struct scoutfs_inode_info *si; struct scoutfs_key key; struct scoutfs_inode sinode; struct inode *inode; @@ -1378,16 +1379,16 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, if (!inode) return ERR_PTR(-ENOMEM); - ci = SCOUTFS_I(inode); - ci->ino = ino; - ci->data_version = 0; - ci->online_blocks = 0; - ci->offline_blocks = 0; - ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS; - ci->next_xattr_id = 0; - ci->have_item = false; - atomic64_set(&ci->last_refreshed, lock->refresh_gen); - ci->flags = 0; + si = SCOUTFS_I(inode); + si->ino = ino; + si->data_version = 0; + si->online_blocks = 0; + si->offline_blocks = 0; + si->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS; + si->next_xattr_id = 0; + si->have_item = false; + atomic64_set(&si->last_refreshed, lock->refresh_gen); + si->flags = 0; scoutfs_inode_set_meta_seq(inode); scoutfs_inode_set_data_seq(inode); diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 9034aef4..a281180c 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -22,6 +22,14 @@ struct scoutfs_inode_info { u64 offline_blocks; u32 flags; + /* + * Protects per-inode extent items, most particularly readers + * who want to serialize writers without holding i_mutex. (only + * used in data.c, it's the only place that understands file + * extent items) + */ + struct rw_semaphore extent_sem; + /* * The in-memory item info caches the current index item values * so that we can decide to update them with comparisons instead diff --git a/tests/golden/stage-multi-part b/tests/golden/stage-multi-part new file mode 100644 index 00000000..e69de29b diff --git a/tests/sequence b/tests/sequence index 6b142b4a..4c7d0cdb 100644 --- a/tests/sequence +++ b/tests/sequence @@ -16,6 +16,7 @@ createmany-parallel.sh createmany-large-names.sh createmany-rename-large-dir.sh stage-release-race-alloc.sh +stage-multi-part.sh basic-posix-consistency.sh dirent-consistency.sh lock-ex-race-processes.sh diff --git a/tests/tests/stage-multi-part.sh b/tests/tests/stage-multi-part.sh new file mode 100644 index 00000000..594a4f06 --- /dev/null +++ b/tests/tests/stage-multi-part.sh @@ -0,0 +1,69 @@ +# +# Stage a large file in multiple parts and have a reader read it while +# it's being staged. This has found problems with extent access +# locking. +# + +t_require_commands scoutfs perl cmp rm + +FILE_BYTES=$((4 * 1024 * 1024 * 1024)) +FILE_BLOCKS=$((FILE_BYTES / 4096)) +FRAG_BYTES=$((128 * 1024 * 1024)) +FRAG_BLOCKS=$((FRAG_BYTES / 4096)) +NR_FRAGS=$((FILE_BLOCKS / FRAG_BLOCKS)) + +# +# high bandwidth way to generate file contents with predictable +# contents. We use ascii lines with the block identity, padded to 4KB +# with spaces. +# +# $1 is number of 4k blocks to write, and each block gets its block +# number in the line. $2, $3, and $4 are fields that are put in every +# block. +# +gen() { + perl -e 'for (my $i = 0; $i < '$1'; $i++) { printf("mount %020u process %020u file %020u blkno %020u%s\n", '$2', '$3', '$4', $i, " " x 3987); }' +} + +release_file() { + local path="$1" + local vers=$(scoutfs stat -s data_version "$path") + + scoutfs release "$path" "$vers" 0 $FILE_BLOCKS +} + +stage_file() { + local path="$1" + local vers=$(scoutfs stat -s data_version "$path") + local off=0 + + for a in $(seq 1 $NR_FRAGS); do + scoutfs stage "$path" "$vers" $off $FRAG_BYTES \ + <(gen $FRAG_BLOCKS $a $a $a) + ((off+=$FRAG_BYTES)) + done +} + +FILE="$T_D0/file" + +whole_file() { + for a in $(seq 1 $NR_FRAGS); do + gen $FRAG_BLOCKS $a $a $a + done +} + +# +# just one pass through the file. +# + +whole_file > "$FILE" +release_file "$FILE" + +cmp "$FILE" <(whole_file) & +pid=$! + +stage_file "$FILE" + +wait $pid || t_fail "comparison failed" + +t_pass