diff --git a/kmod/src/data.c b/kmod/src/data.c index 7903e8d7..13c02391 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -296,28 +296,39 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode, * and offline blocks. If it's not provided then the inode is being * destroyed and isn't reachable, we don't need to update it. * + * If 'pause' is set, then we are destroying the inode and we should take + * breaks occasionally to allow other nodes access to this inode lock shard. + * * The caller is in charge of locking the inode and data, but we may * have to modify far more items than fit in a transaction so we're in * charge of batching updates into transactions. If the inode is * provided then we're responsible for updating its item as we go. */ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, - u64 ino, u64 iblock, u64 last, bool offline, - struct scoutfs_lock *lock) + u64 ino, u64 *iblock, u64 last, bool offline, + struct scoutfs_lock *lock, bool pause) { struct scoutfs_inode_info *si = NULL; LIST_HEAD(ind_locks); + u64 cur_seq; s64 ret = 0; WARN_ON_ONCE(inode && !inode_is_locked(inode)); + /* + * If the inode is provided, then we aren't destroying it. So it's not + * safe to pause while removing items- it needs to be done in one chunk. + */ + if (WARN_ON_ONCE(pause && inode)) + return -EINVAL; + /* clamp last to the last possible block? */ if (last > SCOUTFS_BLOCK_SM_MAX) last = SCOUTFS_BLOCK_SM_MAX; - trace_scoutfs_data_truncate_items(sb, iblock, last, offline); + trace_scoutfs_data_truncate_items(sb, *iblock, last, offline, pause); - if (WARN_ON_ONCE(last < iblock)) + if (WARN_ON_ONCE(last < *iblock)) return -EINVAL; if (inode) { @@ -325,7 +336,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, down_write(&si->extent_sem); } - while (iblock <= last) { + cur_seq = scoutfs_trans_sample_seq(sb); + + while (*iblock <= last) { if (inode) ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); else @@ -339,7 +352,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, ret = 0; if (ret == 0) - ret = truncate_extents(sb, inode, ino, iblock, last, + ret = truncate_extents(sb, inode, ino, *iblock, last, offline, lock); if (inode) @@ -351,8 +364,19 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, if (ret <= 0) break; - iblock = ret; + *iblock = ret; ret = 0; + + /* + * We know there's more to do because truncate_extents() + * pauses every EXTENTS_PER_HOLD extents and it returned the + * next starting block. Our caller might also want us to pause, + * which we will do whenever we cross a transaction boundary. + */ + if (pause && (cur_seq != scoutfs_trans_sample_seq(sb))) { + ret = -EINPROGRESS; + break; + } } if (si) diff --git a/kmod/src/data.h b/kmod/src/data.h index a34854eb..340e3da4 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -47,8 +47,8 @@ int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_ int create); int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, - u64 ino, u64 iblock, u64 last, bool offline, - struct scoutfs_lock *lock); + u64 ino, u64 *iblock, u64 last, bool offline, + struct scoutfs_lock *lock, bool pause); int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 5e3f724f..c975e7cf 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -468,8 +468,8 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock) start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >> SCOUTFS_BLOCK_SM_SHIFT; ret = scoutfs_data_truncate_items(inode->i_sb, inode, - scoutfs_ino(inode), start, ~0ULL, - false, lock); + scoutfs_ino(inode), &start, ~0ULL, + false, lock, false); err = clear_truncate_flag(inode, lock); return ret ? ret : err; @@ -1635,7 +1635,8 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_ * partial deletion until all deletion is complete and the orphan item * is removed. */ -static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode, +static int delete_inode_items(struct super_block *sb, u64 ino, + struct scoutfs_inode *sinode, u64 *start, struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock) { struct scoutfs_key key; @@ -1654,8 +1655,8 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_in /* remove data items in their own transactions */ if (S_ISREG(mode)) { - ret = scoutfs_data_truncate_items(sb, NULL, ino, 0, ~0ULL, - false, lock); + ret = scoutfs_data_truncate_items(sb, NULL, ino, start, ~0ULL, + false, lock, true); if (ret) goto out; } @@ -1803,16 +1804,23 @@ out: */ static int try_delete_inode_items(struct super_block *sb, u64 ino) { - struct inode_deletion_lock_data *ldata = NULL; - struct scoutfs_lock *orph_lock = NULL; - struct scoutfs_lock *lock = NULL; + struct inode_deletion_lock_data *ldata; + struct scoutfs_lock *orph_lock; + struct scoutfs_lock *lock; struct scoutfs_inode sinode; struct scoutfs_key key; bool clear_trying = false; + bool more = false; u64 group_nr; + u64 start = 0; int bit_nr; int ret; +again: + ldata = NULL; + orph_lock = NULL; + lock = NULL; + ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock); if (ret < 0) goto out; @@ -1824,11 +1832,12 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) goto out; /* only one local attempt per inode at a time */ - if (test_and_set_bit(bit_nr, ldata->trying)) { + if (!more && test_and_set_bit(bit_nr, ldata->trying)) { ret = 0; goto out; } clear_trying = true; + more = false; /* can't delete if it's cached in local or remote mounts */ if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) { @@ -1853,7 +1862,15 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino) if (ret < 0) goto out; - ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock); + ret = delete_inode_items(sb, ino, &sinode, &start, lock, orph_lock); + + if (ret == -EINPROGRESS) { + more = true; + clear_trying = false; + } else { + more = false; + } + out: if (clear_trying) clear_bit(bit_nr, ldata->trying); @@ -1861,6 +1878,9 @@ out: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); + if (more) + goto again; + return ret; } diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index fea7aae3..01f9ed13 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -372,9 +372,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg) sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT; eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT; ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode), - sblock, + &sblock, eblock, true, - lock); + lock, false); if (ret == 0) { scoutfs_inode_get_onoff(inode, &online, &offline); isize = i_size_read(inode); @@ -383,8 +383,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg) >> SCOUTFS_BLOCK_SM_SHIFT; ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode), - sblock, U64_MAX, - false, lock); + &sblock, U64_MAX, + false, lock, false); } } diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 930a275a..07ddbc77 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -378,15 +378,16 @@ DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent, ); TRACE_EVENT(scoutfs_data_truncate_items, - TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline), + TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline, bool pause), - TP_ARGS(sb, iblock, last, offline), + TP_ARGS(sb, iblock, last, offline, pause), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(__u64, iblock) __field(__u64, last) __field(int, offline) + __field(bool, pause) ), TP_fast_assign( @@ -394,10 +395,12 @@ TRACE_EVENT(scoutfs_data_truncate_items, __entry->iblock = iblock; __entry->last = last; __entry->offline = offline; + __entry->pause = pause; ), - TP_printk(SCSBF" iblock %llu last %llu offline %u", SCSB_TRACE_ARGS, - __entry->iblock, __entry->last, __entry->offline) + TP_printk(SCSBF" iblock %llu last %llu offline %u pause %d", + SCSB_TRACE_ARGS, __entry->iblock, __entry->last, + __entry->offline, __entry->pause) ); TRACE_EVENT(scoutfs_data_wait_check, diff --git a/tests/golden/large-fragmented-free b/tests/golden/large-fragmented-free index 7740849c..58aea5f7 100644 --- a/tests/golden/large-fragmented-free +++ b/tests/golden/large-fragmented-free @@ -1,4 +1,3 @@ -== setting longer hung task timeout == creating fragmented extents == unlink file with moved extents to free extents per block == cleanup diff --git a/tests/tests/large-fragmented-free.sh b/tests/tests/large-fragmented-free.sh index 33f3bef7..e27d1bbe 100644 --- a/tests/tests/large-fragmented-free.sh +++ b/tests/tests/large-fragmented-free.sh @@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600 EXTENTS_PER_LIST_BLOCK=8192 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK)) -# -# This test specifically creates a pathologically sparse file that will -# be as expensive as possible to free. This is usually fine on -# dedicated or reasonable hardware, but trying to run this in -# virtualized debug kernels can take a very long time. This test is -# about making sure that the server doesn't fail, not that the platform -# can handle the scale of work that our btree formats happen to require -# while execution is bogged down with use-after-free memory reference -# tracking. So we give the test a lot more breathing room before -# deciding that its hung. -# -echo "== setting longer hung task timeout" -if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then - secs=$(cat /proc/sys/kernel/hung_task_timeout_secs) - test "$secs" -gt 0 || \ - t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs" - restore_hung_task_timeout() - { - echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs - } - trap restore_hung_task_timeout EXIT - echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs -fi - echo "== creating fragmented extents" fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"