Do orphaned inode extent freeing in chunks

Fix an issue where the final freeing of extents for unlinked files could trigger spurious hung task timeout warnings. Instead of holding the scoutfs inode lock for the entire duration of extent freeing, release and reacquire the lock every time the transaction sequence number changes. This is only used in the evict and orphan inode cleanup paths; truncate and release continue to free during a single lock hold. Remove the hung_task_timeout_secs workaround from the large-fragmented-free test script. Signed-off-by: Chris Kirby <ckirby@versity.com>
2026-05-01 18:35:43 +00:00 · 2025-06-11 13:24:11 -05:00
13 changed files with 75 additions and 137 deletions
--- a/kmod/src/Makefile.kernelcompat
+++ b/kmod/src/Makefile.kernelcompat
@@ -287,14 +287,6 @@ ifneq (,$(shell grep 'int ..mknod. .struct user_namespace' include/linux/fs.h))
 ccflags-y += -DKC_VFS_METHOD_USER_NAMESPACE_ARG
 endif

-#
-# v6.2-rc1-2-gabf08576afe3
-#
-# fs: vfs methods use struct mnt_idmap instead of struct user_namespace
-ifneq (,$(shell grep 'int vfs_mknod.struct mnt_idmap' include/linux/fs.h))
-ccflags-y += -DKC_VFS_METHOD_MNT_IDMAP_ARG
-endif
-
 #
 # v5.17-rc2-21-g07888c665b40
 #
@@ -442,12 +434,3 @@ endif
 ifneq (,$(shell grep 'int ..remap_pages..struct vm_area_struct' include/linux/mm.h))
 ccflags-y += -DKC_MM_REMAP_PAGES
 endif
-
-#
-# v6.1-rc1-4-g7420332a6ff4
-#
-# .get_acl() method now has dentry arg (and mnt_idmap). The old get_acl has been renamed
-# to get_inode_acl() and is still available as well, but has an extra rcu param.
-ifneq (,$(shell grep 'struct posix_acl ...get_acl..struct mnt_idmap ., struct dentry' include/linux/fs.h))
-ccflags-y += -DKC_GET_ACL_DENTRY
-endif
--- a/kmod/src/acl.c
+++ b/kmod/src/acl.c
@@ -107,15 +107,8 @@ struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct s
 	return acl;
 }

-#ifdef KC_GET_ACL_DENTRY
-struct posix_acl *scoutfs_get_acl(KC_VFS_NS_DEF
-				  struct dentry *dentry, int type)
-{
-	struct inode *inode = dentry->d_inode;
-#else
 struct posix_acl *scoutfs_get_acl(struct inode *inode, int type)
 {
-#endif
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *lock = NULL;
 	struct posix_acl *acl;
@@ -208,15 +201,8 @@ out:
 	return ret;
 }

-#ifdef KC_GET_ACL_DENTRY
-int scoutfs_set_acl(KC_VFS_NS_DEF
-		    struct dentry *dentry, struct posix_acl *acl, int type)
-{
-	struct inode *inode = dentry->d_inode;
-#else
 int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-#endif
 	struct super_block *sb = inode->i_sb;
 	struct scoutfs_lock *lock = NULL;
 	LIST_HEAD(ind_locks);
@@ -254,12 +240,7 @@ int scoutfs_acl_get_xattr(struct dentry *dentry, const char *name, void *value,
 	if (!IS_POSIXACL(dentry->d_inode))
 		return -EOPNOTSUPP;

-#ifdef KC_GET_ACL_DENTRY
-	acl = scoutfs_get_acl(KC_VFS_INIT_NS
-			      dentry, type);
-#else
 	acl = scoutfs_get_acl(dentry->d_inode, type);
-#endif
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
@@ -305,11 +286,7 @@ int scoutfs_acl_set_xattr(struct dentry *dentry, const char *name, const void *v
 		}
 	}

-#ifdef KC_GET_ACL_DENTRY
-	ret = scoutfs_set_acl(KC_VFS_INIT_NS dentry, acl, type);
-#else
 	ret = scoutfs_set_acl(dentry->d_inode, acl, type);
-#endif
 out:
 	posix_acl_release(acl);

--- a/kmod/src/acl.h
+++ b/kmod/src/acl.h
@@ -1,14 +1,9 @@
 #ifndef _SCOUTFS_ACL_H_
 #define _SCOUTFS_ACL_H_

-#ifdef KC_GET_ACL_DENTRY
-struct posix_acl *scoutfs_get_acl(KC_VFS_NS_DEF struct dentry *dentry, int type);
-int scoutfs_set_acl(KC_VFS_NS_DEF struct dentry *dentry, struct posix_acl *acl, int type);
-#else
 struct posix_acl *scoutfs_get_acl(struct inode *inode, int type);
-int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-#endif
 struct posix_acl *scoutfs_get_acl_locked(struct inode *inode, int type, struct scoutfs_lock *lock);
+int scoutfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int scoutfs_set_acl_locked(struct inode *inode, struct posix_acl *acl, int type,
 			   struct scoutfs_lock *lock, struct list_head *ind_locks);
 #ifdef KC_XATTR_STRUCT_XATTR_HANDLER
--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -296,28 +296,39 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
 * and offline blocks.  If it's not provided then the inode is being
 * destroyed and isn't reachable, we don't need to update it.
 *
+ * If 'pause' is set, then we are destroying the inode and we should take
+ * breaks occasionally to allow other nodes access to this inode lock shard.
+ *
 * The caller is in charge of locking the inode and data, but we may
 * have to modify far more items than fit in a transaction so we're in
 * charge of batching updates into transactions.  If the inode is
 * provided then we're responsible for updating its item as we go.
 */
 int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
-				u64 ino, u64 iblock, u64 last, bool offline,
-				struct scoutfs_lock *lock)
+				u64 ino, u64 *iblock, u64 last, bool offline,
+				struct scoutfs_lock *lock, bool pause)
 {
 	struct scoutfs_inode_info *si = NULL;
 	LIST_HEAD(ind_locks);
+	u64 cur_seq;
 	s64 ret = 0;

 	WARN_ON_ONCE(inode && !inode_is_locked(inode));

+	/*
+	 * If the inode is provided, then we aren't destroying it. So it's not
+	 * safe to pause while removing items- it needs to be done in one chunk.
+	 */
+	if (WARN_ON_ONCE(pause && inode))
+		return -EINVAL;
+
 	/* clamp last to the last possible block? */
 	if (last > SCOUTFS_BLOCK_SM_MAX)
 		last = SCOUTFS_BLOCK_SM_MAX;

-	trace_scoutfs_data_truncate_items(sb, iblock, last, offline);
+	trace_scoutfs_data_truncate_items(sb, *iblock, last, offline, pause);

-	if (WARN_ON_ONCE(last < iblock))
+	if (WARN_ON_ONCE(last < *iblock))
 		return -EINVAL;

 	if (inode) {
@@ -325,7 +336,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 		down_write(&si->extent_sem);
 	}

-	while (iblock <= last) {
+	cur_seq = scoutfs_trans_sample_seq(sb);
+
+	while (*iblock <= last) {
 		if (inode)
 			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
 		else
@@ -339,7 +352,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 			ret = 0;

 		if (ret == 0)
-			ret = truncate_extents(sb, inode, ino, iblock, last,
+			ret = truncate_extents(sb, inode, ino, *iblock, last,
 					       offline, lock);

 		if (inode)
@@ -351,8 +364,19 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 		if (ret <= 0)
 			break;

-		iblock = ret;
+		*iblock = ret;
 		ret = 0;
+
+		/*
+		 * We know there's more to do because truncate_extents()
+		 * pauses every EXTENTS_PER_HOLD extents and it returned the
+		 * next starting block. Our caller might also want us to pause,
+		 * which we will do whenever we cross a transaction boundary.
+		 */
+		if (pause && (cur_seq != scoutfs_trans_sample_seq(sb))) {
+			ret = -EINPROGRESS;
+			break;
+		}
 	}

 	if (si)
--- a/kmod/src/data.h
+++ b/kmod/src/data.h
@@ -47,8 +47,8 @@ int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_
 			    int create);

 int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
-				u64 ino, u64 iblock, u64 last, bool offline,
-				struct scoutfs_lock *lock);
+				u64 ino, u64 *iblock, u64 last, bool offline,
+				struct scoutfs_lock *lock, bool pause);
 int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len);
 long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
--- a/kmod/src/dir.c
+++ b/kmod/src/dir.c
@@ -2053,9 +2053,6 @@ const struct inode_operations scoutfs_dir_iops = {
 #endif
 	.listxattr	= scoutfs_listxattr,
 	.get_acl	= scoutfs_get_acl,
-#ifdef KC_GET_ACL_DENTRY
-	.set_acl	= scoutfs_set_acl,
-#endif
 	.symlink	= scoutfs_symlink,
 	.permission	= scoutfs_permission,
 #ifdef KC_LINUX_HAVE_RHEL_IOPS_WRAPPER
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -150,9 +150,6 @@ static const struct inode_operations scoutfs_file_iops = {
 #endif
 	.listxattr	= scoutfs_listxattr,
 	.get_acl	= scoutfs_get_acl,
-#ifdef KC_GET_ACL_DENTRY
-	.set_acl	= scoutfs_set_acl,
-#endif
 	.fiemap		= scoutfs_data_fiemap,
 };

@@ -166,9 +163,6 @@ static const struct inode_operations scoutfs_special_iops = {
 #endif
 	.listxattr	= scoutfs_listxattr,
 	.get_acl	= scoutfs_get_acl,
-#ifdef KC_GET_ACL_DENTRY
-	.set_acl	= scoutfs_set_acl,
-#endif
 };

 /*
@@ -474,8 +468,8 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 	start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
 		SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(inode->i_sb, inode,
-					  scoutfs_ino(inode), start, ~0ULL,
-					  false, lock);
+					  scoutfs_ino(inode), &start, ~0ULL,
+					  false, lock, false);
 	err = clear_truncate_flag(inode, lock);

 	return ret ? ret : err;
@@ -1641,7 +1635,8 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
 * partial deletion until all deletion is complete and the orphan item
 * is removed.
 */
-static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
+static int delete_inode_items(struct super_block *sb, u64 ino,
+			      struct scoutfs_inode *sinode, u64 *start,
 			      struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
 {
 	struct scoutfs_key key;
@@ -1660,8 +1655,8 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_in

 	/* remove data items in their own transactions */
 	if (S_ISREG(mode)) {
-		ret = scoutfs_data_truncate_items(sb, NULL, ino, 0, ~0ULL,
-						  false, lock);
+		ret = scoutfs_data_truncate_items(sb, NULL, ino, start, ~0ULL,
+						  false, lock, true);
 		if (ret)
 			goto out;
 	}
@@ -1809,16 +1804,23 @@ out:
 */
 static int try_delete_inode_items(struct super_block *sb, u64 ino)
 {
-	struct inode_deletion_lock_data *ldata = NULL;
-	struct scoutfs_lock *orph_lock = NULL;
-	struct scoutfs_lock *lock = NULL;
+	struct inode_deletion_lock_data *ldata;
+	struct scoutfs_lock *orph_lock;
+	struct scoutfs_lock *lock;
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
 	bool clear_trying = false;
+	bool more = false;
 	u64 group_nr;
+	u64 start = 0;
 	int bit_nr;
 	int ret;

+again:
+	ldata = NULL;
+	orph_lock = NULL;
+	lock = NULL;
+
 	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
 	if (ret < 0)
 		goto out;
@@ -1830,11 +1832,12 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 		goto out;

 	/* only one local attempt per inode at a time */
-	if (test_and_set_bit(bit_nr, ldata->trying)) {
+	if (!more && test_and_set_bit(bit_nr, ldata->trying)) {
 		ret = 0;
 		goto out;
 	}
 	clear_trying = true;
+	more = false;

 	/* can't delete if it's cached in local or remote mounts */
 	if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
@@ -1859,7 +1862,15 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 	if (ret < 0)
 		goto out;

-	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
+	ret = delete_inode_items(sb, ino, &sinode, &start, lock, orph_lock);
+
+	if (ret == -EINPROGRESS) {
+		more = true;
+		clear_trying = false;
+	} else {
+		more = false;
+	}
+
 out:
 	if (clear_trying)
 		clear_bit(bit_nr, ldata->trying);
@@ -1867,6 +1878,9 @@ out:
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

+	if (more)
+		goto again;
+
 	return ret;
 }

--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -372,9 +372,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
 	eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
-					  sblock,
+					  &sblock,
 					  eblock, true,
-					  lock);
+					  lock, false);
 	if (ret == 0) {
 		scoutfs_inode_get_onoff(inode, &online, &offline);
 		isize = i_size_read(inode);
@@ -383,8 +383,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 					>> SCOUTFS_BLOCK_SM_SHIFT;
 			ret = scoutfs_data_truncate_items(sb, inode,
 							  scoutfs_ino(inode),
-							  sblock, U64_MAX,
-							  false, lock);
+							  &sblock, U64_MAX,
+							  false, lock, false);
 		}
 	}

--- a/kmod/src/kernelcompat.h
+++ b/kmod/src/kernelcompat.h
@@ -263,11 +263,6 @@ typedef unsigned int blk_opf_t;
 #define kc__vmalloc __vmalloc
 #endif

-#ifdef KC_VFS_METHOD_MNT_IDMAP_ARG
-#define KC_VFS_NS_DEF struct mnt_idmap *mnt_idmap,
-#define KC_VFS_NS mnt_idmap,
-#define KC_VFS_INIT_NS &nop_mnt_idmap,
-#else
 #ifdef KC_VFS_METHOD_USER_NAMESPACE_ARG
 #define KC_VFS_NS_DEF struct user_namespace *mnt_user_ns,
 #define KC_VFS_NS mnt_user_ns,
@@ -277,7 +272,6 @@ typedef unsigned int blk_opf_t;
 #define KC_VFS_NS
 #define KC_VFS_INIT_NS
 #endif
-#endif /* KC_VFS_METHOD_MNT_IDMAP_ARG */

 #ifdef KC_BIO_ALLOC_DEV_OPF_ARGS
 #define kc_bio_alloc bio_alloc
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -378,15 +378,16 @@ DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent,
 );

 TRACE_EVENT(scoutfs_data_truncate_items,
-	TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline),
+	TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline, bool pause),

-	TP_ARGS(sb, iblock, last, offline),
+	TP_ARGS(sb, iblock, last, offline, pause),

 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, iblock)
 		__field(__u64, last)
 		__field(int, offline)
+		__field(bool, pause)
 	),

 	TP_fast_assign(
@@ -394,10 +395,12 @@ TRACE_EVENT(scoutfs_data_truncate_items,
 		__entry->iblock = iblock;
 		__entry->last = last;
 		__entry->offline = offline;
+		__entry->pause = pause;
 	),

-	TP_printk(SCSBF" iblock %llu last %llu offline %u", SCSB_TRACE_ARGS,
-		  __entry->iblock, __entry->last, __entry->offline)
+	TP_printk(SCSBF" iblock %llu last %llu offline %u pause %d",
+		  SCSB_TRACE_ARGS, __entry->iblock, __entry->last,
+		  __entry->offline, __entry->pause)
 );

 TRACE_EVENT(scoutfs_data_wait_check,
--- a/tests/golden/large-fragmented-free
+++ b/tests/golden/large-fragmented-free
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -60,8 +60,6 @@ $(basename $0) options:
              | the file system to be tested.  Will be clobbered by -m mkfs.
    -m        | Run mkfs on the device before mounting and running
              | tests.  Implies unmounting existing mounts first.
-    -l        | Enable kmemleak scan during each test. Requires "kmemleak=on" in
-              | kernel cmdline boot args.
    -n <nr>   | The number of devices and mounts to test.
    -o <opts> | Add option string to all mounts during all tests.
    -P        | Enable trace_printk.
@@ -131,12 +129,6 @@ while true; do
 	-i)
 		T_INSMOD="1"
 		;;
-	-l)
-		echo "stack=off" > /sys/kernel/debug/kmemleak &&
-		echo "scan=off" > /sys/kernel/debug/kmemleak ||
-		die "kmemleak disabled or missing"
-		T_KMEMLEAK="1"
-		;;
 	-M)
 	        test -n "$2" || die "-z must have meta device file argument"
 	        T_META_DEVICE="$2"
@@ -577,11 +569,6 @@ for t in $tests; do
 	# mark in dmesg as to what test we are running
 	echo "run scoutfs test $test_name" > /dev/kmsg

-	# clean kmemleak scan
-	if [ -n "$T_KMEMLEAK" ]; then
-		echo "clear" > /sys/kernel/debug/kmemleak
-	fi
-
 	# record dmesg before
 	dmesg | t_filter_dmesg > "$T_TMPDIR/dmesg.before"

@@ -629,17 +616,6 @@ for t in $tests; do
 		fi
 	fi

-	# record kmemleak scan
-	if [ -n "$T_KMEMLEAK" ]; then
-		echo scan > /sys/kernel/debug/kmemleak
-		cp /sys/kernel/debug/kmemleak "$T_TMPDIR/kmemleak.scan"
-		if [ -s "$T_TMPDIR/kmemleak.scan" ]; then
-			message="kmemleak detected memory leak"
-			sts=$T_FAIL_STATUS
-			cat "$T_TMPDIR/kmemleak.scan" >> "$T_RESULTS/fail.log"
-		fi
-	fi
-
 	# record unknown exit status
 	if [ "$sts" -lt "$T_FIRST_STATUS" -o "$sts" -gt "$T_LAST_STATUS" ]; then
 		message="unknown status: $sts"
--- a/tests/tests/large-fragmented-free.sh
+++ b/tests/tests/large-fragmented-free.sh
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"