Merge pull request #38 from versity/zab/read_xattr_deadlocks

Zab/read xattr deadlocks
2026-07-27 18:43:13 +00:00 · 2021-05-03 09:44:57 -07:00
parent f631058265 a5ca5ee36d
commit 973dc4fd1c
7 changed files with 64 additions and 36 deletions
@@ -29,7 +29,12 @@
 #include "per_task.h"
 #include "omap.h"

-/* TODO: Direct I/O, AIO */
+/*
+ * Start a high level file read.  We check for offline extents in the
+ * read region here so that we only check the extents once.  We use the
+ * dio count to prevent releasing while we're reading after we've
+ * checked the extents.
+ */
 ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
@@ -43,30 +48,32 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	int ret;

 retry:
+	/* protect checked extents from release */
+	mutex_lock(&inode->i_mutex);
+	atomic_inc(&inode->i_dio_count);
+	mutex_unlock(&inode->i_mutex);
+
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
 	if (ret)
 		goto out;

 	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
-		/* protect checked extents from stage/release */
-		mutex_lock(&inode->i_mutex);
-		atomic_inc(&inode->i_dio_count);
-		mutex_unlock(&inode->i_mutex);
-
 		ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos,
 						  SEF_OFFLINE,
 						  SCOUTFS_IOC_DWO_READ,
 						  &dw, inode_lock);
 		if (ret != 0)
 			goto out;
+	} else {
+		WARN_ON_ONCE(true);
 	}

 	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);

 out:
-	if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent))
-		inode_dio_done(inode);
+	inode_dio_done(inode);
+	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);

 	if (scoutfs_data_wait_found(&dw)) {
@@ -648,12 +648,22 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
 	} while (read_seqcount_retry(&si->seqcount, seq));
 }

+/*
+ * We have inversions between getting cluster locks while performing
+ * final deletion on a freeing inode and waiting on a freeing inode
+ * while holding a cluster lock.
+ *
+ * We can avoid these deadlocks by hiding freeing inodes in our hash
+ * lookup function.  We're fine with either returning null or populating
+ * a new inode overlapping with eviction freeing a previous instance of
+ * the inode.
+ */
 static int scoutfs_iget_test(struct inode *inode, void *arg)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;

-	return si->ino == *ino;
+	return (si->ino == *ino) && !(inode->i_state & I_FREEING);
 }

 static int scoutfs_iget_set(struct inode *inode, void *arg)
@@ -672,28 +682,6 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
 	return ilookup5(sb, ino, scoutfs_iget_test, &ino);
 }

-static int iget_test_nofreeing(struct inode *inode, void *arg)
-{
-	return !(inode->i_state & I_FREEING) && scoutfs_iget_test(inode, arg);
-}
-
-/*
- * There's a natural risk of a deadlock between lock invalidation and
- * eviction.  Invalidation blocks locks while looking up inodes and
- * invalidating local caches.  Inode eviction gets a lock to check final
- * inode deletion while the inode is marked FREEING which blocks
- * lookups.
- *
- * We have a lookup variant which doesn't return I_FREEING inodes
- * instead of waiting on them.  If an inode has made it to I_FREEING
- * then it doesn't have any local caches that are reachable and the lock
- * invalidation promise is kept.
- */
-struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino)
-{
-	return ilookup5(sb, ino, iget_test_nofreeing, &ino);
-}
-
 struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 {
 	struct scoutfs_lock *lock = NULL;
@@ -79,7 +79,6 @@ int scoutfs_orphan_inode(struct inode *inode);

 struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
-struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino);

 void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
 				  u32 minor, u64 ino);
@@ -156,9 +156,7 @@ static void lock_inv_iput_worker(struct work_struct *work)

 /*
 * Invalidate cached data associated with an inode whose lock is going
- * away.  We ignore indoes with I_FREEING instead of waiting on them to
- * avoid a deadlock, if they're freeing then they won't be visible to
- * future lock users and we don't need to invalidate them.
+ * away.
 *
 * We try to drop cached dentries and inodes covered by the lock if they
 * aren't referenced.  This removes them from the mount's open map and
@@ -178,7 +176,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 	struct scoutfs_inode_info *si;
 	struct inode *inode;

-	inode = scoutfs_ilookup_nofreeing(sb, ino);
+	inode = scoutfs_ilookup(sb, ino);
 	if (inode) {
 		si = SCOUTFS_I(inode);

@@ -896,6 +894,9 @@ static void lock_invalidate_worker(struct work_struct *work)
 			list_del_init(&lock->inv_head);
 			lock->invalidate_pending = 0;
 			wake_up(&lock->waitq);
+		} else {
+			/* another request filled nl/net_id, put it back on the list */
+			list_move_tail(&lock->inv_head, &linfo->inv_list);
 		}
 		put_lock(linfo, lock);
 	}
@@ -13,6 +13,7 @@ lock-refleak.sh
 lock-shrink-consistency.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
+export-lookup-evict-race.sh
 createmany-parallel.sh
 createmany-large-names.sh
 createmany-rename-large-dir.sh
@@ -0,0 +1,32 @@
+#
+# test racing fh_to_dentry with evict from lock invalidation.   We've
+# had deadlocks between the ordering of iget and evict when they acquire
+# cluster locks.
+# 
+
+t_require_commands touch stat handle_cat
+t_require_mounts 2
+
+CPUS=$(getconf _NPROCESSORS_ONLN)
+NR=$((CPUS * 4))
+END=$((SECONDS + 30))
+
+touch "$T_D0/file"
+ino=$(stat -c "%i" "$T_D0/file")
+
+while test $SECONDS -lt $END; do
+	for i in $(seq 1 $NR); do
+		fs=$((RANDOM % T_NR_MOUNTS))
+		eval dir="\$T_D${fs}"
+		write=$((RANDOM & 1))
+
+		if [ "$write" == 1 ]; then
+			touch "$dir/file" &
+		else
+			handle_cat "$dir" "$ino" &
+		fi
+	done
+	wait
+done
+
+t_pass