From 5231cf4034cb69e8b69daa5413f5c5381e510bd3 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 26 Apr 2021 10:59:34 -0700
Subject: [PATCH 1/4] Add export-lookup-evict-race test

Add a test that creates races between fh_to_dentry and eviction
triggered by lock invalidation.

Signed-off-by: Zach Brown <zab@versity.com>
---
 tests/golden/export-lookup-evict-race   |  0
 tests/sequence                          |  1 +
 tests/tests/export-lookup-evict-race.sh | 32 +++++++++++++++++++++++++
 3 files changed, 33 insertions(+)
 create mode 100644 tests/golden/export-lookup-evict-race
 create mode 100644 tests/tests/export-lookup-evict-race.sh

diff --git a/tests/golden/export-lookup-evict-race b/tests/golden/export-lookup-evict-race
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/sequence b/tests/sequence
index 9e51c19a..972bebe2 100644
--- a/tests/sequence
+++ b/tests/sequence
@@ -13,6 +13,7 @@ lock-refleak.sh
 lock-shrink-consistency.sh
 lock-pr-cw-conflict.sh
 lock-revoke-getcwd.sh
+export-lookup-evict-race.sh
 createmany-parallel.sh
 createmany-large-names.sh
 createmany-rename-large-dir.sh
diff --git a/tests/tests/export-lookup-evict-race.sh b/tests/tests/export-lookup-evict-race.sh
new file mode 100644
index 00000000..ebddf6c3
--- /dev/null
+++ b/tests/tests/export-lookup-evict-race.sh
@@ -0,0 +1,32 @@
+#
+# test racing fh_to_dentry with evict from lock invalidation.   We've
+# had deadlocks between the ordering of iget and evict when they acquire
+# cluster locks.
+# 
+
+t_require_commands touch stat handle_cat
+t_require_mounts 2
+
+CPUS=$(getconf _NPROCESSORS_ONLN)
+NR=$((CPUS * 4))
+END=$((SECONDS + 30))
+
+touch "$T_D0/file"
+ino=$(stat -c "%i" "$T_D0/file")
+
+while test $SECONDS -lt $END; do
+	for i in $(seq 1 $NR); do
+		fs=$((RANDOM % T_NR_MOUNTS))
+		eval dir="\$T_D${fs}"
+		write=$((RANDOM & 1))
+
+		if [ "$write" == 1 ]; then
+			touch "$dir/file" &
+		else
+			handle_cat "$dir" "$ino" &
+		fi
+	done
+	wait
+done
+
+t_pass

From ca320d02cb1666b5fd40e9c01c6e9a8f0f5b3eca Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Mon, 26 Apr 2021 16:09:13 -0700
Subject: [PATCH 2/4] Get i_mutex before cluster lock in file aio_read

The vfs often calls filesystem methods with i_mutex held.  This creates
a natural ordering of i_mutex outside of cluster locks.  The file
aio_read method acquired i_mutex after its cluster lock, creating a
deadlock with other vfs methods like setattr.

The acquisition of i_mutex after the cluster lock was due to using the
pattern where we use the per-task lock to discover if we're the first
user of the lock in a call chain.  Readpage has to do this, but file
aio_read doesn't.  It should never be called recursively.  So we can
acquire the i_mutex outside of the cluster lock and warn if we ever are
called recursively.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/file.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kmod/src/file.c b/kmod/src/file.c
index b1555b08..586d77fd 100644
--- a/kmod/src/file.c
+++ b/kmod/src/file.c
@@ -29,7 +29,12 @@
 #include "per_task.h"
 #include "omap.h"
 
-/* TODO: Direct I/O, AIO */
+/*
+ * Start a high level file read.  We check for offline extents in the
+ * read region here so that we only check the extents once.  We use the
+ * dio count to prevent releasing while we're reading after we've
+ * checked the extents.
+ */
 ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
@@ -43,30 +48,32 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	int ret;
 
 retry:
+	/* protect checked extents from release */
+	mutex_lock(&inode->i_mutex);
+	atomic_inc(&inode->i_dio_count);
+	mutex_unlock(&inode->i_mutex);
+
 	ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ,
 				 SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock);
 	if (ret)
 		goto out;
 
 	if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) {
-		/* protect checked extents from stage/release */
-		mutex_lock(&inode->i_mutex);
-		atomic_inc(&inode->i_dio_count);
-		mutex_unlock(&inode->i_mutex);
-
 		ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos,
 						  SEF_OFFLINE,
 						  SCOUTFS_IOC_DWO_READ,
 						  &dw, inode_lock);
 		if (ret != 0)
 			goto out;
+	} else {
+		WARN_ON_ONCE(true);
 	}
 
 	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
 
 out:
-	if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent))
-		inode_dio_done(inode);
+	inode_dio_done(inode);
+	scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
 	scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
 
 	if (scoutfs_data_wait_found(&dw)) {

From 603af327acf0f0ecd730283a556da76a2fa6d07f Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Tue, 27 Apr 2021 14:36:03 -0700
Subject: [PATCH 3/4] Ignore I_FREEING in all inode hash lookups

Previously we added a ilookup variant that ignored I_FREEING inodes
to avoid a deadlock between lock invalidation (lock->I_FREEING) and
eviction (I_FREEING->lock);

Now we're seeing similar deadlocks between eviction (I_FREEING->lock)
and fh_to_dentry's iget (lock->I_FREEING).

I think it's reasonable to ignore all inodes with I_FREEING set when
we're using our _test callback in ilookup or iget.  We can remove the
_nofreeing ilookup variant and move its I_FREEING test into the
iget_test callback provided to both ilookup and iget.

Callers will get the same result, it will just happen without waiting
for a previously I_FREEING inode to leave.  They'll get NULL instead of
waiting from ilookup.  They'll allocate and start to initialize a newer
instance of the inode and insert it along side the previous instance.

We don't have inode number re-use so we don't have the problem where a
newly allocated inode number is relying on inode cache serialization to
not find a previously allocated inode that is being evicted.

This change does allow for concurrent iget of an inode number that is
being deleted on a local node.  This could happen in fh_to_dentry with a
raw inode number.  But this was already a problem between mounts because
they don't have a shared inode cache to serialize them.  Once we fix
that between nodes, we fix it on a single node as well.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/inode.c | 34 +++++++++++-----------------------
 kmod/src/inode.h |  1 -
 kmod/src/lock.c  |  6 ++----
 3 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/kmod/src/inode.c b/kmod/src/inode.c
index cdade67d..3c78dc21 100644
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -648,12 +648,22 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
 	} while (read_seqcount_retry(&si->seqcount, seq));
 }
 
+/*
+ * We have inversions between getting cluster locks while performing
+ * final deletion on a freeing inode and waiting on a freeing inode
+ * while holding a cluster lock.
+ *
+ * We can avoid these deadlocks by hiding freeing inodes in our hash
+ * lookup function.  We're fine with either returning null or populating
+ * a new inode overlapping with eviction freeing a previous instance of
+ * the inode.
+ */
 static int scoutfs_iget_test(struct inode *inode, void *arg)
 {
 	struct scoutfs_inode_info *si = SCOUTFS_I(inode);
 	u64 *ino = arg;
 
-	return si->ino == *ino;
+	return (si->ino == *ino) && !(inode->i_state & I_FREEING);
 }
 
 static int scoutfs_iget_set(struct inode *inode, void *arg)
@@ -672,28 +682,6 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino)
 	return ilookup5(sb, ino, scoutfs_iget_test, &ino);
 }
 
-static int iget_test_nofreeing(struct inode *inode, void *arg)
-{
-	return !(inode->i_state & I_FREEING) && scoutfs_iget_test(inode, arg);
-}
-
-/*
- * There's a natural risk of a deadlock between lock invalidation and
- * eviction.  Invalidation blocks locks while looking up inodes and
- * invalidating local caches.  Inode eviction gets a lock to check final
- * inode deletion while the inode is marked FREEING which blocks
- * lookups.
- *
- * We have a lookup variant which doesn't return I_FREEING inodes
- * instead of waiting on them.  If an inode has made it to I_FREEING
- * then it doesn't have any local caches that are reachable and the lock
- * invalidation promise is kept.
- */
-struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino)
-{
-	return ilookup5(sb, ino, iget_test_nofreeing, &ino);
-}
-
 struct inode *scoutfs_iget(struct super_block *sb, u64 ino)
 {
 	struct scoutfs_lock *lock = NULL;
diff --git a/kmod/src/inode.h b/kmod/src/inode.h
index f1884ad6..070d6492 100644
--- a/kmod/src/inode.h
+++ b/kmod/src/inode.h
@@ -79,7 +79,6 @@ int scoutfs_orphan_inode(struct inode *inode);
 
 struct inode *scoutfs_iget(struct super_block *sb, u64 ino);
 struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino);
-struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino);
 
 void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major,
 				  u32 minor, u64 ino);
diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index f149bb13..a091ca68 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -156,9 +156,7 @@ static void lock_inv_iput_worker(struct work_struct *work)
 
 /*
  * Invalidate cached data associated with an inode whose lock is going
- * away.  We ignore indoes with I_FREEING instead of waiting on them to
- * avoid a deadlock, if they're freeing then they won't be visible to
- * future lock users and we don't need to invalidate them.
+ * away.
  *
  * We try to drop cached dentries and inodes covered by the lock if they
  * aren't referenced.  This removes them from the mount's open map and
@@ -178,7 +176,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
 	struct scoutfs_inode_info *si;
 	struct inode *inode;
 
-	inode = scoutfs_ilookup_nofreeing(sb, ino);
+	inode = scoutfs_ilookup(sb, ino);
 	if (inode) {
 		si = SCOUTFS_I(inode);
 

From a5ca5ee36da6234e43baed93821e69cd6f3f3314 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@versity.com>
Date: Thu, 29 Apr 2021 14:22:30 -0700
Subject: [PATCH 4/4] Put back-to-back invalidated locks back on list

A lock that is undergoing invalidation is put on a list of locks in the
super block.  Invalidation requests put locks on the list.  While locks
are invalidated they're temporarily put on a private list.

To support a request arriving while the lock is being processed we
carefully manage the invalidation fields in the lock between the
invalidation worker and the incoming request.  The worker correctly
noticed that a new invalidation request had arrived but it left the lock
on its private list instead of putting it back on the invalidation list
for further processing.  The lock was unreachable, wouldn't get
invalidated, and caused everyone trying to use the lock to block
indefinitely.

When the worker sees another request arrive for an invalidating lock it
needs to move the lock from the private list back to the invalidation
list.

Signed-off-by: Zach Brown <zab@versity.com>
---
 kmod/src/lock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kmod/src/lock.c b/kmod/src/lock.c
index a091ca68..50a33d26 100644
--- a/kmod/src/lock.c
+++ b/kmod/src/lock.c
@@ -894,6 +894,9 @@ static void lock_invalidate_worker(struct work_struct *work)
 			list_del_init(&lock->inv_head);
 			lock->invalidate_pending = 0;
 			wake_up(&lock->waitq);
+		} else {
+			/* another request filled nl/net_id, put it back on the list */
+			list_move_tail(&lock->inv_head, &linfo->inv_list);
 		}
 		put_lock(linfo, lock);
 	}