diff --git a/kmod/src/file.c b/kmod/src/file.c index b1555b08..586d77fd 100644 --- a/kmod/src/file.c +++ b/kmod/src/file.c @@ -29,7 +29,12 @@ #include "per_task.h" #include "omap.h" -/* TODO: Direct I/O, AIO */ +/* + * Start a high level file read. We check for offline extents in the + * read region here so that we only check the extents once. We use the + * dio count to prevent releasing while we're reading after we've + * checked the extents. + */ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -43,30 +48,32 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov, int ret; retry: + /* protect checked extents from release */ + mutex_lock(&inode->i_mutex); + atomic_inc(&inode->i_dio_count); + mutex_unlock(&inode->i_mutex); + ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); if (ret) goto out; if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) { - /* protect checked extents from stage/release */ - mutex_lock(&inode->i_mutex); - atomic_inc(&inode->i_dio_count); - mutex_unlock(&inode->i_mutex); - ret = scoutfs_data_wait_check_iov(inode, iov, nr_segs, pos, SEF_OFFLINE, SCOUTFS_IOC_DWO_READ, &dw, inode_lock); if (ret != 0) goto out; + } else { + WARN_ON_ONCE(true); } ret = generic_file_aio_read(iocb, iov, nr_segs, pos); out: - if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent)) - inode_dio_done(inode); + inode_dio_done(inode); + scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); if (scoutfs_data_wait_found(&dw)) { diff --git a/kmod/src/inode.c b/kmod/src/inode.c index cdade67d..3c78dc21 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -648,12 +648,22 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off) } while (read_seqcount_retry(&si->seqcount, seq)); } +/* + * We have inversions between getting cluster locks while performing + * final deletion on a freeing inode and waiting on a freeing inode + * while holding a cluster lock. + * + * We can avoid these deadlocks by hiding freeing inodes in our hash + * lookup function. We're fine with either returning null or populating + * a new inode overlapping with eviction freeing a previous instance of + * the inode. + */ static int scoutfs_iget_test(struct inode *inode, void *arg) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); u64 *ino = arg; - return si->ino == *ino; + return (si->ino == *ino) && !(inode->i_state & I_FREEING); } static int scoutfs_iget_set(struct inode *inode, void *arg) @@ -672,28 +682,6 @@ struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino) return ilookup5(sb, ino, scoutfs_iget_test, &ino); } -static int iget_test_nofreeing(struct inode *inode, void *arg) -{ - return !(inode->i_state & I_FREEING) && scoutfs_iget_test(inode, arg); -} - -/* - * There's a natural risk of a deadlock between lock invalidation and - * eviction. Invalidation blocks locks while looking up inodes and - * invalidating local caches. Inode eviction gets a lock to check final - * inode deletion while the inode is marked FREEING which blocks - * lookups. - * - * We have a lookup variant which doesn't return I_FREEING inodes - * instead of waiting on them. If an inode has made it to I_FREEING - * then it doesn't have any local caches that are reachable and the lock - * invalidation promise is kept. - */ -struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino) -{ - return ilookup5(sb, ino, iget_test_nofreeing, &ino); -} - struct inode *scoutfs_iget(struct super_block *sb, u64 ino) { struct scoutfs_lock *lock = NULL; diff --git a/kmod/src/inode.h b/kmod/src/inode.h index f1884ad6..070d6492 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -79,7 +79,6 @@ int scoutfs_orphan_inode(struct inode *inode); struct inode *scoutfs_iget(struct super_block *sb, u64 ino); struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino); -struct inode *scoutfs_ilookup_nofreeing(struct super_block *sb, u64 ino); void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major, u32 minor, u64 ino); diff --git a/kmod/src/lock.c b/kmod/src/lock.c index f149bb13..50a33d26 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -156,9 +156,7 @@ static void lock_inv_iput_worker(struct work_struct *work) /* * Invalidate cached data associated with an inode whose lock is going - * away. We ignore indoes with I_FREEING instead of waiting on them to - * avoid a deadlock, if they're freeing then they won't be visible to - * future lock users and we don't need to invalidate them. + * away. * * We try to drop cached dentries and inodes covered by the lock if they * aren't referenced. This removes them from the mount's open map and @@ -178,7 +176,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino) struct scoutfs_inode_info *si; struct inode *inode; - inode = scoutfs_ilookup_nofreeing(sb, ino); + inode = scoutfs_ilookup(sb, ino); if (inode) { si = SCOUTFS_I(inode); @@ -896,6 +894,9 @@ static void lock_invalidate_worker(struct work_struct *work) list_del_init(&lock->inv_head); lock->invalidate_pending = 0; wake_up(&lock->waitq); + } else { + /* another request filled nl/net_id, put it back on the list */ + list_move_tail(&lock->inv_head, &linfo->inv_list); } put_lock(linfo, lock); } diff --git a/tests/golden/export-lookup-evict-race b/tests/golden/export-lookup-evict-race new file mode 100644 index 00000000..e69de29b diff --git a/tests/sequence b/tests/sequence index 9e51c19a..972bebe2 100644 --- a/tests/sequence +++ b/tests/sequence @@ -13,6 +13,7 @@ lock-refleak.sh lock-shrink-consistency.sh lock-pr-cw-conflict.sh lock-revoke-getcwd.sh +export-lookup-evict-race.sh createmany-parallel.sh createmany-large-names.sh createmany-rename-large-dir.sh diff --git a/tests/tests/export-lookup-evict-race.sh b/tests/tests/export-lookup-evict-race.sh new file mode 100644 index 00000000..ebddf6c3 --- /dev/null +++ b/tests/tests/export-lookup-evict-race.sh @@ -0,0 +1,32 @@ +# +# test racing fh_to_dentry with evict from lock invalidation. We've +# had deadlocks between the ordering of iget and evict when they acquire +# cluster locks. +# + +t_require_commands touch stat handle_cat +t_require_mounts 2 + +CPUS=$(getconf _NPROCESSORS_ONLN) +NR=$((CPUS * 4)) +END=$((SECONDS + 30)) + +touch "$T_D0/file" +ino=$(stat -c "%i" "$T_D0/file") + +while test $SECONDS -lt $END; do + for i in $(seq 1 $NR); do + fs=$((RANDOM % T_NR_MOUNTS)) + eval dir="\$T_D${fs}" + write=$((RANDOM & 1)) + + if [ "$write" == 1 ]; then + touch "$dir/file" & + else + handle_cat "$dir" "$ino" & + fi + done + wait +done + +t_pass