mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-08 13:01:23 +00:00
Avoid dput in lock invalidation
The d_prune_aliases in lock invalidation was thought to be safe because the caller had an inode refernece, surely it can't get into iput_final. I missed the fundamental dcache pattern that dput can ascend through parents and end up in inode eviction for entirely unrelated inodes. It's very easy for this to deadlock, imagine if nothing else that the inode invalidation is blocked on in dput->iput->evict->delete->lock is itself in the list of locks to invalidate in the caller. We fix this by always kicking off d_prune and dput into async work. This increases the chance that inodes will still be referenced after invalidation and prevent inline deletion. More deletions can be deferred until the orphan scanner finds them. It should be rare, though. We're still likely to put and drop invalidated inodes before a writer gets around to removing the final unlink and asking us for the omap that describes our cached inodes. To perform the d_prune in work we make it a behavioural flag and make our queued iputs a little more robust. We use much safer and understandable locking to cover the count and the new flags and we put the work in re-entrant work in their own workqueue instead of one work instance in the system_wq. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
123
kmod/src/inode.c
123
kmod/src/inode.c
@@ -19,6 +19,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/list_sort.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "super.h"
|
||||
@@ -67,8 +68,10 @@ struct inode_sb_info {
|
||||
|
||||
struct delayed_work orphan_scan_dwork;
|
||||
|
||||
struct workqueue_struct *iput_workq;
|
||||
struct work_struct iput_work;
|
||||
struct llist_head iput_llist;
|
||||
spinlock_t iput_lock;
|
||||
struct list_head iput_list;
|
||||
};
|
||||
|
||||
#define DECLARE_INODE_SB_INFO(sb, name) \
|
||||
@@ -95,7 +98,9 @@ static void scoutfs_inode_ctor(void *obj)
|
||||
init_rwsem(&si->xattr_rwsem);
|
||||
INIT_LIST_HEAD(&si->writeback_entry);
|
||||
scoutfs_lock_init_coverage(&si->ino_lock_cov);
|
||||
atomic_set(&si->iput_count, 0);
|
||||
INIT_LIST_HEAD(&si->iput_head);
|
||||
si->iput_count = 0;
|
||||
si->iput_flags = 0;
|
||||
|
||||
inode_init_once(&si->inode);
|
||||
}
|
||||
@@ -325,7 +330,6 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock)
|
||||
load_inode(inode, &sinode);
|
||||
atomic64_set(&si->last_refreshed, refresh_gen);
|
||||
scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov);
|
||||
si->drop_invalidated = false;
|
||||
}
|
||||
} else {
|
||||
ret = 0;
|
||||
@@ -1453,7 +1457,6 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d
|
||||
si->have_item = false;
|
||||
atomic64_set(&si->last_refreshed, lock->refresh_gen);
|
||||
scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov);
|
||||
si->drop_invalidated = false;
|
||||
si->flags = 0;
|
||||
|
||||
scoutfs_inode_set_meta_seq(inode);
|
||||
@@ -1755,18 +1758,18 @@ out:
|
||||
}
|
||||
|
||||
/*
|
||||
* As we drop an inode we need to decide to try and delete its items or
|
||||
* not, which is expensive. The two common cases we want to get right
|
||||
* both have cluster lock coverage and don't want to delete. Dropping
|
||||
* unused inodes during read lock invalidation has the current lock and
|
||||
* sees a nonzero nlink and knows not to delete. Final iput after a
|
||||
* local unlink also has a lock, sees a zero nlink, and tries to perform
|
||||
* item deletion in the task that dropped the last link, as users
|
||||
* expect.
|
||||
* As we evicted an inode we need to decide to try and delete its items
|
||||
* or not, which is expensive. We only try when we have lock coverage
|
||||
* and the inode has been unlinked. This catches the common case of
|
||||
* regular deletion so deletion will be performed in the final unlink
|
||||
* task. It also catches open-unlink or o_tmpfile that aren't cached on
|
||||
* other nodes.
|
||||
*
|
||||
* Evicting an inode outside of cluster locking is the odd slow path
|
||||
* that involves lock contention during use the worst cross-mount
|
||||
* open-unlink/delete case.
|
||||
* Inodes being evicted outside of lock coverage, by referenced dentries
|
||||
* or inodes that survived the attempt to drop them as their lock was
|
||||
* invalidated, will not try to delete. This means that cross-mount
|
||||
* open/unlink will almost certainly fall back to the orphan scanner to
|
||||
* perform final deletion.
|
||||
*/
|
||||
void scoutfs_evict_inode(struct inode *inode)
|
||||
{
|
||||
@@ -1782,7 +1785,7 @@ void scoutfs_evict_inode(struct inode *inode)
|
||||
/* clear before trying to delete tests */
|
||||
scoutfs_omap_clear(sb, ino);
|
||||
|
||||
if (!scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || inode->i_nlink == 0)
|
||||
if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink == 0)
|
||||
try_delete_inode_items(sb, scoutfs_ino(inode));
|
||||
}
|
||||
|
||||
@@ -1807,30 +1810,56 @@ int scoutfs_drop_inode(struct inode *inode)
|
||||
{
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
const bool covered = scoutfs_lock_is_covered(sb, &si->ino_lock_cov);
|
||||
|
||||
trace_scoutfs_drop_inode(sb, scoutfs_ino(inode), inode->i_nlink, inode_unhashed(inode),
|
||||
si->drop_invalidated);
|
||||
covered);
|
||||
|
||||
return si->drop_invalidated || !scoutfs_lock_is_covered(sb, &si->ino_lock_cov) ||
|
||||
generic_drop_inode(inode);
|
||||
return !covered || generic_drop_inode(inode);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* These iput workers can be concurrent amongst cpus. This lets us get
|
||||
* some concurrency when these async final iputs end up performing very
|
||||
* expensive inode deletion. Typically they're dropping linked inodes
|
||||
* that lost lock coverage and the iput will evict without deleting.
|
||||
*
|
||||
* Keep in mind that the dputs in d_prune can ascend into parents and
|
||||
* end up performing the final iput->evict deletion on other inodes.
|
||||
*/
|
||||
static void iput_worker(struct work_struct *work)
|
||||
{
|
||||
struct inode_sb_info *inf = container_of(work, struct inode_sb_info, iput_work);
|
||||
struct scoutfs_inode_info *si;
|
||||
struct scoutfs_inode_info *tmp;
|
||||
struct llist_node *inodes;
|
||||
bool more;
|
||||
struct inode *inode;
|
||||
unsigned long count;
|
||||
unsigned long flags;
|
||||
|
||||
inodes = llist_del_all(&inf->iput_llist);
|
||||
spin_lock(&inf->iput_lock);
|
||||
while ((si = list_first_entry_or_null(&inf->iput_list, struct scoutfs_inode_info,
|
||||
iput_head))) {
|
||||
list_del_init(&si->iput_head);
|
||||
count = si->iput_count;
|
||||
flags = si->iput_flags;
|
||||
si->iput_count = 0;
|
||||
si->iput_flags = 0;
|
||||
spin_unlock(&inf->iput_lock);
|
||||
|
||||
llist_for_each_entry_safe(si, tmp, inodes, iput_llnode) {
|
||||
do {
|
||||
more = atomic_dec_return(&si->iput_count) > 0;
|
||||
iput(&si->inode);
|
||||
} while (more);
|
||||
inode = &si->inode;
|
||||
|
||||
/* can't touch during unmount, dcache destroys w/o locks */
|
||||
if ((flags & SI_IPUT_FLAG_PRUNE) && !inf->stopped)
|
||||
d_prune_aliases(inode);
|
||||
|
||||
while (count-- > 0)
|
||||
iput(inode);
|
||||
|
||||
/* can't touch inode after final iput */
|
||||
|
||||
spin_lock(&inf->iput_lock);
|
||||
}
|
||||
spin_unlock(&inf->iput_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1847,15 +1876,21 @@ static void iput_worker(struct work_struct *work)
|
||||
* Nothing stops multiple puts of an inode before the work runs so we
|
||||
* can track multiple puts in flight.
|
||||
*/
|
||||
void scoutfs_inode_queue_iput(struct inode *inode)
|
||||
void scoutfs_inode_queue_iput(struct inode *inode, unsigned long flags)
|
||||
{
|
||||
DECLARE_INODE_SB_INFO(inode->i_sb, inf);
|
||||
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
|
||||
bool should_queue;
|
||||
|
||||
if (atomic_inc_return(&si->iput_count) == 1)
|
||||
llist_add(&si->iput_llnode, &inf->iput_llist);
|
||||
smp_wmb(); /* count and list visible before work executes */
|
||||
schedule_work(&inf->iput_work);
|
||||
spin_lock(&inf->iput_lock);
|
||||
si->iput_count++;
|
||||
si->iput_flags |= flags;
|
||||
if ((should_queue = list_empty(&si->iput_head)))
|
||||
list_add_tail(&si->iput_head, &inf->iput_list);
|
||||
spin_unlock(&inf->iput_lock);
|
||||
|
||||
if (should_queue)
|
||||
queue_work(inf->iput_workq, &inf->iput_work);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2059,7 +2094,7 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
|
||||
trace_scoutfs_inode_walk_writeback(sb, scoutfs_ino(inode),
|
||||
write, ret);
|
||||
if (ret) {
|
||||
scoutfs_inode_queue_iput(inode);
|
||||
scoutfs_inode_queue_iput(inode, 0);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -2075,7 +2110,7 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
|
||||
if (!write)
|
||||
list_del_init(&si->writeback_entry);
|
||||
|
||||
scoutfs_inode_queue_iput(inode);
|
||||
scoutfs_inode_queue_iput(inode, 0);
|
||||
}
|
||||
|
||||
spin_unlock(&inf->writeback_lock);
|
||||
@@ -2100,7 +2135,15 @@ int scoutfs_inode_setup(struct super_block *sb)
|
||||
spin_lock_init(&inf->ino_alloc.lock);
|
||||
INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);
|
||||
INIT_WORK(&inf->iput_work, iput_worker);
|
||||
init_llist_head(&inf->iput_llist);
|
||||
spin_lock_init(&inf->iput_lock);
|
||||
INIT_LIST_HEAD(&inf->iput_list);
|
||||
|
||||
/* re-entrant, worker locks with itself and queueing */
|
||||
inf->iput_workq = alloc_workqueue("scoutfs_inode_iput", WQ_UNBOUND, 0);
|
||||
if (!inf->iput_workq) {
|
||||
kfree(inf);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
sbi->inode_sb_info = inf;
|
||||
|
||||
@@ -2136,14 +2179,18 @@ void scoutfs_inode_flush_iput(struct super_block *sb)
|
||||
DECLARE_INODE_SB_INFO(sb, inf);
|
||||
|
||||
if (inf)
|
||||
flush_work(&inf->iput_work);
|
||||
flush_workqueue(inf->iput_workq);
|
||||
}
|
||||
|
||||
void scoutfs_inode_destroy(struct super_block *sb)
|
||||
{
|
||||
struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info;
|
||||
|
||||
kfree(inf);
|
||||
if (inf) {
|
||||
if (inf->iput_workq)
|
||||
destroy_workqueue(inf->iput_workq);
|
||||
kfree(inf);
|
||||
}
|
||||
}
|
||||
|
||||
void scoutfs_inode_exit(void)
|
||||
|
||||
@@ -56,14 +56,16 @@ struct scoutfs_inode_info {
|
||||
|
||||
struct scoutfs_lock_coverage ino_lock_cov;
|
||||
|
||||
/* drop if i_count hits 0, allows drop while invalidate holds coverage */
|
||||
bool drop_invalidated;
|
||||
struct llist_node iput_llnode;
|
||||
atomic_t iput_count;
|
||||
struct list_head iput_head;
|
||||
unsigned long iput_count;
|
||||
unsigned long iput_flags;
|
||||
|
||||
struct inode inode;
|
||||
};
|
||||
|
||||
/* try to prune dcache aliases with queued iput */
|
||||
#define SI_IPUT_FLAG_PRUNE (1 << 0)
|
||||
|
||||
static inline struct scoutfs_inode_info *SCOUTFS_I(struct inode *inode)
|
||||
{
|
||||
return container_of(inode, struct scoutfs_inode_info, inode);
|
||||
@@ -78,7 +80,7 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb);
|
||||
void scoutfs_destroy_inode(struct inode *inode);
|
||||
int scoutfs_drop_inode(struct inode *inode);
|
||||
void scoutfs_evict_inode(struct inode *inode);
|
||||
void scoutfs_inode_queue_iput(struct inode *inode);
|
||||
void scoutfs_inode_queue_iput(struct inode *inode, unsigned long flags);
|
||||
|
||||
#define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */
|
||||
struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf);
|
||||
|
||||
@@ -130,16 +130,13 @@ static bool lock_modes_match(int granted, int requested)
|
||||
* allows deletions to be performed by unlink without having to wait for
|
||||
* remote cached inodes to be dropped.
|
||||
*
|
||||
* If the cached inode was already deferring final inode deletion then
|
||||
* we can't perform that inline in invalidation. The locking alone
|
||||
* deadlock, and it might also take multiple transactions to fully
|
||||
* delete an inode with significant metadata. We only perform the iput
|
||||
* inline if we know that possible eviction can't perform the final
|
||||
* deletion, otherwise we kick it off to async work.
|
||||
* We kick the d_prune and iput off to async work because they can end
|
||||
* up in final iput and inode eviction item deletion which would
|
||||
* deadlock. d_prune->dput can end up in iput on parents in different
|
||||
* locks entirely.
|
||||
*/
|
||||
static void invalidate_inode(struct super_block *sb, u64 ino)
|
||||
{
|
||||
DECLARE_LOCK_INFO(sb, linfo);
|
||||
struct scoutfs_inode_info *si;
|
||||
struct inode *inode;
|
||||
|
||||
@@ -153,19 +150,9 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
|
||||
scoutfs_data_wait_changed(inode);
|
||||
}
|
||||
|
||||
/* can't touch during unmount, dcache destroys w/o locks */
|
||||
if (!linfo->unmounting)
|
||||
d_prune_aliases(inode);
|
||||
|
||||
forget_all_cached_acls(inode);
|
||||
|
||||
si->drop_invalidated = true;
|
||||
if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) {
|
||||
iput(inode);
|
||||
} else {
|
||||
/* defer iput to work context so we don't evict inodes from invalidation */
|
||||
scoutfs_inode_queue_iput(inode);
|
||||
}
|
||||
scoutfs_inode_queue_iput(inode, SI_IPUT_FLAG_PRUNE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,16 +188,6 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
/* have to invalidate if we're not in the only usable case */
|
||||
if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
|
||||
retry:
|
||||
/* invalidate inodes before removing coverage */
|
||||
if (lock->start.sk_zone == SCOUTFS_FS_ZONE) {
|
||||
ino = le64_to_cpu(lock->start.ski_ino);
|
||||
last = le64_to_cpu(lock->end.ski_ino);
|
||||
while (ino <= last) {
|
||||
invalidate_inode(sb, ino);
|
||||
ino++;
|
||||
}
|
||||
}
|
||||
|
||||
/* remove cov items to tell users that their cache is stale */
|
||||
spin_lock(&lock->cov_list_lock);
|
||||
list_for_each_entry_safe(cov, tmp, &lock->cov_list, head) {
|
||||
@@ -226,6 +203,16 @@ retry:
|
||||
}
|
||||
spin_unlock(&lock->cov_list_lock);
|
||||
|
||||
/* invalidate inodes after removing coverage so drop/evict aren't covered */
|
||||
if (lock->start.sk_zone == SCOUTFS_FS_ZONE) {
|
||||
ino = le64_to_cpu(lock->start.ski_ino);
|
||||
last = le64_to_cpu(lock->end.ski_ino);
|
||||
while (ino <= last) {
|
||||
invalidate_inode(sb, ino);
|
||||
ino++;
|
||||
}
|
||||
}
|
||||
|
||||
scoutfs_item_invalidate(sb, &lock->start, &lock->end);
|
||||
}
|
||||
|
||||
|
||||
@@ -691,16 +691,16 @@ TRACE_EVENT(scoutfs_evict_inode,
|
||||
|
||||
TRACE_EVENT(scoutfs_drop_inode,
|
||||
TP_PROTO(struct super_block *sb, __u64 ino, unsigned int nlink,
|
||||
unsigned int unhashed, bool drop_invalidated),
|
||||
unsigned int unhashed, bool lock_covered),
|
||||
|
||||
TP_ARGS(sb, ino, nlink, unhashed, drop_invalidated),
|
||||
TP_ARGS(sb, ino, nlink, unhashed, lock_covered),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, ino)
|
||||
__field(unsigned int, nlink)
|
||||
__field(unsigned int, unhashed)
|
||||
__field(unsigned int, drop_invalidated)
|
||||
__field(unsigned int, lock_covered)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -708,12 +708,12 @@ TRACE_EVENT(scoutfs_drop_inode,
|
||||
__entry->ino = ino;
|
||||
__entry->nlink = nlink;
|
||||
__entry->unhashed = unhashed;
|
||||
__entry->drop_invalidated = !!drop_invalidated;
|
||||
__entry->lock_covered = !!lock_covered;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" ino %llu nlink %u unhashed %d drop_invalidated %u", SCSB_TRACE_ARGS,
|
||||
TP_printk(SCSBF" ino %llu nlink %u unhashed %d lock_covered %u", SCSB_TRACE_ARGS,
|
||||
__entry->ino, __entry->nlink, __entry->unhashed,
|
||||
__entry->drop_invalidated)
|
||||
__entry->lock_covered)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_inode_walk_writeback,
|
||||
|
||||
Reference in New Issue
Block a user