diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 6d775515..6951a3de 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "format.h" #include "super.h" @@ -67,8 +68,10 @@ struct inode_sb_info { struct delayed_work orphan_scan_dwork; + struct workqueue_struct *iput_workq; struct work_struct iput_work; - struct llist_head iput_llist; + spinlock_t iput_lock; + struct list_head iput_list; }; #define DECLARE_INODE_SB_INFO(sb, name) \ @@ -95,7 +98,9 @@ static void scoutfs_inode_ctor(void *obj) init_rwsem(&si->xattr_rwsem); INIT_LIST_HEAD(&si->writeback_entry); scoutfs_lock_init_coverage(&si->ino_lock_cov); - atomic_set(&si->iput_count, 0); + INIT_LIST_HEAD(&si->iput_head); + si->iput_count = 0; + si->iput_flags = 0; inode_init_once(&si->inode); } @@ -325,7 +330,6 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock) load_inode(inode, &sinode); atomic64_set(&si->last_refreshed, refresh_gen); scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov); - si->drop_invalidated = false; } } else { ret = 0; @@ -1453,7 +1457,6 @@ int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, d si->have_item = false; atomic64_set(&si->last_refreshed, lock->refresh_gen); scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov); - si->drop_invalidated = false; si->flags = 0; scoutfs_inode_set_meta_seq(inode); @@ -1755,18 +1758,18 @@ out: } /* - * As we drop an inode we need to decide to try and delete its items or - * not, which is expensive. The two common cases we want to get right - * both have cluster lock coverage and don't want to delete. Dropping - * unused inodes during read lock invalidation has the current lock and - * sees a nonzero nlink and knows not to delete. Final iput after a - * local unlink also has a lock, sees a zero nlink, and tries to perform - * item deletion in the task that dropped the last link, as users - * expect. + * As we evicted an inode we need to decide to try and delete its items + * or not, which is expensive. We only try when we have lock coverage + * and the inode has been unlinked. This catches the common case of + * regular deletion so deletion will be performed in the final unlink + * task. It also catches open-unlink or o_tmpfile that aren't cached on + * other nodes. * - * Evicting an inode outside of cluster locking is the odd slow path - * that involves lock contention during use the worst cross-mount - * open-unlink/delete case. + * Inodes being evicted outside of lock coverage, by referenced dentries + * or inodes that survived the attempt to drop them as their lock was + * invalidated, will not try to delete. This means that cross-mount + * open/unlink will almost certainly fall back to the orphan scanner to + * perform final deletion. */ void scoutfs_evict_inode(struct inode *inode) { @@ -1782,7 +1785,7 @@ void scoutfs_evict_inode(struct inode *inode) /* clear before trying to delete tests */ scoutfs_omap_clear(sb, ino); - if (!scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || inode->i_nlink == 0) + if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink == 0) try_delete_inode_items(sb, scoutfs_ino(inode)); } @@ -1807,30 +1810,56 @@ int scoutfs_drop_inode(struct inode *inode) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; + const bool covered = scoutfs_lock_is_covered(sb, &si->ino_lock_cov); trace_scoutfs_drop_inode(sb, scoutfs_ino(inode), inode->i_nlink, inode_unhashed(inode), - si->drop_invalidated); + covered); - return si->drop_invalidated || !scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || - generic_drop_inode(inode); + return !covered || generic_drop_inode(inode); } + +/* + * These iput workers can be concurrent amongst cpus. This lets us get + * some concurrency when these async final iputs end up performing very + * expensive inode deletion. Typically they're dropping linked inodes + * that lost lock coverage and the iput will evict without deleting. + * + * Keep in mind that the dputs in d_prune can ascend into parents and + * end up performing the final iput->evict deletion on other inodes. + */ static void iput_worker(struct work_struct *work) { struct inode_sb_info *inf = container_of(work, struct inode_sb_info, iput_work); struct scoutfs_inode_info *si; - struct scoutfs_inode_info *tmp; - struct llist_node *inodes; - bool more; + struct inode *inode; + unsigned long count; + unsigned long flags; - inodes = llist_del_all(&inf->iput_llist); + spin_lock(&inf->iput_lock); + while ((si = list_first_entry_or_null(&inf->iput_list, struct scoutfs_inode_info, + iput_head))) { + list_del_init(&si->iput_head); + count = si->iput_count; + flags = si->iput_flags; + si->iput_count = 0; + si->iput_flags = 0; + spin_unlock(&inf->iput_lock); - llist_for_each_entry_safe(si, tmp, inodes, iput_llnode) { - do { - more = atomic_dec_return(&si->iput_count) > 0; - iput(&si->inode); - } while (more); + inode = &si->inode; + + /* can't touch during unmount, dcache destroys w/o locks */ + if ((flags & SI_IPUT_FLAG_PRUNE) && !inf->stopped) + d_prune_aliases(inode); + + while (count-- > 0) + iput(inode); + + /* can't touch inode after final iput */ + + spin_lock(&inf->iput_lock); } + spin_unlock(&inf->iput_lock); } /* @@ -1847,15 +1876,21 @@ static void iput_worker(struct work_struct *work) * Nothing stops multiple puts of an inode before the work runs so we * can track multiple puts in flight. */ -void scoutfs_inode_queue_iput(struct inode *inode) +void scoutfs_inode_queue_iput(struct inode *inode, unsigned long flags) { DECLARE_INODE_SB_INFO(inode->i_sb, inf); struct scoutfs_inode_info *si = SCOUTFS_I(inode); + bool should_queue; - if (atomic_inc_return(&si->iput_count) == 1) - llist_add(&si->iput_llnode, &inf->iput_llist); - smp_wmb(); /* count and list visible before work executes */ - schedule_work(&inf->iput_work); + spin_lock(&inf->iput_lock); + si->iput_count++; + si->iput_flags |= flags; + if ((should_queue = list_empty(&si->iput_head))) + list_add_tail(&si->iput_head, &inf->iput_list); + spin_unlock(&inf->iput_lock); + + if (should_queue) + queue_work(inf->iput_workq, &inf->iput_work); } /* @@ -2059,7 +2094,7 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write) trace_scoutfs_inode_walk_writeback(sb, scoutfs_ino(inode), write, ret); if (ret) { - scoutfs_inode_queue_iput(inode); + scoutfs_inode_queue_iput(inode, 0); goto out; } @@ -2075,7 +2110,7 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write) if (!write) list_del_init(&si->writeback_entry); - scoutfs_inode_queue_iput(inode); + scoutfs_inode_queue_iput(inode, 0); } spin_unlock(&inf->writeback_lock); @@ -2100,7 +2135,15 @@ int scoutfs_inode_setup(struct super_block *sb) spin_lock_init(&inf->ino_alloc.lock); INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker); INIT_WORK(&inf->iput_work, iput_worker); - init_llist_head(&inf->iput_llist); + spin_lock_init(&inf->iput_lock); + INIT_LIST_HEAD(&inf->iput_list); + + /* re-entrant, worker locks with itself and queueing */ + inf->iput_workq = alloc_workqueue("scoutfs_inode_iput", WQ_UNBOUND, 0); + if (!inf->iput_workq) { + kfree(inf); + return -ENOMEM; + } sbi->inode_sb_info = inf; @@ -2136,14 +2179,18 @@ void scoutfs_inode_flush_iput(struct super_block *sb) DECLARE_INODE_SB_INFO(sb, inf); if (inf) - flush_work(&inf->iput_work); + flush_workqueue(inf->iput_workq); } void scoutfs_inode_destroy(struct super_block *sb) { struct inode_sb_info *inf = SCOUTFS_SB(sb)->inode_sb_info; - kfree(inf); + if (inf) { + if (inf->iput_workq) + destroy_workqueue(inf->iput_workq); + kfree(inf); + } } void scoutfs_inode_exit(void) diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 6feb3f9e..79eacde0 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -56,14 +56,16 @@ struct scoutfs_inode_info { struct scoutfs_lock_coverage ino_lock_cov; - /* drop if i_count hits 0, allows drop while invalidate holds coverage */ - bool drop_invalidated; - struct llist_node iput_llnode; - atomic_t iput_count; + struct list_head iput_head; + unsigned long iput_count; + unsigned long iput_flags; struct inode inode; }; +/* try to prune dcache aliases with queued iput */ +#define SI_IPUT_FLAG_PRUNE (1 << 0) + static inline struct scoutfs_inode_info *SCOUTFS_I(struct inode *inode) { return container_of(inode, struct scoutfs_inode_info, inode); @@ -78,7 +80,7 @@ struct inode *scoutfs_alloc_inode(struct super_block *sb); void scoutfs_destroy_inode(struct inode *inode); int scoutfs_drop_inode(struct inode *inode); void scoutfs_evict_inode(struct inode *inode); -void scoutfs_inode_queue_iput(struct inode *inode); +void scoutfs_inode_queue_iput(struct inode *inode, unsigned long flags); #define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf); diff --git a/kmod/src/lock.c b/kmod/src/lock.c index 00e04248..bd726cec 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -130,16 +130,13 @@ static bool lock_modes_match(int granted, int requested) * allows deletions to be performed by unlink without having to wait for * remote cached inodes to be dropped. * - * If the cached inode was already deferring final inode deletion then - * we can't perform that inline in invalidation. The locking alone - * deadlock, and it might also take multiple transactions to fully - * delete an inode with significant metadata. We only perform the iput - * inline if we know that possible eviction can't perform the final - * deletion, otherwise we kick it off to async work. + * We kick the d_prune and iput off to async work because they can end + * up in final iput and inode eviction item deletion which would + * deadlock. d_prune->dput can end up in iput on parents in different + * locks entirely. */ static void invalidate_inode(struct super_block *sb, u64 ino) { - DECLARE_LOCK_INFO(sb, linfo); struct scoutfs_inode_info *si; struct inode *inode; @@ -153,19 +150,9 @@ static void invalidate_inode(struct super_block *sb, u64 ino) scoutfs_data_wait_changed(inode); } - /* can't touch during unmount, dcache destroys w/o locks */ - if (!linfo->unmounting) - d_prune_aliases(inode); - forget_all_cached_acls(inode); - si->drop_invalidated = true; - if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) { - iput(inode); - } else { - /* defer iput to work context so we don't evict inodes from invalidation */ - scoutfs_inode_queue_iput(inode); - } + scoutfs_inode_queue_iput(inode, SI_IPUT_FLAG_PRUNE); } } @@ -201,16 +188,6 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock, /* have to invalidate if we're not in the only usable case */ if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) { retry: - /* invalidate inodes before removing coverage */ - if (lock->start.sk_zone == SCOUTFS_FS_ZONE) { - ino = le64_to_cpu(lock->start.ski_ino); - last = le64_to_cpu(lock->end.ski_ino); - while (ino <= last) { - invalidate_inode(sb, ino); - ino++; - } - } - /* remove cov items to tell users that their cache is stale */ spin_lock(&lock->cov_list_lock); list_for_each_entry_safe(cov, tmp, &lock->cov_list, head) { @@ -226,6 +203,16 @@ retry: } spin_unlock(&lock->cov_list_lock); + /* invalidate inodes after removing coverage so drop/evict aren't covered */ + if (lock->start.sk_zone == SCOUTFS_FS_ZONE) { + ino = le64_to_cpu(lock->start.ski_ino); + last = le64_to_cpu(lock->end.ski_ino); + while (ino <= last) { + invalidate_inode(sb, ino); + ino++; + } + } + scoutfs_item_invalidate(sb, &lock->start, &lock->end); } diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 75a2f2bb..17bb9a03 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -691,16 +691,16 @@ TRACE_EVENT(scoutfs_evict_inode, TRACE_EVENT(scoutfs_drop_inode, TP_PROTO(struct super_block *sb, __u64 ino, unsigned int nlink, - unsigned int unhashed, bool drop_invalidated), + unsigned int unhashed, bool lock_covered), - TP_ARGS(sb, ino, nlink, unhashed, drop_invalidated), + TP_ARGS(sb, ino, nlink, unhashed, lock_covered), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(__u64, ino) __field(unsigned int, nlink) __field(unsigned int, unhashed) - __field(unsigned int, drop_invalidated) + __field(unsigned int, lock_covered) ), TP_fast_assign( @@ -708,12 +708,12 @@ TRACE_EVENT(scoutfs_drop_inode, __entry->ino = ino; __entry->nlink = nlink; __entry->unhashed = unhashed; - __entry->drop_invalidated = !!drop_invalidated; + __entry->lock_covered = !!lock_covered; ), - TP_printk(SCSBF" ino %llu nlink %u unhashed %d drop_invalidated %u", SCSB_TRACE_ARGS, + TP_printk(SCSBF" ino %llu nlink %u unhashed %d lock_covered %u", SCSB_TRACE_ARGS, __entry->ino, __entry->nlink, __entry->unhashed, - __entry->drop_invalidated) + __entry->lock_covered) ); TRACE_EVENT(scoutfs_inode_walk_writeback,