diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 97a68aeb..87f270c3 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -84,6 +84,7 @@ static void scoutfs_inode_ctor(void *obj) init_rwsem(&si->xattr_rwsem); RB_CLEAR_NODE(&si->writeback_node); scoutfs_lock_init_coverage(&si->ino_lock_cov); + atomic_set(&si->inv_iput_count, 0); inode_init_once(&si->inode); } @@ -313,6 +314,7 @@ int scoutfs_inode_refresh(struct inode *inode, struct scoutfs_lock *lock, load_inode(inode, &sinode); atomic64_set(&si->last_refreshed, refresh_gen); scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov); + si->drop_invalidated = false; } } else { ret = 0; @@ -1393,6 +1395,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, si->have_item = false; atomic64_set(&si->last_refreshed, lock->refresh_gen); scoutfs_lock_add_coverage(sb, lock, &si->ino_lock_cov); + si->drop_invalidated = false; si->flags = 0; scoutfs_inode_set_meta_seq(inode); @@ -1586,13 +1589,30 @@ clear: clear_inode(inode); } +/* + * We want to remove inodes from the cache as their count goes to 0 if + * they're no longer covered by a cluster lock or if while locked they + * were unlinked. + * + * We don't want unused cached inodes to linger outside of cluster + * locking so that they don't prevent final inode deletion on other + * nodes. We don't have specific per-inode or per-dentry locks which + * would otherwise remove the stale caches as they're invalidated. + * Stale cached inodes provide little value because they're going to be + * refreshed the next time they're locked. Populating the item cache + * and loading the inode item is a lot more expensive than initializing + * and inserting a newly allocated vfs inode. + */ int scoutfs_drop_inode(struct inode *inode) { - int ret = generic_drop_inode(inode); + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct super_block *sb = inode->i_sb; - trace_scoutfs_drop_inode(inode->i_sb, scoutfs_ino(inode), - inode->i_nlink, inode_unhashed(inode)); - return ret; + trace_scoutfs_drop_inode(sb, scoutfs_ino(inode), inode->i_nlink, inode_unhashed(inode), + si->drop_invalidated); + + return si->drop_invalidated || !scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || + generic_drop_inode(inode); } /* diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 60213d73..070d6492 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -53,6 +53,11 @@ struct scoutfs_inode_info { struct scoutfs_lock_coverage ino_lock_cov; + /* drop if i_count hits 0, allows drop while invalidate holds coverage */ + bool drop_invalidated; + struct llist_node inv_iput_llnode; + atomic_t inv_iput_count; + struct inode inode; }; diff --git a/kmod/src/lock.c b/kmod/src/lock.c index f761ba49..ba893bfe 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -75,6 +75,7 @@ struct lock_info { struct super_block *sb; spinlock_t lock; bool shutdown; + bool unmounting; struct rb_root lock_tree; struct rb_root lock_range_tree; struct shrinker shrinker; @@ -88,6 +89,9 @@ struct lock_info { struct work_struct shrink_work; struct list_head shrink_list; atomic64_t next_refresh_gen; + struct work_struct inv_iput_work; + struct llist_head inv_iput_llist; + struct dentry *tseq_dentry; struct scoutfs_tseq_tree tseq_tree; }; @@ -122,12 +126,53 @@ static bool lock_modes_match(int granted, int requested) requested == SCOUTFS_LOCK_READ); } +/* + * Final iput can get into evict and perform final inode deletion which + * can delete a lot of items under locks and transactions. We really + * don't want to be doing all that in an iput during invalidation. When + * invalidation sees that iput might perform final deletion it puts them + * on a list and queues this work. + * + * Nothing stops multiple puts for multiple invalidations of an inode + * before the work runs so we can track multiple puts in flight. + */ +static void lock_inv_iput_worker(struct work_struct *work) +{ + struct lock_info *linfo = container_of(work, struct lock_info, inv_iput_work); + struct scoutfs_inode_info *si; + struct scoutfs_inode_info *tmp; + struct llist_node *inodes; + bool more; + + inodes = llist_del_all(&linfo->inv_iput_llist); + + llist_for_each_entry_safe(si, tmp, inodes, inv_iput_llnode) { + do { + more = atomic_dec_return(&si->inv_iput_count) > 0; + iput(&si->inode); + } while (more); + } +} + /* * invalidate cached data associated with an inode whose lock is going * away. + * + * We try to drop cached dentries and inodes covered by the lock if they + * aren't referenced. This removes them from the mount's open map and + * allows deletions to be performed by unlink without having to wait for + * remote cached inodes to be dropped. + * + * If the cached inode was already deferring final inode deletion then + * we can't perform that inline in invalidation. The locking alone + * deadlock, and it might also take multiple transactions to fully + * delete an inode with significant metadata. We only perform the iput + * inline if we know that possible eviction can't perform the final + * deletion, otherwise we kick it off to async work. */ static void invalidate_inode(struct super_block *sb, u64 ino) { + DECLARE_LOCK_INFO(sb, linfo); struct scoutfs_inode_info *si; struct inode *inode; @@ -141,7 +186,20 @@ static void invalidate_inode(struct super_block *sb, u64 ino) scoutfs_data_wait_changed(inode); } - iput(inode); + /* can't touch during unmount, dcache destroys w/o locks */ + if (!linfo->unmounting) + d_prune_aliases(inode); + + si->drop_invalidated = true; + if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) { + iput(inode); + } else { + /* defer iput to work context so we don't evict inodes from invalidation */ + if (atomic_inc_return(&si->inv_iput_count) == 1) + llist_add(&si->inv_iput_llnode, &linfo->inv_iput_llist); + smp_wmb(); /* count and list visible before work executes */ + queue_work(linfo->workq, &linfo->inv_iput_work); + } } } @@ -1536,11 +1594,21 @@ static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent) } /* - * The caller is going to be calling _destroy soon and, critically, is - * about to shutdown networking before calling us so that we don't get - * any callbacks while we're destroying. We have to ensure that we - * won't call networking after this returns. - * + * shrink_dcache_for_umount() tears down dentries with no locking. We + * need to make sure that our invalidation won't touch dentries before + * we return and the caller calls the generic vfs unmount path. + */ +void scoutfs_lock_unmount_begin(struct super_block *sb) +{ + DECLARE_LOCK_INFO(sb, linfo); + + if (linfo) { + linfo->unmounting = true; + flush_delayed_work(&linfo->inv_dwork); + } +} + +/* * Internal fs threads can be using locking, and locking can have async * work pending. We use ->shutdown to force callers to return * -ESHUTDOWN and to prevent the future queueing of work that could call @@ -1682,6 +1750,8 @@ int scoutfs_lock_setup(struct super_block *sb) INIT_WORK(&linfo->shrink_work, lock_shrink_worker); INIT_LIST_HEAD(&linfo->shrink_list); atomic64_set(&linfo->next_refresh_gen, 0); + INIT_WORK(&linfo->inv_iput_work, lock_inv_iput_worker); + init_llist_head(&linfo->inv_iput_llist); scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show); sbi->lock_info = linfo; diff --git a/kmod/src/lock.h b/kmod/src/lock.h index 5485bb35..bf9a9b8a 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -104,6 +104,7 @@ bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key, void scoutfs_free_unused_locks(struct super_block *sb, unsigned long nr); int scoutfs_lock_setup(struct super_block *sb); +void scoutfs_lock_unmount_begin(struct super_block *sb); void scoutfs_lock_shutdown(struct super_block *sb); void scoutfs_lock_destroy(struct super_block *sb); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 8d1209e7..7dce85f0 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -690,15 +690,16 @@ TRACE_EVENT(scoutfs_evict_inode, TRACE_EVENT(scoutfs_drop_inode, TP_PROTO(struct super_block *sb, __u64 ino, unsigned int nlink, - unsigned int unhashed), + unsigned int unhashed, bool drop_invalidated), - TP_ARGS(sb, ino, nlink, unhashed), + TP_ARGS(sb, ino, nlink, unhashed, drop_invalidated), TP_STRUCT__entry( SCSB_TRACE_FIELDS __field(__u64, ino) __field(unsigned int, nlink) __field(unsigned int, unhashed) + __field(unsigned int, drop_invalidated) ), TP_fast_assign( @@ -706,10 +707,12 @@ TRACE_EVENT(scoutfs_drop_inode, __entry->ino = ino; __entry->nlink = nlink; __entry->unhashed = unhashed; + __entry->drop_invalidated = !!drop_invalidated; ), - TP_printk(SCSBF" ino %llu nlink %u unhashed %d", SCSB_TRACE_ARGS, - __entry->ino, __entry->nlink, __entry->unhashed) + TP_printk(SCSBF" ino %llu nlink %u unhashed %d drop_invalidated %u", SCSB_TRACE_ARGS, + __entry->ino, __entry->nlink, __entry->unhashed, + __entry->drop_invalidated) ); TRACE_EVENT(scoutfs_inode_walk_writeback, diff --git a/kmod/src/super.c b/kmod/src/super.c index 27b2f832..5338ec65 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -649,6 +649,9 @@ static void scoutfs_kill_sb(struct super_block *sb) { trace_scoutfs_kill_sb(sb); + if (SCOUTFS_HAS_SBI(sb)) + scoutfs_lock_unmount_begin(sb); + kill_block_super(sb); }