Defer dirty inode data writeback (and use list)

iput() can only be used in contexts that could perform final inode
deletion which requires cluster locks and transactions.  This is
absolutely true for the transaction committing worker.  We can't have
deletion during transaction commit trying to get locks and dirty *more*
items in the transaction.

Now that we're properly getting locks in final inode deletion and
O_TMPFILE support has put pressure on deletion, we're seeing deadlocks
between inode eviction during transaction commit getting a index lock
and index lock invalidation trying to commit.

We use the newly offered queued iput to defer the iput from walking our
dirty inodes.   The transaction commit will be able to proceed while
the iput worker is off waiting for a lock.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2021-07-13 14:11:08 -07:00
parent 52107424dd
commit c51f0c37da
2 changed files with 30 additions and 70 deletions

View File

@@ -59,7 +59,7 @@ struct inode_sb_info {
bool stopped;
spinlock_t writeback_lock;
struct rb_root writeback_inodes;
struct list_head writeback_list;
struct inode_allocator dir_ino_alloc;
struct inode_allocator ino_alloc;
@@ -95,7 +95,7 @@ static void scoutfs_inode_ctor(void *obj)
atomic64_set(&si->data_waitq.changed, 0);
init_waitqueue_head(&si->data_waitq.waitq);
init_rwsem(&si->xattr_rwsem);
RB_CLEAR_NODE(&si->writeback_node);
INIT_LIST_HEAD(&si->writeback_entry);
scoutfs_lock_init_coverage(&si->ino_lock_cov);
atomic_set(&si->iput_count, 0);
@@ -121,47 +121,14 @@ static void scoutfs_i_callback(struct rcu_head *head)
kmem_cache_free(scoutfs_inode_cachep, SCOUTFS_I(inode));
}
static void insert_writeback_inode(struct inode_sb_info *inf,
struct scoutfs_inode_info *ins)
{
struct rb_root *root = &inf->writeback_inodes;
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct scoutfs_inode_info *si;
while (*node) {
parent = *node;
si = container_of(*node, struct scoutfs_inode_info,
writeback_node);
if (ins->ino < si->ino)
node = &(*node)->rb_left;
else if (ins->ino > si->ino)
node = &(*node)->rb_right;
else
BUG();
}
rb_link_node(&ins->writeback_node, parent, node);
rb_insert_color(&ins->writeback_node, root);
}
static void remove_writeback_inode(struct inode_sb_info *inf,
struct scoutfs_inode_info *si)
{
if (!RB_EMPTY_NODE(&si->writeback_node)) {
rb_erase(&si->writeback_node, &inf->writeback_inodes);
RB_CLEAR_NODE(&si->writeback_node);
}
}
void scoutfs_destroy_inode(struct inode *inode)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
DECLARE_INODE_SB_INFO(inode->i_sb, inf);
spin_lock(&inf->writeback_lock);
remove_writeback_inode(inf, SCOUTFS_I(inode));
if (!list_empty(&si->writeback_entry))
list_del_init(&si->writeback_entry);
spin_unlock(&inf->writeback_lock);
scoutfs_lock_del_coverage(inode->i_sb, &si->ino_lock_cov);
@@ -1889,30 +1856,33 @@ out:
* ourselves in knots trying to call through the high level vfs sync
* methods.
*
* File data block allocations tend to advance through free space so we
* add the inode to the end of the list to roughly encourage sequential
* IO.
*
* This is called by writers who hold the inode and transaction. The
* inode's presence in the rbtree is removed by destroy_inode, prevented
* by the inode hold, and by committing the transaction, which is
* prevented by holding the transaction. The inode can only go from
* empty to on the rbtree while we're here.
* inode is removed from the list by evict->destroy if it's unlinked
* during the transaction or by committing the transaction. Pruning the
* icache won't try to evict the inode as long as it has dirty buffers.
*/
void scoutfs_inode_queue_writeback(struct inode *inode)
{
DECLARE_INODE_SB_INFO(inode->i_sb, inf);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
if (RB_EMPTY_NODE(&si->writeback_node)) {
if (list_empty(&si->writeback_entry)) {
spin_lock(&inf->writeback_lock);
if (RB_EMPTY_NODE(&si->writeback_node))
insert_writeback_inode(inf, si);
if (list_empty(&si->writeback_entry))
list_add_tail(&si->writeback_entry, &inf->writeback_list);
spin_unlock(&inf->writeback_lock);
}
}
/*
* Walk our dirty inodes in ino order and either start dirty page
* writeback or wait for writeback to complete.
* Walk our dirty inodes and either start dirty page writeback or wait
* for writeback to complete.
*
* This is called by transaction commiting so other writers are
* This is called by transaction committing so other writers are
* excluded. We're still very careful to iterate over the tree while it
* and the inodes could be changing.
*
@@ -1925,29 +1895,19 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
{
DECLARE_INODE_SB_INFO(sb, inf);
struct scoutfs_inode_info *si;
struct rb_node *node;
struct scoutfs_inode_info *tmp;
struct inode *inode;
struct inode *defer_iput = NULL;
int ret;
spin_lock(&inf->writeback_lock);
node = rb_first(&inf->writeback_inodes);
while (node) {
si = container_of(node, struct scoutfs_inode_info,
writeback_node);
node = rb_next(node);
list_for_each_entry_safe(si, tmp, &inf->writeback_list, writeback_entry) {
inode = igrab(&si->inode);
if (!inode)
continue;
spin_unlock(&inf->writeback_lock);
if (defer_iput) {
iput(defer_iput);
defer_iput = NULL;
}
if (write)
ret = filemap_fdatawrite(inode->i_mapping);
else
@@ -1955,28 +1915,28 @@ int scoutfs_inode_walk_writeback(struct super_block *sb, bool write)
trace_scoutfs_inode_walk_writeback(sb, scoutfs_ino(inode),
write, ret);
if (ret) {
iput(inode);
scoutfs_inode_queue_iput(inode);
goto out;
}
spin_lock(&inf->writeback_lock);
if (WARN_ON_ONCE(RB_EMPTY_NODE(&si->writeback_node)))
node = rb_first(&inf->writeback_inodes);
/* restore tmp after reacquiring lock */
if (WARN_ON_ONCE(list_empty(&si->writeback_entry)))
tmp = list_first_entry(&inf->writeback_list, struct scoutfs_inode_info,
writeback_entry);
else
node = rb_next(&si->writeback_node);
tmp = list_next_entry(si, writeback_entry);
if (!write)
remove_writeback_inode(inf, si);
list_del_init(&si->writeback_entry);
/* avoid iput->destroy lock deadlock */
defer_iput = inode;
scoutfs_inode_queue_iput(inode);
}
spin_unlock(&inf->writeback_lock);
out:
if (defer_iput)
iput(defer_iput);
return ret;
}
@@ -1991,7 +1951,7 @@ int scoutfs_inode_setup(struct super_block *sb)
inf->sb = sb;
spin_lock_init(&inf->writeback_lock);
inf->writeback_inodes = RB_ROOT;
INIT_LIST_HEAD(&inf->writeback_list);
spin_lock_init(&inf->dir_ino_alloc.lock);
spin_lock_init(&inf->ino_alloc.lock);
INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker);

View File

@@ -49,7 +49,7 @@ struct scoutfs_inode_info {
struct scoutfs_per_task pt_data_lock;
struct scoutfs_data_waitq data_waitq;
struct rw_semaphore xattr_rwsem;
struct rb_node writeback_node;
struct list_head writeback_entry;
struct scoutfs_lock_coverage ino_lock_cov;