diff --git a/kmod/src/counters.h b/kmod/src/counters.h index b3e68bd4..bb53e7a4 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -152,11 +152,11 @@ EXPAND_COUNTER(net_recv_messages) \ EXPAND_COUNTER(net_unknown_request) \ EXPAND_COUNTER(orphan_scan) \ + EXPAND_COUNTER(orphan_scan_attempts) \ EXPAND_COUNTER(orphan_scan_cached) \ EXPAND_COUNTER(orphan_scan_error) \ EXPAND_COUNTER(orphan_scan_item) \ EXPAND_COUNTER(orphan_scan_omap_set) \ - EXPAND_COUNTER(orphan_scan_read) \ EXPAND_COUNTER(quorum_elected) \ EXPAND_COUNTER(quorum_fence_error) \ EXPAND_COUNTER(quorum_fence_leader) \ diff --git a/kmod/src/inode.c b/kmod/src/inode.c index b71dffdd..737c8a6c 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -66,10 +66,6 @@ struct inode_sb_info { struct delayed_work orphan_scan_dwork; - /* serialize multiple inode ->evict trying to delete same ino's items */ - spinlock_t deleting_items_lock; - struct list_head deleting_items_list; - struct work_struct iput_work; struct llist_head iput_llist; }; @@ -662,22 +658,12 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off) } while (read_seqcount_retry(&si->seqcount, seq)); } -/* - * We have inversions between getting cluster locks while performing - * final deletion on a freeing inode and waiting on a freeing inode - * while holding a cluster lock. - * - * We can avoid these deadlocks by hiding freeing inodes in our hash - * lookup function. We're fine with either returning null or populating - * a new inode overlapping with eviction freeing a previous instance of - * the inode. - */ static int scoutfs_iget_test(struct inode *inode, void *arg) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); u64 *ino = arg; - return (si->ino == *ino) && !(inode->i_state & I_FREEING); + return si->ino == *ino; } static int scoutfs_iget_set(struct inode *inode, void *arg) @@ -691,11 +677,35 @@ static int scoutfs_iget_set(struct inode *inode, void *arg) return 0; } -struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino) +/* + * There's a risk of a deadlock between lock invalidation and eviction. + * Invalidation blocks locks while looking up inodes. Eviction blocks + * inode lookups while trying to get a lock. + * + * We have an inode lookup variant which will never block waiting for an + * inode. This is more aggressive than base ilookup5_nowait() which + * will, you know, wait for inodes that are being freed. We have our + * test function hide those inodes from find_inode so that it won't wait + * on them. + * + * These semantics are sufficiently weird that we use a big giant scary + * looking function name to deter use. + */ +static int ilookup_test_nonewfree(struct inode *inode, void *arg) { - return ilookup5(sb, ino, scoutfs_iget_test, &ino); + return scoutfs_iget_test(inode, arg) && + !(inode->i_state & (I_NEW | I_WILL_FREE | I_FREEING)); +} +struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino) +{ + return ilookup5_nowait(sb, ino, ilookup_test_nonewfree, &ino); } +/* + * Final iput can delete an unused inode's items which can take multiple + * locked transactions. iget (which can call iput in error cases) and + * iput must not be called with locks or transactions held. + */ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf) { struct scoutfs_lock *lock = NULL; @@ -703,32 +713,36 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf) struct inode *inode = NULL; int ret; - ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock); - if (ret < 0) - goto out; - + /* wait for vfs inode (I_FREEING in particular) before acquiring cluster lock */ inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, &ino); if (!inode) { ret = -ENOMEM; goto out; } + ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock); + if (ret < 0) + goto out; + if (inode->i_state & I_NEW) { /* XXX ensure refresh, instead clear in drop_inode? */ si = SCOUTFS_I(inode); atomic64_set(&si->last_refreshed, 0); inode->i_version = 0; + } - ret = scoutfs_inode_refresh(inode, lock); - if (ret < 0) - goto out; + ret = scoutfs_inode_refresh(inode, lock); + if (ret < 0) + goto out; - if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) { - ret = -ENOENT; - goto out; - } + /* check nlink both for new and after refreshing */ + if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) { + ret = -ENOENT; + goto out; + } - ret = scoutfs_omap_inc(sb, ino); + if (inode->i_state & I_NEW) { + ret = scoutfs_omap_set(sb, ino); if (ret < 0) goto out; @@ -741,8 +755,12 @@ out: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); if (ret < 0) { - if (inode) - iget_failed(inode); + if (inode) { + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); + } inode = ERR_PTR(ret); } @@ -1434,13 +1452,13 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, store_inode(&sinode, inode); scoutfs_inode_init_key(&key, scoutfs_ino(inode)); - ret = scoutfs_omap_inc(sb, ino); + ret = scoutfs_omap_set(sb, ino); if (ret < 0) goto out; ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock); if (ret < 0) - scoutfs_omap_dec(sb, ino); + scoutfs_omap_clear(sb, ino); out: if (ret) { iput(inode); @@ -1482,44 +1500,6 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_ return scoutfs_item_delete_force(sb, &key, lock); } -struct deleting_ino_entry { - struct list_head head; - u64 ino; -}; - -static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino) -{ - struct deleting_ino_entry *tmp; - bool added = true; - - spin_lock(&inf->deleting_items_lock); - - list_for_each_entry(tmp, &inf->deleting_items_list, head) { - if (tmp->ino == ino) { - added = false; - break; - } - } - - if (added) { - del->ino = ino; - list_add_tail(&del->head, &inf->deleting_items_list); - } - - spin_unlock(&inf->deleting_items_lock); - - return added; -} - -static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del) -{ - if (del->ino) { - spin_lock(&inf->deleting_items_lock); - list_del_init(&del->head); - spin_unlock(&inf->deleting_items_lock); - } -} - /* * Remove all the items associated with a given inode. This is only * called once nlink has dropped to zero and nothing has the inode open @@ -1528,22 +1508,10 @@ static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entr * orphan item will continue triggering attempts to finish previous * partial deletion until all deletion is complete and the orphan item * is removed. - * - * Currently this can be called multiple times for multiple cached - * inodes for a given ino number (ilookup avoids freeing inodes to avoid - * cluster lock<->inode flag waiting inversions). Some items are not - * safe to delete concurrently, for example concurrent data truncation - * could free extents multiple times. We use a very silly list of inos - * being deleted. Duplicates just return success. If the first - * deletion ends up failing orphan deletion will come back around later - * and retry. */ -static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, - struct scoutfs_lock *orph_lock) +static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode, + struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock) { - DECLARE_INODE_SB_INFO(sb, inf); - struct deleting_ino_entry del = {{NULL, }}; - struct scoutfs_inode sinode; struct scoutfs_key key; LIST_HEAD(ind_locks); bool release = false; @@ -1552,30 +1520,10 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo u64 size; int ret; - if (!added_deleting_ino(inf, &del, ino)) { - ret = 0; - goto out; - } - scoutfs_inode_init_key(&key, ino); - ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), - lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - goto out; - } - - /* XXX corruption, inode probably won't be freed without repair */ - if (le32_to_cpu(sinode.nlink)) { - scoutfs_warn(sb, "Dangling orphan item for inode %llu.", ino); - ret = -EIO; - goto out; - } - - mode = le32_to_cpu(sinode.mode); - size = le64_to_cpu(sinode.size); + mode = le32_to_cpu(sinode->mode); + size = le64_to_cpu(sinode->size); trace_scoutfs_delete_inode(sb, ino, mode, size); /* remove data items in their own transactions */ @@ -1593,7 +1541,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo /* then delete the small known number of remaining inode items */ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: - prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?: + prepare_index_deletion(sb, &ind_locks, ino, mode, sinode) ?: scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; @@ -1602,7 +1550,7 @@ retry: release = true; - ret = remove_index_items(sb, ino, &sinode, &ind_locks); + ret = remove_index_items(sb, ino, sinode, &ind_locks); if (ret) goto out; @@ -1612,15 +1560,21 @@ retry: goto out; } - ret = scoutfs_item_delete(sb, &key, lock); - if (ret) + /* make sure inode item and orphan are deleted together */ + ret = scoutfs_item_dirty(sb, &key, lock); + if (ret < 0) goto out; ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock); - if (ret == 0) - scoutfs_forest_dec_inode_count(sb); + if (ret < 0) + goto out; + + ret = scoutfs_item_delete(sb, &key, lock); + BUG_ON(ret != 0); /* dirtying should have guaranteed success */ + + scoutfs_forest_dec_inode_count(sb); + out: - del_deleting_ino(inf, &del); if (release) scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); @@ -1628,48 +1582,192 @@ out: return ret; } +struct inode_deletion_lock_data { + wait_queue_head_t waitq; + atomic64_t seq; + struct scoutfs_open_ino_map map; + unsigned long trying[DIV_ROUND_UP(SCOUTFS_OPEN_INO_MAP_BITS, BITS_PER_LONG)]; +}; + /* - * iput_final has already written out the dirty pages to the inode - * before we get here. We're left with a clean inode that we have to - * tear down. We use locking and open inode number bitmaps to decide if - * we should finally destroy an inode that is no longer open nor - * reachable through directory entries. + * Get a lock data struct that has the current omap from this hold of + * the lock. The lock data is saved on the lock so it can be used + * multiple times until the lock is refreshed. Only one task will send + * an omap request at a time, and errors are only returned by each task + * as it gets a response to its send. + */ +static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock, + struct inode_deletion_lock_data **ldata_ret, u64 group_nr) +{ + struct inode_deletion_lock_data *ldata; + u64 seq; + int ret; + + /* we're storing omap maps in locks, they need to cover the same number of inodes */ + BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR); + + /* allocate a new lock data struct as needed */ + while ((ldata = cmpxchg(&lock->inode_deletion_data, NULL, NULL)) == NULL) { + ldata = kzalloc(sizeof(struct inode_deletion_lock_data), GFP_NOFS); + if (!ldata) { + ret = -ENOMEM; + goto out; + } + + atomic64_set(&ldata->seq, lock->write_seq - 1); /* ensure refresh */ + init_waitqueue_head(&ldata->waitq); + + /* the lock kfrees the inode_deletion_data pointer along with the lock */ + if (cmpxchg(&lock->inode_deletion_data, NULL, ldata) == NULL) + break; + else + kfree(ldata); + } + + /* make sure that the lock's data is current */ + while ((seq = atomic64_read(&ldata->seq)) != lock->write_seq) { + if (seq != U64_MAX && atomic64_cmpxchg(&ldata->seq, seq, U64_MAX) == seq) { + /* ask the server for current omap */ + ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map); + if (ret == 0) + atomic64_set(&ldata->seq, lock->write_seq); + else + atomic64_set(&ldata->seq, lock->write_seq - 1); + wake_up(&ldata->waitq); + if (ret < 0) + goto out; + } else { + /* wait for someone else who's sent a request */ + wait_event(ldata->waitq, atomic64_read(&ldata->seq) != U64_MAX); + } + } + + ret = 0; +out: + if (ret < 0) + ldata = NULL; + *ldata_ret = ldata; + return ret; +} + +/* + * Try to delete all the items for an unused inode number. This is the + * relatively slow path that uses cluster locks, network requests, and + * IO to ensure correctness. Callers should try hard to avoid calling + * when there's no work to do. * - * Because lookup ignores freeing inodes we can get here from multiple - * instances of an inode that is being deleted. Orphan scanning in - * particular can race with deletion. delete_inode_items() resolves - * concurrent attempts. + * Inode references are added under cluster locks. In-memory vfs cache + * references are added under read cluster locks and are visible in omap + * bitmaps. Directory entry references are added under write cluster + * locks and are visible in the inode's nlink. Orphan items exist + * whenever nlink == 0 and are maintained under write cluster locks. + * Directory entries can be added to an inode with nlink == 0 to + * instantiate tmpfile inodes into the name space. Cached inodes will + * not be created for inodes with an nlink of 0. + * + * Combining all this we know that it's safe to delete an inode's items + * when we hold an exclusive write cluster lock, the inode has nlink == + * 0, and an omap request protected by the lock doesn't have the inode's + * bit set. + * + * This is called by orphan scanning and vfs inode cache eviction after + * they've checked that the inode could really be deleted. We serialize + * on a bit in the lock data so that we only have one deletion attempt + * per inode under this mount's cluster lock. + */ +static int try_delete_inode_items(struct super_block *sb, u64 ino) +{ + struct inode_deletion_lock_data *ldata = NULL; + struct scoutfs_lock *orph_lock = NULL; + struct scoutfs_lock *lock = NULL; + struct scoutfs_inode sinode; + struct scoutfs_key key; + u64 group_nr; + int bit_nr; + int ret; + + ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock); + if (ret < 0) + goto out; + + scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); + + ret = get_current_lock_data(sb, lock, &ldata, group_nr); + if (ret < 0) + goto out; + + /* only one local attempt per inode at a time */ + if (test_and_set_bit(bit_nr, ldata->trying)) { + ret = 0; + goto out; + } + + /* can't delete if it's cached in local or remote mounts */ + if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) { + ret = 0; + goto out; + } + + scoutfs_inode_init_key(&key, ino); + ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + if (le32_to_cpu(sinode.nlink) > 0) { + ret = 0; + goto out; + } + + ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock); + if (ret < 0) + goto out; + + ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock); +out: + if (ldata) + clear_bit(bit_nr, ldata->trying); + + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); + + return ret; +} + +/* + * As we drop an inode we need to decide to try and delete its items or + * not, which is expensive. The two common cases we want to get right + * both have cluster lock coverage and don't want to delete. Dropping + * unused inodes during read lock invalidation has the current lock and + * sees a nonzero nlink and knows not to delete. Final iput after a + * local unlink also has a lock, sees a zero nlink, and tries to perform + * item deletion in the task that dropped the last link, as users + * expect. + * + * Evicting an inode outside of cluster locking is the odd slow path + * that involves lock contention during use the worst cross-mount + * open-unlink/delete case. */ void scoutfs_evict_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; + struct scoutfs_inode_info *si = SCOUTFS_I(inode); const u64 ino = scoutfs_ino(inode); - struct scoutfs_lock *orph_lock; - struct scoutfs_lock *lock; - int ret; - trace_scoutfs_evict_inode(inode->i_sb, scoutfs_ino(inode), - inode->i_nlink, is_bad_inode(inode)); + trace_scoutfs_evict_inode(sb, ino, inode->i_nlink, is_bad_inode(inode)); - if (is_bad_inode(inode)) - goto clear; + if (!is_bad_inode(inode)) { + truncate_inode_pages_final(&inode->i_data); - truncate_inode_pages_final(&inode->i_data); + /* clear before trying to delete tests */ + scoutfs_omap_clear(sb, ino); - ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock); - if (ret > 0) { - ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock); - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); - scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); - } - if (ret < 0) { - scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.", - ret, ino); + if (!scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || inode->i_nlink == 0) + try_delete_inode_items(sb, scoutfs_ino(inode)); } - scoutfs_omap_dec(sb, ino); - -clear: clear_inode(inode); } @@ -1772,11 +1870,10 @@ void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb) * Find and delete inodes whose only remaining reference is the * persistent orphan item that was created as they were unlinked. * - * Orphan items are created as the final directory entry referring to an - * inode is deleted. They're deleted as the final cached inode is - * evicted and the inode items are destroyed. They can linger if all - * the cached inodes pinning the inode fail to delete as they are - * evicted from the cache -- either through crashing or errors. + * Orphan items are maintained for inodes that have an nlink of 0. + * Typically this is from unlink, but tmpfiles are created with orphans. + * They're deleted as the final cached inode is evicted and the inode + * items are destroyed. * * This work runs in all mounts in the background looking for those * orphaned inodes that weren't fully deleted. @@ -1785,20 +1882,16 @@ void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb) * only find orphan items that made it to the fs root after being merged * from a mount's log btree. This naturally avoids orphan items that * exist while inodes have been unlinked but are still cached, including - * O_TMPFILE inodes that are actively used during normal operations. + * tmpfile inodes that are actively used during normal operations. * Scanning the read-only persistent fs root uses cached blocks and * avoids the lock contention we'd cause if we tried to use the * consistent item cache. The downside is that it adds a bit of - * latency. If an orphan was created in error it'll take until the - * mount's log btree is finalized and merged. A crash will have the log - * btree merged after it is fenced. + * latency. * - * Once we find candidate orphan items we can first check our local - * inode cache for inodes that are already on their way to eviction and - * can be skipped. Then we ask the server for the open map containing - * the inode. Only if we don't have it cached, and no one else does, do - * we try and read it into our cache and evict it to trigger the final - * inode deletion process. + * Once we find candidate orphan items we first check our local omap for + * a locally cached inode. Then we ask the server for the open map + * containing the inode. Only if we don't see any cached users do we do + * the expensive work of acquiring locks to try and delete the items. */ static void inode_orphan_scan_worker(struct work_struct *work) { @@ -1810,7 +1903,6 @@ static void inode_orphan_scan_worker(struct work_struct *work) SCOUTFS_BTREE_ITEM_REF(iref); struct scoutfs_key last; struct scoutfs_key key; - struct inode *inode; u64 group_nr; int bit_nr; u64 ino; @@ -1849,11 +1941,9 @@ static void inode_orphan_scan_worker(struct work_struct *work) scoutfs_inc_counter(sb, orphan_scan_item); ino = le64_to_cpu(key.sko_ino); - /* locally cached inodes will already be deleted */ - inode = scoutfs_ilookup(sb, ino); - if (inode) { + /* locally cached inodes will try to delete as they evict */ + if (scoutfs_omap_test(sb, ino)) { scoutfs_inc_counter(sb, orphan_scan_cached); - iput(inode); continue; } @@ -1866,25 +1956,15 @@ static void inode_orphan_scan_worker(struct work_struct *work) goto out; } - /* don't need to evict if someone else has it open (cached) */ + /* remote cached inodes will also try to delete */ if (test_bit_le(bit_nr, omap.bits)) { scoutfs_inc_counter(sb, orphan_scan_omap_set); continue; } - /* try to cached and evict unused inode to delete, can be racing */ - inode = scoutfs_iget(sb, ino, 0, 0); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - if (ret == -ENOENT) - continue; - else - goto out; - } - - scoutfs_inc_counter(sb, orphan_scan_read); - SCOUTFS_I(inode)->drop_invalidated = true; - iput(inode); + /* seemingly orphaned and unused, get locks and check for sure */ + scoutfs_inc_counter(sb, orphan_scan_attempts); + ret = try_delete_inode_items(sb, ino); } ret = 0; @@ -2001,8 +2081,6 @@ int scoutfs_inode_setup(struct super_block *sb) spin_lock_init(&inf->dir_ino_alloc.lock); spin_lock_init(&inf->ino_alloc.lock); INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker); - spin_lock_init(&inf->deleting_items_lock); - INIT_LIST_HEAD(&inf->deleting_items_list); INIT_WORK(&inf->iput_work, iput_worker); init_llist_head(&inf->iput_llist); diff --git a/kmod/src/inode.h b/kmod/src/inode.h index e855e8f1..4e650007 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -82,7 +82,9 @@ void scoutfs_inode_queue_iput(struct inode *inode); #define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf); -struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino); +struct inode *scoutfs_ilookup_nowait(struct super_block *sb, u64 ino); +struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino); + void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino); void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major, diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index d30a5021..1ae9da1f 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -387,7 +387,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg) if (sblock > eblock) return -EINVAL; - inode = scoutfs_ilookup(sb, args.ino); + inode = scoutfs_ilookup_nowait_nonewfree(sb, args.ino); if (!inode) { ret = -ESTALE; goto out; diff --git a/kmod/src/lock.c b/kmod/src/lock.c index f0d0238d..b04e18ee 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -142,7 +142,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino) struct scoutfs_inode_info *si; struct inode *inode; - inode = scoutfs_ilookup(sb, ino); + inode = scoutfs_ilookup_nowait_nonewfree(sb, ino); if (inode) { si = SCOUTFS_I(inode); @@ -255,7 +255,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock) BUG_ON(!list_empty(&lock->shrink_head)); BUG_ON(!list_empty(&lock->cov_list)); - scoutfs_omap_free_lock_data(lock->omap_data); + kfree(lock->inode_deletion_data); kfree(lock); } @@ -291,7 +291,6 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb, lock->mode = SCOUTFS_LOCK_NULL; atomic64_set(&lock->forest_bloom_nr, 0); - spin_lock_init(&lock->omap_spinlock); trace_scoutfs_lock_alloc(sb, lock); diff --git a/kmod/src/lock.h b/kmod/src/lock.h index 5d7a3ce7..a1d9b903 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -11,7 +11,7 @@ #define SCOUTFS_LOCK_NR_MODES SCOUTFS_LOCK_INVALID -struct scoutfs_omap_lock; +struct inode_deletion_lock_data; /* * A few fields (start, end, refresh_gen, write_seq, granted_mode) @@ -47,9 +47,8 @@ struct scoutfs_lock { /* the forest tracks which log tree last saw bloom bit updates */ atomic64_t forest_bloom_nr; - /* open ino mapping has a valid map for a held write lock */ - spinlock_t omap_spinlock; - struct scoutfs_omap_lock_data *omap_data; + /* inode deletion tracks some state per lock */ + struct inode_deletion_lock_data *inode_deletion_data; }; struct scoutfs_lock_coverage { diff --git a/kmod/src/omap.c b/kmod/src/omap.c index a441897b..604c397e 100644 --- a/kmod/src/omap.c +++ b/kmod/src/omap.c @@ -30,27 +30,22 @@ /* * As a client removes an inode from its cache with an nlink of 0 it * needs to decide if it is the last client using the inode and should - * fully delete all its items. It needs to know if other mounts still - * have the inode in use. + * fully delete all the inode's items. It needs to know if other mounts + * still have the inode in use. * - * We need a way to communicate between mounts that an inode is open. + * We need a way to communicate between mounts that an inode is in use. * We don't want to pay the synchronous per-file locking round trip * costs associated with per-inode open locks that you'd typically see - * in systems to solve this problem. + * in systems to solve this problem. The first prototypes of this + * tracked open file handles so this was coined the open map, though it + * now tracks cached inodes. * - * Instead clients maintain open bitmaps that cover groups of inodes. - * As inodes enter the cache their bit is set, and as the inode is - * evicted the bit is cleared. As an inode is evicted messages are sent - * around the cluster to get the current bitmaps for that inode's group - * from all active mounts. If the inode's bit is clear then it can be - * deleted. - * - * We associate the open bitmaps with our cluster locking of inode - * groups to cache these open bitmaps. As long as we have the lock then - * nlink can't be changed on any remote mounts. Specifically, it can't - * increase from 0 so any clear bits can gain references on remote - * mounts. As long as we have the lock, all clear bits in the group for - * inodes with 0 nlink can be deleted. + * Clients maintain bitmaps that cover groups of inodes. As inodes + * enter the cache their bit is set and as the inode is evicted the bit + * is cleared. As deletion is attempted, either by scanning orphans or + * evicting an inode with an nlink of 0, messages are sent around the + * cluster to get the current bitmaps for that inode's group from all + * active mounts. If the inode's bit is clear then it can be deleted. * * This layer maintains a list of client rids to send messages to. The * server calls us as clients enter and leave the cluster. We can't @@ -85,14 +80,12 @@ struct omap_info { struct omap_info *name = SCOUTFS_SB(sb)->omap_info /* - * The presence of an inode in the inode cache increases the count of - * its inode number's position within its lock group. These structs - * track the counts for all the inodes in a lock group and maintain a - * bitmap whose bits are set for each non-zero count. + * The presence of an inode in the inode sets its bit in the lock + * group's bitmap. * * We don't want to add additional global synchronization of inode cache * maintenance so these are tracked in an rcu hash table. Once their - * total count reaches zero they're removed from the hash and queued for + * total reaches zero they're removed from the hash and queued for * freeing and readers should ignore them. */ struct omap_group { @@ -102,7 +95,6 @@ struct omap_group { u64 nr; spinlock_t lock; unsigned int total; - unsigned int *counts; __le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S]; }; @@ -111,8 +103,7 @@ do { \ __typeof__(group) _grp = (group); \ __typeof__(bit_nr) _nr = (bit_nr); \ \ - trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr, \ - _nr < 0 ? -1 : _grp->counts[_nr]); \ + trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr); \ } while (0) /* @@ -134,18 +125,6 @@ struct omap_request { struct scoutfs_open_ino_map map; }; -/* - * In each inode group cluster lock we store data to track the open ino - * map which tracks all the inodes that the cluster lock covers. When - * the seq shows that the map is stale we send a request to update it. - */ -struct scoutfs_omap_lock_data { - u64 seq; - bool req_in_flight; - wait_queue_head_t waitq; - struct scoutfs_open_ino_map map; -}; - static inline void init_rid_list(struct omap_rid_list *list) { INIT_LIST_HEAD(&list->head); @@ -242,21 +221,13 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr) { struct omap_group *group; - BUILD_BUG_ON((sizeof(group->counts[0]) * SCOUTFS_OPEN_INO_MAP_BITS) > PAGE_SIZE); - group = kzalloc(sizeof(struct omap_group), GFP_NOFS); if (group) { group->sb = sb; group->nr = group_nr; spin_lock_init(&group->lock); - group->counts = (void *)get_zeroed_page(GFP_NOFS); - if (!group->counts) { - kfree(group); - group = NULL; - } else { - trace_group(sb, alloc, group, -1); - } + trace_group(sb, alloc, group, -1); } return group; @@ -265,7 +236,6 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr) static void free_group(struct super_block *sb, struct omap_group *group) { trace_group(sb, free, group, -1); - free_page((unsigned long)group->counts); kfree(group); } @@ -283,13 +253,16 @@ static const struct rhashtable_params group_ht_params = { }; /* - * Track an cached inode in its group. Our increment can be racing with - * a final decrement that removes the group from the hash, sets total to + * Track an cached inode in its group. Our set can be racing with a + * final clear that removes the group from the hash, sets total to * UINT_MAX, and calls rcu free. We can retry until the dead group is * no longer visible in the hash table and we can insert a new allocated * group. + * + * The caller must ensure that the bit is clear, -EEXIST will be + * returned otherwise. */ -int scoutfs_omap_inc(struct super_block *sb, u64 ino) +int scoutfs_omap_set(struct super_block *sb, u64 ino) { DECLARE_OMAP_INFO(sb, ominf); struct omap_group *group; @@ -308,10 +281,10 @@ retry: spin_lock(&group->lock); if (group->total < UINT_MAX) { found = true; - if (group->counts[bit_nr]++ == 0) { - set_bit_le(bit_nr, group->bits); + if (WARN_ON_ONCE(test_and_set_bit_le(bit_nr, group->bits))) + ret = -EEXIST; + else group->total++; - } } trace_group(sb, inc, group, bit_nr); spin_unlock(&group->lock); @@ -342,12 +315,34 @@ retry: return ret; } +bool scoutfs_omap_test(struct super_block *sb, u64 ino) +{ + DECLARE_OMAP_INFO(sb, ominf); + struct omap_group *group; + bool ret = false; + u64 group_nr; + int bit_nr; + + scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); + + rcu_read_lock(); + group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params); + if (group) { + spin_lock(&group->lock); + ret = !!test_bit_le(bit_nr, group->bits); + spin_unlock(&group->lock); + } + rcu_read_unlock(); + + return ret; +} + /* - * Decrement a previously incremented ino count. Not finding a count - * implies imbalanced inc/dec or bugs freeing groups. We only free - * groups here as the last dec drops the group's total count to 0. + * Clear a previously set ino bit. Trying to clear a bit that's already + * clear implies imbalanced set/clear or bugs freeing groups. We only + * free groups here as the last clear drops the group's total to 0. */ -void scoutfs_omap_dec(struct super_block *sb, u64 ino) +void scoutfs_omap_clear(struct super_block *sb, u64 ino) { DECLARE_OMAP_INFO(sb, ominf); struct omap_group *group; @@ -360,11 +355,10 @@ void scoutfs_omap_dec(struct super_block *sb, u64 ino) group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params); if (group) { spin_lock(&group->lock); - WARN_ON_ONCE(group->counts[bit_nr] == 0); + WARN_ON_ONCE(!test_bit_le(bit_nr, group->bits)); WARN_ON_ONCE(group->total == 0); WARN_ON_ONCE(group->total == UINT_MAX); - if (--group->counts[bit_nr] == 0) { - clear_bit_le(bit_nr, group->bits); + if (test_and_clear_bit_le(bit_nr, group->bits)) { if (--group->total == 0) { group->total = UINT_MAX; rhashtable_remove_fast(&ominf->group_ht, &group->ht_head, @@ -664,8 +658,7 @@ int scoutfs_omap_server_handle_request(struct super_block *sb, u64 rid, u64 id, /* * The client is receiving a request from the server for its map for the - * given group. Look up the group and copy the bits to the map for - * non-zero open counts. + * given group. Look up the group and copy the bits to the map. * * The mount originating the request for this bitmap has the inode group * write locked. We can't be adding links to any inodes in the group @@ -814,179 +807,6 @@ void scoutfs_omap_server_shutdown(struct super_block *sb) synchronize_rcu(); } -static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lock_data *ldata) -{ - bool in_flight; - - spin_lock(&lock->omap_spinlock); - in_flight = ldata->req_in_flight; - spin_unlock(&lock->omap_spinlock); - - return in_flight; -} - -/* - * Make sure the map covered by the cluster lock is current. The caller - * holds the cluster lock so once we store lock_data on the cluster lock - * it won't be freed and the write_seq in the cluster lock won't change. - * - * The omap_spinlock protects the omap_data in the cluster lock. We - * have to drop it if we have to block to allocate lock_data, send a - * request for a new map, or wait for a request in flight to finish. - */ -static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock, - struct scoutfs_omap_lock_data **ldata_ret, u64 group_nr) -{ - struct scoutfs_omap_lock_data *ldata; - bool send_req; - int ret = 0; - - spin_lock(&lock->omap_spinlock); - - ldata = lock->omap_data; - if (ldata == NULL) { - spin_unlock(&lock->omap_spinlock); - ldata = kzalloc(sizeof(struct scoutfs_omap_lock_data), GFP_NOFS); - spin_lock(&lock->omap_spinlock); - - if (!ldata) { - ret = -ENOMEM; - goto out; - } - - if (lock->omap_data == NULL) { - ldata->seq = lock->write_seq - 1; /* ensure refresh */ - init_waitqueue_head(&ldata->waitq); - - lock->omap_data = ldata; - } else { - kfree(ldata); - ldata = lock->omap_data; - } - } - - while (ldata->seq != lock->write_seq) { - /* only one waiter sends a request at a time */ - if (!ldata->req_in_flight) { - ldata->req_in_flight = true; - send_req = true; - } else { - send_req = false; - } - - spin_unlock(&lock->omap_spinlock); - if (send_req) - ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map); - else - wait_event(ldata->waitq, !omap_req_in_flight(lock, ldata)); - spin_lock(&lock->omap_spinlock); - - /* only sender can return error, other waiters retry */ - if (send_req) { - ldata->req_in_flight = false; - if (ret == 0) - ldata->seq = lock->write_seq; - wake_up(&ldata->waitq); - if (ret < 0) - goto out; - } - } - -out: - spin_unlock(&lock->omap_spinlock); - - if (ret == 0) - *ldata_ret = ldata; - else - *ldata_ret = NULL; - - return ret; -} - -/* - * Return 1 and give the caller their locks when they should delete the - * inode items. It's safe to delete the inode items when it is no - * longer reachable and nothing is referencing it. - * - * The inode is unreachable when nlink hits zero. Cluster locks protect - * modification and testing of nlink. We use the ino_lock_cov covrage - * to short circuit the common case of having a locked inode that hasn't - * been deleted. If it isn't locked, we have to acquire the lock to - * refresh the inode to see its current nlink. - * - * Then we use an open inode bitmap that covers all the inodes in the - * lock group to determine if the inode is present in any other mount's - * caches. We refresh it by asking the server for all clients' maps and - * then store it in the lock. As long as we hold the lock nothing can - * increase nlink from zero and let people get a reference to the inode. - */ -int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode, - struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret) -{ - struct scoutfs_inode_info *si = SCOUTFS_I(inode); - struct scoutfs_lock *orph_lock = NULL; - struct scoutfs_lock *lock = NULL; - const u64 ino = scoutfs_ino(inode); - struct scoutfs_omap_lock_data *ldata; - u64 group_nr; - int bit_nr; - int ret; - int err; - - /* lock group and omap constants are defined independently */ - BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR); - - if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) { - ret = 0; - goto out; - } - - ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); - if (ret < 0) - goto out; - - if (inode->i_nlink > 0) { - ret = 0; - goto out; - } - - scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); - - /* only one request to refresh the map at a time */ - ret = get_current_lock_data(sb, lock, &ldata, group_nr); - if (ret < 0) - goto out; - - /* can delete caller's zero nlink inode if it's not cached in other mounts */ - ret = !test_bit_le(bit_nr, ldata->map.bits); -out: - trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret); - - if (ret > 0) { - err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock); - if (err < 0) - ret = err; - } - - if (ret <= 0) { - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); - lock = NULL; - } - - *lock_ret = lock; - *orph_lock_ret = orph_lock; - return ret; -} - -void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata) -{ - if (ldata) { - WARN_ON_ONCE(ldata->req_in_flight); - WARN_ON_ONCE(waitqueue_active(&ldata->waitq)); - kfree(ldata); - } -} - int scoutfs_omap_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); diff --git a/kmod/src/omap.h b/kmod/src/omap.h index 4c6787a2..3f09d8a9 100644 --- a/kmod/src/omap.h +++ b/kmod/src/omap.h @@ -1,11 +1,9 @@ #ifndef _SCOUTFS_OMAP_H_ #define _SCOUTFS_OMAP_H_ -int scoutfs_omap_inc(struct super_block *sb, u64 ino); -void scoutfs_omap_dec(struct super_block *sb, u64 ino); -int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode, - struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret); -void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata); +int scoutfs_omap_set(struct super_block *sb, u64 ino); +bool scoutfs_omap_test(struct super_block *sb, u64 ino); +void scoutfs_omap_clear(struct super_block *sb, u64 ino); int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id, struct scoutfs_open_ino_map_args *args); void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 80db3247..22a892c6 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2620,9 +2620,9 @@ TRACE_EVENT(scoutfs_item_invalidate_page, DECLARE_EVENT_CLASS(scoutfs_omap_group_class, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), + int bit_nr), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -2630,7 +2630,6 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class, __field(__u64, group_nr) __field(unsigned int, group_total) __field(int, bit_nr) - __field(int, bit_count) ), TP_fast_assign( @@ -2639,43 +2638,42 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class, __entry->group_nr = group_nr; __entry->group_total = group_total; __entry->bit_nr = bit_nr; - __entry->bit_count = bit_count; ), - TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d bit_count %d", + TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d", SCSB_TRACE_ARGS, __entry->grp, __entry->group_nr, __entry->group_total, - __entry->bit_nr, __entry->bit_count) + __entry->bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_alloc, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_free, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_inc, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_dec, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_request, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_destroy, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); TRACE_EVENT(scoutfs_omap_should_delete,