diff --git a/kmod/src/client.c b/kmod/src/client.c index 4799fcc5..0f1006d0 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -477,12 +477,15 @@ static void scoutfs_client_connect_worker(struct work_struct *work) struct super_block *sb = client->sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; - struct mount_options *opts = &sbi->opts; - const bool am_quorum = opts->quorum_slot_nr >= 0; + struct scoutfs_mount_options opts; struct scoutfs_net_greeting greet; struct sockaddr_in sin; + bool am_quorum; int ret; + scoutfs_options_read(sb, &opts); + am_quorum = opts.quorum_slot_nr >= 0; + /* can unmount once server farewell handling removes our item */ if (client->sending_farewell && lookup_mounted_client_item(sb, sbi->rid) == 0) { diff --git a/kmod/src/counters.h b/kmod/src/counters.h index b3e68bd4..bb53e7a4 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -152,11 +152,11 @@ EXPAND_COUNTER(net_recv_messages) \ EXPAND_COUNTER(net_unknown_request) \ EXPAND_COUNTER(orphan_scan) \ + EXPAND_COUNTER(orphan_scan_attempts) \ EXPAND_COUNTER(orphan_scan_cached) \ EXPAND_COUNTER(orphan_scan_error) \ EXPAND_COUNTER(orphan_scan_item) \ EXPAND_COUNTER(orphan_scan_omap_set) \ - EXPAND_COUNTER(orphan_scan_read) \ EXPAND_COUNTER(quorum_elected) \ EXPAND_COUNTER(quorum_fence_error) \ EXPAND_COUNTER(quorum_fence_leader) \ diff --git a/kmod/src/dir.c b/kmod/src/dir.c index a4b6dc09..00734909 100644 --- a/kmod/src/dir.c +++ b/kmod/src/dir.c @@ -720,7 +720,7 @@ static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry, struct list_head *ind_locks) { struct super_block *sb = dir->i_sb; - struct inode *inode; + struct inode *inode = NULL; u64 ind_seq; int ret = 0; u64 ino; @@ -765,11 +765,9 @@ retry: if (ret) goto out_unlock; - inode = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + ret = scoutfs_new_inode(sb, dir, mode, rdev, ino, *inode_lock, &inode); + if (ret < 0) goto out; - } ret = scoutfs_dirty_inode_item(dir, *dir_lock); out: @@ -787,6 +785,8 @@ out_unlock: *orph_lock = NULL; } + if (!IS_ERR_OR_NULL(inode)) + iput(inode); inode = ERR_PTR(ret); } @@ -1319,11 +1319,11 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry, insert_inode_hash(inode); /* XXX need to set i_op/fop before here for sec callbacks */ d_instantiate(dentry, inode); + inode = NULL; + ret = 0; out: if (ret < 0) { /* XXX remove inode items */ - if (!IS_ERR_OR_NULL(inode)) - iput(inode); symlink_item_ops(sb, SYM_DELETE, scoutfs_ino(inode), inode_lock, NULL, name_len); @@ -1334,6 +1334,9 @@ out: scoutfs_unlock(sb, dir_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); + return ret; } @@ -1923,10 +1926,8 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod si = SCOUTFS_I(inode); ret = scoutfs_inode_orphan_create(sb, scoutfs_ino(inode), orph_lock); - if (ret < 0) { - iput(inode); + if (ret < 0) goto out; /* XXX returning error but items created */ - } inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; si->crtime = inode->i_mtime; @@ -1939,7 +1940,6 @@ static int scoutfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mod scoutfs_update_inode_item(inode, inode_lock, &ind_locks); scoutfs_update_inode_item(dir, dir_lock, &ind_locks); scoutfs_inode_index_unlock(sb, &ind_locks); - iput(inode); out: scoutfs_release_trans(sb); @@ -1948,6 +1948,9 @@ out: scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); + return ret; } diff --git a/kmod/src/fence.c b/kmod/src/fence.c index 7f989e74..ff617634 100644 --- a/kmod/src/fence.c +++ b/kmod/src/fence.c @@ -395,12 +395,13 @@ int scoutfs_fence_wait_fenced(struct super_block *sb, long timeout_jiffies) int scoutfs_fence_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct mount_options *opts = &sbi->opts; + struct scoutfs_mount_options opts; struct fence_info *fi; int ret; /* can only fence if we can be elected by quorum */ - if (opts->quorum_slot_nr == -1) { + scoutfs_options_read(sb, &opts); + if (opts.quorum_slot_nr == -1) { ret = 0; goto out; } diff --git a/kmod/src/inode.c b/kmod/src/inode.c index ac711ac8..9d22f52a 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -66,10 +66,6 @@ struct inode_sb_info { struct delayed_work orphan_scan_dwork; - /* serialize multiple inode ->evict trying to delete same ino's items */ - spinlock_t deleting_items_lock; - struct list_head deleting_items_list; - struct work_struct iput_work; struct llist_head iput_llist; }; @@ -662,22 +658,12 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off) } while (read_seqcount_retry(&si->seqcount, seq)); } -/* - * We have inversions between getting cluster locks while performing - * final deletion on a freeing inode and waiting on a freeing inode - * while holding a cluster lock. - * - * We can avoid these deadlocks by hiding freeing inodes in our hash - * lookup function. We're fine with either returning null or populating - * a new inode overlapping with eviction freeing a previous instance of - * the inode. - */ static int scoutfs_iget_test(struct inode *inode, void *arg) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); u64 *ino = arg; - return (si->ino == *ino) && !(inode->i_state & I_FREEING); + return si->ino == *ino; } static int scoutfs_iget_set(struct inode *inode, void *arg) @@ -691,11 +677,35 @@ static int scoutfs_iget_set(struct inode *inode, void *arg) return 0; } -struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino) +/* + * There's a risk of a deadlock between lock invalidation and eviction. + * Invalidation blocks locks while looking up inodes. Eviction blocks + * inode lookups while trying to get a lock. + * + * We have an inode lookup variant which will never block waiting for an + * inode. This is more aggressive than base ilookup5_nowait() which + * will, you know, wait for inodes that are being freed. We have our + * test function hide those inodes from find_inode so that it won't wait + * on them. + * + * These semantics are sufficiently weird that we use a big giant scary + * looking function name to deter use. + */ +static int ilookup_test_nonewfree(struct inode *inode, void *arg) { - return ilookup5(sb, ino, scoutfs_iget_test, &ino); + return scoutfs_iget_test(inode, arg) && + !(inode->i_state & (I_NEW | I_WILL_FREE | I_FREEING)); +} +struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino) +{ + return ilookup5_nowait(sb, ino, ilookup_test_nonewfree, &ino); } +/* + * Final iput can delete an unused inode's items which can take multiple + * locked transactions. iget (which can call iput in error cases) and + * iput must not be called with locks or transactions held. + */ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf) { struct scoutfs_lock *lock = NULL; @@ -703,32 +713,36 @@ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf) struct inode *inode = NULL; int ret; - ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock); - if (ret < 0) - goto out; - + /* wait for vfs inode (I_FREEING in particular) before acquiring cluster lock */ inode = iget5_locked(sb, ino, scoutfs_iget_test, scoutfs_iget_set, &ino); if (!inode) { ret = -ENOMEM; goto out; } + ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, lkf, ino, &lock); + if (ret < 0) + goto out; + if (inode->i_state & I_NEW) { /* XXX ensure refresh, instead clear in drop_inode? */ si = SCOUTFS_I(inode); atomic64_set(&si->last_refreshed, 0); inode->i_version = 0; + } - ret = scoutfs_inode_refresh(inode, lock); - if (ret < 0) - goto out; + ret = scoutfs_inode_refresh(inode, lock); + if (ret < 0) + goto out; - if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) { - ret = -ENOENT; - goto out; - } + /* check nlink both for new and after refreshing */ + if ((igf & SCOUTFS_IGF_LINKED) && inode->i_nlink == 0) { + ret = -ENOENT; + goto out; + } - ret = scoutfs_omap_inc(sb, ino); + if (inode->i_state & I_NEW) { + ret = scoutfs_omap_set(sb, ino); if (ret < 0) goto out; @@ -741,8 +755,12 @@ out: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); if (ret < 0) { - if (inode) - iget_failed(inode); + if (inode) { + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); + } inode = ERR_PTR(ret); } @@ -1393,10 +1411,14 @@ out: /* * Allocate and initialize a new inode. The caller is responsible for * creating links to it and updating it. @dir can be null. + * + * This is called with locks and a transaction because it creates the + * inode item. We can't call iput on the new inode on error. We + * return the inode to the caller *including on error* for them to put + * once they've released the transaction. */ -struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, - umode_t mode, dev_t rdev, u64 ino, - struct scoutfs_lock *lock) +int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev, + u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret) { struct scoutfs_inode_info *si; struct scoutfs_key key; @@ -1405,8 +1427,10 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, int ret; inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); + if (!inode) { + ret = -ENOMEM; + goto out; + } si = SCOUTFS_I(inode); si->ino = ino; @@ -1434,20 +1458,17 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, store_inode(&sinode, inode); scoutfs_inode_init_key(&key, scoutfs_ino(inode)); - ret = scoutfs_omap_inc(sb, ino); + ret = scoutfs_omap_set(sb, ino); if (ret < 0) goto out; ret = scoutfs_item_create(sb, &key, &sinode, sizeof(sinode), lock); if (ret < 0) - scoutfs_omap_dec(sb, ino); + scoutfs_omap_clear(sb, ino); out: - if (ret) { - iput(inode); - inode = ERR_PTR(ret); - } + *inode_ret = inode; - return inode; + return ret; } static void init_orphan_key(struct scoutfs_key *key, u64 ino) @@ -1482,44 +1503,6 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_ return scoutfs_item_delete_force(sb, &key, lock); } -struct deleting_ino_entry { - struct list_head head; - u64 ino; -}; - -static bool added_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del, u64 ino) -{ - struct deleting_ino_entry *tmp; - bool added = true; - - spin_lock(&inf->deleting_items_lock); - - list_for_each_entry(tmp, &inf->deleting_items_list, head) { - if (tmp->ino == ino) { - added = false; - break; - } - } - - if (added) { - del->ino = ino; - list_add_tail(&del->head, &inf->deleting_items_list); - } - - spin_unlock(&inf->deleting_items_lock); - - return added; -} - -static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entry *del) -{ - if (del->ino) { - spin_lock(&inf->deleting_items_lock); - list_del_init(&del->head); - spin_unlock(&inf->deleting_items_lock); - } -} - /* * Remove all the items associated with a given inode. This is only * called once nlink has dropped to zero and nothing has the inode open @@ -1528,22 +1511,10 @@ static void del_deleting_ino(struct inode_sb_info *inf, struct deleting_ino_entr * orphan item will continue triggering attempts to finish previous * partial deletion until all deletion is complete and the orphan item * is removed. - * - * Currently this can be called multiple times for multiple cached - * inodes for a given ino number (ilookup avoids freeing inodes to avoid - * cluster lock<->inode flag waiting inversions). Some items are not - * safe to delete concurrently, for example concurrent data truncation - * could free extents multiple times. We use a very silly list of inos - * being deleted. Duplicates just return success. If the first - * deletion ends up failing orphan deletion will come back around later - * and retry. */ -static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lock *lock, - struct scoutfs_lock *orph_lock) +static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode, + struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock) { - DECLARE_INODE_SB_INFO(sb, inf); - struct deleting_ino_entry del = {{NULL, }}; - struct scoutfs_inode sinode; struct scoutfs_key key; LIST_HEAD(ind_locks); bool release = false; @@ -1552,30 +1523,10 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo u64 size; int ret; - if (!added_deleting_ino(inf, &del, ino)) { - ret = 0; - goto out; - } - scoutfs_inode_init_key(&key, ino); - ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), - lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - goto out; - } - - /* XXX corruption, inode probably won't be freed without repair */ - if (le32_to_cpu(sinode.nlink)) { - scoutfs_warn(sb, "Dangling orphan item for inode %llu.", ino); - ret = -EIO; - goto out; - } - - mode = le32_to_cpu(sinode.mode); - size = le64_to_cpu(sinode.size); + mode = le32_to_cpu(sinode->mode); + size = le64_to_cpu(sinode->size); trace_scoutfs_delete_inode(sb, ino, mode, size); /* remove data items in their own transactions */ @@ -1593,7 +1544,7 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_lo /* then delete the small known number of remaining inode items */ retry: ret = scoutfs_inode_index_start(sb, &ind_seq) ?: - prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?: + prepare_index_deletion(sb, &ind_locks, ino, mode, sinode) ?: scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq, false); if (ret > 0) goto retry; @@ -1602,7 +1553,7 @@ retry: release = true; - ret = remove_index_items(sb, ino, &sinode, &ind_locks); + ret = remove_index_items(sb, ino, sinode, &ind_locks); if (ret) goto out; @@ -1612,15 +1563,21 @@ retry: goto out; } - ret = scoutfs_item_delete(sb, &key, lock); - if (ret) + /* make sure inode item and orphan are deleted together */ + ret = scoutfs_item_dirty(sb, &key, lock); + if (ret < 0) goto out; ret = scoutfs_inode_orphan_delete(sb, ino, orph_lock); - if (ret == 0) - scoutfs_forest_dec_inode_count(sb); + if (ret < 0) + goto out; + + ret = scoutfs_item_delete(sb, &key, lock); + BUG_ON(ret != 0); /* dirtying should have guaranteed success */ + + scoutfs_forest_dec_inode_count(sb); + out: - del_deleting_ino(inf, &del); if (release) scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); @@ -1628,48 +1585,192 @@ out: return ret; } +struct inode_deletion_lock_data { + wait_queue_head_t waitq; + atomic64_t seq; + struct scoutfs_open_ino_map map; + unsigned long trying[DIV_ROUND_UP(SCOUTFS_OPEN_INO_MAP_BITS, BITS_PER_LONG)]; +}; + /* - * iput_final has already written out the dirty pages to the inode - * before we get here. We're left with a clean inode that we have to - * tear down. We use locking and open inode number bitmaps to decide if - * we should finally destroy an inode that is no longer open nor - * reachable through directory entries. + * Get a lock data struct that has the current omap from this hold of + * the lock. The lock data is saved on the lock so it can be used + * multiple times until the lock is refreshed. Only one task will send + * an omap request at a time, and errors are only returned by each task + * as it gets a response to its send. + */ +static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock, + struct inode_deletion_lock_data **ldata_ret, u64 group_nr) +{ + struct inode_deletion_lock_data *ldata; + u64 seq; + int ret; + + /* we're storing omap maps in locks, they need to cover the same number of inodes */ + BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR); + + /* allocate a new lock data struct as needed */ + while ((ldata = cmpxchg(&lock->inode_deletion_data, NULL, NULL)) == NULL) { + ldata = kzalloc(sizeof(struct inode_deletion_lock_data), GFP_NOFS); + if (!ldata) { + ret = -ENOMEM; + goto out; + } + + atomic64_set(&ldata->seq, lock->write_seq - 1); /* ensure refresh */ + init_waitqueue_head(&ldata->waitq); + + /* the lock kfrees the inode_deletion_data pointer along with the lock */ + if (cmpxchg(&lock->inode_deletion_data, NULL, ldata) == NULL) + break; + else + kfree(ldata); + } + + /* make sure that the lock's data is current */ + while ((seq = atomic64_read(&ldata->seq)) != lock->write_seq) { + if (seq != U64_MAX && atomic64_cmpxchg(&ldata->seq, seq, U64_MAX) == seq) { + /* ask the server for current omap */ + ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map); + if (ret == 0) + atomic64_set(&ldata->seq, lock->write_seq); + else + atomic64_set(&ldata->seq, lock->write_seq - 1); + wake_up(&ldata->waitq); + if (ret < 0) + goto out; + } else { + /* wait for someone else who's sent a request */ + wait_event(ldata->waitq, atomic64_read(&ldata->seq) != U64_MAX); + } + } + + ret = 0; +out: + if (ret < 0) + ldata = NULL; + *ldata_ret = ldata; + return ret; +} + +/* + * Try to delete all the items for an unused inode number. This is the + * relatively slow path that uses cluster locks, network requests, and + * IO to ensure correctness. Callers should try hard to avoid calling + * when there's no work to do. * - * Because lookup ignores freeing inodes we can get here from multiple - * instances of an inode that is being deleted. Orphan scanning in - * particular can race with deletion. delete_inode_items() resolves - * concurrent attempts. + * Inode references are added under cluster locks. In-memory vfs cache + * references are added under read cluster locks and are visible in omap + * bitmaps. Directory entry references are added under write cluster + * locks and are visible in the inode's nlink. Orphan items exist + * whenever nlink == 0 and are maintained under write cluster locks. + * Directory entries can be added to an inode with nlink == 0 to + * instantiate tmpfile inodes into the name space. Cached inodes will + * not be created for inodes with an nlink of 0. + * + * Combining all this we know that it's safe to delete an inode's items + * when we hold an exclusive write cluster lock, the inode has nlink == + * 0, and an omap request protected by the lock doesn't have the inode's + * bit set. + * + * This is called by orphan scanning and vfs inode cache eviction after + * they've checked that the inode could really be deleted. We serialize + * on a bit in the lock data so that we only have one deletion attempt + * per inode under this mount's cluster lock. + */ +static int try_delete_inode_items(struct super_block *sb, u64 ino) +{ + struct inode_deletion_lock_data *ldata = NULL; + struct scoutfs_lock *orph_lock = NULL; + struct scoutfs_lock *lock = NULL; + struct scoutfs_inode sinode; + struct scoutfs_key key; + u64 group_nr; + int bit_nr; + int ret; + + ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock); + if (ret < 0) + goto out; + + scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); + + ret = get_current_lock_data(sb, lock, &ldata, group_nr); + if (ret < 0) + goto out; + + /* only one local attempt per inode at a time */ + if (test_and_set_bit(bit_nr, ldata->trying)) { + ret = 0; + goto out; + } + + /* can't delete if it's cached in local or remote mounts */ + if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) { + ret = 0; + goto out; + } + + scoutfs_inode_init_key(&key, ino); + ret = scoutfs_item_lookup_exact(sb, &key, &sinode, sizeof(sinode), lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + if (le32_to_cpu(sinode.nlink) > 0) { + ret = 0; + goto out; + } + + ret = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock); + if (ret < 0) + goto out; + + ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock); +out: + if (ldata) + clear_bit(bit_nr, ldata->trying); + + scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); + + return ret; +} + +/* + * As we drop an inode we need to decide to try and delete its items or + * not, which is expensive. The two common cases we want to get right + * both have cluster lock coverage and don't want to delete. Dropping + * unused inodes during read lock invalidation has the current lock and + * sees a nonzero nlink and knows not to delete. Final iput after a + * local unlink also has a lock, sees a zero nlink, and tries to perform + * item deletion in the task that dropped the last link, as users + * expect. + * + * Evicting an inode outside of cluster locking is the odd slow path + * that involves lock contention during use the worst cross-mount + * open-unlink/delete case. */ void scoutfs_evict_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; + struct scoutfs_inode_info *si = SCOUTFS_I(inode); const u64 ino = scoutfs_ino(inode); - struct scoutfs_lock *orph_lock; - struct scoutfs_lock *lock; - int ret; - trace_scoutfs_evict_inode(inode->i_sb, scoutfs_ino(inode), - inode->i_nlink, is_bad_inode(inode)); + trace_scoutfs_evict_inode(sb, ino, inode->i_nlink, is_bad_inode(inode)); - if (is_bad_inode(inode)) - goto clear; + if (!is_bad_inode(inode)) { + truncate_inode_pages_final(&inode->i_data); - truncate_inode_pages_final(&inode->i_data); + /* clear before trying to delete tests */ + scoutfs_omap_clear(sb, ino); - ret = scoutfs_omap_should_delete(sb, inode, &lock, &orph_lock); - if (ret > 0) { - ret = delete_inode_items(inode->i_sb, scoutfs_ino(inode), lock, orph_lock); - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); - scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY); - } - if (ret < 0) { - scoutfs_err(sb, "error %d while checking to delete inode nr %llu, it might linger.", - ret, ino); + if (!scoutfs_lock_is_covered(sb, &si->ino_lock_cov) || inode->i_nlink == 0) + try_delete_inode_items(sb, scoutfs_ino(inode)); } - scoutfs_omap_dec(sb, ino); - -clear: clear_inode(inode); } @@ -1745,18 +1846,26 @@ void scoutfs_inode_queue_iput(struct inode *inode) /* * All mounts are performing this work concurrently. We introduce * significant jitter between them to try and keep them from all - * bunching up and working on the same inodes. + * bunching up and working on the same inodes. We always try to delay + * for at least one jiffy if precision tricks us into calculating no + * delay. */ -static void schedule_orphan_dwork(struct inode_sb_info *inf) +void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb) { -#define ORPHAN_SCAN_MIN_MS (10 * MSEC_PER_SEC) -#define ORPHAN_SCAN_JITTER_MS (40 * MSEC_PER_SEC) - unsigned long delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS + - prandom_u32_max(ORPHAN_SCAN_JITTER_MS)); + DECLARE_INODE_SB_INFO(sb, inf); + struct scoutfs_mount_options opts; + unsigned long low; + unsigned long high; + unsigned long delay; + if (!inf->stopped) { - delay = msecs_to_jiffies(ORPHAN_SCAN_MIN_MS + - prandom_u32_max(ORPHAN_SCAN_JITTER_MS)); - schedule_delayed_work(&inf->orphan_scan_dwork, delay); + scoutfs_options_read(sb, &opts); + + low = (opts.orphan_scan_delay_ms * 80) / 100; + high = (opts.orphan_scan_delay_ms * 120) / 100; + delay = msecs_to_jiffies(low + prandom_u32_max(high - low)) ?: 1; + + mod_delayed_work(system_wq, &inf->orphan_scan_dwork, delay); } } @@ -1764,11 +1873,10 @@ static void schedule_orphan_dwork(struct inode_sb_info *inf) * Find and delete inodes whose only remaining reference is the * persistent orphan item that was created as they were unlinked. * - * Orphan items are created as the final directory entry referring to an - * inode is deleted. They're deleted as the final cached inode is - * evicted and the inode items are destroyed. They can linger if all - * the cached inodes pinning the inode fail to delete as they are - * evicted from the cache -- either through crashing or errors. + * Orphan items are maintained for inodes that have an nlink of 0. + * Typically this is from unlink, but tmpfiles are created with orphans. + * They're deleted as the final cached inode is evicted and the inode + * items are destroyed. * * This work runs in all mounts in the background looking for those * orphaned inodes that weren't fully deleted. @@ -1777,20 +1885,16 @@ static void schedule_orphan_dwork(struct inode_sb_info *inf) * only find orphan items that made it to the fs root after being merged * from a mount's log btree. This naturally avoids orphan items that * exist while inodes have been unlinked but are still cached, including - * O_TMPFILE inodes that are actively used during normal operations. + * tmpfile inodes that are actively used during normal operations. * Scanning the read-only persistent fs root uses cached blocks and * avoids the lock contention we'd cause if we tried to use the * consistent item cache. The downside is that it adds a bit of - * latency. If an orphan was created in error it'll take until the - * mount's log btree is finalized and merged. A crash will have the log - * btree merged after it is fenced. + * latency. * - * Once we find candidate orphan items we can first check our local - * inode cache for inodes that are already on their way to eviction and - * can be skipped. Then we ask the server for the open map containing - * the inode. Only if we don't have it cached, and no one else does, do - * we try and read it into our cache and evict it to trigger the final - * inode deletion process. + * Once we find candidate orphan items we first check our local omap for + * a locally cached inode. Then we ask the server for the open map + * containing the inode. Only if we don't see any cached users do we do + * the expensive work of acquiring locks to try and delete the items. */ static void inode_orphan_scan_worker(struct work_struct *work) { @@ -1802,7 +1906,6 @@ static void inode_orphan_scan_worker(struct work_struct *work) SCOUTFS_BTREE_ITEM_REF(iref); struct scoutfs_key last; struct scoutfs_key key; - struct inode *inode; u64 group_nr; int bit_nr; u64 ino; @@ -1841,17 +1944,14 @@ static void inode_orphan_scan_worker(struct work_struct *work) scoutfs_inc_counter(sb, orphan_scan_item); ino = le64_to_cpu(key.sko_ino); - /* locally cached inodes will already be deleted */ - inode = scoutfs_ilookup(sb, ino); - if (inode) { + /* locally cached inodes will try to delete as they evict */ + if (scoutfs_omap_test(sb, ino)) { scoutfs_inc_counter(sb, orphan_scan_cached); - iput(inode); continue; } /* get an omap that covers the orphaned ino */ - group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT; - bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK; + scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); if (le64_to_cpu(omap.args.group_nr) != group_nr) { ret = scoutfs_client_open_ino_map(sb, group_nr, &omap); @@ -1859,25 +1959,15 @@ static void inode_orphan_scan_worker(struct work_struct *work) goto out; } - /* don't need to evict if someone else has it open (cached) */ + /* remote cached inodes will also try to delete */ if (test_bit_le(bit_nr, omap.bits)) { scoutfs_inc_counter(sb, orphan_scan_omap_set); continue; } - /* try to cached and evict unused inode to delete, can be racing */ - inode = scoutfs_iget(sb, ino, 0, 0); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - if (ret == -ENOENT) - continue; - else - goto out; - } - - scoutfs_inc_counter(sb, orphan_scan_read); - SCOUTFS_I(inode)->drop_invalidated = true; - iput(inode); + /* seemingly orphaned and unused, get locks and check for sure */ + scoutfs_inc_counter(sb, orphan_scan_attempts); + ret = try_delete_inode_items(sb, ino); } ret = 0; @@ -1886,7 +1976,7 @@ out: if (ret < 0) scoutfs_inc_counter(sb, orphan_scan_error); - schedule_orphan_dwork(inf); + scoutfs_inode_schedule_orphan_dwork(sb); } /* @@ -1994,8 +2084,6 @@ int scoutfs_inode_setup(struct super_block *sb) spin_lock_init(&inf->dir_ino_alloc.lock); spin_lock_init(&inf->ino_alloc.lock); INIT_DELAYED_WORK(&inf->orphan_scan_dwork, inode_orphan_scan_worker); - spin_lock_init(&inf->deleting_items_lock); - INIT_LIST_HEAD(&inf->deleting_items_list); INIT_WORK(&inf->iput_work, iput_worker); init_llist_head(&inf->iput_llist); @@ -2011,9 +2099,7 @@ int scoutfs_inode_setup(struct super_block *sb) */ void scoutfs_inode_start(struct super_block *sb) { - DECLARE_INODE_SB_INFO(sb, inf); - - schedule_orphan_dwork(inf); + scoutfs_inode_schedule_orphan_dwork(sb); } /* diff --git a/kmod/src/inode.h b/kmod/src/inode.h index eab303ee..88058117 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -82,7 +82,9 @@ void scoutfs_inode_queue_iput(struct inode *inode); #define SCOUTFS_IGF_LINKED (1 << 0) /* enoent if nlink == 0 */ struct inode *scoutfs_iget(struct super_block *sb, u64 ino, int lkf, int igf); -struct inode *scoutfs_ilookup(struct super_block *sb, u64 ino); +struct inode *scoutfs_ilookup_nowait(struct super_block *sb, u64 ino); +struct inode *scoutfs_ilookup_nowait_nonewfree(struct super_block *sb, u64 ino); + void scoutfs_inode_init_key(struct scoutfs_key *key, u64 ino); void scoutfs_inode_init_index_key(struct scoutfs_key *key, u8 type, u64 major, @@ -104,9 +106,8 @@ void scoutfs_update_inode_item(struct inode *inode, struct scoutfs_lock *lock, struct list_head *ind_locks); int scoutfs_alloc_ino(struct super_block *sb, bool is_dir, u64 *ino_ret); -struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, - umode_t mode, dev_t rdev, u64 ino, - struct scoutfs_lock *lock); +int scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev, + u64 ino, struct scoutfs_lock *lock, struct inode **inode_ret); void scoutfs_inode_set_meta_seq(struct inode *inode); void scoutfs_inode_set_data_seq(struct inode *inode); @@ -126,6 +127,7 @@ int scoutfs_setattr(struct dentry *dentry, struct iattr *attr); int scoutfs_inode_orphan_create(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_lock *lock); +void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb); void scoutfs_inode_queue_writeback(struct inode *inode); int scoutfs_inode_walk_writeback(struct super_block *sb, bool write); diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index d30a5021..1ae9da1f 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -387,7 +387,7 @@ static long scoutfs_ioc_data_wait_err(struct file *file, unsigned long arg) if (sblock > eblock) return -EINVAL; - inode = scoutfs_ilookup(sb, args.ino); + inode = scoutfs_ilookup_nowait_nonewfree(sb, args.ino); if (!inode) { ret = -ESTALE; goto out; diff --git a/kmod/src/lock.c b/kmod/src/lock.c index f0d0238d..b04e18ee 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -142,7 +142,7 @@ static void invalidate_inode(struct super_block *sb, u64 ino) struct scoutfs_inode_info *si; struct inode *inode; - inode = scoutfs_ilookup(sb, ino); + inode = scoutfs_ilookup_nowait_nonewfree(sb, ino); if (inode) { si = SCOUTFS_I(inode); @@ -255,7 +255,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock) BUG_ON(!list_empty(&lock->shrink_head)); BUG_ON(!list_empty(&lock->cov_list)); - scoutfs_omap_free_lock_data(lock->omap_data); + kfree(lock->inode_deletion_data); kfree(lock); } @@ -291,7 +291,6 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb, lock->mode = SCOUTFS_LOCK_NULL; atomic64_set(&lock->forest_bloom_nr, 0); - spin_lock_init(&lock->omap_spinlock); trace_scoutfs_lock_alloc(sb, lock); diff --git a/kmod/src/lock.h b/kmod/src/lock.h index 5d7a3ce7..a1d9b903 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -11,7 +11,7 @@ #define SCOUTFS_LOCK_NR_MODES SCOUTFS_LOCK_INVALID -struct scoutfs_omap_lock; +struct inode_deletion_lock_data; /* * A few fields (start, end, refresh_gen, write_seq, granted_mode) @@ -47,9 +47,8 @@ struct scoutfs_lock { /* the forest tracks which log tree last saw bloom bit updates */ atomic64_t forest_bloom_nr; - /* open ino mapping has a valid map for a held write lock */ - spinlock_t omap_spinlock; - struct scoutfs_omap_lock_data *omap_data; + /* inode deletion tracks some state per lock */ + struct inode_deletion_lock_data *inode_deletion_data; }; struct scoutfs_lock_coverage { diff --git a/kmod/src/omap.c b/kmod/src/omap.c index 529cfda4..604c397e 100644 --- a/kmod/src/omap.c +++ b/kmod/src/omap.c @@ -30,27 +30,22 @@ /* * As a client removes an inode from its cache with an nlink of 0 it * needs to decide if it is the last client using the inode and should - * fully delete all its items. It needs to know if other mounts still - * have the inode in use. + * fully delete all the inode's items. It needs to know if other mounts + * still have the inode in use. * - * We need a way to communicate between mounts that an inode is open. + * We need a way to communicate between mounts that an inode is in use. * We don't want to pay the synchronous per-file locking round trip * costs associated with per-inode open locks that you'd typically see - * in systems to solve this problem. + * in systems to solve this problem. The first prototypes of this + * tracked open file handles so this was coined the open map, though it + * now tracks cached inodes. * - * Instead clients maintain open bitmaps that cover groups of inodes. - * As inodes enter the cache their bit is set, and as the inode is - * evicted the bit is cleared. As an inode is evicted messages are sent - * around the cluster to get the current bitmaps for that inode's group - * from all active mounts. If the inode's bit is clear then it can be - * deleted. - * - * We associate the open bitmaps with our cluster locking of inode - * groups to cache these open bitmaps. As long as we have the lock then - * nlink can't be changed on any remote mounts. Specifically, it can't - * increase from 0 so any clear bits can gain references on remote - * mounts. As long as we have the lock, all clear bits in the group for - * inodes with 0 nlink can be deleted. + * Clients maintain bitmaps that cover groups of inodes. As inodes + * enter the cache their bit is set and as the inode is evicted the bit + * is cleared. As deletion is attempted, either by scanning orphans or + * evicting an inode with an nlink of 0, messages are sent around the + * cluster to get the current bitmaps for that inode's group from all + * active mounts. If the inode's bit is clear then it can be deleted. * * This layer maintains a list of client rids to send messages to. The * server calls us as clients enter and leave the cluster. We can't @@ -85,14 +80,12 @@ struct omap_info { struct omap_info *name = SCOUTFS_SB(sb)->omap_info /* - * The presence of an inode in the inode cache increases the count of - * its inode number's position within its lock group. These structs - * track the counts for all the inodes in a lock group and maintain a - * bitmap whose bits are set for each non-zero count. + * The presence of an inode in the inode sets its bit in the lock + * group's bitmap. * * We don't want to add additional global synchronization of inode cache * maintenance so these are tracked in an rcu hash table. Once their - * total count reaches zero they're removed from the hash and queued for + * total reaches zero they're removed from the hash and queued for * freeing and readers should ignore them. */ struct omap_group { @@ -102,7 +95,6 @@ struct omap_group { u64 nr; spinlock_t lock; unsigned int total; - unsigned int *counts; __le64 bits[SCOUTFS_OPEN_INO_MAP_LE64S]; }; @@ -111,8 +103,7 @@ do { \ __typeof__(group) _grp = (group); \ __typeof__(bit_nr) _nr = (bit_nr); \ \ - trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr, \ - _nr < 0 ? -1 : _grp->counts[_nr]); \ + trace_scoutfs_omap_group_##which(sb, _grp, _grp->nr, _grp->total, _nr); \ } while (0) /* @@ -134,18 +125,6 @@ struct omap_request { struct scoutfs_open_ino_map map; }; -/* - * In each inode group cluster lock we store data to track the open ino - * map which tracks all the inodes that the cluster lock covers. When - * the seq shows that the map is stale we send a request to update it. - */ -struct scoutfs_omap_lock_data { - u64 seq; - bool req_in_flight; - wait_queue_head_t waitq; - struct scoutfs_open_ino_map map; -}; - static inline void init_rid_list(struct omap_rid_list *list) { INIT_LIST_HEAD(&list->head); @@ -232,7 +211,7 @@ static void free_rids(struct omap_rid_list *list) } } -static void calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr) +void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr) { *group_nr = ino >> SCOUTFS_OPEN_INO_MAP_SHIFT; *bit_nr = ino & SCOUTFS_OPEN_INO_MAP_MASK; @@ -242,21 +221,13 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr) { struct omap_group *group; - BUILD_BUG_ON((sizeof(group->counts[0]) * SCOUTFS_OPEN_INO_MAP_BITS) > PAGE_SIZE); - group = kzalloc(sizeof(struct omap_group), GFP_NOFS); if (group) { group->sb = sb; group->nr = group_nr; spin_lock_init(&group->lock); - group->counts = (void *)get_zeroed_page(GFP_NOFS); - if (!group->counts) { - kfree(group); - group = NULL; - } else { - trace_group(sb, alloc, group, -1); - } + trace_group(sb, alloc, group, -1); } return group; @@ -265,7 +236,6 @@ static struct omap_group *alloc_group(struct super_block *sb, u64 group_nr) static void free_group(struct super_block *sb, struct omap_group *group) { trace_group(sb, free, group, -1); - free_page((unsigned long)group->counts); kfree(group); } @@ -283,13 +253,16 @@ static const struct rhashtable_params group_ht_params = { }; /* - * Track an cached inode in its group. Our increment can be racing with - * a final decrement that removes the group from the hash, sets total to + * Track an cached inode in its group. Our set can be racing with a + * final clear that removes the group from the hash, sets total to * UINT_MAX, and calls rcu free. We can retry until the dead group is * no longer visible in the hash table and we can insert a new allocated * group. + * + * The caller must ensure that the bit is clear, -EEXIST will be + * returned otherwise. */ -int scoutfs_omap_inc(struct super_block *sb, u64 ino) +int scoutfs_omap_set(struct super_block *sb, u64 ino) { DECLARE_OMAP_INFO(sb, ominf); struct omap_group *group; @@ -298,7 +271,7 @@ int scoutfs_omap_inc(struct super_block *sb, u64 ino) bool found; int ret = 0; - calc_group_nrs(ino, &group_nr, &bit_nr); + scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); retry: found = false; @@ -308,10 +281,10 @@ retry: spin_lock(&group->lock); if (group->total < UINT_MAX) { found = true; - if (group->counts[bit_nr]++ == 0) { - set_bit_le(bit_nr, group->bits); + if (WARN_ON_ONCE(test_and_set_bit_le(bit_nr, group->bits))) + ret = -EEXIST; + else group->total++; - } } trace_group(sb, inc, group, bit_nr); spin_unlock(&group->lock); @@ -342,29 +315,50 @@ retry: return ret; } +bool scoutfs_omap_test(struct super_block *sb, u64 ino) +{ + DECLARE_OMAP_INFO(sb, ominf); + struct omap_group *group; + bool ret = false; + u64 group_nr; + int bit_nr; + + scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); + + rcu_read_lock(); + group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params); + if (group) { + spin_lock(&group->lock); + ret = !!test_bit_le(bit_nr, group->bits); + spin_unlock(&group->lock); + } + rcu_read_unlock(); + + return ret; +} + /* - * Decrement a previously incremented ino count. Not finding a count - * implies imbalanced inc/dec or bugs freeing groups. We only free - * groups here as the last dec drops the group's total count to 0. + * Clear a previously set ino bit. Trying to clear a bit that's already + * clear implies imbalanced set/clear or bugs freeing groups. We only + * free groups here as the last clear drops the group's total to 0. */ -void scoutfs_omap_dec(struct super_block *sb, u64 ino) +void scoutfs_omap_clear(struct super_block *sb, u64 ino) { DECLARE_OMAP_INFO(sb, ominf); struct omap_group *group; u64 group_nr; int bit_nr; - calc_group_nrs(ino, &group_nr, &bit_nr); + scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr); rcu_read_lock(); group = rhashtable_lookup(&ominf->group_ht, &group_nr, group_ht_params); if (group) { spin_lock(&group->lock); - WARN_ON_ONCE(group->counts[bit_nr] == 0); + WARN_ON_ONCE(!test_bit_le(bit_nr, group->bits)); WARN_ON_ONCE(group->total == 0); WARN_ON_ONCE(group->total == UINT_MAX); - if (--group->counts[bit_nr] == 0) { - clear_bit_le(bit_nr, group->bits); + if (test_and_clear_bit_le(bit_nr, group->bits)) { if (--group->total == 0) { group->total = UINT_MAX; rhashtable_remove_fast(&ominf->group_ht, &group->ht_head, @@ -664,8 +658,7 @@ int scoutfs_omap_server_handle_request(struct super_block *sb, u64 rid, u64 id, /* * The client is receiving a request from the server for its map for the - * given group. Look up the group and copy the bits to the map for - * non-zero open counts. + * given group. Look up the group and copy the bits to the map. * * The mount originating the request for this bitmap has the inode group * write locked. We can't be adding links to any inodes in the group @@ -814,179 +807,6 @@ void scoutfs_omap_server_shutdown(struct super_block *sb) synchronize_rcu(); } -static bool omap_req_in_flight(struct scoutfs_lock *lock, struct scoutfs_omap_lock_data *ldata) -{ - bool in_flight; - - spin_lock(&lock->omap_spinlock); - in_flight = ldata->req_in_flight; - spin_unlock(&lock->omap_spinlock); - - return in_flight; -} - -/* - * Make sure the map covered by the cluster lock is current. The caller - * holds the cluster lock so once we store lock_data on the cluster lock - * it won't be freed and the write_seq in the cluster lock won't change. - * - * The omap_spinlock protects the omap_data in the cluster lock. We - * have to drop it if we have to block to allocate lock_data, send a - * request for a new map, or wait for a request in flight to finish. - */ -static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lock, - struct scoutfs_omap_lock_data **ldata_ret, u64 group_nr) -{ - struct scoutfs_omap_lock_data *ldata; - bool send_req; - int ret = 0; - - spin_lock(&lock->omap_spinlock); - - ldata = lock->omap_data; - if (ldata == NULL) { - spin_unlock(&lock->omap_spinlock); - ldata = kzalloc(sizeof(struct scoutfs_omap_lock_data), GFP_NOFS); - spin_lock(&lock->omap_spinlock); - - if (!ldata) { - ret = -ENOMEM; - goto out; - } - - if (lock->omap_data == NULL) { - ldata->seq = lock->write_seq - 1; /* ensure refresh */ - init_waitqueue_head(&ldata->waitq); - - lock->omap_data = ldata; - } else { - kfree(ldata); - ldata = lock->omap_data; - } - } - - while (ldata->seq != lock->write_seq) { - /* only one waiter sends a request at a time */ - if (!ldata->req_in_flight) { - ldata->req_in_flight = true; - send_req = true; - } else { - send_req = false; - } - - spin_unlock(&lock->omap_spinlock); - if (send_req) - ret = scoutfs_client_open_ino_map(sb, group_nr, &ldata->map); - else - wait_event(ldata->waitq, !omap_req_in_flight(lock, ldata)); - spin_lock(&lock->omap_spinlock); - - /* only sender can return error, other waiters retry */ - if (send_req) { - ldata->req_in_flight = false; - if (ret == 0) - ldata->seq = lock->write_seq; - wake_up(&ldata->waitq); - if (ret < 0) - goto out; - } - } - -out: - spin_unlock(&lock->omap_spinlock); - - if (ret == 0) - *ldata_ret = ldata; - else - *ldata_ret = NULL; - - return ret; -} - -/* - * Return 1 and give the caller their locks when they should delete the - * inode items. It's safe to delete the inode items when it is no - * longer reachable and nothing is referencing it. - * - * The inode is unreachable when nlink hits zero. Cluster locks protect - * modification and testing of nlink. We use the ino_lock_cov covrage - * to short circuit the common case of having a locked inode that hasn't - * been deleted. If it isn't locked, we have to acquire the lock to - * refresh the inode to see its current nlink. - * - * Then we use an open inode bitmap that covers all the inodes in the - * lock group to determine if the inode is present in any other mount's - * caches. We refresh it by asking the server for all clients' maps and - * then store it in the lock. As long as we hold the lock nothing can - * increase nlink from zero and let people get a reference to the inode. - */ -int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode, - struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret) -{ - struct scoutfs_inode_info *si = SCOUTFS_I(inode); - struct scoutfs_lock *orph_lock = NULL; - struct scoutfs_lock *lock = NULL; - const u64 ino = scoutfs_ino(inode); - struct scoutfs_omap_lock_data *ldata; - u64 group_nr; - int bit_nr; - int ret; - int err; - - /* lock group and omap constants are defined independently */ - BUILD_BUG_ON(SCOUTFS_OPEN_INO_MAP_BITS != SCOUTFS_LOCK_INODE_GROUP_NR); - - if (scoutfs_lock_is_covered(sb, &si->ino_lock_cov) && inode->i_nlink > 0) { - ret = 0; - goto out; - } - - ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); - if (ret < 0) - goto out; - - if (inode->i_nlink > 0) { - ret = 0; - goto out; - } - - calc_group_nrs(ino, &group_nr, &bit_nr); - - /* only one request to refresh the map at a time */ - ret = get_current_lock_data(sb, lock, &ldata, group_nr); - if (ret < 0) - goto out; - - /* can delete caller's zero nlink inode if it's not cached in other mounts */ - ret = !test_bit_le(bit_nr, ldata->map.bits); -out: - trace_scoutfs_omap_should_delete(sb, ino, inode->i_nlink, ret); - - if (ret > 0) { - err = scoutfs_lock_orphan(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, ino, &orph_lock); - if (err < 0) - ret = err; - } - - if (ret <= 0) { - scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); - lock = NULL; - } - - *lock_ret = lock; - *orph_lock_ret = orph_lock; - return ret; -} - -void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata) -{ - if (ldata) { - WARN_ON_ONCE(ldata->req_in_flight); - WARN_ON_ONCE(waitqueue_active(&ldata->waitq)); - kfree(ldata); - } -} - int scoutfs_omap_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); diff --git a/kmod/src/omap.h b/kmod/src/omap.h index 9a2d1e0b..3f09d8a9 100644 --- a/kmod/src/omap.h +++ b/kmod/src/omap.h @@ -1,13 +1,12 @@ #ifndef _SCOUTFS_OMAP_H_ #define _SCOUTFS_OMAP_H_ -int scoutfs_omap_inc(struct super_block *sb, u64 ino); -void scoutfs_omap_dec(struct super_block *sb, u64 ino); -int scoutfs_omap_should_delete(struct super_block *sb, struct inode *inode, - struct scoutfs_lock **lock_ret, struct scoutfs_lock **orph_lock_ret); -void scoutfs_omap_free_lock_data(struct scoutfs_omap_lock_data *ldata); +int scoutfs_omap_set(struct super_block *sb, u64 ino); +bool scoutfs_omap_test(struct super_block *sb, u64 ino); +void scoutfs_omap_clear(struct super_block *sb, u64 ino); int scoutfs_omap_client_handle_request(struct super_block *sb, u64 id, struct scoutfs_open_ino_map_args *args); +void scoutfs_omap_calc_group_nrs(u64 ino, u64 *group_nr, int *bit_nr); int scoutfs_omap_add_rid(struct super_block *sb, u64 rid); int scoutfs_omap_remove_rid(struct super_block *sb, u64 rid); diff --git a/kmod/src/options.c b/kmod/src/options.c index 74c9819c..a447a931 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -26,22 +26,30 @@ #include "msg.h" #include "options.h" #include "super.h" +#include "inode.h" + +enum { + Opt_metadev_path, + Opt_orphan_scan_delay_ms, + Opt_quorum_slot_nr, + Opt_err, +}; static const match_table_t tokens = { - {Opt_quorum_slot_nr, "quorum_slot_nr=%s"}, {Opt_metadev_path, "metadev_path=%s"}, + {Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"}, + {Opt_quorum_slot_nr, "quorum_slot_nr=%s"}, {Opt_err, NULL} }; -struct options_sb_info { - struct dentry *debugfs_dir; +struct options_info { + seqlock_t seqlock; + struct scoutfs_mount_options opts; + struct scoutfs_sysfs_attrs sysfs_attrs; }; -u32 scoutfs_option_u32(struct super_block *sb, int token) -{ - WARN_ON_ONCE(1); - return 0; -} +#define DECLARE_OPTIONS_INFO(sb, name) \ + struct options_info *name = SCOUTFS_SB(sb)->options_info static int parse_bdev_path(struct super_block *sb, substring_t *substr, char **bdev_path_ret) @@ -89,8 +97,29 @@ out: return ret; } -int scoutfs_parse_options(struct super_block *sb, char *options, - struct mount_options *parsed) +static void free_options(struct scoutfs_mount_options *opts) +{ + kfree(opts->metadev_path); +} + +#define MIN_ORPHAN_SCAN_DELAY_MS 100UL +#define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC) +#define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC) + +static void init_default_options(struct scoutfs_mount_options *opts) +{ + memset(opts, 0, sizeof(*opts)); + opts->quorum_slot_nr = -1; + opts->orphan_scan_delay_ms = DEFAULT_ORPHAN_SCAN_DELAY_MS; +} + +/* + * Parse the option string into our options struct. This can allocate + * memory in the struct. The caller is responsible for always calling + * free_options() when the struct is destroyed, including when we return + * an error. + */ +static int parse_options(struct super_block *sb, char *options, struct scoutfs_mount_options *opts) { substring_t args[MAX_OPT_ARGS]; int nr; @@ -98,49 +127,61 @@ int scoutfs_parse_options(struct super_block *sb, char *options, char *p; int ret; - /* Set defaults */ - memset(parsed, 0, sizeof(*parsed)); - parsed->quorum_slot_nr = -1; - while ((p = strsep(&options, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { - case Opt_quorum_slot_nr: - if (parsed->quorum_slot_nr != -1) { + case Opt_metadev_path: + ret = parse_bdev_path(sb, &args[0], &opts->metadev_path); + if (ret < 0) + return ret; + break; + + case Opt_orphan_scan_delay_ms: + if (opts->orphan_scan_delay_ms != -1) { + scoutfs_err(sb, "multiple orphan_scan_delay_ms options provided, only provide one."); + return -EINVAL; + } + + ret = match_int(args, &nr); + if (ret < 0 || + nr < MIN_ORPHAN_SCAN_DELAY_MS || nr > MAX_ORPHAN_SCAN_DELAY_MS) { + scoutfs_err(sb, "invalid orphan_scan_delay_ms option, must be between %lu and %lu", + MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS); + if (ret == 0) + ret = -EINVAL; + return ret; + } + opts->orphan_scan_delay_ms = nr; + break; + + case Opt_quorum_slot_nr: + if (opts->quorum_slot_nr != -1) { scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one."); return -EINVAL; } ret = match_int(args, &nr); - if (ret < 0 || nr < 0 || - nr >= SCOUTFS_QUORUM_MAX_SLOTS) { + if (ret < 0 || nr < 0 || nr >= SCOUTFS_QUORUM_MAX_SLOTS) { scoutfs_err(sb, "invalid quorum_slot_nr option, must be between 0 and %u", SCOUTFS_QUORUM_MAX_SLOTS - 1); if (ret == 0) ret = -EINVAL; return ret; } - parsed->quorum_slot_nr = nr; + opts->quorum_slot_nr = nr; break; - case Opt_metadev_path: - ret = parse_bdev_path(sb, &args[0], - &parsed->metadev_path); - if (ret < 0) - return ret; - break; default: scoutfs_err(sb, "Unknown or malformed option, \"%s\"", p); return -EINVAL; - break; } } - if (!parsed->metadev_path) { + if (!opts->metadev_path) { scoutfs_err(sb, "Required mount option \"metadev_path\" not found"); return -EINVAL; } @@ -148,40 +189,181 @@ int scoutfs_parse_options(struct super_block *sb, char *options, return 0; } -int scoutfs_options_setup(struct super_block *sb) +void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts) +{ + DECLARE_OPTIONS_INFO(sb, optinf); + unsigned int seq; + + if (WARN_ON_ONCE(optinf == NULL)) { + /* trying to use options before early setup or after destroy */ + init_default_options(opts); + return; + } + + do { + seq = read_seqbegin(&optinf->seqlock); + memcpy(opts, &optinf->opts, sizeof(struct scoutfs_mount_options)); + } while (read_seqretry(&optinf->seqlock, seq)); +} + +/* + * Early setup that parses and stores the options so that the rest of + * setup can use them. Full options setup that relies on other + * components will be done later. + */ +int scoutfs_options_early_setup(struct super_block *sb, char *options) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct options_sb_info *osi; + struct scoutfs_mount_options opts; + struct options_info *optinf; int ret; - osi = kzalloc(sizeof(struct options_sb_info), GFP_KERNEL); - if (!osi) - return -ENOMEM; + init_default_options(&opts); - sbi->options = osi; + ret = parse_options(sb, options, &opts); + if (ret < 0) + goto out; - osi->debugfs_dir = debugfs_create_dir("options", sbi->debug_root); - if (!osi->debugfs_dir) { + optinf = kzalloc(sizeof(struct options_info), GFP_KERNEL); + if (!optinf) { ret = -ENOMEM; goto out; } + seqlock_init(&optinf->seqlock); + scoutfs_sysfs_init_attrs(sb, &optinf->sysfs_attrs); + + write_seqlock(&optinf->seqlock); + optinf->opts = opts; + write_sequnlock(&optinf->seqlock); + + sbi->options_info = optinf; ret = 0; out: - if (ret) + if (ret < 0) + free_options(&opts); + + return ret; +} + +int scoutfs_options_show(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + seq_printf(seq, ",metadev_path=%s", opts.metadev_path); + seq_printf(seq, ",orphan_scan_delay_ms=%u", opts.orphan_scan_delay_ms); + if (opts.quorum_slot_nr >= 0) + seq_printf(seq, ",quorum_slot_nr=%d", opts.quorum_slot_nr); + + return 0; +} + +static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%s", opts.metadev_path); +} +SCOUTFS_ATTR_RO(metadev_path); + +static ssize_t orphan_scan_delay_ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%u", opts.orphan_scan_delay_ms); +} +static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + DECLARE_OPTIONS_INFO(sb, optinf); + char nullterm[20]; /* more than enough for octal -U32_MAX */ + long val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtol(nullterm, 0, &val); + if (ret < 0 || val < MIN_ORPHAN_SCAN_DELAY_MS || val > MAX_ORPHAN_SCAN_DELAY_MS) { + scoutfs_err(sb, "invalid orphan_scan_delay_ms value written to options sysfs file, must be between %lu and %lu", + MIN_ORPHAN_SCAN_DELAY_MS, MAX_ORPHAN_SCAN_DELAY_MS); + return -EINVAL; + } + + write_seqlock(&optinf->seqlock); + optinf->opts.orphan_scan_delay_ms = val; + write_sequnlock(&optinf->seqlock); + + scoutfs_inode_schedule_orphan_dwork(sb); + + return count; +} +SCOUTFS_ATTR_RW(orphan_scan_delay_ms); + +static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%d\n", opts.quorum_slot_nr); +} +SCOUTFS_ATTR_RO(quorum_slot_nr); + +static struct attribute *options_attrs[] = { + SCOUTFS_ATTR_PTR(metadev_path), + SCOUTFS_ATTR_PTR(orphan_scan_delay_ms), + SCOUTFS_ATTR_PTR(quorum_slot_nr), + NULL, +}; + +int scoutfs_options_setup(struct super_block *sb) +{ + DECLARE_OPTIONS_INFO(sb, optinf); + int ret; + + ret = scoutfs_sysfs_create_attrs(sb, &optinf->sysfs_attrs, options_attrs, "mount_options"); + if (ret < 0) scoutfs_options_destroy(sb); return ret; } +/* + * We remove the sysfs files early in unmount so that they can't try to call other subsystems + * as they're being destroyed. + */ +void scoutfs_options_stop(struct super_block *sb) +{ + DECLARE_OPTIONS_INFO(sb, optinf); + + if (optinf) + scoutfs_sysfs_destroy_attrs(sb, &optinf->sysfs_attrs); +} + void scoutfs_options_destroy(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct options_sb_info *osi = sbi->options; + DECLARE_OPTIONS_INFO(sb, optinf); - if (osi) { - if (osi->debugfs_dir) - debugfs_remove_recursive(osi->debugfs_dir); - kfree(osi); - sbi->options = NULL; + scoutfs_options_stop(sb); + + if (optinf) { + free_options(&optinf->opts); + kfree(optinf); + sbi->options_info = NULL; } } diff --git a/kmod/src/options.h b/kmod/src/options.h index d948b5b7..26d1eb1e 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -5,23 +5,19 @@ #include #include "format.h" -enum scoutfs_mount_options { - Opt_quorum_slot_nr, - Opt_metadev_path, - Opt_err, -}; - -struct mount_options { - int quorum_slot_nr; +struct scoutfs_mount_options { char *metadev_path; + unsigned int orphan_scan_delay_ms; + int quorum_slot_nr; + }; -int scoutfs_parse_options(struct super_block *sb, char *options, - struct mount_options *parsed); +void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts); +int scoutfs_options_show(struct seq_file *seq, struct dentry *root); + +int scoutfs_options_early_setup(struct super_block *sb, char *options); int scoutfs_options_setup(struct super_block *sb); +void scoutfs_options_stop(struct super_block *sb); void scoutfs_options_destroy(struct super_block *sb); -u32 scoutfs_option_u32(struct super_block *sb, int token); -#define scoutfs_option_bool scoutfs_option_u32 - #endif /* _SCOUTFS_OPTIONS_H_ */ diff --git a/kmod/src/quorum.c b/kmod/src/quorum.c index f8b547d2..91f5b492 100644 --- a/kmod/src/quorum.c +++ b/kmod/src/quorum.c @@ -116,6 +116,7 @@ struct quorum_info { struct socket *sock; bool shutdown; + int our_quorum_slot_nr; unsigned long flags; int votes_needed; @@ -160,9 +161,7 @@ static ktime_t heartbeat_timeout(void) static int create_socket(struct super_block *sb) { DECLARE_QUORUM_INFO(sb, qinf); - struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct mount_options *opts = &sbi->opts; - struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; struct socket *sock = NULL; struct sockaddr_in sin; int addrlen; @@ -176,7 +175,7 @@ static int create_socket(struct super_block *sb) sock->sk->sk_allocation = GFP_NOFS; - scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin); + scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin); addrlen = sizeof(sin); ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen); @@ -207,7 +206,6 @@ static void send_msg_members(struct super_block *sb, int type, u64 term, int only) { DECLARE_QUORUM_INFO(sb, qinf); - struct mount_options *opts = &SCOUTFS_SB(sb)->opts; struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; ktime_t now; int i; @@ -216,7 +214,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term, .fsid = super->hdr.fsid, .term = cpu_to_le64(term), .type = type, - .from = opts->quorum_slot_nr, + .from = qinf->our_quorum_slot_nr, }; struct kvec kv = { .iov_base = &qmes, @@ -238,7 +236,7 @@ static void send_msg_members(struct super_block *sb, int type, u64 term, for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { if (!quorum_slot_present(super, i) || - (only >= 0 && i != only) || i == opts->quorum_slot_nr) + (only >= 0 && i != only) || i == qinf->our_quorum_slot_nr) continue; scoutfs_quorum_slot_sin(super, i, &sin); @@ -476,8 +474,8 @@ static int write_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_ */ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool check_rid) { - struct mount_options *opts = &SCOUTFS_SB(sb)->opts; - u64 blkno = SCOUTFS_QUORUM_BLKNO + opts->quorum_slot_nr; + DECLARE_QUORUM_INFO(sb, qinf); + u64 blkno = SCOUTFS_QUORUM_BLKNO + qinf->our_quorum_slot_nr; struct scoutfs_quorum_block blk; int ret; @@ -622,7 +620,6 @@ static void scoutfs_quorum_worker(struct work_struct *work) { struct quorum_info *qinf = container_of(work, struct quorum_info, work); struct super_block *sb = qinf->sb; - struct mount_options *opts = &SCOUTFS_SB(sb)->opts; struct sockaddr_in unused; struct quorum_host_msg msg; struct quorum_status qst; @@ -724,7 +721,7 @@ static void scoutfs_quorum_worker(struct work_struct *work) qst.term++; qst.vote_for = -1; qst.vote_bits = 0; - set_bit(opts->quorum_slot_nr, &qst.vote_bits); + set_bit(qinf->our_quorum_slot_nr, &qst.vote_bits); send_msg_others(sb, SCOUTFS_QUORUM_MSG_REQUEST_VOTE, qst.term); qst.timeout = election_timeout(); @@ -954,7 +951,6 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { DECLARE_QUORUM_INFO_KOBJ(kobj, qinf); - struct mount_options *opts = &SCOUTFS_SB(qinf->sb)->opts; struct quorum_status qst; struct last_msg last; struct timespec64 ts; @@ -971,7 +967,7 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr, ret = 0; snprintf_ret(buf, size, &ret, "quorum_slot_nr %u\n", - opts->quorum_slot_nr); + qinf->our_quorum_slot_nr); snprintf_ret(buf, size, &ret, "term %llu\n", qst.term); snprintf_ret(buf, size, &ret, "role %d (%s)\n", @@ -1048,7 +1044,6 @@ static inline bool valid_ipv4_port(__be16 port) static int verify_quorum_slots(struct super_block *sb) { struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct mount_options *opts = &SCOUTFS_SB(sb)->opts; char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1]; DECLARE_QUORUM_INFO(sb, qinf); struct sockaddr_in other; @@ -1099,7 +1094,7 @@ static int verify_quorum_slots(struct super_block *sb) return -EINVAL; } - if (!quorum_slot_present(super, opts->quorum_slot_nr)) { + if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) { char *str = slots; *str = '\0'; for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) { @@ -1114,7 +1109,7 @@ static int verify_quorum_slots(struct super_block *sb) } } scoutfs_err(sb, "quorum_slot_nr=%u option references unused slot, must be one of the following configured slots:%s", - opts->quorum_slot_nr, slots); + qinf->our_quorum_slot_nr, slots); return -EINVAL; } @@ -1137,11 +1132,12 @@ static int verify_quorum_slots(struct super_block *sb) int scoutfs_quorum_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - struct mount_options *opts = &sbi->opts; + struct scoutfs_mount_options opts; struct quorum_info *qinf; int ret; - if (opts->quorum_slot_nr < 0) + scoutfs_options_read(sb, &opts); + if (opts.quorum_slot_nr < 0) return 0; qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL); @@ -1153,6 +1149,8 @@ int scoutfs_quorum_setup(struct super_block *sb) spin_lock_init(&qinf->show_lock); INIT_WORK(&qinf->work, scoutfs_quorum_worker); scoutfs_sysfs_init_attrs(sb, &qinf->ssa); + /* static for the lifetime of the mount */ + qinf->our_quorum_slot_nr = opts.quorum_slot_nr; sbi->quorum_info = qinf; qinf->sb = sb; diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 80db3247..22a892c6 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2620,9 +2620,9 @@ TRACE_EVENT(scoutfs_item_invalidate_page, DECLARE_EVENT_CLASS(scoutfs_omap_group_class, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), + int bit_nr), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr), TP_STRUCT__entry( SCSB_TRACE_FIELDS @@ -2630,7 +2630,6 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class, __field(__u64, group_nr) __field(unsigned int, group_total) __field(int, bit_nr) - __field(int, bit_count) ), TP_fast_assign( @@ -2639,43 +2638,42 @@ DECLARE_EVENT_CLASS(scoutfs_omap_group_class, __entry->group_nr = group_nr; __entry->group_total = group_total; __entry->bit_nr = bit_nr; - __entry->bit_count = bit_count; ), - TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d bit_count %d", + TP_printk(SCSBF" grp %p group_nr %llu group_total %u bit_nr %d", SCSB_TRACE_ARGS, __entry->grp, __entry->group_nr, __entry->group_total, - __entry->bit_nr, __entry->bit_count) + __entry->bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_alloc, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_free, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_inc, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_dec, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_request, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); DEFINE_EVENT(scoutfs_omap_group_class, scoutfs_omap_group_destroy, TP_PROTO(struct super_block *sb, void *grp, u64 group_nr, unsigned int group_total, - int bit_nr, int bit_count), - TP_ARGS(sb, grp, group_nr, group_total, bit_nr, bit_count) + int bit_nr), + TP_ARGS(sb, grp, group_nr, group_total, bit_nr) ); TRACE_EVENT(scoutfs_omap_should_delete, diff --git a/kmod/src/server.c b/kmod/src/server.c index 2b8221d7..2fcd316b 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -3842,8 +3842,8 @@ static void scoutfs_server_worker(struct work_struct *work) struct super_block *sb = server->sb; struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; - struct mount_options *opts = &sbi->opts; struct scoutfs_net_connection *conn = NULL; + struct scoutfs_mount_options opts; DECLARE_WAIT_QUEUE_HEAD(waitq); struct sockaddr_in sin; bool alloc_init = false; @@ -3852,7 +3852,8 @@ static void scoutfs_server_worker(struct work_struct *work) trace_scoutfs_server_work_enter(sb, 0, 0); - scoutfs_quorum_slot_sin(super, opts->quorum_slot_nr, &sin); + scoutfs_options_read(sb, &opts); + scoutfs_quorum_slot_sin(super, opts.quorum_slot_nr, &sin); scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin)); scoutfs_block_writer_init(sb, &server->wri); diff --git a/kmod/src/super.c b/kmod/src/super.c index 3302ad4f..5cac5148 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -132,44 +132,6 @@ out: return ret; } -static int scoutfs_show_options(struct seq_file *seq, struct dentry *root) -{ - struct super_block *sb = root->d_sb; - struct mount_options *opts = &SCOUTFS_SB(sb)->opts; - - if (opts->quorum_slot_nr >= 0) - seq_printf(seq, ",quorum_slot_nr=%d", opts->quorum_slot_nr); - seq_printf(seq, ",metadev_path=%s", opts->metadev_path); - - return 0; -} - -static ssize_t metadev_path_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); - struct mount_options *opts = &SCOUTFS_SB(sb)->opts; - - return snprintf(buf, PAGE_SIZE, "%s", opts->metadev_path); -} -SCOUTFS_ATTR_RO(metadev_path); - -static ssize_t quorum_server_nr_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); - struct mount_options *opts = &SCOUTFS_SB(sb)->opts; - - return snprintf(buf, PAGE_SIZE, "%d\n", opts->quorum_slot_nr); -} -SCOUTFS_ATTR_RO(quorum_server_nr); - -static struct attribute *mount_options_attrs[] = { - SCOUTFS_ATTR_PTR(metadev_path), - SCOUTFS_ATTR_PTR(quorum_server_nr), - NULL, -}; - static int scoutfs_sync_fs(struct super_block *sb, int wait) { trace_scoutfs_sync_fs(sb, wait); @@ -246,13 +208,11 @@ static void scoutfs_put_super(struct super_block *sb) scoutfs_destroy_triggers(sb); scoutfs_fence_destroy(sb); scoutfs_options_destroy(sb); - scoutfs_sysfs_destroy_attrs(sb, &sbi->mopts_ssa); debugfs_remove(sbi->debug_root); scoutfs_destroy_counters(sb); scoutfs_destroy_sysfs(sb); scoutfs_metadev_close(sb); - kfree(sbi->opts.metadev_path); kfree(sbi); sb->s_fs_info = NULL; @@ -282,7 +242,7 @@ static const struct super_operations scoutfs_super_ops = { .destroy_inode = scoutfs_destroy_inode, .sync_fs = scoutfs_sync_fs, .statfs = scoutfs_statfs, - .show_options = scoutfs_show_options, + .show_options = scoutfs_options_show, .put_super = scoutfs_put_super, .umount_begin = scoutfs_umount_begin, }; @@ -511,9 +471,9 @@ out: static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) { - struct scoutfs_sb_info *sbi; - struct mount_options opts; + struct scoutfs_mount_options opts; struct block_device *meta_bdev; + struct scoutfs_sb_info *sbi; struct inode *inode; int ret; @@ -541,13 +501,12 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->next_ino_lock); spin_lock_init(&sbi->data_wait_root.lock); sbi->data_wait_root.root = RB_ROOT; - scoutfs_sysfs_init_attrs(sb, &sbi->mopts_ssa); - ret = scoutfs_parse_options(sb, data, &opts); - if (ret) - goto out; - - sbi->opts = opts; + /* parse options early for use during setup */ + ret = scoutfs_options_early_setup(sb, data); + if (ret < 0) + return ret; + scoutfs_options_read(sb, &opts); ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE); if (ret != SCOUTFS_BLOCK_SM_SIZE) { @@ -556,9 +515,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) goto out; } - meta_bdev = - blkdev_get_by_path(sbi->opts.metadev_path, - SCOUTFS_META_BDEV_MODE, sb); + meta_bdev = blkdev_get_by_path(opts.metadev_path, SCOUTFS_META_BDEV_MODE, sb); if (IS_ERR(meta_bdev)) { scoutfs_err(sb, "could not open metadev: error %ld", PTR_ERR(meta_bdev)); @@ -578,8 +535,6 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) scoutfs_setup_sysfs(sb) ?: scoutfs_setup_counters(sb) ?: scoutfs_options_setup(sb) ?: - scoutfs_sysfs_create_attrs(sb, &sbi->mopts_ssa, - mount_options_attrs, "mount_options") ?: scoutfs_setup_triggers(sb) ?: scoutfs_fence_setup(sb) ?: scoutfs_block_setup(sb) ?: @@ -652,6 +607,7 @@ static void scoutfs_kill_sb(struct super_block *sb) } if (SCOUTFS_HAS_SBI(sb)) { + scoutfs_options_stop(sb); scoutfs_inode_orphan_stop(sb); scoutfs_lock_unmount_begin(sb); } diff --git a/kmod/src/super.h b/kmod/src/super.h index 51c11722..3d3cd3d2 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -44,6 +44,7 @@ struct scoutfs_sb_info { spinlock_t next_ino_lock; + struct options_info *options_info; struct data_info *data_info; struct inode_sb_info *inode_sb_info; struct btree_info *btree_info; @@ -74,10 +75,6 @@ struct scoutfs_sb_info { struct scoutfs_counters *counters; struct scoutfs_triggers *triggers; - struct mount_options opts; - struct options_sb_info *options; - struct scoutfs_sysfs_attrs mopts_ssa; - struct dentry *debug_root; bool forced_unmount; diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 14e45c15..bc07071b 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -640,6 +640,7 @@ void scoutfs_shutdown_trans(struct super_block *sb) tri->write_workq = NULL; } + scoutfs_alloc_prepare_commit(sb, &tri->alloc, &tri->wri); scoutfs_block_writer_forget_all(sb, &tri->wri); kfree(tri); diff --git a/tests/.gitignore b/tests/.gitignore index 7603c209..9da79900 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -3,6 +3,7 @@ src/createmany src/dumb_renameat2 src/dumb_setxattr src/handle_cat +src/handle_fsetxattr src/bulk_create_paths src/find_xattrs src/stage_tmpfile diff --git a/tests/Makefile b/tests/Makefile index ec507401..60eea516 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -6,6 +6,7 @@ BIN := src/createmany \ src/dumb_renameat2 \ src/dumb_setxattr \ src/handle_cat \ + src/handle_fsetxattr \ src/bulk_create_paths \ src/stage_tmpfile \ src/find_xattrs \ diff --git a/tests/funcs/fs.sh b/tests/funcs/fs.sh index b68bcfe7..3345688a 100644 --- a/tests/funcs/fs.sh +++ b/tests/funcs/fs.sh @@ -362,3 +362,49 @@ t_wait_for_leader() { done done } + +t_set_sysfs_mount_option() { + local nr="$1" + local name="$2" + local val="$3" + local opt="$(t_sysfs_path $nr)/mount_options/$name" + + echo "$val" > "$opt" +} + +t_set_all_sysfs_mount_options() { + local name="$1" + local val="$2" + local i + + for i in $(t_fs_nrs); do + t_set_sysfs_mount_option $i $name $val + done +} + +declare -A _saved_opts +t_save_all_sysfs_mount_options() { + local name="$1" + local ind + local opt + local i + + for i in $(t_fs_nrs); do + opt="$(t_sysfs_path $i)/mount_options/$name" + ind="$name_$i" + + _saved_opts[$ind]="$(cat $opt)" + done +} + +t_restore_all_sysfs_mount_options() { + local name="$1" + local ind + local i + + for i in $(t_fs_nrs); do + ind="$name_$i" + + t_set_sysfs_mount_option $i $name "${_saved_opts[$ind]}" + done +} diff --git a/tests/golden/orphan-inodes b/tests/golden/orphan-inodes index cb79e12d..8f54a624 100644 --- a/tests/golden/orphan-inodes +++ b/tests/golden/orphan-inodes @@ -2,3 +2,4 @@ == unlinked and opened inodes still exist == orphan from failed evict deletion is picked up == orphaned inos in all mounts all deleted +== 30s of racing evict deletion, orphan scanning, and open by handle diff --git a/tests/src/handle_fsetxattr.c b/tests/src/handle_fsetxattr.c new file mode 100644 index 00000000..5d063552 --- /dev/null +++ b/tests/src/handle_fsetxattr.c @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2022 Versity Software, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FILEID_SCOUTFS 0x81 +#define FILEID_SCOUTFS_WITH_PARENT 0x82 + +struct our_handle { + struct file_handle handle; + /* + * scoutfs file handle can be ino or ino/parent. The + * handle_type field of struct file_handle denotes which + * version is in use. We only use the ino variant here. + */ + __le64 scoutfs_ino; +}; + +#define DEFAULT_NAME "user.handle_fsetxattr" +#define DEFAULT_VALUE "value" + +static void exit_usage(void) +{ + printf(" -h/-? output this usage message and exit\n" + " -e keep trying on enoent, consider success an error\n" + " -i 64bit inode number for handle open, can be multiple\n" + " -m scoutfs mount path string for ioctl fd\n" + " -n optional xattr name string, defaults to \""DEFAULT_NAME"\"\n" + " -s loop for num seconds, defaults to 0 for one iteration" + " -v optional xattr value string, defaults to \""DEFAULT_VALUE"\"\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + struct our_handle handle; + struct timespec ts; + bool enoent_success_err = false; + uint64_t seconds = 0; + char *value = NULL; + char *name = NULL; + char *mnt = NULL; + int nr_inos = 0; + uint64_t *inos; + uint64_t i; + int *fds; + int mntfd; + int fd; + int ret; + char c; + int j; + + /* can't have more inos than args */ + inos = calloc(argc, sizeof(inos[0])); + fds = calloc(argc, sizeof(fds[0])); + if (!inos || !fds) { + perror("calloc"); + exit(1); + } + for (i = 0; i < argc; i++) + fds[i] = -1; + + while ((c = getopt(argc, argv, "+ei:m:n:s:v:")) != -1) { + switch (c) { + case 'e': + enoent_success_err = true; + break; + case 'i': + inos[nr_inos] = strtoll(optarg, NULL, 0); + nr_inos++; + break; + case 'm': + mnt = strdup(optarg); + break; + case 'n': + name = strdup(optarg); + break; + case 's': + seconds = strtoll(optarg, NULL, 0); + break; + case 'v': + value = strdup(optarg); + break; + case '?': + printf("unknown argument: %c\n", optind); + case 'h': + exit_usage(); + } + } + + if (nr_inos == 0) { + printf("specify non-zero inode number with -i\n"); + exit(1); + } + + if (!mnt) { + printf("specify scoutfs mount path for ioctl with -p\n"); + exit(1); + } + + if (name == NULL) + name = DEFAULT_NAME; + if (value == NULL) + value = DEFAULT_VALUE; + + mntfd = open(mnt, O_RDONLY); + if (mntfd == -1) { + perror("opening mountpoint"); + return 1; + } + + clock_gettime(CLOCK_REALTIME, &ts); + seconds += ts.tv_sec; + + for (i = 0; ; i++) { + for (j = 0; j < nr_inos; j++) { + fd = fds[j]; + + if (fd < 0) { + handle.handle.handle_bytes = sizeof(struct our_handle); + handle.handle.handle_type = FILEID_SCOUTFS; + handle.scoutfs_ino = htole64(inos[j]); + + fd = open_by_handle_at(mntfd, &handle.handle, O_RDWR); + if (fd == -1) { + if (!enoent_success_err || errno != ENOENT) { + perror("open_by_handle_at"); + return 1; + } + continue; + } + fds[j] = fd; + } + + ret = fsetxattr(fd, name, value, strlen(value), 0); + if (ret < 0) { + perror("fsetxattr"); + return 1; + } + } + + if ((i % 10) == 0) { + clock_gettime(CLOCK_REALTIME, &ts); + if (ts.tv_sec >= seconds) + break; + } + } + + if (enoent_success_err) { + bool able = false; + for (i = 0; i < nr_inos; i++) { + if (fds[i] >= 0) { + printf("was able to open ino %"PRIu64"\n", inos[i]); + able = true; + } + } + if (able) + exit(1); + } + + /* not bothering to close or free */ + return 0; +} diff --git a/tests/tests/orphan-inodes.sh b/tests/tests/orphan-inodes.sh index ffd9f498..9f2b6e3f 100644 --- a/tests/tests/orphan-inodes.sh +++ b/tests/tests/orphan-inodes.sh @@ -30,6 +30,13 @@ inode_exists() test "$?" == 0 -a "$(head -1 $T_TMP.inos.log)" == "$ino" } +t_save_all_sysfs_mount_options orphan_scan_delay_ms +restore_delays() +{ + t_restore_all_sysfs_mount_options orphan_scan_delay_ms +} +trap restore_delays EXIT + echo "== test our inode existance function" path="$T_D0/file" touch "$path" @@ -38,6 +45,7 @@ inode_exists $ino || echo "$ino didn't exist" echo "== unlinked and opened inodes still exist" sleep 1000000 < "$path" & +sleep .1 # wait for background sleep to run and open stdin pid="$!" rm -f "$path" inode_exists $ino || echo "$ino didn't exist" @@ -45,7 +53,8 @@ inode_exists $ino || echo "$ino didn't exist" echo "== orphan from failed evict deletion is picked up" # pending kill signal stops evict from getting locks and deleting silent_kill $pid -sleep 55 +t_set_sysfs_mount_option 0 orphan_scan_delay_ms 1000 +sleep 5 inode_exists $ino && echo "$ino still exists" echo "== orphaned inos in all mounts all deleted" @@ -56,6 +65,7 @@ for nr in $(t_fs_nrs); do touch "$path" inos="$inos $(stat -c %i $path)" sleep 1000000 < "$path" & + sleep .1 # wait for background sleep to run and open stdin pids="$pids $!" rm -f "$path" done @@ -70,9 +80,63 @@ while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do sleep .5 done # wait for orphan scans to run -sleep 55 +t_set_all_sysfs_mount_options orphan_scan_delay_ms 1000 +# also have to wait for delayed log merge work from mount +sleep 15 for ino in $inos; do inode_exists $ino && echo "$ino still exists" done +RUNTIME=30 +echo "== ${RUNTIME}s of racing evict deletion, orphan scanning, and open by handle" + +# exclude last client mount +last="" +for nr in $(t_fs_nrs); do + last=$nr +done + +END=$((SECONDS + RUNTIME)) +while [ $SECONDS -lt $END ]; do + # hold open per-mount unlinked files + pids="" + ino_args="" + for nr in $(t_fs_nrs); do + test $nr == $last && continue + + eval path="\$T_D${nr}/racing-$nr" + touch "$path" + ino_args="$ino_args -i $(stat -c %i $path)" + + sleep 1000000 < "$path" & + sleep .1 # wait for sleep to start and open input :/ + pids="$pids $!" + rm -f "$path" + done + + # remount excluded last client to force log merging and make orphan visible + sync + t_umount $last + t_mount $last + + # get all mounts scanning orphans at high frequency + t_set_all_sysfs_mount_options orphan_scan_delay_ms 100 + + # spin having tasks in each mount trying to open/fsetxattr all inos + for nr in $(t_fs_nrs); do + test $nr == $last && continue + + eval path="\$T_M${nr}" + handle_fsetxattr -e $ino_args -m "$path" -s 2 & + done + + # trigger eviction deletion of each file in each mount + silent_kill $pids + + wait || t_fail "handle_fsetxattr failed" + + # slow down orphan scanning for the next iteration + t_set_all_sysfs_mount_options orphan_scan_delay_ms $(((RUNTIME * 2) * 1000)) +done + t_pass diff --git a/utils/man/scoutfs.5 b/utils/man/scoutfs.5 index a9303c9e..f6cbe193 100644 --- a/utils/man/scoutfs.5 +++ b/utils/man/scoutfs.5 @@ -21,6 +21,21 @@ contains the filesystem's metadata. .sp This option is required. .TP +.B orphan_scan_delay_ms= +This option sets the average expected delay, in milliseconds, between +each mount's scan of the global orphaned inode list. Jitter is added to +avoid contention so each individual delay between scans is a random +value up to 20% less than or greater than this average expected delay. +.sp +The minimum value for this option is 100ms which is very short and is +only reasonable for testing or experiments. The default is 10000ms (10 +seconds) and the maximum is 60000ms (1 minute). +.sp +This option can be changed in an active mount by writing to its file in +the options directory in the mount's sysfs directory. Writing a new +value will cause the next pending orphan scan to be rescheduled +with the newly written delay time. +.TP .B quorum_slot_nr= The quorum_slot_nr option assigns a quorum member slot to the mount. The mount will use the slot assignment to claim exclusive ownership of